{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 7344, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00013616557734204794, "grad_norm": 114.1466307381031, "learning_rate": 1.0884353741496598e-09, "logits/chosen": -8.211344718933105, "logits/rejected": -7.574012756347656, "logps/chosen": -1.1677236557006836, "logps/rejected": -1.2345831394195557, "loss": 5.8192, "rewards/accuracies": 0.5, "rewards/chosen": -11.677236557006836, "rewards/margins": 0.6685943603515625, "rewards/rejected": -12.345830917358398, "step": 1 }, { "epoch": 0.0002723311546840959, "grad_norm": 76.86048464045298, "learning_rate": 2.1768707482993195e-09, "logits/chosen": -9.145951271057129, "logits/rejected": -7.078125476837158, "logps/chosen": -1.578005313873291, "logps/rejected": -2.109044075012207, "loss": 5.2105, "rewards/accuracies": 0.5, "rewards/chosen": -15.780054092407227, "rewards/margins": 5.310388088226318, "rewards/rejected": -21.090442657470703, "step": 2 }, { "epoch": 0.0004084967320261438, "grad_norm": 76.43961845113635, "learning_rate": 3.2653061224489797e-09, "logits/chosen": -7.927372932434082, "logits/rejected": -6.205313205718994, "logps/chosen": -2.5349032878875732, "logps/rejected": -2.0752487182617188, "loss": 4.8378, "rewards/accuracies": 0.5, "rewards/chosen": -25.34903335571289, "rewards/margins": -4.596545696258545, "rewards/rejected": -20.752487182617188, "step": 3 }, { "epoch": 0.0005446623093681918, "grad_norm": 77.98031568789776, "learning_rate": 4.353741496598639e-09, "logits/chosen": -5.9748406410217285, "logits/rejected": -7.572969436645508, "logps/chosen": -3.1493289470672607, "logps/rejected": -2.46872615814209, "loss": 5.215, "rewards/accuracies": 0.5, "rewards/chosen": -31.493289947509766, "rewards/margins": -6.8060302734375, "rewards/rejected": -24.6872615814209, "step": 4 }, { "epoch": 0.0006808278867102397, "grad_norm": 87.3306205376835, "learning_rate": 5.442176870748299e-09, "logits/chosen": -7.935755729675293, "logits/rejected": -7.728808879852295, "logps/chosen": -1.6264891624450684, "logps/rejected": -1.3496191501617432, "loss": 4.7414, "rewards/accuracies": 0.25, "rewards/chosen": -16.264892578125, "rewards/margins": -2.768700361251831, "rewards/rejected": -13.496191024780273, "step": 5 }, { "epoch": 0.0008169934640522876, "grad_norm": 164.49533571987374, "learning_rate": 6.5306122448979594e-09, "logits/chosen": -7.936881065368652, "logits/rejected": -7.454012393951416, "logps/chosen": -1.0871236324310303, "logps/rejected": -1.2872685194015503, "loss": 6.2777, "rewards/accuracies": 0.5, "rewards/chosen": -10.871236801147461, "rewards/margins": 2.0014491081237793, "rewards/rejected": -12.872684478759766, "step": 6 }, { "epoch": 0.0009531590413943355, "grad_norm": 167.48787329226752, "learning_rate": 7.61904761904762e-09, "logits/chosen": -8.905806541442871, "logits/rejected": -6.656863212585449, "logps/chosen": -1.5222654342651367, "logps/rejected": -3.3753795623779297, "loss": 6.6024, "rewards/accuracies": 1.0, "rewards/chosen": -15.222654342651367, "rewards/margins": 18.531143188476562, "rewards/rejected": -33.7537956237793, "step": 7 }, { "epoch": 0.0010893246187363835, "grad_norm": 127.86369516545541, "learning_rate": 8.707482993197278e-09, "logits/chosen": -8.361207008361816, "logits/rejected": -7.8086347579956055, "logps/chosen": -1.3292715549468994, "logps/rejected": -3.623223304748535, "loss": 6.387, "rewards/accuracies": 1.0, "rewards/chosen": -13.292716026306152, "rewards/margins": 22.939516067504883, "rewards/rejected": -36.23223114013672, "step": 8 }, { "epoch": 0.0012254901960784314, "grad_norm": 171.52450106839234, "learning_rate": 9.795918367346937e-09, "logits/chosen": -8.539985656738281, "logits/rejected": -7.584209442138672, "logps/chosen": -1.1267136335372925, "logps/rejected": -1.5293104648590088, "loss": 5.6042, "rewards/accuracies": 0.5, "rewards/chosen": -11.267135620117188, "rewards/margins": 4.0259690284729, "rewards/rejected": -15.293105125427246, "step": 9 }, { "epoch": 0.0013616557734204794, "grad_norm": 187.39916682211575, "learning_rate": 1.0884353741496598e-08, "logits/chosen": -8.87921142578125, "logits/rejected": -9.262184143066406, "logps/chosen": -0.7852182388305664, "logps/rejected": -0.6963953971862793, "loss": 6.9377, "rewards/accuracies": 0.25, "rewards/chosen": -7.852182388305664, "rewards/margins": -0.8882288932800293, "rewards/rejected": -6.963953495025635, "step": 10 }, { "epoch": 0.0014978213507625272, "grad_norm": 117.86843909063884, "learning_rate": 1.1972789115646258e-08, "logits/chosen": -8.689042091369629, "logits/rejected": -9.15873908996582, "logps/chosen": -1.6882120370864868, "logps/rejected": -1.5802668333053589, "loss": 7.3226, "rewards/accuracies": 0.5, "rewards/chosen": -16.88212013244629, "rewards/margins": -1.0794520378112793, "rewards/rejected": -15.802667617797852, "step": 11 }, { "epoch": 0.0016339869281045752, "grad_norm": 113.2337766979685, "learning_rate": 1.3061224489795919e-08, "logits/chosen": -7.202520370483398, "logits/rejected": -8.234952926635742, "logps/chosen": -2.2109673023223877, "logps/rejected": -2.203378915786743, "loss": 5.1287, "rewards/accuracies": 0.25, "rewards/chosen": -22.10967445373535, "rewards/margins": -0.07588529586791992, "rewards/rejected": -22.033788681030273, "step": 12 }, { "epoch": 0.001770152505446623, "grad_norm": 94.17930024704908, "learning_rate": 1.414965986394558e-08, "logits/chosen": -8.136855125427246, "logits/rejected": -8.138833045959473, "logps/chosen": -1.1407005786895752, "logps/rejected": -1.114087462425232, "loss": 5.523, "rewards/accuracies": 0.5, "rewards/chosen": -11.407005310058594, "rewards/margins": -0.26613128185272217, "rewards/rejected": -11.140874862670898, "step": 13 }, { "epoch": 0.001906318082788671, "grad_norm": 86.03672701426065, "learning_rate": 1.523809523809524e-08, "logits/chosen": -9.734512329101562, "logits/rejected": -9.69428539276123, "logps/chosen": -1.0596859455108643, "logps/rejected": -1.180835485458374, "loss": 4.694, "rewards/accuracies": 0.5, "rewards/chosen": -10.596858978271484, "rewards/margins": 1.2114964723587036, "rewards/rejected": -11.808355331420898, "step": 14 }, { "epoch": 0.002042483660130719, "grad_norm": 79.29614050556592, "learning_rate": 1.6326530612244897e-08, "logits/chosen": -7.513303756713867, "logits/rejected": -8.469420433044434, "logps/chosen": -1.335381031036377, "logps/rejected": -1.1892902851104736, "loss": 5.5952, "rewards/accuracies": 0.5, "rewards/chosen": -13.353809356689453, "rewards/margins": -1.460907220840454, "rewards/rejected": -11.892902374267578, "step": 15 }, { "epoch": 0.002178649237472767, "grad_norm": 125.4236844549834, "learning_rate": 1.7414965986394556e-08, "logits/chosen": -9.610332489013672, "logits/rejected": -9.503217697143555, "logps/chosen": -1.2437952756881714, "logps/rejected": -1.767426609992981, "loss": 5.9875, "rewards/accuracies": 0.75, "rewards/chosen": -12.43795394897461, "rewards/margins": 5.236311912536621, "rewards/rejected": -17.674264907836914, "step": 16 }, { "epoch": 0.0023148148148148147, "grad_norm": 130.21391405647876, "learning_rate": 1.8503401360544215e-08, "logits/chosen": -8.501277923583984, "logits/rejected": -9.421195030212402, "logps/chosen": -1.3410167694091797, "logps/rejected": -1.0022532939910889, "loss": 6.4125, "rewards/accuracies": 0.0, "rewards/chosen": -13.410167694091797, "rewards/margins": -3.387634754180908, "rewards/rejected": -10.02253246307373, "step": 17 }, { "epoch": 0.0024509803921568627, "grad_norm": 115.77977594533623, "learning_rate": 1.9591836734693874e-08, "logits/chosen": -9.150215148925781, "logits/rejected": -7.175837516784668, "logps/chosen": -1.1750270128250122, "logps/rejected": -1.6741033792495728, "loss": 6.0578, "rewards/accuracies": 1.0, "rewards/chosen": -11.75027084350586, "rewards/margins": 4.990763187408447, "rewards/rejected": -16.74103355407715, "step": 18 }, { "epoch": 0.0025871459694989107, "grad_norm": 118.1429288190624, "learning_rate": 2.0680272108843536e-08, "logits/chosen": -8.59371280670166, "logits/rejected": -8.491493225097656, "logps/chosen": -4.535824298858643, "logps/rejected": -1.898592233657837, "loss": 5.3852, "rewards/accuracies": 0.5, "rewards/chosen": -45.358238220214844, "rewards/margins": -26.372318267822266, "rewards/rejected": -18.98592185974121, "step": 19 }, { "epoch": 0.0027233115468409588, "grad_norm": 127.5721256093001, "learning_rate": 2.1768707482993195e-08, "logits/chosen": -7.222818374633789, "logits/rejected": -7.5780534744262695, "logps/chosen": -1.5290307998657227, "logps/rejected": -2.143995523452759, "loss": 6.2404, "rewards/accuracies": 0.75, "rewards/chosen": -15.290307998657227, "rewards/margins": 6.149646759033203, "rewards/rejected": -21.439956665039062, "step": 20 }, { "epoch": 0.0028594771241830064, "grad_norm": 99.40704965752145, "learning_rate": 2.2857142857142854e-08, "logits/chosen": -7.122776031494141, "logits/rejected": -8.08615779876709, "logps/chosen": -1.611696481704712, "logps/rejected": -1.4275271892547607, "loss": 5.4325, "rewards/accuracies": 0.25, "rewards/chosen": -16.11696434020996, "rewards/margins": -1.8416929244995117, "rewards/rejected": -14.27527141571045, "step": 21 }, { "epoch": 0.0029956427015250544, "grad_norm": 89.36148588522657, "learning_rate": 2.3945578231292517e-08, "logits/chosen": -7.899971008300781, "logits/rejected": -7.134831428527832, "logps/chosen": -1.1833109855651855, "logps/rejected": -1.4166796207427979, "loss": 4.9541, "rewards/accuracies": 0.5, "rewards/chosen": -11.833109855651855, "rewards/margins": 2.333686113357544, "rewards/rejected": -14.16679573059082, "step": 22 }, { "epoch": 0.0031318082788671024, "grad_norm": 152.42598823063548, "learning_rate": 2.503401360544218e-08, "logits/chosen": -7.501503944396973, "logits/rejected": -7.411863327026367, "logps/chosen": -1.0078234672546387, "logps/rejected": -1.2176735401153564, "loss": 6.4242, "rewards/accuracies": 0.5, "rewards/chosen": -10.078235626220703, "rewards/margins": 2.0984997749328613, "rewards/rejected": -12.176734924316406, "step": 23 }, { "epoch": 0.0032679738562091504, "grad_norm": 117.57280305127702, "learning_rate": 2.6122448979591838e-08, "logits/chosen": -8.515674591064453, "logits/rejected": -8.054911613464355, "logps/chosen": -1.6368861198425293, "logps/rejected": -1.5409890413284302, "loss": 6.3687, "rewards/accuracies": 0.25, "rewards/chosen": -16.368860244750977, "rewards/margins": -0.9589707851409912, "rewards/rejected": -15.409890174865723, "step": 24 }, { "epoch": 0.0034041394335511985, "grad_norm": 74.29078068598226, "learning_rate": 2.7210884353741497e-08, "logits/chosen": -8.20907974243164, "logits/rejected": -7.5154643058776855, "logps/chosen": -1.421484112739563, "logps/rejected": -1.5936205387115479, "loss": 5.1423, "rewards/accuracies": 0.25, "rewards/chosen": -14.214841842651367, "rewards/margins": 1.721365213394165, "rewards/rejected": -15.936205863952637, "step": 25 }, { "epoch": 0.003540305010893246, "grad_norm": 73.30416088801161, "learning_rate": 2.829931972789116e-08, "logits/chosen": -8.965385437011719, "logits/rejected": -8.74022102355957, "logps/chosen": -1.210025429725647, "logps/rejected": -1.1828711032867432, "loss": 5.1587, "rewards/accuracies": 0.5, "rewards/chosen": -12.10025405883789, "rewards/margins": -0.2715420722961426, "rewards/rejected": -11.82871150970459, "step": 26 }, { "epoch": 0.003676470588235294, "grad_norm": 69.57707526154249, "learning_rate": 2.9387755102040818e-08, "logits/chosen": -7.099234104156494, "logits/rejected": -7.478131294250488, "logps/chosen": -1.679986596107483, "logps/rejected": -1.6992101669311523, "loss": 4.7157, "rewards/accuracies": 0.75, "rewards/chosen": -16.79986572265625, "rewards/margins": 0.19223499298095703, "rewards/rejected": -16.992101669311523, "step": 27 }, { "epoch": 0.003812636165577342, "grad_norm": 76.89270114914073, "learning_rate": 3.047619047619048e-08, "logits/chosen": -8.429234504699707, "logits/rejected": -8.70054817199707, "logps/chosen": -1.4596924781799316, "logps/rejected": -1.416865587234497, "loss": 4.9061, "rewards/accuracies": 0.5, "rewards/chosen": -14.596925735473633, "rewards/margins": -0.42826974391937256, "rewards/rejected": -14.168655395507812, "step": 28 }, { "epoch": 0.00394880174291939, "grad_norm": 195.9487427258406, "learning_rate": 3.156462585034013e-08, "logits/chosen": -9.023954391479492, "logits/rejected": -8.18747329711914, "logps/chosen": -2.179352045059204, "logps/rejected": -2.6804957389831543, "loss": 5.9604, "rewards/accuracies": 0.75, "rewards/chosen": -21.793519973754883, "rewards/margins": 5.011435508728027, "rewards/rejected": -26.804956436157227, "step": 29 }, { "epoch": 0.004084967320261438, "grad_norm": 137.52642175006892, "learning_rate": 3.2653061224489795e-08, "logits/chosen": -10.156384468078613, "logits/rejected": -8.7210693359375, "logps/chosen": -1.4117178916931152, "logps/rejected": -1.9476935863494873, "loss": 6.4322, "rewards/accuracies": 1.0, "rewards/chosen": -14.117178916931152, "rewards/margins": 5.359757423400879, "rewards/rejected": -19.47693634033203, "step": 30 }, { "epoch": 0.004221132897603486, "grad_norm": 77.64172540319154, "learning_rate": 3.3741496598639454e-08, "logits/chosen": -8.296278953552246, "logits/rejected": -8.093879699707031, "logps/chosen": -1.420203685760498, "logps/rejected": -1.0650157928466797, "loss": 5.6083, "rewards/accuracies": 0.25, "rewards/chosen": -14.202037811279297, "rewards/margins": -3.551880121231079, "rewards/rejected": -10.650157928466797, "step": 31 }, { "epoch": 0.004357298474945534, "grad_norm": 224.1673890854367, "learning_rate": 3.482993197278911e-08, "logits/chosen": -9.610221862792969, "logits/rejected": -8.1019287109375, "logps/chosen": -1.2892544269561768, "logps/rejected": -1.3113367557525635, "loss": 6.3824, "rewards/accuracies": 0.5, "rewards/chosen": -12.892545700073242, "rewards/margins": 0.22082161903381348, "rewards/rejected": -13.113367080688477, "step": 32 }, { "epoch": 0.004493464052287581, "grad_norm": 97.9362092198681, "learning_rate": 3.591836734693877e-08, "logits/chosen": -8.641178131103516, "logits/rejected": -7.257102966308594, "logps/chosen": -0.9310708045959473, "logps/rejected": -1.1190235614776611, "loss": 5.8542, "rewards/accuracies": 0.5, "rewards/chosen": -9.310707092285156, "rewards/margins": 1.8795278072357178, "rewards/rejected": -11.190235137939453, "step": 33 }, { "epoch": 0.004629629629629629, "grad_norm": 78.90814047381855, "learning_rate": 3.700680272108843e-08, "logits/chosen": -9.328125, "logits/rejected": -7.129678726196289, "logps/chosen": -0.792837381362915, "logps/rejected": -1.184739589691162, "loss": 5.437, "rewards/accuracies": 1.0, "rewards/chosen": -7.928374290466309, "rewards/margins": 3.91902232170105, "rewards/rejected": -11.847396850585938, "step": 34 }, { "epoch": 0.004765795206971677, "grad_norm": 83.84449815163184, "learning_rate": 3.809523809523809e-08, "logits/chosen": -10.108911514282227, "logits/rejected": -8.289352416992188, "logps/chosen": -0.83558589220047, "logps/rejected": -0.9919118881225586, "loss": 5.3822, "rewards/accuracies": 0.75, "rewards/chosen": -8.355859756469727, "rewards/margins": 1.5632598400115967, "rewards/rejected": -9.919118881225586, "step": 35 }, { "epoch": 0.004901960784313725, "grad_norm": 56.733587648120654, "learning_rate": 3.918367346938775e-08, "logits/chosen": -7.068085670471191, "logits/rejected": -7.348333835601807, "logps/chosen": -1.414238691329956, "logps/rejected": -1.5801136493682861, "loss": 4.5908, "rewards/accuracies": 0.25, "rewards/chosen": -14.142387390136719, "rewards/margins": 1.6587491035461426, "rewards/rejected": -15.801136016845703, "step": 36 }, { "epoch": 0.0050381263616557734, "grad_norm": 147.24495848753503, "learning_rate": 4.0272108843537414e-08, "logits/chosen": -7.519924163818359, "logits/rejected": -8.549153327941895, "logps/chosen": -1.178124189376831, "logps/rejected": -1.1782786846160889, "loss": 5.1081, "rewards/accuracies": 0.5, "rewards/chosen": -11.781242370605469, "rewards/margins": 0.0015451908111572266, "rewards/rejected": -11.782787322998047, "step": 37 }, { "epoch": 0.0051742919389978215, "grad_norm": 82.33515551791946, "learning_rate": 4.136054421768707e-08, "logits/chosen": -8.169211387634277, "logits/rejected": -7.195664882659912, "logps/chosen": -1.086193323135376, "logps/rejected": -2.0211470127105713, "loss": 4.4568, "rewards/accuracies": 1.0, "rewards/chosen": -10.861932754516602, "rewards/margins": 9.349535942077637, "rewards/rejected": -20.211467742919922, "step": 38 }, { "epoch": 0.0053104575163398695, "grad_norm": 154.33189417762586, "learning_rate": 4.244897959183673e-08, "logits/chosen": -6.354782581329346, "logits/rejected": -6.38871955871582, "logps/chosen": -2.1581192016601562, "logps/rejected": -1.4443436861038208, "loss": 6.4702, "rewards/accuracies": 0.25, "rewards/chosen": -21.581192016601562, "rewards/margins": -7.137753963470459, "rewards/rejected": -14.443437576293945, "step": 39 }, { "epoch": 0.0054466230936819175, "grad_norm": 84.10312094932894, "learning_rate": 4.353741496598639e-08, "logits/chosen": -11.454967498779297, "logits/rejected": -7.408640384674072, "logps/chosen": -1.6280879974365234, "logps/rejected": -1.2988227605819702, "loss": 5.1652, "rewards/accuracies": 0.5, "rewards/chosen": -16.280879974365234, "rewards/margins": -3.292651414871216, "rewards/rejected": -12.988227844238281, "step": 40 }, { "epoch": 0.0055827886710239655, "grad_norm": 98.41552039031836, "learning_rate": 4.462585034013605e-08, "logits/chosen": -9.39912223815918, "logits/rejected": -8.352947235107422, "logps/chosen": -0.8734592795372009, "logps/rejected": -1.1101105213165283, "loss": 6.844, "rewards/accuracies": 1.0, "rewards/chosen": -8.73459243774414, "rewards/margins": 2.366513252258301, "rewards/rejected": -11.101105690002441, "step": 41 }, { "epoch": 0.005718954248366013, "grad_norm": 133.43674955444925, "learning_rate": 4.571428571428571e-08, "logits/chosen": -7.2400221824646, "logits/rejected": -8.104711532592773, "logps/chosen": -2.300874710083008, "logps/rejected": -1.7017452716827393, "loss": 6.6227, "rewards/accuracies": 0.25, "rewards/chosen": -23.008747100830078, "rewards/margins": -5.991292953491211, "rewards/rejected": -17.017454147338867, "step": 42 }, { "epoch": 0.005855119825708061, "grad_norm": 124.26319880689016, "learning_rate": 4.680272108843537e-08, "logits/chosen": -6.776821136474609, "logits/rejected": -7.68208122253418, "logps/chosen": -1.5143163204193115, "logps/rejected": -1.5395972728729248, "loss": 6.8612, "rewards/accuracies": 0.5, "rewards/chosen": -15.143162727355957, "rewards/margins": 0.2528102397918701, "rewards/rejected": -15.395973205566406, "step": 43 }, { "epoch": 0.005991285403050109, "grad_norm": 144.04833755610252, "learning_rate": 4.789115646258503e-08, "logits/chosen": -9.737911224365234, "logits/rejected": -7.971320152282715, "logps/chosen": -0.9367937445640564, "logps/rejected": -1.076307773590088, "loss": 7.0768, "rewards/accuracies": 0.5, "rewards/chosen": -9.367937088012695, "rewards/margins": 1.395140290260315, "rewards/rejected": -10.763077735900879, "step": 44 }, { "epoch": 0.006127450980392157, "grad_norm": 101.72183976160838, "learning_rate": 4.897959183673469e-08, "logits/chosen": -7.6925578117370605, "logits/rejected": -7.423181533813477, "logps/chosen": -1.545872449874878, "logps/rejected": -1.6742762327194214, "loss": 5.727, "rewards/accuracies": 0.5, "rewards/chosen": -15.458723068237305, "rewards/margins": 1.2840385437011719, "rewards/rejected": -16.74276351928711, "step": 45 }, { "epoch": 0.006263616557734205, "grad_norm": 125.3616686943247, "learning_rate": 5.006802721088436e-08, "logits/chosen": -7.420891761779785, "logits/rejected": -6.859945297241211, "logps/chosen": -4.174806118011475, "logps/rejected": -1.7693819999694824, "loss": 6.2751, "rewards/accuracies": 0.75, "rewards/chosen": -41.7480583190918, "rewards/margins": -24.054237365722656, "rewards/rejected": -17.693819046020508, "step": 46 }, { "epoch": 0.006399782135076253, "grad_norm": 96.64040500873067, "learning_rate": 5.115646258503401e-08, "logits/chosen": -8.11874771118164, "logits/rejected": -7.624804496765137, "logps/chosen": -1.5863895416259766, "logps/rejected": -1.7539196014404297, "loss": 5.3442, "rewards/accuracies": 0.75, "rewards/chosen": -15.863895416259766, "rewards/margins": 1.6753010749816895, "rewards/rejected": -17.539196014404297, "step": 47 }, { "epoch": 0.006535947712418301, "grad_norm": 99.52356907473843, "learning_rate": 5.2244897959183676e-08, "logits/chosen": -7.503182411193848, "logits/rejected": -7.31477165222168, "logps/chosen": -1.638970136642456, "logps/rejected": -2.0264317989349365, "loss": 5.4639, "rewards/accuracies": 0.5, "rewards/chosen": -16.38970184326172, "rewards/margins": 3.8746156692504883, "rewards/rejected": -20.26431655883789, "step": 48 }, { "epoch": 0.006672113289760349, "grad_norm": 108.7423549903766, "learning_rate": 5.333333333333333e-08, "logits/chosen": -6.89417839050293, "logits/rejected": -7.197060585021973, "logps/chosen": -1.3763270378112793, "logps/rejected": -0.9428555965423584, "loss": 5.5456, "rewards/accuracies": 0.0, "rewards/chosen": -13.763269424438477, "rewards/margins": -4.334713935852051, "rewards/rejected": -9.428556442260742, "step": 49 }, { "epoch": 0.006808278867102397, "grad_norm": 130.15832449324566, "learning_rate": 5.4421768707482993e-08, "logits/chosen": -9.584125518798828, "logits/rejected": -7.581586837768555, "logps/chosen": -1.2842180728912354, "logps/rejected": -1.396111011505127, "loss": 5.9542, "rewards/accuracies": 0.5, "rewards/chosen": -12.842180252075195, "rewards/margins": 1.1189295053482056, "rewards/rejected": -13.961109161376953, "step": 50 }, { "epoch": 0.006944444444444444, "grad_norm": 110.64279836560613, "learning_rate": 5.551020408163265e-08, "logits/chosen": -7.62509298324585, "logits/rejected": -7.249032020568848, "logps/chosen": -1.2819865942001343, "logps/rejected": -1.0427690744400024, "loss": 5.719, "rewards/accuracies": 0.0, "rewards/chosen": -12.819866180419922, "rewards/margins": -2.3921751976013184, "rewards/rejected": -10.427690505981445, "step": 51 }, { "epoch": 0.007080610021786492, "grad_norm": 119.87864609847475, "learning_rate": 5.659863945578232e-08, "logits/chosen": -6.505054473876953, "logits/rejected": -9.39389419555664, "logps/chosen": -4.63499116897583, "logps/rejected": -1.8232948780059814, "loss": 6.5677, "rewards/accuracies": 0.5, "rewards/chosen": -46.34991455078125, "rewards/margins": -28.11696434020996, "rewards/rejected": -18.232948303222656, "step": 52 }, { "epoch": 0.00721677559912854, "grad_norm": 81.26726532006398, "learning_rate": 5.768707482993197e-08, "logits/chosen": -9.371644020080566, "logits/rejected": -9.768196105957031, "logps/chosen": -1.5649704933166504, "logps/rejected": -1.9932059049606323, "loss": 4.8789, "rewards/accuracies": 0.5, "rewards/chosen": -15.64970588684082, "rewards/margins": 4.282353401184082, "rewards/rejected": -19.932058334350586, "step": 53 }, { "epoch": 0.007352941176470588, "grad_norm": 113.15510629076843, "learning_rate": 5.8775510204081636e-08, "logits/chosen": -7.878812789916992, "logits/rejected": -5.903256416320801, "logps/chosen": -1.3815128803253174, "logps/rejected": -1.8615334033966064, "loss": 5.9815, "rewards/accuracies": 0.75, "rewards/chosen": -13.815129280090332, "rewards/margins": 4.800204277038574, "rewards/rejected": -18.615333557128906, "step": 54 }, { "epoch": 0.007489106753812636, "grad_norm": 97.88784055960298, "learning_rate": 5.986394557823129e-08, "logits/chosen": -8.673227310180664, "logits/rejected": -7.9167280197143555, "logps/chosen": -1.2534761428833008, "logps/rejected": -1.3685142993927002, "loss": 5.7756, "rewards/accuracies": 0.5, "rewards/chosen": -12.534761428833008, "rewards/margins": 1.150381326675415, "rewards/rejected": -13.68514347076416, "step": 55 }, { "epoch": 0.007625272331154684, "grad_norm": 107.17320325974283, "learning_rate": 6.095238095238095e-08, "logits/chosen": -7.264116287231445, "logits/rejected": -9.411066055297852, "logps/chosen": -1.898555040359497, "logps/rejected": -1.3082913160324097, "loss": 5.4008, "rewards/accuracies": 0.25, "rewards/chosen": -18.985549926757812, "rewards/margins": -5.902637481689453, "rewards/rejected": -13.08291244506836, "step": 56 }, { "epoch": 0.007761437908496732, "grad_norm": 63.47571034378853, "learning_rate": 6.20408163265306e-08, "logits/chosen": -9.199972152709961, "logits/rejected": -8.315970420837402, "logps/chosen": -1.6043345928192139, "logps/rejected": -1.5709304809570312, "loss": 4.5587, "rewards/accuracies": 0.5, "rewards/chosen": -16.043346405029297, "rewards/margins": -0.3340415954589844, "rewards/rejected": -15.709305763244629, "step": 57 }, { "epoch": 0.00789760348583878, "grad_norm": 70.99979376886064, "learning_rate": 6.312925170068026e-08, "logits/chosen": -8.813874244689941, "logits/rejected": -9.394254684448242, "logps/chosen": -1.4622546434402466, "logps/rejected": -1.5640861988067627, "loss": 5.174, "rewards/accuracies": 0.5, "rewards/chosen": -14.622546195983887, "rewards/margins": 1.0183155536651611, "rewards/rejected": -15.640861511230469, "step": 58 }, { "epoch": 0.008033769063180828, "grad_norm": 72.15643816550936, "learning_rate": 6.421768707482992e-08, "logits/chosen": -8.310294151306152, "logits/rejected": -8.120220184326172, "logps/chosen": -1.091858148574829, "logps/rejected": -1.5291668176651, "loss": 4.9232, "rewards/accuracies": 0.75, "rewards/chosen": -10.918581008911133, "rewards/margins": 4.373086929321289, "rewards/rejected": -15.291667938232422, "step": 59 }, { "epoch": 0.008169934640522876, "grad_norm": 67.97018547407833, "learning_rate": 6.530612244897959e-08, "logits/chosen": -6.65509557723999, "logits/rejected": -9.490769386291504, "logps/chosen": -1.9426493644714355, "logps/rejected": -0.8362451791763306, "loss": 5.0841, "rewards/accuracies": 0.0, "rewards/chosen": -19.426494598388672, "rewards/margins": -11.064042091369629, "rewards/rejected": -8.362451553344727, "step": 60 }, { "epoch": 0.008306100217864924, "grad_norm": 74.71438410984017, "learning_rate": 6.639455782312925e-08, "logits/chosen": -8.316003799438477, "logits/rejected": -7.666031837463379, "logps/chosen": -1.1824958324432373, "logps/rejected": -1.5117268562316895, "loss": 4.5549, "rewards/accuracies": 0.5, "rewards/chosen": -11.824957847595215, "rewards/margins": 3.2923097610473633, "rewards/rejected": -15.117267608642578, "step": 61 }, { "epoch": 0.008442265795206972, "grad_norm": 138.6088203982334, "learning_rate": 6.748299319727891e-08, "logits/chosen": -9.967666625976562, "logits/rejected": -8.454381942749023, "logps/chosen": -1.2460039854049683, "logps/rejected": -1.7386822700500488, "loss": 6.5243, "rewards/accuracies": 0.5, "rewards/chosen": -12.460041046142578, "rewards/margins": 4.92678165435791, "rewards/rejected": -17.386821746826172, "step": 62 }, { "epoch": 0.00857843137254902, "grad_norm": 70.43622085386934, "learning_rate": 6.857142857142857e-08, "logits/chosen": -8.940287590026855, "logits/rejected": -6.79775333404541, "logps/chosen": -1.1265895366668701, "logps/rejected": -2.7872064113616943, "loss": 4.7779, "rewards/accuracies": 0.75, "rewards/chosen": -11.265893936157227, "rewards/margins": 16.606170654296875, "rewards/rejected": -27.8720645904541, "step": 63 }, { "epoch": 0.008714596949891068, "grad_norm": 99.77522156713466, "learning_rate": 6.965986394557823e-08, "logits/chosen": -7.091365814208984, "logits/rejected": -6.561487197875977, "logps/chosen": -1.318249225616455, "logps/rejected": -1.9107496738433838, "loss": 4.8264, "rewards/accuracies": 1.0, "rewards/chosen": -13.18249225616455, "rewards/margins": 5.925003528594971, "rewards/rejected": -19.10749626159668, "step": 64 }, { "epoch": 0.008850762527233115, "grad_norm": 122.70216789510816, "learning_rate": 7.074829931972789e-08, "logits/chosen": -8.370696067810059, "logits/rejected": -7.31377649307251, "logps/chosen": -1.2015694379806519, "logps/rejected": -1.3841997385025024, "loss": 5.8268, "rewards/accuracies": 0.75, "rewards/chosen": -12.015693664550781, "rewards/margins": 1.8263037204742432, "rewards/rejected": -13.841998100280762, "step": 65 }, { "epoch": 0.008986928104575163, "grad_norm": 132.0032304264201, "learning_rate": 7.183673469387754e-08, "logits/chosen": -8.399620056152344, "logits/rejected": -7.441420078277588, "logps/chosen": -1.150929570198059, "logps/rejected": -1.578810214996338, "loss": 6.2916, "rewards/accuracies": 1.0, "rewards/chosen": -11.509295463562012, "rewards/margins": 4.278807163238525, "rewards/rejected": -15.788103103637695, "step": 66 }, { "epoch": 0.00912309368191721, "grad_norm": 63.150089606353056, "learning_rate": 7.292517006802721e-08, "logits/chosen": -7.126203536987305, "logits/rejected": -7.484006404876709, "logps/chosen": -1.4870810508728027, "logps/rejected": -1.5404748916625977, "loss": 4.7745, "rewards/accuracies": 0.5, "rewards/chosen": -14.870810508728027, "rewards/margins": 0.533939003944397, "rewards/rejected": -15.404748916625977, "step": 67 }, { "epoch": 0.009259259259259259, "grad_norm": 125.45364597271929, "learning_rate": 7.401360544217686e-08, "logits/chosen": -9.911307334899902, "logits/rejected": -9.331811904907227, "logps/chosen": -1.6816699504852295, "logps/rejected": -2.4589548110961914, "loss": 6.4537, "rewards/accuracies": 1.0, "rewards/chosen": -16.816699981689453, "rewards/margins": 7.772848129272461, "rewards/rejected": -24.58954620361328, "step": 68 }, { "epoch": 0.009395424836601307, "grad_norm": 86.1941974771448, "learning_rate": 7.510204081632653e-08, "logits/chosen": -6.7245893478393555, "logits/rejected": -5.919681549072266, "logps/chosen": -1.358107089996338, "logps/rejected": -1.5384711027145386, "loss": 5.7238, "rewards/accuracies": 0.75, "rewards/chosen": -13.581069946289062, "rewards/margins": 1.8036401271820068, "rewards/rejected": -15.384710311889648, "step": 69 }, { "epoch": 0.009531590413943355, "grad_norm": 94.41013818517813, "learning_rate": 7.619047619047618e-08, "logits/chosen": -7.906219482421875, "logits/rejected": -7.004551887512207, "logps/chosen": -1.636581301689148, "logps/rejected": -1.9006057977676392, "loss": 5.26, "rewards/accuracies": 0.5, "rewards/chosen": -16.365814208984375, "rewards/margins": 2.640244722366333, "rewards/rejected": -19.006057739257812, "step": 70 }, { "epoch": 0.009667755991285403, "grad_norm": 82.20978279761624, "learning_rate": 7.727891156462584e-08, "logits/chosen": -8.558752059936523, "logits/rejected": -7.616976261138916, "logps/chosen": -0.8557428121566772, "logps/rejected": -1.2832086086273193, "loss": 4.9706, "rewards/accuracies": 0.75, "rewards/chosen": -8.557427406311035, "rewards/margins": 4.274658203125, "rewards/rejected": -12.832085609436035, "step": 71 }, { "epoch": 0.00980392156862745, "grad_norm": 72.28125414239635, "learning_rate": 7.83673469387755e-08, "logits/chosen": -7.2157487869262695, "logits/rejected": -6.2108306884765625, "logps/chosen": -1.5208988189697266, "logps/rejected": -1.798633098602295, "loss": 5.6179, "rewards/accuracies": 0.5, "rewards/chosen": -15.208988189697266, "rewards/margins": 2.7773427963256836, "rewards/rejected": -17.986331939697266, "step": 72 }, { "epoch": 0.009940087145969499, "grad_norm": 174.15594229895135, "learning_rate": 7.945578231292516e-08, "logits/chosen": -8.816532135009766, "logits/rejected": -8.565436363220215, "logps/chosen": -1.0466110706329346, "logps/rejected": -1.3531405925750732, "loss": 8.2356, "rewards/accuracies": 0.75, "rewards/chosen": -10.466110229492188, "rewards/margins": 3.0652964115142822, "rewards/rejected": -13.53140640258789, "step": 73 }, { "epoch": 0.010076252723311547, "grad_norm": 114.1189808677808, "learning_rate": 8.054421768707483e-08, "logits/chosen": -7.946718215942383, "logits/rejected": -8.267077445983887, "logps/chosen": -1.184006690979004, "logps/rejected": -1.359445333480835, "loss": 5.4727, "rewards/accuracies": 0.25, "rewards/chosen": -11.840066909790039, "rewards/margins": 1.7543872594833374, "rewards/rejected": -13.594453811645508, "step": 74 }, { "epoch": 0.010212418300653595, "grad_norm": 99.28729028496868, "learning_rate": 8.16326530612245e-08, "logits/chosen": -7.435127258300781, "logits/rejected": -7.248805999755859, "logps/chosen": -2.4092087745666504, "logps/rejected": -2.0611274242401123, "loss": 6.359, "rewards/accuracies": 0.75, "rewards/chosen": -24.09208869934082, "rewards/margins": -3.4808149337768555, "rewards/rejected": -20.61127281188965, "step": 75 }, { "epoch": 0.010348583877995643, "grad_norm": 71.36210910793139, "learning_rate": 8.272108843537415e-08, "logits/chosen": -7.562510013580322, "logits/rejected": -8.113588333129883, "logps/chosen": -1.5316396951675415, "logps/rejected": -0.9470953941345215, "loss": 4.6281, "rewards/accuracies": 0.0, "rewards/chosen": -15.31639575958252, "rewards/margins": -5.845442295074463, "rewards/rejected": -9.470952987670898, "step": 76 }, { "epoch": 0.010484749455337691, "grad_norm": 152.5020634367813, "learning_rate": 8.380952380952381e-08, "logits/chosen": -8.81529426574707, "logits/rejected": -7.513816833496094, "logps/chosen": -1.0823626518249512, "logps/rejected": -1.5705209970474243, "loss": 6.0376, "rewards/accuracies": 0.5, "rewards/chosen": -10.823626518249512, "rewards/margins": 4.881583213806152, "rewards/rejected": -15.705209732055664, "step": 77 }, { "epoch": 0.010620915032679739, "grad_norm": 115.58135216339514, "learning_rate": 8.489795918367346e-08, "logits/chosen": -6.990825176239014, "logits/rejected": -6.871254920959473, "logps/chosen": -1.3340160846710205, "logps/rejected": -1.018347144126892, "loss": 6.2653, "rewards/accuracies": 0.25, "rewards/chosen": -13.340160369873047, "rewards/margins": -3.156689167022705, "rewards/rejected": -10.183470726013184, "step": 78 }, { "epoch": 0.010757080610021787, "grad_norm": 91.04966573031922, "learning_rate": 8.598639455782313e-08, "logits/chosen": -8.935365676879883, "logits/rejected": -9.263147354125977, "logps/chosen": -0.8826298117637634, "logps/rejected": -0.7888429760932922, "loss": 4.7763, "rewards/accuracies": 0.25, "rewards/chosen": -8.826297760009766, "rewards/margins": -0.9378681182861328, "rewards/rejected": -7.888429641723633, "step": 79 }, { "epoch": 0.010893246187363835, "grad_norm": 92.66196323244837, "learning_rate": 8.707482993197278e-08, "logits/chosen": -7.85068416595459, "logits/rejected": -8.759309768676758, "logps/chosen": -1.5066897869110107, "logps/rejected": -1.4703220129013062, "loss": 6.0878, "rewards/accuracies": 0.5, "rewards/chosen": -15.066898345947266, "rewards/margins": -0.3636772632598877, "rewards/rejected": -14.70322036743164, "step": 80 }, { "epoch": 0.011029411764705883, "grad_norm": 66.24374178406546, "learning_rate": 8.816326530612245e-08, "logits/chosen": -6.682755470275879, "logits/rejected": -7.729180335998535, "logps/chosen": -1.4993747472763062, "logps/rejected": -1.3412177562713623, "loss": 5.0086, "rewards/accuracies": 0.5, "rewards/chosen": -14.99374771118164, "rewards/margins": -1.5815701484680176, "rewards/rejected": -13.412177085876465, "step": 81 }, { "epoch": 0.011165577342047931, "grad_norm": 107.79983831497081, "learning_rate": 8.92517006802721e-08, "logits/chosen": -7.459165096282959, "logits/rejected": -8.513818740844727, "logps/chosen": -1.28342866897583, "logps/rejected": -1.2030141353607178, "loss": 5.5921, "rewards/accuracies": 0.75, "rewards/chosen": -12.8342866897583, "rewards/margins": -0.8041467666625977, "rewards/rejected": -12.030139923095703, "step": 82 }, { "epoch": 0.011301742919389977, "grad_norm": 114.67483127909163, "learning_rate": 9.034013605442176e-08, "logits/chosen": -7.378748416900635, "logits/rejected": -6.690046787261963, "logps/chosen": -1.793397068977356, "logps/rejected": -2.0308127403259277, "loss": 6.2229, "rewards/accuracies": 0.75, "rewards/chosen": -17.933971405029297, "rewards/margins": 2.374157667160034, "rewards/rejected": -20.308128356933594, "step": 83 }, { "epoch": 0.011437908496732025, "grad_norm": 76.31372176217329, "learning_rate": 9.142857142857142e-08, "logits/chosen": -7.920049667358398, "logits/rejected": -9.507408142089844, "logps/chosen": -1.574668288230896, "logps/rejected": -1.28892183303833, "loss": 5.2408, "rewards/accuracies": 0.25, "rewards/chosen": -15.746682167053223, "rewards/margins": -2.8574633598327637, "rewards/rejected": -12.889219284057617, "step": 84 }, { "epoch": 0.011574074074074073, "grad_norm": 53.19374567724993, "learning_rate": 9.251700680272108e-08, "logits/chosen": -7.838123321533203, "logits/rejected": -8.164039611816406, "logps/chosen": -1.3105769157409668, "logps/rejected": -1.5724979639053345, "loss": 4.7178, "rewards/accuracies": 0.5, "rewards/chosen": -13.105770111083984, "rewards/margins": 2.6192095279693604, "rewards/rejected": -15.724979400634766, "step": 85 }, { "epoch": 0.011710239651416121, "grad_norm": 83.11818058397047, "learning_rate": 9.360544217687074e-08, "logits/chosen": -8.956910133361816, "logits/rejected": -7.127848148345947, "logps/chosen": -0.9617190361022949, "logps/rejected": -1.3131762742996216, "loss": 4.5313, "rewards/accuracies": 0.75, "rewards/chosen": -9.61719036102295, "rewards/margins": 3.514573335647583, "rewards/rejected": -13.131763458251953, "step": 86 }, { "epoch": 0.01184640522875817, "grad_norm": 135.50909691562592, "learning_rate": 9.46938775510204e-08, "logits/chosen": -8.015868186950684, "logits/rejected": -7.18034553527832, "logps/chosen": -1.4160012006759644, "logps/rejected": -1.899852991104126, "loss": 6.3869, "rewards/accuracies": 0.75, "rewards/chosen": -14.160012245178223, "rewards/margins": 4.838517189025879, "rewards/rejected": -18.9985294342041, "step": 87 }, { "epoch": 0.011982570806100218, "grad_norm": 194.8526131887022, "learning_rate": 9.578231292517007e-08, "logits/chosen": -8.310699462890625, "logits/rejected": -7.483238220214844, "logps/chosen": -1.567845344543457, "logps/rejected": -2.027216672897339, "loss": 6.5591, "rewards/accuracies": 0.75, "rewards/chosen": -15.678454399108887, "rewards/margins": 4.593711853027344, "rewards/rejected": -20.272167205810547, "step": 88 }, { "epoch": 0.012118736383442266, "grad_norm": 86.41937950338823, "learning_rate": 9.687074829931973e-08, "logits/chosen": -8.387208938598633, "logits/rejected": -8.707887649536133, "logps/chosen": -1.0567429065704346, "logps/rejected": -1.2387934923171997, "loss": 5.2817, "rewards/accuracies": 0.75, "rewards/chosen": -10.567428588867188, "rewards/margins": 1.8205052614212036, "rewards/rejected": -12.387933731079102, "step": 89 }, { "epoch": 0.012254901960784314, "grad_norm": 167.52811527627333, "learning_rate": 9.795918367346938e-08, "logits/chosen": -8.506210327148438, "logits/rejected": -7.366031169891357, "logps/chosen": -1.1873722076416016, "logps/rejected": -1.6340794563293457, "loss": 5.9765, "rewards/accuracies": 0.75, "rewards/chosen": -11.873722076416016, "rewards/margins": 4.467071533203125, "rewards/rejected": -16.34079360961914, "step": 90 }, { "epoch": 0.012391067538126362, "grad_norm": 99.77974952104485, "learning_rate": 9.904761904761905e-08, "logits/chosen": -6.567556381225586, "logits/rejected": -6.088610649108887, "logps/chosen": -2.4839515686035156, "logps/rejected": -1.5262925624847412, "loss": 5.1157, "rewards/accuracies": 0.25, "rewards/chosen": -24.839515686035156, "rewards/margins": -9.576591491699219, "rewards/rejected": -15.262925148010254, "step": 91 }, { "epoch": 0.01252723311546841, "grad_norm": 138.5112647724549, "learning_rate": 1.0013605442176872e-07, "logits/chosen": -8.968034744262695, "logits/rejected": -7.947174072265625, "logps/chosen": -1.3343839645385742, "logps/rejected": -1.4076025485992432, "loss": 6.2193, "rewards/accuracies": 0.75, "rewards/chosen": -13.343840599060059, "rewards/margins": 0.7321840524673462, "rewards/rejected": -14.076025009155273, "step": 92 }, { "epoch": 0.012663398692810458, "grad_norm": 76.83693389649015, "learning_rate": 1.0122448979591835e-07, "logits/chosen": -8.755231857299805, "logits/rejected": -6.5800371170043945, "logps/chosen": -1.6095538139343262, "logps/rejected": -1.6626031398773193, "loss": 4.981, "rewards/accuracies": 0.25, "rewards/chosen": -16.095539093017578, "rewards/margins": 0.5304934978485107, "rewards/rejected": -16.62603187561035, "step": 93 }, { "epoch": 0.012799564270152506, "grad_norm": 67.67459755217072, "learning_rate": 1.0231292517006802e-07, "logits/chosen": -8.35530948638916, "logits/rejected": -7.346632957458496, "logps/chosen": -1.5239591598510742, "logps/rejected": -1.4855124950408936, "loss": 4.976, "rewards/accuracies": 0.5, "rewards/chosen": -15.239592552185059, "rewards/margins": -0.38446712493896484, "rewards/rejected": -14.855125427246094, "step": 94 }, { "epoch": 0.012935729847494554, "grad_norm": 138.79095674842807, "learning_rate": 1.0340136054421769e-07, "logits/chosen": -7.688117027282715, "logits/rejected": -5.764824867248535, "logps/chosen": -4.866403579711914, "logps/rejected": -1.7673399448394775, "loss": 6.2775, "rewards/accuracies": 0.5, "rewards/chosen": -48.664031982421875, "rewards/margins": -30.990633010864258, "rewards/rejected": -17.67340087890625, "step": 95 }, { "epoch": 0.013071895424836602, "grad_norm": 72.14711455913951, "learning_rate": 1.0448979591836735e-07, "logits/chosen": -8.103166580200195, "logits/rejected": -8.934942245483398, "logps/chosen": -1.0589070320129395, "logps/rejected": -1.3673222064971924, "loss": 5.4907, "rewards/accuracies": 0.5, "rewards/chosen": -10.589070320129395, "rewards/margins": 3.0841519832611084, "rewards/rejected": -13.673222541809082, "step": 96 }, { "epoch": 0.01320806100217865, "grad_norm": 80.5400932846621, "learning_rate": 1.0557823129251699e-07, "logits/chosen": -8.798177719116211, "logits/rejected": -7.312837600708008, "logps/chosen": -1.3897900581359863, "logps/rejected": -1.9143118858337402, "loss": 4.662, "rewards/accuracies": 1.0, "rewards/chosen": -13.89790153503418, "rewards/margins": 5.245217800140381, "rewards/rejected": -19.14311981201172, "step": 97 }, { "epoch": 0.013344226579520698, "grad_norm": 132.90113160721006, "learning_rate": 1.0666666666666666e-07, "logits/chosen": -6.230717658996582, "logits/rejected": -6.846138000488281, "logps/chosen": -1.3648370504379272, "logps/rejected": -1.1475584506988525, "loss": 5.6988, "rewards/accuracies": 0.25, "rewards/chosen": -13.648370742797852, "rewards/margins": -2.172786235809326, "rewards/rejected": -11.475584030151367, "step": 98 }, { "epoch": 0.013480392156862746, "grad_norm": 103.64860072059895, "learning_rate": 1.0775510204081632e-07, "logits/chosen": -8.25450325012207, "logits/rejected": -7.826421737670898, "logps/chosen": -1.0872471332550049, "logps/rejected": -1.0798908472061157, "loss": 4.986, "rewards/accuracies": 0.5, "rewards/chosen": -10.87247085571289, "rewards/margins": -0.07356202602386475, "rewards/rejected": -10.798908233642578, "step": 99 }, { "epoch": 0.013616557734204794, "grad_norm": 161.02047160147487, "learning_rate": 1.0884353741496599e-07, "logits/chosen": -7.526217460632324, "logits/rejected": -8.304452896118164, "logps/chosen": -1.9105037450790405, "logps/rejected": -1.7503979206085205, "loss": 5.6975, "rewards/accuracies": 0.5, "rewards/chosen": -19.105037689208984, "rewards/margins": -1.6010582447052002, "rewards/rejected": -17.503978729248047, "step": 100 }, { "epoch": 0.01375272331154684, "grad_norm": 87.66251614498431, "learning_rate": 1.0993197278911564e-07, "logits/chosen": -8.093244552612305, "logits/rejected": -8.369668960571289, "logps/chosen": -0.9477163553237915, "logps/rejected": -0.9063022136688232, "loss": 4.9535, "rewards/accuracies": 0.25, "rewards/chosen": -9.477163314819336, "rewards/margins": -0.4141418933868408, "rewards/rejected": -9.063021659851074, "step": 101 }, { "epoch": 0.013888888888888888, "grad_norm": 147.94781920097606, "learning_rate": 1.110204081632653e-07, "logits/chosen": -8.414246559143066, "logits/rejected": -8.22299861907959, "logps/chosen": -1.2817943096160889, "logps/rejected": -1.3437156677246094, "loss": 6.1949, "rewards/accuracies": 0.25, "rewards/chosen": -12.81794261932373, "rewards/margins": 0.6192144155502319, "rewards/rejected": -13.437156677246094, "step": 102 }, { "epoch": 0.014025054466230936, "grad_norm": 69.18705436076095, "learning_rate": 1.1210884353741497e-07, "logits/chosen": -7.203943729400635, "logits/rejected": -7.0678205490112305, "logps/chosen": -1.3164646625518799, "logps/rejected": -1.4119610786437988, "loss": 4.5866, "rewards/accuracies": 0.5, "rewards/chosen": -13.16464614868164, "rewards/margins": 0.9549641609191895, "rewards/rejected": -14.119610786437988, "step": 103 }, { "epoch": 0.014161220043572984, "grad_norm": 147.6829392865544, "learning_rate": 1.1319727891156464e-07, "logits/chosen": -7.827901363372803, "logits/rejected": -9.09907341003418, "logps/chosen": -1.3967491388320923, "logps/rejected": -1.0639358758926392, "loss": 5.6567, "rewards/accuracies": 0.5, "rewards/chosen": -13.967491149902344, "rewards/margins": -3.328132152557373, "rewards/rejected": -10.639358520507812, "step": 104 }, { "epoch": 0.014297385620915032, "grad_norm": 139.16878999895226, "learning_rate": 1.1428571428571427e-07, "logits/chosen": -8.653733253479004, "logits/rejected": -8.594332695007324, "logps/chosen": -1.2562005519866943, "logps/rejected": -1.2710366249084473, "loss": 5.9136, "rewards/accuracies": 0.5, "rewards/chosen": -12.562005043029785, "rewards/margins": 0.1483595371246338, "rewards/rejected": -12.71036434173584, "step": 105 }, { "epoch": 0.01443355119825708, "grad_norm": 74.16418578351004, "learning_rate": 1.1537414965986394e-07, "logits/chosen": -8.14253044128418, "logits/rejected": -8.070277214050293, "logps/chosen": -1.6088261604309082, "logps/rejected": -1.5226283073425293, "loss": 5.0765, "rewards/accuracies": 0.5, "rewards/chosen": -16.088260650634766, "rewards/margins": -0.8619790077209473, "rewards/rejected": -15.226282119750977, "step": 106 }, { "epoch": 0.014569716775599128, "grad_norm": 90.97523981322425, "learning_rate": 1.164625850340136e-07, "logits/chosen": -8.80117416381836, "logits/rejected": -6.526435852050781, "logps/chosen": -1.2978707551956177, "logps/rejected": -1.7596895694732666, "loss": 5.2639, "rewards/accuracies": 1.0, "rewards/chosen": -12.978708267211914, "rewards/margins": 4.618188858032227, "rewards/rejected": -17.596895217895508, "step": 107 }, { "epoch": 0.014705882352941176, "grad_norm": 152.92236713673708, "learning_rate": 1.1755102040816327e-07, "logits/chosen": -8.181175231933594, "logits/rejected": -8.921409606933594, "logps/chosen": -2.7746469974517822, "logps/rejected": -1.390120029449463, "loss": 4.6746, "rewards/accuracies": 0.5, "rewards/chosen": -27.746471405029297, "rewards/margins": -13.845272064208984, "rewards/rejected": -13.901199340820312, "step": 108 }, { "epoch": 0.014842047930283224, "grad_norm": 112.70207828013707, "learning_rate": 1.1863945578231291e-07, "logits/chosen": -8.3693208694458, "logits/rejected": -6.938942909240723, "logps/chosen": -1.2172410488128662, "logps/rejected": -1.351880431175232, "loss": 4.8407, "rewards/accuracies": 0.5, "rewards/chosen": -12.172409057617188, "rewards/margins": 1.3463950157165527, "rewards/rejected": -13.518804550170898, "step": 109 }, { "epoch": 0.014978213507625272, "grad_norm": 121.54226995243312, "learning_rate": 1.1972789115646258e-07, "logits/chosen": -6.318542957305908, "logits/rejected": -6.55689811706543, "logps/chosen": -3.979107141494751, "logps/rejected": -1.9599238634109497, "loss": 6.4685, "rewards/accuracies": 0.25, "rewards/chosen": -39.79107666015625, "rewards/margins": -20.191835403442383, "rewards/rejected": -19.599239349365234, "step": 110 }, { "epoch": 0.01511437908496732, "grad_norm": 84.58190857401989, "learning_rate": 1.2081632653061225e-07, "logits/chosen": -8.3256196975708, "logits/rejected": -7.410186767578125, "logps/chosen": -1.3572335243225098, "logps/rejected": -1.0426664352416992, "loss": 5.0825, "rewards/accuracies": 0.25, "rewards/chosen": -13.572334289550781, "rewards/margins": -3.14566969871521, "rewards/rejected": -10.426664352416992, "step": 111 }, { "epoch": 0.015250544662309368, "grad_norm": 65.64314693674002, "learning_rate": 1.219047619047619e-07, "logits/chosen": -6.951257705688477, "logits/rejected": -7.209904193878174, "logps/chosen": -1.1895802021026611, "logps/rejected": -1.2549169063568115, "loss": 4.868, "rewards/accuracies": 0.75, "rewards/chosen": -11.895801544189453, "rewards/margins": 0.6533675193786621, "rewards/rejected": -12.549169540405273, "step": 112 }, { "epoch": 0.015386710239651416, "grad_norm": 113.08861912889519, "learning_rate": 1.2299319727891156e-07, "logits/chosen": -8.145864486694336, "logits/rejected": -6.895856857299805, "logps/chosen": -1.325095772743225, "logps/rejected": -1.4878772497177124, "loss": 4.8132, "rewards/accuracies": 0.75, "rewards/chosen": -13.250957489013672, "rewards/margins": 1.627814769744873, "rewards/rejected": -14.878772735595703, "step": 113 }, { "epoch": 0.015522875816993464, "grad_norm": 75.60935770369265, "learning_rate": 1.240816326530612e-07, "logits/chosen": -7.004352569580078, "logits/rejected": -7.219120979309082, "logps/chosen": -1.666276454925537, "logps/rejected": -1.442216157913208, "loss": 4.8391, "rewards/accuracies": 0.25, "rewards/chosen": -16.662765502929688, "rewards/margins": -2.240602493286133, "rewards/rejected": -14.422163009643555, "step": 114 }, { "epoch": 0.01565904139433551, "grad_norm": 145.08244740594614, "learning_rate": 1.251700680272109e-07, "logits/chosen": -7.890751838684082, "logits/rejected": -7.44482421875, "logps/chosen": -0.9217901229858398, "logps/rejected": -1.1771790981292725, "loss": 5.7998, "rewards/accuracies": 0.75, "rewards/chosen": -9.217901229858398, "rewards/margins": 2.5538902282714844, "rewards/rejected": -11.771791458129883, "step": 115 }, { "epoch": 0.01579520697167756, "grad_norm": 81.64108183081876, "learning_rate": 1.2625850340136052e-07, "logits/chosen": -8.327146530151367, "logits/rejected": -7.732615947723389, "logps/chosen": -1.1325106620788574, "logps/rejected": -1.4629204273223877, "loss": 4.8724, "rewards/accuracies": 0.5, "rewards/chosen": -11.32510757446289, "rewards/margins": 3.3040976524353027, "rewards/rejected": -14.629205703735352, "step": 116 }, { "epoch": 0.015931372549019607, "grad_norm": 122.8120274021992, "learning_rate": 1.273469387755102e-07, "logits/chosen": -7.807650089263916, "logits/rejected": -8.775320053100586, "logps/chosen": -1.8602893352508545, "logps/rejected": -1.7113218307495117, "loss": 5.5495, "rewards/accuracies": 0.5, "rewards/chosen": -18.602893829345703, "rewards/margins": -1.48967444896698, "rewards/rejected": -17.113218307495117, "step": 117 }, { "epoch": 0.016067538126361657, "grad_norm": 109.5513659618724, "learning_rate": 1.2843537414965985e-07, "logits/chosen": -7.386325836181641, "logits/rejected": -7.472169399261475, "logps/chosen": -1.4960219860076904, "logps/rejected": -1.2435331344604492, "loss": 4.8499, "rewards/accuracies": 0.25, "rewards/chosen": -14.960220336914062, "rewards/margins": -2.5248897075653076, "rewards/rejected": -12.435330390930176, "step": 118 }, { "epoch": 0.016203703703703703, "grad_norm": 71.06846317108376, "learning_rate": 1.2952380952380953e-07, "logits/chosen": -8.483373641967773, "logits/rejected": -7.18443489074707, "logps/chosen": -0.824257493019104, "logps/rejected": -1.4479196071624756, "loss": 4.4634, "rewards/accuracies": 1.0, "rewards/chosen": -8.242574691772461, "rewards/margins": 6.236621856689453, "rewards/rejected": -14.479196548461914, "step": 119 }, { "epoch": 0.016339869281045753, "grad_norm": 155.3238168301606, "learning_rate": 1.3061224489795918e-07, "logits/chosen": -8.394693374633789, "logits/rejected": -7.223039627075195, "logps/chosen": -1.6742703914642334, "logps/rejected": -1.5777275562286377, "loss": 6.6937, "rewards/accuracies": 0.5, "rewards/chosen": -16.742706298828125, "rewards/margins": -0.9654300212860107, "rewards/rejected": -15.777275085449219, "step": 120 }, { "epoch": 0.0164760348583878, "grad_norm": 65.75487204851369, "learning_rate": 1.3170068027210883e-07, "logits/chosen": -8.552877426147461, "logits/rejected": -9.355611801147461, "logps/chosen": -1.0653274059295654, "logps/rejected": -1.0810471773147583, "loss": 5.119, "rewards/accuracies": 0.5, "rewards/chosen": -10.653273582458496, "rewards/margins": 0.1571979522705078, "rewards/rejected": -10.810471534729004, "step": 121 }, { "epoch": 0.01661220043572985, "grad_norm": 92.27963701703484, "learning_rate": 1.327891156462585e-07, "logits/chosen": -7.977911949157715, "logits/rejected": -8.875438690185547, "logps/chosen": -0.8868775367736816, "logps/rejected": -0.6830118894577026, "loss": 5.8881, "rewards/accuracies": 0.25, "rewards/chosen": -8.8687744140625, "rewards/margins": -2.0386557579040527, "rewards/rejected": -6.8301191329956055, "step": 122 }, { "epoch": 0.016748366013071895, "grad_norm": 133.28121418787444, "learning_rate": 1.3387755102040816e-07, "logits/chosen": -7.237972736358643, "logits/rejected": -7.679261207580566, "logps/chosen": -1.4854984283447266, "logps/rejected": -1.3006603717803955, "loss": 6.7854, "rewards/accuracies": 0.0, "rewards/chosen": -14.85498332977295, "rewards/margins": -1.848380208015442, "rewards/rejected": -13.006603240966797, "step": 123 }, { "epoch": 0.016884531590413945, "grad_norm": 116.64182000018063, "learning_rate": 1.3496598639455781e-07, "logits/chosen": -8.235772132873535, "logits/rejected": -7.443356990814209, "logps/chosen": -1.5524362325668335, "logps/rejected": -2.6640877723693848, "loss": 4.8964, "rewards/accuracies": 0.5, "rewards/chosen": -15.524361610412598, "rewards/margins": 11.116518020629883, "rewards/rejected": -26.640878677368164, "step": 124 }, { "epoch": 0.01702069716775599, "grad_norm": 77.24402407554578, "learning_rate": 1.3605442176870747e-07, "logits/chosen": -8.537908554077148, "logits/rejected": -9.160828590393066, "logps/chosen": -1.2190563678741455, "logps/rejected": -0.9468997716903687, "loss": 4.6712, "rewards/accuracies": 0.5, "rewards/chosen": -12.190563201904297, "rewards/margins": -2.7215657234191895, "rewards/rejected": -9.468997955322266, "step": 125 }, { "epoch": 0.01715686274509804, "grad_norm": 62.60496842150831, "learning_rate": 1.3714285714285715e-07, "logits/chosen": -8.927555084228516, "logits/rejected": -6.9217071533203125, "logps/chosen": -1.3026244640350342, "logps/rejected": -1.621842622756958, "loss": 4.8888, "rewards/accuracies": 0.75, "rewards/chosen": -13.026244163513184, "rewards/margins": 3.1921825408935547, "rewards/rejected": -16.218425750732422, "step": 126 }, { "epoch": 0.017293028322440087, "grad_norm": 71.76649279098766, "learning_rate": 1.382312925170068e-07, "logits/chosen": -8.929429054260254, "logits/rejected": -8.861654281616211, "logps/chosen": -1.2698887586593628, "logps/rejected": -1.2908586263656616, "loss": 4.4883, "rewards/accuracies": 0.75, "rewards/chosen": -12.698887825012207, "rewards/margins": 0.20969891548156738, "rewards/rejected": -12.908586502075195, "step": 127 }, { "epoch": 0.017429193899782137, "grad_norm": 90.7280684147843, "learning_rate": 1.3931972789115645e-07, "logits/chosen": -8.100556373596191, "logits/rejected": -7.284461975097656, "logps/chosen": -1.0994539260864258, "logps/rejected": -1.3498657941818237, "loss": 4.6904, "rewards/accuracies": 0.75, "rewards/chosen": -10.994539260864258, "rewards/margins": 2.5041182041168213, "rewards/rejected": -13.4986572265625, "step": 128 }, { "epoch": 0.017565359477124183, "grad_norm": 119.93755865533443, "learning_rate": 1.404081632653061e-07, "logits/chosen": -8.692374229431152, "logits/rejected": -9.704275131225586, "logps/chosen": -1.1250450611114502, "logps/rejected": -1.0680081844329834, "loss": 5.5376, "rewards/accuracies": 0.5, "rewards/chosen": -11.250452041625977, "rewards/margins": -0.5703698396682739, "rewards/rejected": -10.680082321166992, "step": 129 }, { "epoch": 0.01770152505446623, "grad_norm": 64.18488248079667, "learning_rate": 1.4149659863945578e-07, "logits/chosen": -7.641815662384033, "logits/rejected": -6.911394119262695, "logps/chosen": -1.1077734231948853, "logps/rejected": -1.3053556680679321, "loss": 4.7644, "rewards/accuracies": 0.75, "rewards/chosen": -11.077733993530273, "rewards/margins": 1.9758222103118896, "rewards/rejected": -13.053556442260742, "step": 130 }, { "epoch": 0.01783769063180828, "grad_norm": 96.93506026483561, "learning_rate": 1.4258503401360543e-07, "logits/chosen": -8.411888122558594, "logits/rejected": -7.937638282775879, "logps/chosen": -1.0048186779022217, "logps/rejected": -1.1728907823562622, "loss": 4.828, "rewards/accuracies": 0.75, "rewards/chosen": -10.048186302185059, "rewards/margins": 1.6807211637496948, "rewards/rejected": -11.728907585144043, "step": 131 }, { "epoch": 0.017973856209150325, "grad_norm": 96.99601743690654, "learning_rate": 1.4367346938775509e-07, "logits/chosen": -7.937776565551758, "logits/rejected": -8.09326171875, "logps/chosen": -1.8889554738998413, "logps/rejected": -1.9573417901992798, "loss": 4.5314, "rewards/accuracies": 0.5, "rewards/chosen": -18.889554977416992, "rewards/margins": 0.6838645935058594, "rewards/rejected": -19.57341766357422, "step": 132 }, { "epoch": 0.018110021786492375, "grad_norm": 169.39217875481538, "learning_rate": 1.4476190476190476e-07, "logits/chosen": -8.100971221923828, "logits/rejected": -6.834127426147461, "logps/chosen": -1.3643587827682495, "logps/rejected": -1.4150521755218506, "loss": 5.8794, "rewards/accuracies": 0.75, "rewards/chosen": -13.643587112426758, "rewards/margins": 0.5069348812103271, "rewards/rejected": -14.150522232055664, "step": 133 }, { "epoch": 0.01824618736383442, "grad_norm": 73.43780486551361, "learning_rate": 1.4585034013605442e-07, "logits/chosen": -8.857494354248047, "logits/rejected": -7.973879814147949, "logps/chosen": -1.2026863098144531, "logps/rejected": -1.3635804653167725, "loss": 4.6363, "rewards/accuracies": 0.5, "rewards/chosen": -12.026863098144531, "rewards/margins": 1.6089420318603516, "rewards/rejected": -13.635804176330566, "step": 134 }, { "epoch": 0.01838235294117647, "grad_norm": 132.97652969892755, "learning_rate": 1.4693877551020407e-07, "logits/chosen": -9.920654296875, "logits/rejected": -8.664923667907715, "logps/chosen": -0.8287444114685059, "logps/rejected": -0.9304681420326233, "loss": 5.2533, "rewards/accuracies": 0.5, "rewards/chosen": -8.287444114685059, "rewards/margins": 1.0172369480133057, "rewards/rejected": -9.304681777954102, "step": 135 }, { "epoch": 0.018518518518518517, "grad_norm": 108.6310445221214, "learning_rate": 1.4802721088435372e-07, "logits/chosen": -6.958468914031982, "logits/rejected": -8.03316879272461, "logps/chosen": -1.4720054864883423, "logps/rejected": -1.568041205406189, "loss": 5.461, "rewards/accuracies": 0.5, "rewards/chosen": -14.720054626464844, "rewards/margins": 0.9603571891784668, "rewards/rejected": -15.680412292480469, "step": 136 }, { "epoch": 0.018654684095860567, "grad_norm": 130.04682495899175, "learning_rate": 1.491156462585034e-07, "logits/chosen": -8.107259750366211, "logits/rejected": -7.727598667144775, "logps/chosen": -0.7272666096687317, "logps/rejected": -0.9790180921554565, "loss": 5.1799, "rewards/accuracies": 1.0, "rewards/chosen": -7.2726664543151855, "rewards/margins": 2.5175139904022217, "rewards/rejected": -9.790180206298828, "step": 137 }, { "epoch": 0.018790849673202614, "grad_norm": 125.55864439144133, "learning_rate": 1.5020408163265305e-07, "logits/chosen": -7.8524322509765625, "logits/rejected": -6.454014778137207, "logps/chosen": -1.3398516178131104, "logps/rejected": -1.2288532257080078, "loss": 5.5296, "rewards/accuracies": 0.5, "rewards/chosen": -13.398515701293945, "rewards/margins": -1.1099846363067627, "rewards/rejected": -12.288532257080078, "step": 138 }, { "epoch": 0.018927015250544663, "grad_norm": 120.82654758672258, "learning_rate": 1.5129251700680273e-07, "logits/chosen": -6.387835502624512, "logits/rejected": -5.812603950500488, "logps/chosen": -1.5713181495666504, "logps/rejected": -1.8111991882324219, "loss": 5.0277, "rewards/accuracies": 0.5, "rewards/chosen": -15.713180541992188, "rewards/margins": 2.398810625076294, "rewards/rejected": -18.11199188232422, "step": 139 }, { "epoch": 0.01906318082788671, "grad_norm": 71.8867716521012, "learning_rate": 1.5238095238095236e-07, "logits/chosen": -8.14934253692627, "logits/rejected": -7.855738639831543, "logps/chosen": -0.8292730450630188, "logps/rejected": -0.9465879797935486, "loss": 5.0979, "rewards/accuracies": 0.5, "rewards/chosen": -8.292731285095215, "rewards/margins": 1.1731492280960083, "rewards/rejected": -9.465880393981934, "step": 140 }, { "epoch": 0.01919934640522876, "grad_norm": 133.13783769484897, "learning_rate": 1.5346938775510204e-07, "logits/chosen": -7.077601909637451, "logits/rejected": -6.995731353759766, "logps/chosen": -1.3134288787841797, "logps/rejected": -1.7887707948684692, "loss": 5.9851, "rewards/accuracies": 0.75, "rewards/chosen": -13.13428783416748, "rewards/margins": 4.753420352935791, "rewards/rejected": -17.88770866394043, "step": 141 }, { "epoch": 0.019335511982570806, "grad_norm": 187.32748292609577, "learning_rate": 1.545578231292517e-07, "logits/chosen": -6.660077095031738, "logits/rejected": -6.529077529907227, "logps/chosen": -1.9430222511291504, "logps/rejected": -1.3025259971618652, "loss": 6.2829, "rewards/accuracies": 0.5, "rewards/chosen": -19.430221557617188, "rewards/margins": -6.40496301651001, "rewards/rejected": -13.025259971618652, "step": 142 }, { "epoch": 0.019471677559912855, "grad_norm": 93.32424058033999, "learning_rate": 1.5564625850340137e-07, "logits/chosen": -6.744956016540527, "logits/rejected": -6.147989273071289, "logps/chosen": -1.6654924154281616, "logps/rejected": -1.4076611995697021, "loss": 5.2559, "rewards/accuracies": 0.25, "rewards/chosen": -16.654924392700195, "rewards/margins": -2.578312873840332, "rewards/rejected": -14.076611518859863, "step": 143 }, { "epoch": 0.0196078431372549, "grad_norm": 105.58250000816324, "learning_rate": 1.56734693877551e-07, "logits/chosen": -7.31378173828125, "logits/rejected": -7.078433990478516, "logps/chosen": -1.2827730178833008, "logps/rejected": -1.2402223348617554, "loss": 6.6242, "rewards/accuracies": 0.5, "rewards/chosen": -12.827730178833008, "rewards/margins": -0.4255063533782959, "rewards/rejected": -12.402223587036133, "step": 144 }, { "epoch": 0.01974400871459695, "grad_norm": 88.00451371475083, "learning_rate": 1.5782312925170067e-07, "logits/chosen": -10.263345718383789, "logits/rejected": -7.425136566162109, "logps/chosen": -1.2247413396835327, "logps/rejected": -1.4101629257202148, "loss": 5.2318, "rewards/accuracies": 0.75, "rewards/chosen": -12.247413635253906, "rewards/margins": 1.854215145111084, "rewards/rejected": -14.101628303527832, "step": 145 }, { "epoch": 0.019880174291938998, "grad_norm": 101.8242574204444, "learning_rate": 1.5891156462585032e-07, "logits/chosen": -7.049863815307617, "logits/rejected": -6.860673904418945, "logps/chosen": -1.3652560710906982, "logps/rejected": -1.8334383964538574, "loss": 4.5453, "rewards/accuracies": 0.75, "rewards/chosen": -13.65256118774414, "rewards/margins": 4.681824207305908, "rewards/rejected": -18.33438491821289, "step": 146 }, { "epoch": 0.020016339869281044, "grad_norm": 137.92112931366847, "learning_rate": 1.6e-07, "logits/chosen": -9.203563690185547, "logits/rejected": -8.293973922729492, "logps/chosen": -3.522048234939575, "logps/rejected": -1.0164036750793457, "loss": 5.5492, "rewards/accuracies": 0.75, "rewards/chosen": -35.220481872558594, "rewards/margins": -25.056446075439453, "rewards/rejected": -10.16403579711914, "step": 147 }, { "epoch": 0.020152505446623094, "grad_norm": 142.7548544512285, "learning_rate": 1.6108843537414966e-07, "logits/chosen": -6.722068786621094, "logits/rejected": -6.706391334533691, "logps/chosen": -1.446770191192627, "logps/rejected": -1.3910293579101562, "loss": 4.8614, "rewards/accuracies": 0.25, "rewards/chosen": -14.467700958251953, "rewards/margins": -0.557408332824707, "rewards/rejected": -13.910293579101562, "step": 148 }, { "epoch": 0.02028867102396514, "grad_norm": 147.4414610504222, "learning_rate": 1.621768707482993e-07, "logits/chosen": -6.472548484802246, "logits/rejected": -7.359100341796875, "logps/chosen": -1.774766206741333, "logps/rejected": -1.749028205871582, "loss": 5.616, "rewards/accuracies": 0.75, "rewards/chosen": -17.747661590576172, "rewards/margins": -0.25737929344177246, "rewards/rejected": -17.49028205871582, "step": 149 }, { "epoch": 0.02042483660130719, "grad_norm": 85.40604401509445, "learning_rate": 1.63265306122449e-07, "logits/chosen": -8.287897109985352, "logits/rejected": -7.272768020629883, "logps/chosen": -0.9985866546630859, "logps/rejected": -1.0714044570922852, "loss": 5.0447, "rewards/accuracies": 0.5, "rewards/chosen": -9.98586654663086, "rewards/margins": 0.728177547454834, "rewards/rejected": -10.714043617248535, "step": 150 }, { "epoch": 0.020561002178649236, "grad_norm": 191.94433843514835, "learning_rate": 1.6435374149659864e-07, "logits/chosen": -7.810141563415527, "logits/rejected": -6.642199993133545, "logps/chosen": -1.408130407333374, "logps/rejected": -2.367379903793335, "loss": 5.586, "rewards/accuracies": 1.0, "rewards/chosen": -14.081304550170898, "rewards/margins": 9.59249496459961, "rewards/rejected": -23.673799514770508, "step": 151 }, { "epoch": 0.020697167755991286, "grad_norm": 84.36206782043772, "learning_rate": 1.654421768707483e-07, "logits/chosen": -8.025960922241211, "logits/rejected": -7.982983589172363, "logps/chosen": -1.2618099451065063, "logps/rejected": -1.185113549232483, "loss": 4.9386, "rewards/accuracies": 0.5, "rewards/chosen": -12.6181001663208, "rewards/margins": -0.7669644355773926, "rewards/rejected": -11.85113525390625, "step": 152 }, { "epoch": 0.020833333333333332, "grad_norm": 129.32292423879954, "learning_rate": 1.6653061224489794e-07, "logits/chosen": -8.213658332824707, "logits/rejected": -7.41917610168457, "logps/chosen": -1.7385635375976562, "logps/rejected": -1.744072675704956, "loss": 4.8346, "rewards/accuracies": 0.75, "rewards/chosen": -17.385635375976562, "rewards/margins": 0.05509233474731445, "rewards/rejected": -17.44072723388672, "step": 153 }, { "epoch": 0.020969498910675382, "grad_norm": 123.1360072588639, "learning_rate": 1.6761904761904762e-07, "logits/chosen": -5.90570592880249, "logits/rejected": -5.351279258728027, "logps/chosen": -1.4041664600372314, "logps/rejected": -1.6992828845977783, "loss": 5.4451, "rewards/accuracies": 0.75, "rewards/chosen": -14.041665077209473, "rewards/margins": 2.9511637687683105, "rewards/rejected": -16.992828369140625, "step": 154 }, { "epoch": 0.021105664488017428, "grad_norm": 92.59032196523893, "learning_rate": 1.6870748299319727e-07, "logits/chosen": -8.394842147827148, "logits/rejected": -6.3303375244140625, "logps/chosen": -1.135949969291687, "logps/rejected": -1.2834014892578125, "loss": 4.406, "rewards/accuracies": 0.75, "rewards/chosen": -11.35949993133545, "rewards/margins": 1.4745149612426758, "rewards/rejected": -12.834014892578125, "step": 155 }, { "epoch": 0.021241830065359478, "grad_norm": 102.56602321070297, "learning_rate": 1.6979591836734693e-07, "logits/chosen": -7.4587016105651855, "logits/rejected": -6.154718399047852, "logps/chosen": -1.2901796102523804, "logps/rejected": -1.79585862159729, "loss": 5.3973, "rewards/accuracies": 0.75, "rewards/chosen": -12.901796340942383, "rewards/margins": 5.056789875030518, "rewards/rejected": -17.958585739135742, "step": 156 }, { "epoch": 0.021377995642701524, "grad_norm": 97.38957721642002, "learning_rate": 1.7088435374149658e-07, "logits/chosen": -5.6771650314331055, "logits/rejected": -7.148407936096191, "logps/chosen": -1.198584794998169, "logps/rejected": -0.9971252083778381, "loss": 4.7677, "rewards/accuracies": 0.0, "rewards/chosen": -11.985848426818848, "rewards/margins": -2.0145962238311768, "rewards/rejected": -9.971251487731934, "step": 157 }, { "epoch": 0.021514161220043574, "grad_norm": 104.15165875462495, "learning_rate": 1.7197278911564626e-07, "logits/chosen": -6.29349422454834, "logits/rejected": -6.065828323364258, "logps/chosen": -1.5833168029785156, "logps/rejected": -1.5859633684158325, "loss": 4.5853, "rewards/accuracies": 0.5, "rewards/chosen": -15.833168983459473, "rewards/margins": 0.026464223861694336, "rewards/rejected": -15.859633445739746, "step": 158 }, { "epoch": 0.02165032679738562, "grad_norm": 79.15068341682391, "learning_rate": 1.730612244897959e-07, "logits/chosen": -7.3802618980407715, "logits/rejected": -6.974637031555176, "logps/chosen": -0.9901829957962036, "logps/rejected": -1.0578351020812988, "loss": 4.9174, "rewards/accuracies": 0.5, "rewards/chosen": -9.901830673217773, "rewards/margins": 0.676520824432373, "rewards/rejected": -10.578350067138672, "step": 159 }, { "epoch": 0.02178649237472767, "grad_norm": 68.74837348448261, "learning_rate": 1.7414965986394556e-07, "logits/chosen": -7.3688154220581055, "logits/rejected": -7.208442687988281, "logps/chosen": -1.0561609268188477, "logps/rejected": -1.7540125846862793, "loss": 5.2544, "rewards/accuracies": 1.0, "rewards/chosen": -10.561607360839844, "rewards/margins": 6.978518486022949, "rewards/rejected": -17.54012680053711, "step": 160 }, { "epoch": 0.021922657952069716, "grad_norm": 95.65614437023193, "learning_rate": 1.7523809523809524e-07, "logits/chosen": -6.960417747497559, "logits/rejected": -7.1131367683410645, "logps/chosen": -1.4105963706970215, "logps/rejected": -1.388922929763794, "loss": 4.5346, "rewards/accuracies": 0.5, "rewards/chosen": -14.105962753295898, "rewards/margins": -0.21673369407653809, "rewards/rejected": -13.889229774475098, "step": 161 }, { "epoch": 0.022058823529411766, "grad_norm": 88.99570284408347, "learning_rate": 1.763265306122449e-07, "logits/chosen": -5.26167631149292, "logits/rejected": -5.481692314147949, "logps/chosen": -1.5601022243499756, "logps/rejected": -2.136596918106079, "loss": 5.0081, "rewards/accuracies": 1.0, "rewards/chosen": -15.601022720336914, "rewards/margins": 5.764946460723877, "rewards/rejected": -21.365968704223633, "step": 162 }, { "epoch": 0.022194989106753812, "grad_norm": 137.2807080323382, "learning_rate": 1.7741496598639457e-07, "logits/chosen": -6.738339424133301, "logits/rejected": -5.6601057052612305, "logps/chosen": -1.0017204284667969, "logps/rejected": -0.9605573415756226, "loss": 6.3733, "rewards/accuracies": 0.25, "rewards/chosen": -10.017204284667969, "rewards/margins": -0.41163039207458496, "rewards/rejected": -9.605573654174805, "step": 163 }, { "epoch": 0.022331154684095862, "grad_norm": 71.07082662178453, "learning_rate": 1.785034013605442e-07, "logits/chosen": -6.039597511291504, "logits/rejected": -6.692374229431152, "logps/chosen": -1.6276044845581055, "logps/rejected": -1.286259412765503, "loss": 4.7965, "rewards/accuracies": 0.5, "rewards/chosen": -16.276046752929688, "rewards/margins": -3.413450002670288, "rewards/rejected": -12.862594604492188, "step": 164 }, { "epoch": 0.02246732026143791, "grad_norm": 54.70497607264681, "learning_rate": 1.7959183673469388e-07, "logits/chosen": -7.710261344909668, "logits/rejected": -8.2518892288208, "logps/chosen": -1.2027339935302734, "logps/rejected": -1.2235569953918457, "loss": 4.4275, "rewards/accuracies": 0.5, "rewards/chosen": -12.027338981628418, "rewards/margins": 0.20823073387145996, "rewards/rejected": -12.23556900024414, "step": 165 }, { "epoch": 0.022603485838779955, "grad_norm": 153.98278521390245, "learning_rate": 1.8068027210884353e-07, "logits/chosen": -8.509668350219727, "logits/rejected": -6.1804022789001465, "logps/chosen": -1.461782455444336, "logps/rejected": -2.0017993450164795, "loss": 5.5284, "rewards/accuracies": 0.75, "rewards/chosen": -14.61782455444336, "rewards/margins": 5.400167465209961, "rewards/rejected": -20.01799201965332, "step": 166 }, { "epoch": 0.022739651416122005, "grad_norm": 75.15901475261614, "learning_rate": 1.817687074829932e-07, "logits/chosen": -7.961825370788574, "logits/rejected": -6.909971237182617, "logps/chosen": -1.1711571216583252, "logps/rejected": -1.2304127216339111, "loss": 5.1794, "rewards/accuracies": 0.5, "rewards/chosen": -11.711570739746094, "rewards/margins": 0.5925552845001221, "rewards/rejected": -12.304126739501953, "step": 167 }, { "epoch": 0.02287581699346405, "grad_norm": 109.45754873895234, "learning_rate": 1.8285714285714283e-07, "logits/chosen": -6.563947677612305, "logits/rejected": -8.403340339660645, "logps/chosen": -1.0987374782562256, "logps/rejected": -0.9991467595100403, "loss": 5.7174, "rewards/accuracies": 0.25, "rewards/chosen": -10.987373352050781, "rewards/margins": -0.9959064722061157, "rewards/rejected": -9.991467475891113, "step": 168 }, { "epoch": 0.0230119825708061, "grad_norm": 112.1123860248158, "learning_rate": 1.8394557823129251e-07, "logits/chosen": -8.20693588256836, "logits/rejected": -8.455974578857422, "logps/chosen": -0.8681545257568359, "logps/rejected": -1.1759507656097412, "loss": 5.454, "rewards/accuracies": 0.25, "rewards/chosen": -8.68154525756836, "rewards/margins": 3.077962875366211, "rewards/rejected": -11.75950813293457, "step": 169 }, { "epoch": 0.023148148148148147, "grad_norm": 76.72218375026486, "learning_rate": 1.8503401360544217e-07, "logits/chosen": -6.5804033279418945, "logits/rejected": -5.652115821838379, "logps/chosen": -1.0899848937988281, "logps/rejected": -0.9910410642623901, "loss": 4.645, "rewards/accuracies": 0.5, "rewards/chosen": -10.899848937988281, "rewards/margins": -0.9894390106201172, "rewards/rejected": -9.91041088104248, "step": 170 }, { "epoch": 0.023284313725490197, "grad_norm": 79.72833006512947, "learning_rate": 1.8612244897959182e-07, "logits/chosen": -8.579032897949219, "logits/rejected": -6.2043914794921875, "logps/chosen": -1.2742339372634888, "logps/rejected": -1.6190030574798584, "loss": 5.2016, "rewards/accuracies": 1.0, "rewards/chosen": -12.742340087890625, "rewards/margins": 3.4476895332336426, "rewards/rejected": -16.19002914428711, "step": 171 }, { "epoch": 0.023420479302832243, "grad_norm": 97.46553575067091, "learning_rate": 1.8721088435374147e-07, "logits/chosen": -7.081697940826416, "logits/rejected": -7.289755821228027, "logps/chosen": -0.9433913230895996, "logps/rejected": -1.1108554601669312, "loss": 5.5086, "rewards/accuracies": 0.75, "rewards/chosen": -9.433913230895996, "rewards/margins": 1.6746413707733154, "rewards/rejected": -11.108553886413574, "step": 172 }, { "epoch": 0.023556644880174293, "grad_norm": 77.7655501824403, "learning_rate": 1.8829931972789115e-07, "logits/chosen": -8.084100723266602, "logits/rejected": -8.048393249511719, "logps/chosen": -1.1167643070220947, "logps/rejected": -0.8588655591011047, "loss": 4.5307, "rewards/accuracies": 0.5, "rewards/chosen": -11.167643547058105, "rewards/margins": -2.5789875984191895, "rewards/rejected": -8.588655471801758, "step": 173 }, { "epoch": 0.02369281045751634, "grad_norm": 73.69536119387801, "learning_rate": 1.893877551020408e-07, "logits/chosen": -7.671952724456787, "logits/rejected": -7.287993431091309, "logps/chosen": -1.3024232387542725, "logps/rejected": -1.4799177646636963, "loss": 4.2354, "rewards/accuracies": 0.75, "rewards/chosen": -13.024232864379883, "rewards/margins": 1.7749452590942383, "rewards/rejected": -14.799179077148438, "step": 174 }, { "epoch": 0.02382897603485839, "grad_norm": 73.30163669138452, "learning_rate": 1.9047619047619045e-07, "logits/chosen": -6.510768413543701, "logits/rejected": -6.433928489685059, "logps/chosen": -1.4896583557128906, "logps/rejected": -1.4321060180664062, "loss": 5.2671, "rewards/accuracies": 0.5, "rewards/chosen": -14.896583557128906, "rewards/margins": -0.5755228996276855, "rewards/rejected": -14.321060180664062, "step": 175 }, { "epoch": 0.023965141612200435, "grad_norm": 84.59154091137295, "learning_rate": 1.9156462585034013e-07, "logits/chosen": -7.444751739501953, "logits/rejected": -6.793341636657715, "logps/chosen": -0.9935526251792908, "logps/rejected": -1.2576138973236084, "loss": 5.225, "rewards/accuracies": 1.0, "rewards/chosen": -9.935525894165039, "rewards/margins": 2.6406126022338867, "rewards/rejected": -12.576138496398926, "step": 176 }, { "epoch": 0.024101307189542485, "grad_norm": 94.40683894409909, "learning_rate": 1.9265306122448978e-07, "logits/chosen": -8.296958923339844, "logits/rejected": -7.922490119934082, "logps/chosen": -0.9406330585479736, "logps/rejected": -1.1321996450424194, "loss": 5.1181, "rewards/accuracies": 0.75, "rewards/chosen": -9.406330108642578, "rewards/margins": 1.9156659841537476, "rewards/rejected": -11.321996688842773, "step": 177 }, { "epoch": 0.02423747276688453, "grad_norm": 99.27857201672498, "learning_rate": 1.9374149659863946e-07, "logits/chosen": -7.959944725036621, "logits/rejected": -6.8451433181762695, "logps/chosen": -1.232956886291504, "logps/rejected": -1.5422444343566895, "loss": 4.9527, "rewards/accuracies": 0.5, "rewards/chosen": -12.329568862915039, "rewards/margins": 3.0928754806518555, "rewards/rejected": -15.422444343566895, "step": 178 }, { "epoch": 0.02437363834422658, "grad_norm": 96.55958330988591, "learning_rate": 1.948299319727891e-07, "logits/chosen": -7.794098854064941, "logits/rejected": -5.9459075927734375, "logps/chosen": -1.4828112125396729, "logps/rejected": -2.5785632133483887, "loss": 5.2391, "rewards/accuracies": 0.75, "rewards/chosen": -14.82811164855957, "rewards/margins": 10.95751953125, "rewards/rejected": -25.78563117980957, "step": 179 }, { "epoch": 0.024509803921568627, "grad_norm": 83.74751977372985, "learning_rate": 1.9591836734693877e-07, "logits/chosen": -7.918519973754883, "logits/rejected": -6.533925533294678, "logps/chosen": -1.2089519500732422, "logps/rejected": -1.3907301425933838, "loss": 5.6945, "rewards/accuracies": 0.75, "rewards/chosen": -12.089519500732422, "rewards/margins": 1.8177810907363892, "rewards/rejected": -13.90730094909668, "step": 180 }, { "epoch": 0.024645969498910677, "grad_norm": 68.04222975287135, "learning_rate": 1.9700680272108842e-07, "logits/chosen": -7.509442329406738, "logits/rejected": -6.4137959480285645, "logps/chosen": -1.4295858144760132, "logps/rejected": -1.257176399230957, "loss": 5.2318, "rewards/accuracies": 0.5, "rewards/chosen": -14.295858383178711, "rewards/margins": -1.7240939140319824, "rewards/rejected": -12.57176399230957, "step": 181 }, { "epoch": 0.024782135076252723, "grad_norm": 70.04948463922646, "learning_rate": 1.980952380952381e-07, "logits/chosen": -6.858748435974121, "logits/rejected": -5.326076507568359, "logps/chosen": -1.170296311378479, "logps/rejected": -1.025686264038086, "loss": 5.2616, "rewards/accuracies": 0.25, "rewards/chosen": -11.702963829040527, "rewards/margins": -1.4461005926132202, "rewards/rejected": -10.25686264038086, "step": 182 }, { "epoch": 0.024918300653594773, "grad_norm": 67.06831822147421, "learning_rate": 1.9918367346938773e-07, "logits/chosen": -5.9540486335754395, "logits/rejected": -6.134858131408691, "logps/chosen": -1.4768328666687012, "logps/rejected": -1.4597630500793457, "loss": 4.5049, "rewards/accuracies": 0.75, "rewards/chosen": -14.768327713012695, "rewards/margins": -0.17069673538208008, "rewards/rejected": -14.597631454467773, "step": 183 }, { "epoch": 0.02505446623093682, "grad_norm": 80.87774508979096, "learning_rate": 2.0027210884353743e-07, "logits/chosen": -6.767053604125977, "logits/rejected": -6.290744304656982, "logps/chosen": -1.2690455913543701, "logps/rejected": -1.3472695350646973, "loss": 5.2294, "rewards/accuracies": 0.75, "rewards/chosen": -12.69045639038086, "rewards/margins": 0.7822391986846924, "rewards/rejected": -13.472695350646973, "step": 184 }, { "epoch": 0.025190631808278865, "grad_norm": 116.3431268533652, "learning_rate": 2.0136054421768706e-07, "logits/chosen": -8.071076393127441, "logits/rejected": -6.2148756980896, "logps/chosen": -0.7556376457214355, "logps/rejected": -1.154641032218933, "loss": 6.0319, "rewards/accuracies": 0.75, "rewards/chosen": -7.5563764572143555, "rewards/margins": 3.990034580230713, "rewards/rejected": -11.546411514282227, "step": 185 }, { "epoch": 0.025326797385620915, "grad_norm": 115.56135296089637, "learning_rate": 2.024489795918367e-07, "logits/chosen": -5.8168463706970215, "logits/rejected": -6.642218589782715, "logps/chosen": -1.6020214557647705, "logps/rejected": -1.114095687866211, "loss": 4.9616, "rewards/accuracies": 0.25, "rewards/chosen": -16.020214080810547, "rewards/margins": -4.879256725311279, "rewards/rejected": -11.140957832336426, "step": 186 }, { "epoch": 0.02546296296296296, "grad_norm": 67.26543233381646, "learning_rate": 2.035374149659864e-07, "logits/chosen": -6.950218200683594, "logits/rejected": -4.895829200744629, "logps/chosen": -1.0128856897354126, "logps/rejected": -1.8647680282592773, "loss": 4.0077, "rewards/accuracies": 0.75, "rewards/chosen": -10.128856658935547, "rewards/margins": 8.518823623657227, "rewards/rejected": -18.647680282592773, "step": 187 }, { "epoch": 0.02559912854030501, "grad_norm": 90.73890023080358, "learning_rate": 2.0462585034013604e-07, "logits/chosen": -6.8982133865356445, "logits/rejected": -7.051582336425781, "logps/chosen": -1.4017748832702637, "logps/rejected": -1.128211259841919, "loss": 5.2079, "rewards/accuracies": 0.5, "rewards/chosen": -14.017749786376953, "rewards/margins": -2.7356364727020264, "rewards/rejected": -11.282113075256348, "step": 188 }, { "epoch": 0.025735294117647058, "grad_norm": 96.41724944087588, "learning_rate": 2.057142857142857e-07, "logits/chosen": -6.817584037780762, "logits/rejected": -8.678714752197266, "logps/chosen": -1.083337664604187, "logps/rejected": -1.046867847442627, "loss": 5.2113, "rewards/accuracies": 0.5, "rewards/chosen": -10.833375930786133, "rewards/margins": -0.36469900608062744, "rewards/rejected": -10.468677520751953, "step": 189 }, { "epoch": 0.025871459694989107, "grad_norm": 83.25695770923153, "learning_rate": 2.0680272108843537e-07, "logits/chosen": -6.661531448364258, "logits/rejected": -6.242840766906738, "logps/chosen": -1.2485356330871582, "logps/rejected": -1.082529067993164, "loss": 4.4928, "rewards/accuracies": 0.25, "rewards/chosen": -12.485357284545898, "rewards/margins": -1.66006600856781, "rewards/rejected": -10.82529067993164, "step": 190 }, { "epoch": 0.026007625272331154, "grad_norm": 70.5737780269745, "learning_rate": 2.0789115646258502e-07, "logits/chosen": -6.130125045776367, "logits/rejected": -5.073531150817871, "logps/chosen": -1.3772631883621216, "logps/rejected": -1.27760910987854, "loss": 4.6737, "rewards/accuracies": 0.75, "rewards/chosen": -13.772631645202637, "rewards/margins": -0.9965405464172363, "rewards/rejected": -12.776091575622559, "step": 191 }, { "epoch": 0.026143790849673203, "grad_norm": 58.367081528081165, "learning_rate": 2.089795918367347e-07, "logits/chosen": -7.82505989074707, "logits/rejected": -6.476356506347656, "logps/chosen": -1.4139668941497803, "logps/rejected": -1.6912710666656494, "loss": 4.4833, "rewards/accuracies": 0.5, "rewards/chosen": -14.139669418334961, "rewards/margins": 2.773041248321533, "rewards/rejected": -16.912710189819336, "step": 192 }, { "epoch": 0.02627995642701525, "grad_norm": 61.6651807762063, "learning_rate": 2.1006802721088435e-07, "logits/chosen": -6.099628448486328, "logits/rejected": -7.124472618103027, "logps/chosen": -1.1536598205566406, "logps/rejected": -0.8781601786613464, "loss": 5.311, "rewards/accuracies": 0.0, "rewards/chosen": -11.536596298217773, "rewards/margins": -2.754995107650757, "rewards/rejected": -8.781601905822754, "step": 193 }, { "epoch": 0.0264161220043573, "grad_norm": 87.1504705483159, "learning_rate": 2.1115646258503398e-07, "logits/chosen": -7.453646659851074, "logits/rejected": -6.912619590759277, "logps/chosen": -0.8000520467758179, "logps/rejected": -0.9588027000427246, "loss": 5.1691, "rewards/accuracies": 0.75, "rewards/chosen": -8.000520706176758, "rewards/margins": 1.587506651878357, "rewards/rejected": -9.588027000427246, "step": 194 }, { "epoch": 0.026552287581699346, "grad_norm": 103.33590340183373, "learning_rate": 2.1224489795918369e-07, "logits/chosen": -7.2746901512146, "logits/rejected": -6.692259788513184, "logps/chosen": -1.3528649806976318, "logps/rejected": -1.20949125289917, "loss": 4.6167, "rewards/accuracies": 0.75, "rewards/chosen": -13.52864933013916, "rewards/margins": -1.433736801147461, "rewards/rejected": -12.0949125289917, "step": 195 }, { "epoch": 0.026688453159041396, "grad_norm": 118.7605415818877, "learning_rate": 2.133333333333333e-07, "logits/chosen": -6.24162483215332, "logits/rejected": -4.508004188537598, "logps/chosen": -0.903314471244812, "logps/rejected": -1.5306448936462402, "loss": 5.6855, "rewards/accuracies": 0.75, "rewards/chosen": -9.033143997192383, "rewards/margins": 6.273303031921387, "rewards/rejected": -15.306447982788086, "step": 196 }, { "epoch": 0.026824618736383442, "grad_norm": 72.31368849575523, "learning_rate": 2.1442176870748296e-07, "logits/chosen": -6.148506164550781, "logits/rejected": -7.032370567321777, "logps/chosen": -1.2420457601547241, "logps/rejected": -0.9879764914512634, "loss": 5.4145, "rewards/accuracies": 0.25, "rewards/chosen": -12.42045783996582, "rewards/margins": -2.5406928062438965, "rewards/rejected": -9.879764556884766, "step": 197 }, { "epoch": 0.02696078431372549, "grad_norm": 61.23065509770097, "learning_rate": 2.1551020408163264e-07, "logits/chosen": -7.058525085449219, "logits/rejected": -6.249241828918457, "logps/chosen": -1.1598646640777588, "logps/rejected": -1.441483974456787, "loss": 5.3416, "rewards/accuracies": 0.75, "rewards/chosen": -11.59864616394043, "rewards/margins": 2.816193103790283, "rewards/rejected": -14.414838790893555, "step": 198 }, { "epoch": 0.027096949891067538, "grad_norm": 116.70120451114593, "learning_rate": 2.165986394557823e-07, "logits/chosen": -7.153243064880371, "logits/rejected": -6.457721710205078, "logps/chosen": -0.8915928602218628, "logps/rejected": -1.0144236087799072, "loss": 4.8856, "rewards/accuracies": 0.75, "rewards/chosen": -8.91592788696289, "rewards/margins": 1.2283074855804443, "rewards/rejected": -10.144235610961914, "step": 199 }, { "epoch": 0.027233115468409588, "grad_norm": 72.40451493157046, "learning_rate": 2.1768707482993197e-07, "logits/chosen": -7.039863586425781, "logits/rejected": -5.742154121398926, "logps/chosen": -1.1692641973495483, "logps/rejected": -1.4311308860778809, "loss": 4.776, "rewards/accuracies": 0.75, "rewards/chosen": -11.692641258239746, "rewards/margins": 2.618666648864746, "rewards/rejected": -14.311307907104492, "step": 200 }, { "epoch": 0.027369281045751634, "grad_norm": 105.33046777142792, "learning_rate": 2.1877551020408163e-07, "logits/chosen": -6.790611267089844, "logits/rejected": -6.1442718505859375, "logps/chosen": -1.0044852495193481, "logps/rejected": -0.8278141617774963, "loss": 4.6958, "rewards/accuracies": 0.25, "rewards/chosen": -10.044853210449219, "rewards/margins": -1.766710877418518, "rewards/rejected": -8.278141021728516, "step": 201 }, { "epoch": 0.02750544662309368, "grad_norm": 56.480333990031674, "learning_rate": 2.1986394557823128e-07, "logits/chosen": -4.502762794494629, "logits/rejected": -4.234026908874512, "logps/chosen": -1.476064682006836, "logps/rejected": -1.517585277557373, "loss": 4.4695, "rewards/accuracies": 0.25, "rewards/chosen": -14.76064682006836, "rewards/margins": 0.41520583629608154, "rewards/rejected": -15.17585277557373, "step": 202 }, { "epoch": 0.02764161220043573, "grad_norm": 91.62946975206289, "learning_rate": 2.2095238095238096e-07, "logits/chosen": -6.68681001663208, "logits/rejected": -6.122259140014648, "logps/chosen": -1.2721357345581055, "logps/rejected": -1.5630892515182495, "loss": 5.5299, "rewards/accuracies": 1.0, "rewards/chosen": -12.721357345581055, "rewards/margins": 2.9095358848571777, "rewards/rejected": -15.630891799926758, "step": 203 }, { "epoch": 0.027777777777777776, "grad_norm": 75.07575471569352, "learning_rate": 2.220408163265306e-07, "logits/chosen": -5.518540382385254, "logits/rejected": -5.904209613800049, "logps/chosen": -2.094674825668335, "logps/rejected": -1.1049268245697021, "loss": 5.6318, "rewards/accuracies": 0.0, "rewards/chosen": -20.946748733520508, "rewards/margins": -9.897480010986328, "rewards/rejected": -11.04926872253418, "step": 204 }, { "epoch": 0.027913943355119826, "grad_norm": 77.50770347599598, "learning_rate": 2.2312925170068024e-07, "logits/chosen": -5.377975940704346, "logits/rejected": -5.207005500793457, "logps/chosen": -1.399886965751648, "logps/rejected": -1.221920371055603, "loss": 5.1013, "rewards/accuracies": 0.25, "rewards/chosen": -13.998869895935059, "rewards/margins": -1.7796659469604492, "rewards/rejected": -12.21920394897461, "step": 205 }, { "epoch": 0.028050108932461872, "grad_norm": 82.4322279404147, "learning_rate": 2.2421768707482994e-07, "logits/chosen": -5.701888084411621, "logits/rejected": -5.295608997344971, "logps/chosen": -1.2542833089828491, "logps/rejected": -1.351365089416504, "loss": 4.6043, "rewards/accuracies": 0.75, "rewards/chosen": -12.54283332824707, "rewards/margins": 0.9708186388015747, "rewards/rejected": -13.513651847839355, "step": 206 }, { "epoch": 0.028186274509803922, "grad_norm": 136.82930422236254, "learning_rate": 2.2530612244897957e-07, "logits/chosen": -6.315647125244141, "logits/rejected": -6.732501029968262, "logps/chosen": -1.0674160718917847, "logps/rejected": -1.1574244499206543, "loss": 5.053, "rewards/accuracies": 0.5, "rewards/chosen": -10.67416000366211, "rewards/margins": 0.9000828266143799, "rewards/rejected": -11.574243545532227, "step": 207 }, { "epoch": 0.02832244008714597, "grad_norm": 72.4359417661519, "learning_rate": 2.2639455782312927e-07, "logits/chosen": -5.756917953491211, "logits/rejected": -5.640669822692871, "logps/chosen": -1.105139970779419, "logps/rejected": -1.0349900722503662, "loss": 5.9381, "rewards/accuracies": 0.25, "rewards/chosen": -11.051400184631348, "rewards/margins": -0.7014998197555542, "rewards/rejected": -10.34990119934082, "step": 208 }, { "epoch": 0.028458605664488018, "grad_norm": 61.22684484759386, "learning_rate": 2.274829931972789e-07, "logits/chosen": -7.9694929122924805, "logits/rejected": -6.877322196960449, "logps/chosen": -0.625891923904419, "logps/rejected": -1.001348614692688, "loss": 5.3777, "rewards/accuracies": 1.0, "rewards/chosen": -6.2589192390441895, "rewards/margins": 3.7545676231384277, "rewards/rejected": -10.013486862182617, "step": 209 }, { "epoch": 0.028594771241830064, "grad_norm": 79.62462810757066, "learning_rate": 2.2857142857142855e-07, "logits/chosen": -6.145734786987305, "logits/rejected": -5.478681564331055, "logps/chosen": -1.0469872951507568, "logps/rejected": -1.3055447340011597, "loss": 4.6382, "rewards/accuracies": 1.0, "rewards/chosen": -10.46987247467041, "rewards/margins": 2.5855751037597656, "rewards/rejected": -13.055447578430176, "step": 210 }, { "epoch": 0.028730936819172114, "grad_norm": 71.31357401147491, "learning_rate": 2.2965986394557823e-07, "logits/chosen": -6.855096340179443, "logits/rejected": -4.953028678894043, "logps/chosen": -1.691826581954956, "logps/rejected": -1.3644262552261353, "loss": 5.4208, "rewards/accuracies": 0.5, "rewards/chosen": -16.91826629638672, "rewards/margins": -3.274003505706787, "rewards/rejected": -13.644262313842773, "step": 211 }, { "epoch": 0.02886710239651416, "grad_norm": 76.58918075000632, "learning_rate": 2.3074829931972788e-07, "logits/chosen": -7.463372230529785, "logits/rejected": -6.737998962402344, "logps/chosen": -1.1528640985488892, "logps/rejected": -1.1275687217712402, "loss": 4.6627, "rewards/accuracies": 0.5, "rewards/chosen": -11.528640747070312, "rewards/margins": -0.2529531717300415, "rewards/rejected": -11.275687217712402, "step": 212 }, { "epoch": 0.02900326797385621, "grad_norm": 136.91719761616997, "learning_rate": 2.3183673469387753e-07, "logits/chosen": -6.638874053955078, "logits/rejected": -6.179720878601074, "logps/chosen": -1.0043089389801025, "logps/rejected": -1.470382571220398, "loss": 5.1253, "rewards/accuracies": 0.75, "rewards/chosen": -10.043089866638184, "rewards/margins": 4.660735130310059, "rewards/rejected": -14.703824996948242, "step": 213 }, { "epoch": 0.029139433551198256, "grad_norm": 77.57457712666577, "learning_rate": 2.329251700680272e-07, "logits/chosen": -6.933961868286133, "logits/rejected": -5.310276031494141, "logps/chosen": -0.9395098686218262, "logps/rejected": -1.1492676734924316, "loss": 4.7131, "rewards/accuracies": 0.5, "rewards/chosen": -9.395098686218262, "rewards/margins": 2.097578525543213, "rewards/rejected": -11.492677688598633, "step": 214 }, { "epoch": 0.029275599128540306, "grad_norm": 63.55906381894036, "learning_rate": 2.3401360544217686e-07, "logits/chosen": -7.894040107727051, "logits/rejected": -6.691250801086426, "logps/chosen": -1.2535067796707153, "logps/rejected": -1.3696892261505127, "loss": 4.5394, "rewards/accuracies": 0.75, "rewards/chosen": -12.535067558288574, "rewards/margins": 1.1618250608444214, "rewards/rejected": -13.696892738342285, "step": 215 }, { "epoch": 0.029411764705882353, "grad_norm": 78.30885681022556, "learning_rate": 2.3510204081632654e-07, "logits/chosen": -5.697659492492676, "logits/rejected": -4.982385635375977, "logps/chosen": -1.7605178356170654, "logps/rejected": -2.1654295921325684, "loss": 4.9137, "rewards/accuracies": 0.5, "rewards/chosen": -17.605178833007812, "rewards/margins": 4.049118995666504, "rewards/rejected": -21.654296875, "step": 216 }, { "epoch": 0.029547930283224402, "grad_norm": 62.09341804124108, "learning_rate": 2.361904761904762e-07, "logits/chosen": -6.962115287780762, "logits/rejected": -6.143084526062012, "logps/chosen": -1.0354642868041992, "logps/rejected": -1.2692071199417114, "loss": 5.8084, "rewards/accuracies": 0.5, "rewards/chosen": -10.354642868041992, "rewards/margins": 2.3374288082122803, "rewards/rejected": -12.692071914672852, "step": 217 }, { "epoch": 0.02968409586056645, "grad_norm": 194.27908701239292, "learning_rate": 2.3727891156462582e-07, "logits/chosen": -5.615656852722168, "logits/rejected": -4.609704971313477, "logps/chosen": -1.2224977016448975, "logps/rejected": -1.3477003574371338, "loss": 5.1518, "rewards/accuracies": 0.75, "rewards/chosen": -12.2249755859375, "rewards/margins": 1.252026915550232, "rewards/rejected": -13.47700309753418, "step": 218 }, { "epoch": 0.0298202614379085, "grad_norm": 62.49515112751743, "learning_rate": 2.3836734693877553e-07, "logits/chosen": -5.848294258117676, "logits/rejected": -4.78947639465332, "logps/chosen": -1.5041086673736572, "logps/rejected": -1.9121339321136475, "loss": 4.5845, "rewards/accuracies": 0.75, "rewards/chosen": -15.041086196899414, "rewards/margins": 4.080253601074219, "rewards/rejected": -19.121339797973633, "step": 219 }, { "epoch": 0.029956427015250545, "grad_norm": 66.62784254424413, "learning_rate": 2.3945578231292515e-07, "logits/chosen": -7.249790191650391, "logits/rejected": -6.993615627288818, "logps/chosen": -1.1668280363082886, "logps/rejected": -1.1935391426086426, "loss": 4.5609, "rewards/accuracies": 0.5, "rewards/chosen": -11.668279647827148, "rewards/margins": 0.2671109437942505, "rewards/rejected": -11.93539047241211, "step": 220 }, { "epoch": 0.03009259259259259, "grad_norm": 58.40283878205376, "learning_rate": 2.405442176870748e-07, "logits/chosen": -6.601233005523682, "logits/rejected": -6.202434539794922, "logps/chosen": -1.5255262851715088, "logps/rejected": -1.5346779823303223, "loss": 4.6333, "rewards/accuracies": 0.75, "rewards/chosen": -15.255263328552246, "rewards/margins": 0.09151554107666016, "rewards/rejected": -15.346778869628906, "step": 221 }, { "epoch": 0.03022875816993464, "grad_norm": 104.48901953297994, "learning_rate": 2.416326530612245e-07, "logits/chosen": -4.526758670806885, "logits/rejected": -4.371284008026123, "logps/chosen": -0.9522086977958679, "logps/rejected": -1.0991909503936768, "loss": 5.8593, "rewards/accuracies": 0.75, "rewards/chosen": -9.522087097167969, "rewards/margins": 1.4698230028152466, "rewards/rejected": -10.991910934448242, "step": 222 }, { "epoch": 0.030364923747276687, "grad_norm": 60.14996073416937, "learning_rate": 2.427210884353741e-07, "logits/chosen": -6.021376609802246, "logits/rejected": -5.64418888092041, "logps/chosen": -1.384101152420044, "logps/rejected": -1.3105504512786865, "loss": 5.4161, "rewards/accuracies": 0.25, "rewards/chosen": -13.841011047363281, "rewards/margins": -0.7355060577392578, "rewards/rejected": -13.105504989624023, "step": 223 }, { "epoch": 0.030501089324618737, "grad_norm": 73.97540727183505, "learning_rate": 2.438095238095238e-07, "logits/chosen": -7.346739292144775, "logits/rejected": -6.452864646911621, "logps/chosen": -1.2245633602142334, "logps/rejected": -1.2087429761886597, "loss": 4.5699, "rewards/accuracies": 0.75, "rewards/chosen": -12.245634078979492, "rewards/margins": -0.15820443630218506, "rewards/rejected": -12.08742904663086, "step": 224 }, { "epoch": 0.030637254901960783, "grad_norm": 69.95087811826433, "learning_rate": 2.4489795918367347e-07, "logits/chosen": -7.0311970710754395, "logits/rejected": -7.328324317932129, "logps/chosen": -1.4359487295150757, "logps/rejected": -1.1880981922149658, "loss": 5.156, "rewards/accuracies": 0.25, "rewards/chosen": -14.359487533569336, "rewards/margins": -2.4785051345825195, "rewards/rejected": -11.880982398986816, "step": 225 }, { "epoch": 0.030773420479302833, "grad_norm": 59.44744527515022, "learning_rate": 2.459863945578231e-07, "logits/chosen": -7.7552618980407715, "logits/rejected": -6.726212024688721, "logps/chosen": -1.0393712520599365, "logps/rejected": -1.3850208520889282, "loss": 4.2634, "rewards/accuracies": 0.75, "rewards/chosen": -10.393712043762207, "rewards/margins": 3.4564967155456543, "rewards/rejected": -13.850208282470703, "step": 226 }, { "epoch": 0.03090958605664488, "grad_norm": 61.86202628007466, "learning_rate": 2.4707482993197277e-07, "logits/chosen": -6.196245193481445, "logits/rejected": -5.664061069488525, "logps/chosen": -1.2878174781799316, "logps/rejected": -1.5794470310211182, "loss": 4.3617, "rewards/accuracies": 1.0, "rewards/chosen": -12.878175735473633, "rewards/margins": 2.9162960052490234, "rewards/rejected": -15.794471740722656, "step": 227 }, { "epoch": 0.03104575163398693, "grad_norm": 65.2414416870879, "learning_rate": 2.481632653061224e-07, "logits/chosen": -5.041067123413086, "logits/rejected": -4.838307857513428, "logps/chosen": -1.2869311571121216, "logps/rejected": -1.5623878240585327, "loss": 4.495, "rewards/accuracies": 0.5, "rewards/chosen": -12.869311332702637, "rewards/margins": 2.7545652389526367, "rewards/rejected": -15.623876571655273, "step": 228 }, { "epoch": 0.031181917211328975, "grad_norm": 103.61266108847221, "learning_rate": 2.492517006802721e-07, "logits/chosen": -6.699453353881836, "logits/rejected": -6.316040992736816, "logps/chosen": -1.4382461309432983, "logps/rejected": -1.2933666706085205, "loss": 4.6222, "rewards/accuracies": 0.25, "rewards/chosen": -14.382461547851562, "rewards/margins": -1.4487944841384888, "rewards/rejected": -12.933666229248047, "step": 229 }, { "epoch": 0.03131808278867102, "grad_norm": 56.60251125164447, "learning_rate": 2.503401360544218e-07, "logits/chosen": -6.429636001586914, "logits/rejected": -4.45748233795166, "logps/chosen": -0.8151422739028931, "logps/rejected": -1.4166911840438843, "loss": 5.0471, "rewards/accuracies": 1.0, "rewards/chosen": -8.151422500610352, "rewards/margins": 6.01548957824707, "rewards/rejected": -14.166912078857422, "step": 230 }, { "epoch": 0.03145424836601307, "grad_norm": 103.04138187604374, "learning_rate": 2.5142857142857143e-07, "logits/chosen": -6.237385272979736, "logits/rejected": -6.88185453414917, "logps/chosen": -1.1841727495193481, "logps/rejected": -0.8830050230026245, "loss": 4.834, "rewards/accuracies": 0.25, "rewards/chosen": -11.841728210449219, "rewards/margins": -3.0116770267486572, "rewards/rejected": -8.830050468444824, "step": 231 }, { "epoch": 0.03159041394335512, "grad_norm": 62.129766564570225, "learning_rate": 2.5251700680272103e-07, "logits/chosen": -6.272815704345703, "logits/rejected": -6.050259590148926, "logps/chosen": -1.3385813236236572, "logps/rejected": -1.1527156829833984, "loss": 5.1696, "rewards/accuracies": 0.25, "rewards/chosen": -13.38581371307373, "rewards/margins": -1.8586556911468506, "rewards/rejected": -11.5271577835083, "step": 232 }, { "epoch": 0.03172657952069717, "grad_norm": 64.97048324187509, "learning_rate": 2.5360544217687074e-07, "logits/chosen": -6.52187967300415, "logits/rejected": -5.35231876373291, "logps/chosen": -0.7651889324188232, "logps/rejected": -1.1424862146377563, "loss": 5.0345, "rewards/accuracies": 0.75, "rewards/chosen": -7.651888847351074, "rewards/margins": 3.7729735374450684, "rewards/rejected": -11.424861907958984, "step": 233 }, { "epoch": 0.031862745098039214, "grad_norm": 71.6213463546921, "learning_rate": 2.546938775510204e-07, "logits/chosen": -5.955198287963867, "logits/rejected": -5.022669792175293, "logps/chosen": -1.0090972185134888, "logps/rejected": -1.6245440244674683, "loss": 4.2366, "rewards/accuracies": 1.0, "rewards/chosen": -10.090971946716309, "rewards/margins": 6.154468059539795, "rewards/rejected": -16.245439529418945, "step": 234 }, { "epoch": 0.03199891067538126, "grad_norm": 94.12603895643487, "learning_rate": 2.557823129251701e-07, "logits/chosen": -5.043076515197754, "logits/rejected": -4.527265548706055, "logps/chosen": -1.1433395147323608, "logps/rejected": -1.2495653629302979, "loss": 5.1634, "rewards/accuracies": 0.75, "rewards/chosen": -11.433395385742188, "rewards/margins": 1.0622570514678955, "rewards/rejected": -12.495652198791504, "step": 235 }, { "epoch": 0.03213507625272331, "grad_norm": 66.94369496685466, "learning_rate": 2.568707482993197e-07, "logits/chosen": -5.487996578216553, "logits/rejected": -5.347557067871094, "logps/chosen": -0.8969215154647827, "logps/rejected": -0.9316048622131348, "loss": 4.6605, "rewards/accuracies": 0.25, "rewards/chosen": -8.96921443939209, "rewards/margins": 0.34683430194854736, "rewards/rejected": -9.316048622131348, "step": 236 }, { "epoch": 0.03227124183006536, "grad_norm": 69.2373029335222, "learning_rate": 2.5795918367346935e-07, "logits/chosen": -5.951333999633789, "logits/rejected": -6.019161224365234, "logps/chosen": -1.0771560668945312, "logps/rejected": -0.8683271408081055, "loss": 5.1164, "rewards/accuracies": 0.25, "rewards/chosen": -10.771560668945312, "rewards/margins": -2.088289260864258, "rewards/rejected": -8.683271408081055, "step": 237 }, { "epoch": 0.032407407407407406, "grad_norm": 95.09701899094627, "learning_rate": 2.5904761904761905e-07, "logits/chosen": -4.775607585906982, "logits/rejected": -4.902871131896973, "logps/chosen": -1.538007378578186, "logps/rejected": -1.1899385452270508, "loss": 6.3959, "rewards/accuracies": 0.5, "rewards/chosen": -15.380073547363281, "rewards/margins": -3.48068904876709, "rewards/rejected": -11.899385452270508, "step": 238 }, { "epoch": 0.032543572984749455, "grad_norm": 117.22008093150545, "learning_rate": 2.601360544217687e-07, "logits/chosen": -5.904189586639404, "logits/rejected": -4.84073543548584, "logps/chosen": -1.3049538135528564, "logps/rejected": -1.7636070251464844, "loss": 5.2874, "rewards/accuracies": 0.75, "rewards/chosen": -13.049537658691406, "rewards/margins": 4.5865325927734375, "rewards/rejected": -17.636070251464844, "step": 239 }, { "epoch": 0.032679738562091505, "grad_norm": 96.66844165249017, "learning_rate": 2.6122448979591836e-07, "logits/chosen": -5.6741790771484375, "logits/rejected": -5.74574089050293, "logps/chosen": -1.1981593370437622, "logps/rejected": -1.5953609943389893, "loss": 5.3706, "rewards/accuracies": 0.5, "rewards/chosen": -11.981593132019043, "rewards/margins": 3.9720165729522705, "rewards/rejected": -15.953609466552734, "step": 240 }, { "epoch": 0.03281590413943355, "grad_norm": 63.68893045904735, "learning_rate": 2.62312925170068e-07, "logits/chosen": -6.569314002990723, "logits/rejected": -5.0147905349731445, "logps/chosen": -1.0044927597045898, "logps/rejected": -1.1354849338531494, "loss": 4.0248, "rewards/accuracies": 0.75, "rewards/chosen": -10.044926643371582, "rewards/margins": 1.309922695159912, "rewards/rejected": -11.354848861694336, "step": 241 }, { "epoch": 0.0329520697167756, "grad_norm": 73.3670993478221, "learning_rate": 2.6340136054421766e-07, "logits/chosen": -5.521923542022705, "logits/rejected": -5.294943809509277, "logps/chosen": -1.1957913637161255, "logps/rejected": -0.9369980692863464, "loss": 5.1792, "rewards/accuracies": 0.5, "rewards/chosen": -11.957913398742676, "rewards/margins": -2.587932825088501, "rewards/rejected": -9.369979858398438, "step": 242 }, { "epoch": 0.03308823529411765, "grad_norm": 67.84197497193351, "learning_rate": 2.6448979591836737e-07, "logits/chosen": -6.474400043487549, "logits/rejected": -5.056929111480713, "logps/chosen": -1.1984574794769287, "logps/rejected": -1.4924933910369873, "loss": 4.6672, "rewards/accuracies": 0.75, "rewards/chosen": -11.984574317932129, "rewards/margins": 2.940359354019165, "rewards/rejected": -14.924932479858398, "step": 243 }, { "epoch": 0.0332244008714597, "grad_norm": 57.308658956394545, "learning_rate": 2.65578231292517e-07, "logits/chosen": -5.604331970214844, "logits/rejected": -5.682088851928711, "logps/chosen": -1.3401546478271484, "logps/rejected": -1.7893145084381104, "loss": 4.4413, "rewards/accuracies": 0.75, "rewards/chosen": -13.401546478271484, "rewards/margins": 4.491598606109619, "rewards/rejected": -17.893146514892578, "step": 244 }, { "epoch": 0.03336056644880174, "grad_norm": 90.77730958492798, "learning_rate": 2.666666666666666e-07, "logits/chosen": -6.021014213562012, "logits/rejected": -6.30087947845459, "logps/chosen": -1.0005228519439697, "logps/rejected": -0.9910122156143188, "loss": 5.2992, "rewards/accuracies": 0.5, "rewards/chosen": -10.005228996276855, "rewards/margins": -0.09510648250579834, "rewards/rejected": -9.91012191772461, "step": 245 }, { "epoch": 0.03349673202614379, "grad_norm": 63.54756644605835, "learning_rate": 2.677551020408163e-07, "logits/chosen": -6.363530158996582, "logits/rejected": -5.341102600097656, "logps/chosen": -1.0328192710876465, "logps/rejected": -1.2250367403030396, "loss": 4.3594, "rewards/accuracies": 0.75, "rewards/chosen": -10.328191757202148, "rewards/margins": 1.922175645828247, "rewards/rejected": -12.250368118286133, "step": 246 }, { "epoch": 0.03363289760348584, "grad_norm": 71.25722896478057, "learning_rate": 2.68843537414966e-07, "logits/chosen": -5.572047233581543, "logits/rejected": -5.132349014282227, "logps/chosen": -1.0486743450164795, "logps/rejected": -1.3887054920196533, "loss": 4.8229, "rewards/accuracies": 0.5, "rewards/chosen": -10.486742973327637, "rewards/margins": 3.400311231613159, "rewards/rejected": -13.887054443359375, "step": 247 }, { "epoch": 0.03376906318082789, "grad_norm": 48.25483382944756, "learning_rate": 2.6993197278911563e-07, "logits/chosen": -5.587971210479736, "logits/rejected": -4.888079643249512, "logps/chosen": -1.018635630607605, "logps/rejected": -1.04044771194458, "loss": 3.9453, "rewards/accuracies": 0.5, "rewards/chosen": -10.186356544494629, "rewards/margins": 0.21812069416046143, "rewards/rejected": -10.4044771194458, "step": 248 }, { "epoch": 0.03390522875816993, "grad_norm": 58.15812929388971, "learning_rate": 2.710204081632653e-07, "logits/chosen": -6.11392879486084, "logits/rejected": -5.03648567199707, "logps/chosen": -1.0052895545959473, "logps/rejected": -1.0711268186569214, "loss": 4.6013, "rewards/accuracies": 0.5, "rewards/chosen": -10.052896499633789, "rewards/margins": 0.6583713293075562, "rewards/rejected": -10.711267471313477, "step": 249 }, { "epoch": 0.03404139433551198, "grad_norm": 65.30866071072909, "learning_rate": 2.7210884353741493e-07, "logits/chosen": -6.044029235839844, "logits/rejected": -5.183504104614258, "logps/chosen": -1.089360237121582, "logps/rejected": -1.3705317974090576, "loss": 4.4488, "rewards/accuracies": 0.75, "rewards/chosen": -10.89360237121582, "rewards/margins": 2.811715602874756, "rewards/rejected": -13.705318450927734, "step": 250 }, { "epoch": 0.03417755991285403, "grad_norm": 57.4053048100066, "learning_rate": 2.7319727891156464e-07, "logits/chosen": -6.623503684997559, "logits/rejected": -5.899845600128174, "logps/chosen": -1.10856032371521, "logps/rejected": -1.375737190246582, "loss": 4.9093, "rewards/accuracies": 0.5, "rewards/chosen": -11.085603713989258, "rewards/margins": 2.6717677116394043, "rewards/rejected": -13.757370948791504, "step": 251 }, { "epoch": 0.03431372549019608, "grad_norm": 68.93087216361234, "learning_rate": 2.742857142857143e-07, "logits/chosen": -4.927043914794922, "logits/rejected": -4.6741533279418945, "logps/chosen": -1.3462625741958618, "logps/rejected": -1.3670933246612549, "loss": 5.0532, "rewards/accuracies": 0.5, "rewards/chosen": -13.462625503540039, "rewards/margins": 0.20830845832824707, "rewards/rejected": -13.670934677124023, "step": 252 }, { "epoch": 0.034449891067538124, "grad_norm": 78.19863175129578, "learning_rate": 2.7537414965986394e-07, "logits/chosen": -9.363874435424805, "logits/rejected": -7.845788955688477, "logps/chosen": -0.8910379409790039, "logps/rejected": -1.08247709274292, "loss": 5.3969, "rewards/accuracies": 0.5, "rewards/chosen": -8.910379409790039, "rewards/margins": 1.914391040802002, "rewards/rejected": -10.824769973754883, "step": 253 }, { "epoch": 0.034586056644880174, "grad_norm": 98.14738407838406, "learning_rate": 2.764625850340136e-07, "logits/chosen": -5.11147403717041, "logits/rejected": -4.387681484222412, "logps/chosen": -1.1952247619628906, "logps/rejected": -1.1681249141693115, "loss": 5.6966, "rewards/accuracies": 0.5, "rewards/chosen": -11.95224666595459, "rewards/margins": -0.27099788188934326, "rewards/rejected": -11.681249618530273, "step": 254 }, { "epoch": 0.034722222222222224, "grad_norm": 77.57095842737702, "learning_rate": 2.7755102040816325e-07, "logits/chosen": -4.062272071838379, "logits/rejected": -5.8715410232543945, "logps/chosen": -1.1749825477600098, "logps/rejected": -1.3008275032043457, "loss": 5.1191, "rewards/accuracies": 0.5, "rewards/chosen": -11.749824523925781, "rewards/margins": 1.2584497928619385, "rewards/rejected": -13.00827407836914, "step": 255 }, { "epoch": 0.034858387799564274, "grad_norm": 88.12204704195358, "learning_rate": 2.786394557823129e-07, "logits/chosen": -4.978282928466797, "logits/rejected": -5.032160758972168, "logps/chosen": -1.1378146409988403, "logps/rejected": -1.1246583461761475, "loss": 3.9981, "rewards/accuracies": 0.5, "rewards/chosen": -11.37814712524414, "rewards/margins": -0.13156390190124512, "rewards/rejected": -11.246583938598633, "step": 256 }, { "epoch": 0.034994553376906316, "grad_norm": 73.8351288098107, "learning_rate": 2.797278911564626e-07, "logits/chosen": -6.188664436340332, "logits/rejected": -6.1422576904296875, "logps/chosen": -1.3975176811218262, "logps/rejected": -1.3318121433258057, "loss": 4.9609, "rewards/accuracies": 0.25, "rewards/chosen": -13.975176811218262, "rewards/margins": -0.6570560932159424, "rewards/rejected": -13.318120956420898, "step": 257 }, { "epoch": 0.035130718954248366, "grad_norm": 77.04059209661084, "learning_rate": 2.808163265306122e-07, "logits/chosen": -3.5865583419799805, "logits/rejected": -5.211180210113525, "logps/chosen": -1.1150599718093872, "logps/rejected": -1.1279733180999756, "loss": 4.7276, "rewards/accuracies": 0.75, "rewards/chosen": -11.150598526000977, "rewards/margins": 0.1291351318359375, "rewards/rejected": -11.279733657836914, "step": 258 }, { "epoch": 0.035266884531590416, "grad_norm": 66.66712116757404, "learning_rate": 2.819047619047619e-07, "logits/chosen": -5.319080352783203, "logits/rejected": -4.613959789276123, "logps/chosen": -1.2092119455337524, "logps/rejected": -1.38112473487854, "loss": 4.4729, "rewards/accuracies": 0.25, "rewards/chosen": -12.092119216918945, "rewards/margins": 1.7191277742385864, "rewards/rejected": -13.811246871948242, "step": 259 }, { "epoch": 0.03540305010893246, "grad_norm": 53.26694010917725, "learning_rate": 2.8299319727891156e-07, "logits/chosen": -7.482570648193359, "logits/rejected": -6.389812469482422, "logps/chosen": -1.184424877166748, "logps/rejected": -1.3781447410583496, "loss": 4.2651, "rewards/accuracies": 0.5, "rewards/chosen": -11.844247817993164, "rewards/margins": 1.9371987581253052, "rewards/rejected": -13.78144645690918, "step": 260 }, { "epoch": 0.03553921568627451, "grad_norm": 54.176787649214354, "learning_rate": 2.840816326530612e-07, "logits/chosen": -5.978750228881836, "logits/rejected": -5.551822662353516, "logps/chosen": -0.9772324562072754, "logps/rejected": -1.0748043060302734, "loss": 4.8221, "rewards/accuracies": 1.0, "rewards/chosen": -9.772324562072754, "rewards/margins": 0.9757180213928223, "rewards/rejected": -10.748042106628418, "step": 261 }, { "epoch": 0.03567538126361656, "grad_norm": 66.664806814064, "learning_rate": 2.8517006802721087e-07, "logits/chosen": -5.673116683959961, "logits/rejected": -5.13516902923584, "logps/chosen": -1.2682722806930542, "logps/rejected": -1.5272258520126343, "loss": 4.9832, "rewards/accuracies": 0.75, "rewards/chosen": -12.682723045349121, "rewards/margins": 2.5895354747772217, "rewards/rejected": -15.272258758544922, "step": 262 }, { "epoch": 0.03581154684095861, "grad_norm": 59.91748095731296, "learning_rate": 2.862585034013605e-07, "logits/chosen": -6.447942733764648, "logits/rejected": -5.6805524826049805, "logps/chosen": -0.9962539076805115, "logps/rejected": -1.2909497022628784, "loss": 4.461, "rewards/accuracies": 0.75, "rewards/chosen": -9.962538719177246, "rewards/margins": 2.946958541870117, "rewards/rejected": -12.909497261047363, "step": 263 }, { "epoch": 0.03594771241830065, "grad_norm": 55.97479846461005, "learning_rate": 2.8734693877551017e-07, "logits/chosen": -5.6748762130737305, "logits/rejected": -5.556204319000244, "logps/chosen": -1.0425621271133423, "logps/rejected": -0.7304791808128357, "loss": 4.7079, "rewards/accuracies": 0.25, "rewards/chosen": -10.425621032714844, "rewards/margins": -3.1208291053771973, "rewards/rejected": -7.3047919273376465, "step": 264 }, { "epoch": 0.0360838779956427, "grad_norm": 61.16058931205353, "learning_rate": 2.884353741496599e-07, "logits/chosen": -6.118993759155273, "logits/rejected": -5.219854354858398, "logps/chosen": -1.3427917957305908, "logps/rejected": -1.3493716716766357, "loss": 4.8154, "rewards/accuracies": 0.25, "rewards/chosen": -13.42791748046875, "rewards/margins": 0.06579875946044922, "rewards/rejected": -13.493717193603516, "step": 265 }, { "epoch": 0.03622004357298475, "grad_norm": 63.515818854204575, "learning_rate": 2.8952380952380953e-07, "logits/chosen": -5.471975326538086, "logits/rejected": -5.962667465209961, "logps/chosen": -1.1339610815048218, "logps/rejected": -0.893965482711792, "loss": 4.9408, "rewards/accuracies": 0.25, "rewards/chosen": -11.339611053466797, "rewards/margins": -2.3999555110931396, "rewards/rejected": -8.939655303955078, "step": 266 }, { "epoch": 0.0363562091503268, "grad_norm": 89.28554828495842, "learning_rate": 2.906122448979592e-07, "logits/chosen": -5.867156028747559, "logits/rejected": -5.653191566467285, "logps/chosen": -1.7523956298828125, "logps/rejected": -1.2520313262939453, "loss": 5.7334, "rewards/accuracies": 0.0, "rewards/chosen": -17.523956298828125, "rewards/margins": -5.003640174865723, "rewards/rejected": -12.52031421661377, "step": 267 }, { "epoch": 0.03649237472766884, "grad_norm": 60.89498108842642, "learning_rate": 2.9170068027210883e-07, "logits/chosen": -7.2450761795043945, "logits/rejected": -7.492772102355957, "logps/chosen": -0.693792998790741, "logps/rejected": -0.5410770177841187, "loss": 4.2041, "rewards/accuracies": 0.5, "rewards/chosen": -6.937929630279541, "rewards/margins": -1.5271596908569336, "rewards/rejected": -5.410769939422607, "step": 268 }, { "epoch": 0.03662854030501089, "grad_norm": 73.40782375570238, "learning_rate": 2.927891156462585e-07, "logits/chosen": -5.03182315826416, "logits/rejected": -5.203419208526611, "logps/chosen": -0.8716521263122559, "logps/rejected": -1.2112388610839844, "loss": 5.1114, "rewards/accuracies": 0.5, "rewards/chosen": -8.716522216796875, "rewards/margins": 3.3958678245544434, "rewards/rejected": -12.11238956451416, "step": 269 }, { "epoch": 0.03676470588235294, "grad_norm": 82.4989027353636, "learning_rate": 2.9387755102040814e-07, "logits/chosen": -5.914813041687012, "logits/rejected": -6.236295700073242, "logps/chosen": -1.0074737071990967, "logps/rejected": -1.2283413410186768, "loss": 4.345, "rewards/accuracies": 0.75, "rewards/chosen": -10.074736595153809, "rewards/margins": 2.208677291870117, "rewards/rejected": -12.283413887023926, "step": 270 }, { "epoch": 0.03690087145969499, "grad_norm": 56.683897816670466, "learning_rate": 2.949659863945578e-07, "logits/chosen": -5.1891303062438965, "logits/rejected": -4.342794418334961, "logps/chosen": -1.2542762756347656, "logps/rejected": -1.4795784950256348, "loss": 4.9206, "rewards/accuracies": 0.75, "rewards/chosen": -12.542762756347656, "rewards/margins": 2.253021717071533, "rewards/rejected": -14.795784950256348, "step": 271 }, { "epoch": 0.037037037037037035, "grad_norm": 73.00412768811887, "learning_rate": 2.9605442176870744e-07, "logits/chosen": -5.744339942932129, "logits/rejected": -4.7394890785217285, "logps/chosen": -1.1511955261230469, "logps/rejected": -1.0473639965057373, "loss": 4.8546, "rewards/accuracies": 0.5, "rewards/chosen": -11.511956214904785, "rewards/margins": -1.0383156538009644, "rewards/rejected": -10.473640441894531, "step": 272 }, { "epoch": 0.037173202614379085, "grad_norm": 47.943558420834904, "learning_rate": 2.9714285714285715e-07, "logits/chosen": -4.224227428436279, "logits/rejected": -4.2890095710754395, "logps/chosen": -1.323521614074707, "logps/rejected": -1.2335691452026367, "loss": 5.0834, "rewards/accuracies": 0.5, "rewards/chosen": -13.23521614074707, "rewards/margins": -0.899524450302124, "rewards/rejected": -12.335691452026367, "step": 273 }, { "epoch": 0.037309368191721135, "grad_norm": 45.23249096934942, "learning_rate": 2.982312925170068e-07, "logits/chosen": -5.175953388214111, "logits/rejected": -5.670047760009766, "logps/chosen": -1.3690056800842285, "logps/rejected": -1.2710869312286377, "loss": 4.4505, "rewards/accuracies": 0.75, "rewards/chosen": -13.690056800842285, "rewards/margins": -0.9791874885559082, "rewards/rejected": -12.710868835449219, "step": 274 }, { "epoch": 0.037445533769063184, "grad_norm": 101.81107993656474, "learning_rate": 2.9931972789115645e-07, "logits/chosen": -6.438827991485596, "logits/rejected": -4.477993488311768, "logps/chosen": -1.4407565593719482, "logps/rejected": -2.0651755332946777, "loss": 5.1641, "rewards/accuracies": 0.5, "rewards/chosen": -14.40756607055664, "rewards/margins": 6.24418830871582, "rewards/rejected": -20.65175437927246, "step": 275 }, { "epoch": 0.03758169934640523, "grad_norm": 50.52675591009575, "learning_rate": 3.004081632653061e-07, "logits/chosen": -6.486952781677246, "logits/rejected": -6.977458953857422, "logps/chosen": -1.0360772609710693, "logps/rejected": -1.4686963558197021, "loss": 4.6235, "rewards/accuracies": 0.5, "rewards/chosen": -10.360772132873535, "rewards/margins": 4.326190948486328, "rewards/rejected": -14.686963081359863, "step": 276 }, { "epoch": 0.03771786492374728, "grad_norm": 59.04572550867243, "learning_rate": 3.0149659863945576e-07, "logits/chosen": -6.62136173248291, "logits/rejected": -6.684057235717773, "logps/chosen": -1.1884666681289673, "logps/rejected": -1.4418880939483643, "loss": 5.0253, "rewards/accuracies": 0.75, "rewards/chosen": -11.88466739654541, "rewards/margins": 2.534213066101074, "rewards/rejected": -14.418880462646484, "step": 277 }, { "epoch": 0.03785403050108933, "grad_norm": 51.474232343825726, "learning_rate": 3.0258503401360546e-07, "logits/chosen": -4.66886043548584, "logits/rejected": -4.881121635437012, "logps/chosen": -1.4682432413101196, "logps/rejected": -0.8505520820617676, "loss": 4.6748, "rewards/accuracies": 0.0, "rewards/chosen": -14.682432174682617, "rewards/margins": -6.176910877227783, "rewards/rejected": -8.505520820617676, "step": 278 }, { "epoch": 0.03799019607843137, "grad_norm": 59.46845057819673, "learning_rate": 3.0367346938775506e-07, "logits/chosen": -4.250846862792969, "logits/rejected": -4.090045928955078, "logps/chosen": -0.9794827699661255, "logps/rejected": -0.9620204567909241, "loss": 5.0954, "rewards/accuracies": 0.5, "rewards/chosen": -9.79482650756836, "rewards/margins": -0.1746230125427246, "rewards/rejected": -9.62020492553711, "step": 279 }, { "epoch": 0.03812636165577342, "grad_norm": 81.45401538772694, "learning_rate": 3.047619047619047e-07, "logits/chosen": -5.700243949890137, "logits/rejected": -5.550249099731445, "logps/chosen": -1.2983471155166626, "logps/rejected": -1.1378917694091797, "loss": 5.4449, "rewards/accuracies": 0.25, "rewards/chosen": -12.983470916748047, "rewards/margins": -1.604552984237671, "rewards/rejected": -11.378917694091797, "step": 280 }, { "epoch": 0.03826252723311547, "grad_norm": 51.746731100736916, "learning_rate": 3.058503401360544e-07, "logits/chosen": -5.497616767883301, "logits/rejected": -5.22199010848999, "logps/chosen": -1.2074272632598877, "logps/rejected": -1.392014503479004, "loss": 4.2037, "rewards/accuracies": 0.5, "rewards/chosen": -12.074272155761719, "rewards/margins": 1.845873475074768, "rewards/rejected": -13.920145988464355, "step": 281 }, { "epoch": 0.03839869281045752, "grad_norm": 81.42446689293689, "learning_rate": 3.0693877551020407e-07, "logits/chosen": -5.306422710418701, "logits/rejected": -7.0699462890625, "logps/chosen": -1.1654083728790283, "logps/rejected": -1.1180654764175415, "loss": 4.2958, "rewards/accuracies": 0.5, "rewards/chosen": -11.654083251953125, "rewards/margins": -0.4734284281730652, "rewards/rejected": -11.180654525756836, "step": 282 }, { "epoch": 0.03853485838779956, "grad_norm": 85.28601029015509, "learning_rate": 3.080272108843537e-07, "logits/chosen": -5.311562538146973, "logits/rejected": -3.963003158569336, "logps/chosen": -1.0550737380981445, "logps/rejected": -1.9785308837890625, "loss": 5.7733, "rewards/accuracies": 1.0, "rewards/chosen": -10.550737380981445, "rewards/margins": 9.23457145690918, "rewards/rejected": -19.785308837890625, "step": 283 }, { "epoch": 0.03867102396514161, "grad_norm": 82.89479518815811, "learning_rate": 3.091156462585034e-07, "logits/chosen": -4.117852687835693, "logits/rejected": -3.5049514770507812, "logps/chosen": -1.2195240259170532, "logps/rejected": -1.4119523763656616, "loss": 5.0577, "rewards/accuracies": 0.25, "rewards/chosen": -12.195240020751953, "rewards/margins": 1.924283504486084, "rewards/rejected": -14.119524002075195, "step": 284 }, { "epoch": 0.03880718954248366, "grad_norm": 60.26087213923758, "learning_rate": 3.1020408163265303e-07, "logits/chosen": -7.015053749084473, "logits/rejected": -6.443515777587891, "logps/chosen": -0.6964254379272461, "logps/rejected": -1.0593734979629517, "loss": 4.6616, "rewards/accuracies": 0.5, "rewards/chosen": -6.964254379272461, "rewards/margins": 3.6294798851013184, "rewards/rejected": -10.593734741210938, "step": 285 }, { "epoch": 0.03894335511982571, "grad_norm": 76.71687439288748, "learning_rate": 3.1129251700680274e-07, "logits/chosen": -5.824742317199707, "logits/rejected": -5.608431816101074, "logps/chosen": -1.022223949432373, "logps/rejected": -1.494560718536377, "loss": 5.3093, "rewards/accuracies": 0.75, "rewards/chosen": -10.222238540649414, "rewards/margins": 4.723368167877197, "rewards/rejected": -14.945606231689453, "step": 286 }, { "epoch": 0.039079520697167754, "grad_norm": 48.11281649992343, "learning_rate": 3.123809523809524e-07, "logits/chosen": -6.561643600463867, "logits/rejected": -5.773690223693848, "logps/chosen": -1.1647915840148926, "logps/rejected": -1.3229095935821533, "loss": 4.3772, "rewards/accuracies": 0.5, "rewards/chosen": -11.647916793823242, "rewards/margins": 1.581178903579712, "rewards/rejected": -13.229095458984375, "step": 287 }, { "epoch": 0.0392156862745098, "grad_norm": 101.91462629033805, "learning_rate": 3.13469387755102e-07, "logits/chosen": -6.1120195388793945, "logits/rejected": -4.842982769012451, "logps/chosen": -1.114365577697754, "logps/rejected": -1.148498773574829, "loss": 5.1493, "rewards/accuracies": 0.5, "rewards/chosen": -11.143655776977539, "rewards/margins": 0.34133267402648926, "rewards/rejected": -11.48498821258545, "step": 288 }, { "epoch": 0.03935185185185185, "grad_norm": 58.586709869111985, "learning_rate": 3.145578231292517e-07, "logits/chosen": -5.140995979309082, "logits/rejected": -5.744998931884766, "logps/chosen": -1.2991633415222168, "logps/rejected": -0.978131890296936, "loss": 4.7759, "rewards/accuracies": 0.25, "rewards/chosen": -12.991634368896484, "rewards/margins": -3.2103145122528076, "rewards/rejected": -9.781318664550781, "step": 289 }, { "epoch": 0.0394880174291939, "grad_norm": 85.71902529513093, "learning_rate": 3.1564625850340134e-07, "logits/chosen": -6.510519504547119, "logits/rejected": -5.653317451477051, "logps/chosen": -0.7901195287704468, "logps/rejected": -0.8980304598808289, "loss": 5.4417, "rewards/accuracies": 0.5, "rewards/chosen": -7.901196002960205, "rewards/margins": 1.0791089534759521, "rewards/rejected": -8.980304718017578, "step": 290 }, { "epoch": 0.039624183006535946, "grad_norm": 63.6967584546626, "learning_rate": 3.16734693877551e-07, "logits/chosen": -5.364646911621094, "logits/rejected": -5.718958854675293, "logps/chosen": -0.8741910457611084, "logps/rejected": -0.9376577138900757, "loss": 5.5728, "rewards/accuracies": 0.5, "rewards/chosen": -8.741909980773926, "rewards/margins": 0.634666919708252, "rewards/rejected": -9.376577377319336, "step": 291 }, { "epoch": 0.039760348583877995, "grad_norm": 124.29152590483952, "learning_rate": 3.1782312925170065e-07, "logits/chosen": -6.3695173263549805, "logits/rejected": -6.648038864135742, "logps/chosen": -1.1529583930969238, "logps/rejected": -0.9785535335540771, "loss": 4.5874, "rewards/accuracies": 0.25, "rewards/chosen": -11.529583930969238, "rewards/margins": -1.7440489530563354, "rewards/rejected": -9.78553581237793, "step": 292 }, { "epoch": 0.039896514161220045, "grad_norm": 63.53746644444534, "learning_rate": 3.189115646258503e-07, "logits/chosen": -4.745383262634277, "logits/rejected": -4.805789947509766, "logps/chosen": -1.211514949798584, "logps/rejected": -1.1108471155166626, "loss": 4.4226, "rewards/accuracies": 0.5, "rewards/chosen": -12.115150451660156, "rewards/margins": -1.006678819656372, "rewards/rejected": -11.108470916748047, "step": 293 }, { "epoch": 0.04003267973856209, "grad_norm": 71.56180968292765, "learning_rate": 3.2e-07, "logits/chosen": -4.7049102783203125, "logits/rejected": -4.994217872619629, "logps/chosen": -1.4565553665161133, "logps/rejected": -1.3495374917984009, "loss": 4.8107, "rewards/accuracies": 0.5, "rewards/chosen": -14.56555461883545, "rewards/margins": -1.0701792240142822, "rewards/rejected": -13.49537467956543, "step": 294 }, { "epoch": 0.04016884531590414, "grad_norm": 64.27970013723564, "learning_rate": 3.2108843537414966e-07, "logits/chosen": -3.9368350505828857, "logits/rejected": -4.138067245483398, "logps/chosen": -1.4147156476974487, "logps/rejected": -1.2015376091003418, "loss": 4.8499, "rewards/accuracies": 0.25, "rewards/chosen": -14.147156715393066, "rewards/margins": -2.1317806243896484, "rewards/rejected": -12.015376091003418, "step": 295 }, { "epoch": 0.04030501089324619, "grad_norm": 69.40538567174809, "learning_rate": 3.221768707482993e-07, "logits/chosen": -6.214529991149902, "logits/rejected": -4.970919609069824, "logps/chosen": -0.8570600152015686, "logps/rejected": -1.350068211555481, "loss": 4.6123, "rewards/accuracies": 0.75, "rewards/chosen": -8.570600509643555, "rewards/margins": 4.930081367492676, "rewards/rejected": -13.500682830810547, "step": 296 }, { "epoch": 0.04044117647058824, "grad_norm": 62.912887250279155, "learning_rate": 3.2326530612244896e-07, "logits/chosen": -5.182707786560059, "logits/rejected": -5.297575950622559, "logps/chosen": -1.0879995822906494, "logps/rejected": -1.397923231124878, "loss": 4.6399, "rewards/accuracies": 0.75, "rewards/chosen": -10.879995346069336, "rewards/margins": 3.099235773086548, "rewards/rejected": -13.979232788085938, "step": 297 }, { "epoch": 0.04057734204793028, "grad_norm": 56.44378818566525, "learning_rate": 3.243537414965986e-07, "logits/chosen": -7.36495304107666, "logits/rejected": -6.3211565017700195, "logps/chosen": -0.8207796812057495, "logps/rejected": -1.0906308889389038, "loss": 4.9065, "rewards/accuracies": 0.75, "rewards/chosen": -8.207796096801758, "rewards/margins": 2.698512315750122, "rewards/rejected": -10.906309127807617, "step": 298 }, { "epoch": 0.04071350762527233, "grad_norm": 75.23329800036315, "learning_rate": 3.2544217687074827e-07, "logits/chosen": -5.045053482055664, "logits/rejected": -4.893669128417969, "logps/chosen": -1.4212563037872314, "logps/rejected": -1.2317540645599365, "loss": 5.4285, "rewards/accuracies": 0.5, "rewards/chosen": -14.212562561035156, "rewards/margins": -1.8950209617614746, "rewards/rejected": -12.317541122436523, "step": 299 }, { "epoch": 0.04084967320261438, "grad_norm": 53.10982447079129, "learning_rate": 3.26530612244898e-07, "logits/chosen": -4.828580379486084, "logits/rejected": -4.032795429229736, "logps/chosen": -1.0820941925048828, "logps/rejected": -1.4866735935211182, "loss": 5.3607, "rewards/accuracies": 0.5, "rewards/chosen": -10.820941925048828, "rewards/margins": 4.045794486999512, "rewards/rejected": -14.86673641204834, "step": 300 }, { "epoch": 0.04098583877995643, "grad_norm": 66.96904129367707, "learning_rate": 3.2761904761904757e-07, "logits/chosen": -4.872203826904297, "logits/rejected": -4.1192426681518555, "logps/chosen": -1.389347791671753, "logps/rejected": -2.259610652923584, "loss": 4.5198, "rewards/accuracies": 0.75, "rewards/chosen": -13.893478393554688, "rewards/margins": 8.702627182006836, "rewards/rejected": -22.596105575561523, "step": 301 }, { "epoch": 0.04112200435729847, "grad_norm": 73.52684445258411, "learning_rate": 3.287074829931973e-07, "logits/chosen": -8.252922058105469, "logits/rejected": -6.003087043762207, "logps/chosen": -0.6266273856163025, "logps/rejected": -0.892977237701416, "loss": 5.399, "rewards/accuracies": 1.0, "rewards/chosen": -6.2662739753723145, "rewards/margins": 2.6634984016418457, "rewards/rejected": -8.92977237701416, "step": 302 }, { "epoch": 0.04125816993464052, "grad_norm": 64.7885280541169, "learning_rate": 3.2979591836734693e-07, "logits/chosen": -5.001124858856201, "logits/rejected": -5.295206546783447, "logps/chosen": -1.039398193359375, "logps/rejected": -1.0232157707214355, "loss": 4.858, "rewards/accuracies": 0.5, "rewards/chosen": -10.39398193359375, "rewards/margins": -0.16182303428649902, "rewards/rejected": -10.232158660888672, "step": 303 }, { "epoch": 0.04139433551198257, "grad_norm": 53.44314392064221, "learning_rate": 3.308843537414966e-07, "logits/chosen": -3.8118338584899902, "logits/rejected": -3.767061471939087, "logps/chosen": -1.2660624980926514, "logps/rejected": -1.5525192022323608, "loss": 4.3837, "rewards/accuracies": 0.75, "rewards/chosen": -12.660625457763672, "rewards/margins": 2.8645665645599365, "rewards/rejected": -15.525192260742188, "step": 304 }, { "epoch": 0.04153050108932462, "grad_norm": 111.33391115443968, "learning_rate": 3.3197278911564624e-07, "logits/chosen": -5.365604400634766, "logits/rejected": -4.919011116027832, "logps/chosen": -0.9577054977416992, "logps/rejected": -1.2963093519210815, "loss": 5.3884, "rewards/accuracies": 0.75, "rewards/chosen": -9.577054977416992, "rewards/margins": 3.386038064956665, "rewards/rejected": -12.963092803955078, "step": 305 }, { "epoch": 0.041666666666666664, "grad_norm": 68.26294673178148, "learning_rate": 3.330612244897959e-07, "logits/chosen": -5.444345951080322, "logits/rejected": -5.243512153625488, "logps/chosen": -1.289650797843933, "logps/rejected": -1.1728914976119995, "loss": 4.8367, "rewards/accuracies": 0.25, "rewards/chosen": -12.89650821685791, "rewards/margins": -1.167593240737915, "rewards/rejected": -11.728914260864258, "step": 306 }, { "epoch": 0.041802832244008714, "grad_norm": 45.83251318701521, "learning_rate": 3.3414965986394554e-07, "logits/chosen": -5.622945785522461, "logits/rejected": -4.702096462249756, "logps/chosen": -0.8104870319366455, "logps/rejected": -1.3166258335113525, "loss": 4.1325, "rewards/accuracies": 1.0, "rewards/chosen": -8.104870796203613, "rewards/margins": 5.0613861083984375, "rewards/rejected": -13.16625690460205, "step": 307 }, { "epoch": 0.041938997821350764, "grad_norm": 65.01352811772412, "learning_rate": 3.3523809523809525e-07, "logits/chosen": -4.006753921508789, "logits/rejected": -4.324404239654541, "logps/chosen": -1.6145024299621582, "logps/rejected": -1.6190104484558105, "loss": 4.6983, "rewards/accuracies": 0.75, "rewards/chosen": -16.1450252532959, "rewards/margins": 0.04508066177368164, "rewards/rejected": -16.190105438232422, "step": 308 }, { "epoch": 0.042075163398692814, "grad_norm": 73.40716928272855, "learning_rate": 3.363265306122449e-07, "logits/chosen": -4.923980712890625, "logits/rejected": -4.357913017272949, "logps/chosen": -1.2101860046386719, "logps/rejected": -1.0742626190185547, "loss": 4.9399, "rewards/accuracies": 0.25, "rewards/chosen": -12.101859092712402, "rewards/margins": -1.3592329025268555, "rewards/rejected": -10.742626190185547, "step": 309 }, { "epoch": 0.042211328976034856, "grad_norm": 86.64242989388097, "learning_rate": 3.3741496598639455e-07, "logits/chosen": -5.090981960296631, "logits/rejected": -4.790233135223389, "logps/chosen": -1.2187621593475342, "logps/rejected": -1.46649968624115, "loss": 5.7045, "rewards/accuracies": 0.5, "rewards/chosen": -12.1876220703125, "rewards/margins": 2.47737455368042, "rewards/rejected": -14.664997100830078, "step": 310 }, { "epoch": 0.042347494553376906, "grad_norm": 84.52761647877323, "learning_rate": 3.385034013605442e-07, "logits/chosen": -4.32208251953125, "logits/rejected": -4.260677337646484, "logps/chosen": -1.5720665454864502, "logps/rejected": -1.6891428232192993, "loss": 4.703, "rewards/accuracies": 0.5, "rewards/chosen": -15.720664978027344, "rewards/margins": 1.1707630157470703, "rewards/rejected": -16.891427993774414, "step": 311 }, { "epoch": 0.042483660130718956, "grad_norm": 56.415207724586466, "learning_rate": 3.3959183673469385e-07, "logits/chosen": -4.906296730041504, "logits/rejected": -3.4032950401306152, "logps/chosen": -1.1015807390213013, "logps/rejected": -1.6876873970031738, "loss": 4.575, "rewards/accuracies": 1.0, "rewards/chosen": -11.015806198120117, "rewards/margins": 5.861067771911621, "rewards/rejected": -16.876874923706055, "step": 312 }, { "epoch": 0.042619825708061, "grad_norm": 112.02772938904974, "learning_rate": 3.4068027210884356e-07, "logits/chosen": -5.677821636199951, "logits/rejected": -4.527283668518066, "logps/chosen": -1.5534473657608032, "logps/rejected": -1.644795536994934, "loss": 4.8996, "rewards/accuracies": 0.5, "rewards/chosen": -15.534473419189453, "rewards/margins": 0.9134814739227295, "rewards/rejected": -16.447956085205078, "step": 313 }, { "epoch": 0.04275599128540305, "grad_norm": 67.18467578850294, "learning_rate": 3.4176870748299316e-07, "logits/chosen": -5.308394908905029, "logits/rejected": -5.2597455978393555, "logps/chosen": -0.9776167869567871, "logps/rejected": -1.4059151411056519, "loss": 4.9014, "rewards/accuracies": 1.0, "rewards/chosen": -9.776167869567871, "rewards/margins": 4.282983303070068, "rewards/rejected": -14.059150695800781, "step": 314 }, { "epoch": 0.0428921568627451, "grad_norm": 75.33241825063263, "learning_rate": 3.428571428571428e-07, "logits/chosen": -5.659082412719727, "logits/rejected": -4.746170997619629, "logps/chosen": -1.3662632703781128, "logps/rejected": -1.3791353702545166, "loss": 4.9001, "rewards/accuracies": 0.5, "rewards/chosen": -13.66263198852539, "rewards/margins": 0.12872076034545898, "rewards/rejected": -13.791353225708008, "step": 315 }, { "epoch": 0.04302832244008715, "grad_norm": 56.62935616086173, "learning_rate": 3.439455782312925e-07, "logits/chosen": -4.99162483215332, "logits/rejected": -4.908379554748535, "logps/chosen": -0.9431143999099731, "logps/rejected": -0.9593716263771057, "loss": 4.2346, "rewards/accuracies": 0.5, "rewards/chosen": -9.431144714355469, "rewards/margins": 0.16257143020629883, "rewards/rejected": -9.59371566772461, "step": 316 }, { "epoch": 0.04316448801742919, "grad_norm": 75.45895738515944, "learning_rate": 3.4503401360544217e-07, "logits/chosen": -5.662214279174805, "logits/rejected": -4.625615119934082, "logps/chosen": -2.197333335876465, "logps/rejected": -1.1858606338500977, "loss": 5.417, "rewards/accuracies": 0.5, "rewards/chosen": -21.97333526611328, "rewards/margins": -10.114727973937988, "rewards/rejected": -11.858606338500977, "step": 317 }, { "epoch": 0.04330065359477124, "grad_norm": 69.00190768507453, "learning_rate": 3.461224489795918e-07, "logits/chosen": -6.073953628540039, "logits/rejected": -5.639466762542725, "logps/chosen": -0.8199307322502136, "logps/rejected": -0.9989339113235474, "loss": 6.7057, "rewards/accuracies": 0.75, "rewards/chosen": -8.19930648803711, "rewards/margins": 1.790032148361206, "rewards/rejected": -9.989338874816895, "step": 318 }, { "epoch": 0.04343681917211329, "grad_norm": 47.132847487006686, "learning_rate": 3.472108843537415e-07, "logits/chosen": -5.244969367980957, "logits/rejected": -4.629319190979004, "logps/chosen": -0.9945318102836609, "logps/rejected": -0.9217130541801453, "loss": 5.2587, "rewards/accuracies": 0.5, "rewards/chosen": -9.945318222045898, "rewards/margins": -0.7281875610351562, "rewards/rejected": -9.217130661010742, "step": 319 }, { "epoch": 0.04357298474945534, "grad_norm": 80.67572060200001, "learning_rate": 3.482993197278911e-07, "logits/chosen": -6.84930419921875, "logits/rejected": -5.082592964172363, "logps/chosen": -0.8338396549224854, "logps/rejected": -0.7946495413780212, "loss": 4.6799, "rewards/accuracies": 0.25, "rewards/chosen": -8.338396072387695, "rewards/margins": -0.39190101623535156, "rewards/rejected": -7.946495056152344, "step": 320 }, { "epoch": 0.04370915032679738, "grad_norm": 53.715382075407234, "learning_rate": 3.4938775510204083e-07, "logits/chosen": -4.414486885070801, "logits/rejected": -4.325369834899902, "logps/chosen": -1.1800625324249268, "logps/rejected": -1.2570714950561523, "loss": 4.3018, "rewards/accuracies": 0.75, "rewards/chosen": -11.80062484741211, "rewards/margins": 0.7700886726379395, "rewards/rejected": -12.570714950561523, "step": 321 }, { "epoch": 0.04384531590413943, "grad_norm": 76.3922283044073, "learning_rate": 3.504761904761905e-07, "logits/chosen": -5.061430931091309, "logits/rejected": -5.193187713623047, "logps/chosen": -0.9471786022186279, "logps/rejected": -1.0105222463607788, "loss": 4.7564, "rewards/accuracies": 0.75, "rewards/chosen": -9.471786499023438, "rewards/margins": 0.6334362030029297, "rewards/rejected": -10.10522174835205, "step": 322 }, { "epoch": 0.04398148148148148, "grad_norm": 78.80229104300813, "learning_rate": 3.515646258503401e-07, "logits/chosen": -3.73931884765625, "logits/rejected": -5.390448570251465, "logps/chosen": -1.020543098449707, "logps/rejected": -0.8303165435791016, "loss": 4.9138, "rewards/accuracies": 0.25, "rewards/chosen": -10.20543098449707, "rewards/margins": -1.902264952659607, "rewards/rejected": -8.303165435791016, "step": 323 }, { "epoch": 0.04411764705882353, "grad_norm": 82.43983763027742, "learning_rate": 3.526530612244898e-07, "logits/chosen": -4.164395809173584, "logits/rejected": -4.700032711029053, "logps/chosen": -1.0258872509002686, "logps/rejected": -1.0510218143463135, "loss": 5.5313, "rewards/accuracies": 0.75, "rewards/chosen": -10.25887393951416, "rewards/margins": 0.25134384632110596, "rewards/rejected": -10.510217666625977, "step": 324 }, { "epoch": 0.044253812636165575, "grad_norm": 65.96171987316494, "learning_rate": 3.5374149659863944e-07, "logits/chosen": -4.379007339477539, "logits/rejected": -7.247016906738281, "logps/chosen": -1.0006804466247559, "logps/rejected": -0.7425208687782288, "loss": 4.8815, "rewards/accuracies": 0.5, "rewards/chosen": -10.006805419921875, "rewards/margins": -2.5815958976745605, "rewards/rejected": -7.425209045410156, "step": 325 }, { "epoch": 0.044389978213507625, "grad_norm": 67.045670852833, "learning_rate": 3.5482993197278915e-07, "logits/chosen": -4.473821640014648, "logits/rejected": -5.27337646484375, "logps/chosen": -1.364248275756836, "logps/rejected": -0.941375195980072, "loss": 5.2318, "rewards/accuracies": 0.0, "rewards/chosen": -13.64248275756836, "rewards/margins": -4.228731155395508, "rewards/rejected": -9.413751602172852, "step": 326 }, { "epoch": 0.044526143790849675, "grad_norm": 75.04393921776482, "learning_rate": 3.5591836734693875e-07, "logits/chosen": -5.228488445281982, "logits/rejected": -4.250424385070801, "logps/chosen": -1.8979551792144775, "logps/rejected": -1.6222800016403198, "loss": 4.702, "rewards/accuracies": 0.5, "rewards/chosen": -18.979551315307617, "rewards/margins": -2.756751537322998, "rewards/rejected": -16.222801208496094, "step": 327 }, { "epoch": 0.044662309368191724, "grad_norm": 49.55325983822202, "learning_rate": 3.570068027210884e-07, "logits/chosen": -5.328608989715576, "logits/rejected": -4.373261451721191, "logps/chosen": -1.0470333099365234, "logps/rejected": -1.5911309719085693, "loss": 4.4238, "rewards/accuracies": 0.5, "rewards/chosen": -10.470333099365234, "rewards/margins": 5.440976142883301, "rewards/rejected": -15.911310195922852, "step": 328 }, { "epoch": 0.04479847494553377, "grad_norm": 49.88808470156949, "learning_rate": 3.580952380952381e-07, "logits/chosen": -5.277864456176758, "logits/rejected": -3.787621259689331, "logps/chosen": -0.8987516164779663, "logps/rejected": -1.3171722888946533, "loss": 4.8759, "rewards/accuracies": 1.0, "rewards/chosen": -8.987516403198242, "rewards/margins": 4.184206962585449, "rewards/rejected": -13.171723365783691, "step": 329 }, { "epoch": 0.04493464052287582, "grad_norm": 56.35995975770236, "learning_rate": 3.5918367346938776e-07, "logits/chosen": -4.28376579284668, "logits/rejected": -3.742021083831787, "logps/chosen": -1.1659579277038574, "logps/rejected": -1.6126478910446167, "loss": 4.1243, "rewards/accuracies": 0.75, "rewards/chosen": -11.659578323364258, "rewards/margins": 4.466899394989014, "rewards/rejected": -16.12647819519043, "step": 330 }, { "epoch": 0.04507080610021787, "grad_norm": 52.58990131690406, "learning_rate": 3.602721088435374e-07, "logits/chosen": -4.0313568115234375, "logits/rejected": -5.071317672729492, "logps/chosen": -1.0978147983551025, "logps/rejected": -0.9905543923377991, "loss": 4.6989, "rewards/accuracies": 0.25, "rewards/chosen": -10.9781494140625, "rewards/margins": -1.0726053714752197, "rewards/rejected": -9.90554428100586, "step": 331 }, { "epoch": 0.04520697167755991, "grad_norm": 51.608049607727914, "learning_rate": 3.6136054421768706e-07, "logits/chosen": -4.0459394454956055, "logits/rejected": -3.6338887214660645, "logps/chosen": -1.3316205739974976, "logps/rejected": -1.599829912185669, "loss": 5.2421, "rewards/accuracies": 0.75, "rewards/chosen": -13.316205978393555, "rewards/margins": 2.682093381881714, "rewards/rejected": -15.998298645019531, "step": 332 }, { "epoch": 0.04534313725490196, "grad_norm": 69.42275051159423, "learning_rate": 3.624489795918367e-07, "logits/chosen": -3.898859739303589, "logits/rejected": -3.7162675857543945, "logps/chosen": -1.6424038410186768, "logps/rejected": -1.857055425643921, "loss": 4.8124, "rewards/accuracies": 0.75, "rewards/chosen": -16.42403793334961, "rewards/margins": 2.1465158462524414, "rewards/rejected": -18.570552825927734, "step": 333 }, { "epoch": 0.04547930283224401, "grad_norm": 53.622499284965485, "learning_rate": 3.635374149659864e-07, "logits/chosen": -4.4776411056518555, "logits/rejected": -4.470993518829346, "logps/chosen": -1.687505841255188, "logps/rejected": -1.3818199634552002, "loss": 4.875, "rewards/accuracies": 0.25, "rewards/chosen": -16.875059127807617, "rewards/margins": -3.0568594932556152, "rewards/rejected": -13.81820011138916, "step": 334 }, { "epoch": 0.04561546840958606, "grad_norm": 59.11941552248355, "learning_rate": 3.64625850340136e-07, "logits/chosen": -5.771968364715576, "logits/rejected": -4.812268257141113, "logps/chosen": -0.8832271099090576, "logps/rejected": -1.2285106182098389, "loss": 4.221, "rewards/accuracies": 1.0, "rewards/chosen": -8.832271575927734, "rewards/margins": 3.452834129333496, "rewards/rejected": -12.28510570526123, "step": 335 }, { "epoch": 0.0457516339869281, "grad_norm": 88.1622514093425, "learning_rate": 3.6571428571428567e-07, "logits/chosen": -4.847055912017822, "logits/rejected": -4.330475330352783, "logps/chosen": -0.914954662322998, "logps/rejected": -1.245988130569458, "loss": 5.2333, "rewards/accuracies": 0.75, "rewards/chosen": -9.149547576904297, "rewards/margins": 3.310335159301758, "rewards/rejected": -12.459882736206055, "step": 336 }, { "epoch": 0.04588779956427015, "grad_norm": 52.001334009645625, "learning_rate": 3.668027210884354e-07, "logits/chosen": -6.903139114379883, "logits/rejected": -5.824688911437988, "logps/chosen": -0.5480578541755676, "logps/rejected": -0.6635133028030396, "loss": 4.3156, "rewards/accuracies": 0.75, "rewards/chosen": -5.480578422546387, "rewards/margins": 1.1545543670654297, "rewards/rejected": -6.635132789611816, "step": 337 }, { "epoch": 0.0460239651416122, "grad_norm": 80.05198967809932, "learning_rate": 3.6789115646258503e-07, "logits/chosen": -4.128746032714844, "logits/rejected": -4.49485969543457, "logps/chosen": -1.1353280544281006, "logps/rejected": -1.1029114723205566, "loss": 5.3823, "rewards/accuracies": 0.5, "rewards/chosen": -11.353281021118164, "rewards/margins": -0.32416510581970215, "rewards/rejected": -11.029115676879883, "step": 338 }, { "epoch": 0.04616013071895425, "grad_norm": 81.94806348695083, "learning_rate": 3.689795918367347e-07, "logits/chosen": -4.851099014282227, "logits/rejected": -4.12558650970459, "logps/chosen": -1.1570490598678589, "logps/rejected": -1.3034899234771729, "loss": 4.7139, "rewards/accuracies": 0.75, "rewards/chosen": -11.570490837097168, "rewards/margins": 1.464409351348877, "rewards/rejected": -13.034900665283203, "step": 339 }, { "epoch": 0.046296296296296294, "grad_norm": 103.62559251002375, "learning_rate": 3.7006802721088433e-07, "logits/chosen": -5.333596229553223, "logits/rejected": -4.665900230407715, "logps/chosen": -1.1381239891052246, "logps/rejected": -1.030791997909546, "loss": 5.3047, "rewards/accuracies": 0.5, "rewards/chosen": -11.381240844726562, "rewards/margins": -1.0733213424682617, "rewards/rejected": -10.3079195022583, "step": 340 }, { "epoch": 0.046432461873638343, "grad_norm": 52.47840897312059, "learning_rate": 3.71156462585034e-07, "logits/chosen": -3.387547016143799, "logits/rejected": -4.031194686889648, "logps/chosen": -1.4097591638565063, "logps/rejected": -1.5441861152648926, "loss": 4.533, "rewards/accuracies": 0.5, "rewards/chosen": -14.097591400146484, "rewards/margins": 1.3442699909210205, "rewards/rejected": -15.441862106323242, "step": 341 }, { "epoch": 0.04656862745098039, "grad_norm": 64.26360427992691, "learning_rate": 3.7224489795918364e-07, "logits/chosen": -4.687727928161621, "logits/rejected": -3.7767446041107178, "logps/chosen": -1.1348202228546143, "logps/rejected": -1.4387342929840088, "loss": 4.6277, "rewards/accuracies": 1.0, "rewards/chosen": -11.348201751708984, "rewards/margins": 3.0391411781311035, "rewards/rejected": -14.387343406677246, "step": 342 }, { "epoch": 0.04670479302832244, "grad_norm": 47.44529818345173, "learning_rate": 3.7333333333333334e-07, "logits/chosen": -3.479304313659668, "logits/rejected": -3.206601858139038, "logps/chosen": -1.4744844436645508, "logps/rejected": -1.1833782196044922, "loss": 4.8163, "rewards/accuracies": 0.25, "rewards/chosen": -14.744844436645508, "rewards/margins": -2.911062240600586, "rewards/rejected": -11.833782196044922, "step": 343 }, { "epoch": 0.046840958605664486, "grad_norm": 64.26531673086846, "learning_rate": 3.7442176870748294e-07, "logits/chosen": -3.7847156524658203, "logits/rejected": -4.481882095336914, "logps/chosen": -1.8097883462905884, "logps/rejected": -1.3306057453155518, "loss": 4.6497, "rewards/accuracies": 0.25, "rewards/chosen": -18.097885131835938, "rewards/margins": -4.791826248168945, "rewards/rejected": -13.306057929992676, "step": 344 }, { "epoch": 0.046977124183006536, "grad_norm": 72.58462310643739, "learning_rate": 3.7551020408163265e-07, "logits/chosen": -5.742620468139648, "logits/rejected": -3.8809614181518555, "logps/chosen": -0.7879772186279297, "logps/rejected": -1.2834994792938232, "loss": 4.6548, "rewards/accuracies": 1.0, "rewards/chosen": -7.879772186279297, "rewards/margins": 4.955223083496094, "rewards/rejected": -12.83499526977539, "step": 345 }, { "epoch": 0.047113289760348585, "grad_norm": 75.7942208749751, "learning_rate": 3.765986394557823e-07, "logits/chosen": -5.122191905975342, "logits/rejected": -3.5494017601013184, "logps/chosen": -1.184884786605835, "logps/rejected": -1.0418821573257446, "loss": 4.7957, "rewards/accuracies": 0.5, "rewards/chosen": -11.848846435546875, "rewards/margins": -1.430025339126587, "rewards/rejected": -10.418821334838867, "step": 346 }, { "epoch": 0.047249455337690635, "grad_norm": 84.8430919001219, "learning_rate": 3.7768707482993195e-07, "logits/chosen": -4.72030782699585, "logits/rejected": -4.733547210693359, "logps/chosen": -1.1555423736572266, "logps/rejected": -1.5096678733825684, "loss": 4.648, "rewards/accuracies": 0.5, "rewards/chosen": -11.55542278289795, "rewards/margins": 3.5412545204162598, "rewards/rejected": -15.096677780151367, "step": 347 }, { "epoch": 0.04738562091503268, "grad_norm": 61.499147763932804, "learning_rate": 3.787755102040816e-07, "logits/chosen": -4.381627082824707, "logits/rejected": -3.8670363426208496, "logps/chosen": -0.9360236525535583, "logps/rejected": -1.106676697731018, "loss": 4.1167, "rewards/accuracies": 0.75, "rewards/chosen": -9.360236167907715, "rewards/margins": 1.7065308094024658, "rewards/rejected": -11.066766738891602, "step": 348 }, { "epoch": 0.04752178649237473, "grad_norm": 70.53092724254896, "learning_rate": 3.7986394557823126e-07, "logits/chosen": -2.729912519454956, "logits/rejected": -4.400382995605469, "logps/chosen": -1.1213295459747314, "logps/rejected": -2.0715765953063965, "loss": 4.9453, "rewards/accuracies": 0.75, "rewards/chosen": -11.213296890258789, "rewards/margins": 9.502470016479492, "rewards/rejected": -20.71576690673828, "step": 349 }, { "epoch": 0.04765795206971678, "grad_norm": 60.41701033824524, "learning_rate": 3.809523809523809e-07, "logits/chosen": -6.25025749206543, "logits/rejected": -5.233442306518555, "logps/chosen": -1.1553184986114502, "logps/rejected": -1.1237400770187378, "loss": 5.2371, "rewards/accuracies": 0.5, "rewards/chosen": -11.553184509277344, "rewards/margins": -0.3157843351364136, "rewards/rejected": -11.23740005493164, "step": 350 }, { "epoch": 0.04779411764705882, "grad_norm": 39.688776218157535, "learning_rate": 3.820408163265306e-07, "logits/chosen": -3.9296278953552246, "logits/rejected": -3.9206624031066895, "logps/chosen": -1.246860146522522, "logps/rejected": -1.5343701839447021, "loss": 4.0967, "rewards/accuracies": 0.5, "rewards/chosen": -12.468602180480957, "rewards/margins": 2.8751001358032227, "rewards/rejected": -15.34370231628418, "step": 351 }, { "epoch": 0.04793028322440087, "grad_norm": 91.18671826620812, "learning_rate": 3.8312925170068026e-07, "logits/chosen": -3.2646350860595703, "logits/rejected": -2.839343309402466, "logps/chosen": -1.4661448001861572, "logps/rejected": -1.6291890144348145, "loss": 4.7386, "rewards/accuracies": 0.5, "rewards/chosen": -14.661447525024414, "rewards/margins": 1.6304430961608887, "rewards/rejected": -16.29189109802246, "step": 352 }, { "epoch": 0.04806644880174292, "grad_norm": 99.64857481042004, "learning_rate": 3.842176870748299e-07, "logits/chosen": -5.036593437194824, "logits/rejected": -4.73375129699707, "logps/chosen": -1.1730133295059204, "logps/rejected": -1.1294384002685547, "loss": 4.4026, "rewards/accuracies": 0.5, "rewards/chosen": -11.730133056640625, "rewards/margins": -0.4357491731643677, "rewards/rejected": -11.294384002685547, "step": 353 }, { "epoch": 0.04820261437908497, "grad_norm": 70.43635347849056, "learning_rate": 3.8530612244897957e-07, "logits/chosen": -5.037467956542969, "logits/rejected": -3.364009141921997, "logps/chosen": -1.0652697086334229, "logps/rejected": -1.746490240097046, "loss": 4.5528, "rewards/accuracies": 1.0, "rewards/chosen": -10.652698516845703, "rewards/margins": 6.812203407287598, "rewards/rejected": -17.464900970458984, "step": 354 }, { "epoch": 0.04833877995642701, "grad_norm": 59.65530580634134, "learning_rate": 3.863945578231292e-07, "logits/chosen": -4.721744537353516, "logits/rejected": -4.3982648849487305, "logps/chosen": -1.0750972032546997, "logps/rejected": -1.1355454921722412, "loss": 5.0939, "rewards/accuracies": 0.75, "rewards/chosen": -10.750971794128418, "rewards/margins": 0.6044834852218628, "rewards/rejected": -11.35545539855957, "step": 355 }, { "epoch": 0.04847494553376906, "grad_norm": 46.770668291367954, "learning_rate": 3.8748299319727893e-07, "logits/chosen": -5.162968635559082, "logits/rejected": -5.444433689117432, "logps/chosen": -0.8656356334686279, "logps/rejected": -0.8235973119735718, "loss": 4.9787, "rewards/accuracies": 0.5, "rewards/chosen": -8.656355857849121, "rewards/margins": -0.4203832149505615, "rewards/rejected": -8.235973358154297, "step": 356 }, { "epoch": 0.04861111111111111, "grad_norm": 66.49217632232175, "learning_rate": 3.8857142857142853e-07, "logits/chosen": -4.034242153167725, "logits/rejected": -3.3635754585266113, "logps/chosen": -1.3328701257705688, "logps/rejected": -1.2513222694396973, "loss": 4.941, "rewards/accuracies": 0.5, "rewards/chosen": -13.32870101928711, "rewards/margins": -0.8154784440994263, "rewards/rejected": -12.513222694396973, "step": 357 }, { "epoch": 0.04874727668845316, "grad_norm": 50.072997862043174, "learning_rate": 3.896598639455782e-07, "logits/chosen": -2.774782657623291, "logits/rejected": -2.750211000442505, "logps/chosen": -1.333456039428711, "logps/rejected": -2.016745090484619, "loss": 4.3693, "rewards/accuracies": 0.5, "rewards/chosen": -13.33456039428711, "rewards/margins": 6.832892417907715, "rewards/rejected": -20.167451858520508, "step": 358 }, { "epoch": 0.048883442265795204, "grad_norm": 48.626430561867856, "learning_rate": 3.907482993197279e-07, "logits/chosen": -4.561434745788574, "logits/rejected": -3.3318333625793457, "logps/chosen": -1.0798548460006714, "logps/rejected": -1.0834531784057617, "loss": 4.0742, "rewards/accuracies": 0.5, "rewards/chosen": -10.79854965209961, "rewards/margins": 0.03598320484161377, "rewards/rejected": -10.834531784057617, "step": 359 }, { "epoch": 0.049019607843137254, "grad_norm": 71.1432026917616, "learning_rate": 3.9183673469387754e-07, "logits/chosen": -3.937601089477539, "logits/rejected": -5.394858360290527, "logps/chosen": -1.0354994535446167, "logps/rejected": -0.7686932682991028, "loss": 4.7626, "rewards/accuracies": 0.25, "rewards/chosen": -10.354995727539062, "rewards/margins": -2.668062686920166, "rewards/rejected": -7.68693208694458, "step": 360 }, { "epoch": 0.049155773420479304, "grad_norm": 57.589488422016544, "learning_rate": 3.929251700680272e-07, "logits/chosen": -4.250298023223877, "logits/rejected": -3.810303211212158, "logps/chosen": -0.9952207207679749, "logps/rejected": -1.0889822244644165, "loss": 4.7041, "rewards/accuracies": 0.75, "rewards/chosen": -9.952207565307617, "rewards/margins": 0.937615156173706, "rewards/rejected": -10.889822006225586, "step": 361 }, { "epoch": 0.049291938997821354, "grad_norm": 63.333065680523944, "learning_rate": 3.9401360544217684e-07, "logits/chosen": -5.630011558532715, "logits/rejected": -5.683361530303955, "logps/chosen": -1.3218655586242676, "logps/rejected": -0.7940860986709595, "loss": 4.863, "rewards/accuracies": 0.0, "rewards/chosen": -13.218655586242676, "rewards/margins": -5.27779483795166, "rewards/rejected": -7.940860748291016, "step": 362 }, { "epoch": 0.0494281045751634, "grad_norm": 49.946540758737406, "learning_rate": 3.951020408163265e-07, "logits/chosen": -4.430416107177734, "logits/rejected": -4.594329357147217, "logps/chosen": -1.14668869972229, "logps/rejected": -1.1622904539108276, "loss": 4.715, "rewards/accuracies": 0.25, "rewards/chosen": -11.466887474060059, "rewards/margins": 0.15601706504821777, "rewards/rejected": -11.622903823852539, "step": 363 }, { "epoch": 0.049564270152505446, "grad_norm": 75.65677594959018, "learning_rate": 3.961904761904762e-07, "logits/chosen": -4.715834617614746, "logits/rejected": -4.765722274780273, "logps/chosen": -2.1950907707214355, "logps/rejected": -1.3353166580200195, "loss": 5.1503, "rewards/accuracies": 0.25, "rewards/chosen": -21.950904846191406, "rewards/margins": -8.597740173339844, "rewards/rejected": -13.353166580200195, "step": 364 }, { "epoch": 0.049700435729847496, "grad_norm": 59.49474022139297, "learning_rate": 3.9727891156462585e-07, "logits/chosen": -4.238061428070068, "logits/rejected": -3.64263916015625, "logps/chosen": -1.2017842531204224, "logps/rejected": -1.4413776397705078, "loss": 4.5683, "rewards/accuracies": 0.75, "rewards/chosen": -12.017842292785645, "rewards/margins": 2.3959343433380127, "rewards/rejected": -14.413776397705078, "step": 365 }, { "epoch": 0.049836601307189546, "grad_norm": 61.31130797899487, "learning_rate": 3.9836734693877545e-07, "logits/chosen": -4.462696075439453, "logits/rejected": -2.9877586364746094, "logps/chosen": -0.9624910354614258, "logps/rejected": -1.4527757167816162, "loss": 4.2574, "rewards/accuracies": 1.0, "rewards/chosen": -9.624910354614258, "rewards/margins": 4.902846336364746, "rewards/rejected": -14.52775764465332, "step": 366 }, { "epoch": 0.04997276688453159, "grad_norm": 35.89373304338028, "learning_rate": 3.9945578231292516e-07, "logits/chosen": -5.073393821716309, "logits/rejected": -4.809358596801758, "logps/chosen": -0.9978084564208984, "logps/rejected": -1.0534707307815552, "loss": 4.8495, "rewards/accuracies": 0.5, "rewards/chosen": -9.978084564208984, "rewards/margins": 0.556623101234436, "rewards/rejected": -10.534708023071289, "step": 367 }, { "epoch": 0.05010893246187364, "grad_norm": 70.47804626430614, "learning_rate": 4.0054421768707486e-07, "logits/chosen": -4.609557151794434, "logits/rejected": -3.538032293319702, "logps/chosen": -1.2886557579040527, "logps/rejected": -1.4615461826324463, "loss": 4.8944, "rewards/accuracies": 0.5, "rewards/chosen": -12.886558532714844, "rewards/margins": 1.7289037704467773, "rewards/rejected": -14.615461349487305, "step": 368 }, { "epoch": 0.05024509803921569, "grad_norm": 62.440594736931246, "learning_rate": 4.016326530612245e-07, "logits/chosen": -5.322005271911621, "logits/rejected": -4.109156608581543, "logps/chosen": -1.0093988180160522, "logps/rejected": -1.3403376340866089, "loss": 5.0942, "rewards/accuracies": 0.75, "rewards/chosen": -10.093987464904785, "rewards/margins": 3.309389114379883, "rewards/rejected": -13.403376579284668, "step": 369 }, { "epoch": 0.05038126361655773, "grad_norm": 48.339595778900176, "learning_rate": 4.027210884353741e-07, "logits/chosen": -4.733573913574219, "logits/rejected": -4.270490646362305, "logps/chosen": -1.0108896493911743, "logps/rejected": -1.106526255607605, "loss": 4.6791, "rewards/accuracies": 0.25, "rewards/chosen": -10.108896255493164, "rewards/margins": 0.9563665390014648, "rewards/rejected": -11.065262794494629, "step": 370 }, { "epoch": 0.05051742919389978, "grad_norm": 64.31593728825284, "learning_rate": 4.0380952380952377e-07, "logits/chosen": -4.218809127807617, "logits/rejected": -3.940826177597046, "logps/chosen": -0.7416388988494873, "logps/rejected": -0.8213395476341248, "loss": 5.1248, "rewards/accuracies": 0.75, "rewards/chosen": -7.416388511657715, "rewards/margins": 0.7970068454742432, "rewards/rejected": -8.213395118713379, "step": 371 }, { "epoch": 0.05065359477124183, "grad_norm": 50.62888602434962, "learning_rate": 4.048979591836734e-07, "logits/chosen": -4.656181335449219, "logits/rejected": -3.1050233840942383, "logps/chosen": -1.013460636138916, "logps/rejected": -1.09527587890625, "loss": 4.3055, "rewards/accuracies": 0.25, "rewards/chosen": -10.13460636138916, "rewards/margins": 0.8181521892547607, "rewards/rejected": -10.9527587890625, "step": 372 }, { "epoch": 0.05078976034858388, "grad_norm": 50.01530705316231, "learning_rate": 4.059863945578232e-07, "logits/chosen": -4.433231830596924, "logits/rejected": -5.109742164611816, "logps/chosen": -1.093780517578125, "logps/rejected": -0.8204267024993896, "loss": 3.7494, "rewards/accuracies": 0.25, "rewards/chosen": -10.93780517578125, "rewards/margins": -2.7335383892059326, "rewards/rejected": -8.204266548156738, "step": 373 }, { "epoch": 0.05092592592592592, "grad_norm": 49.319680041038744, "learning_rate": 4.070748299319728e-07, "logits/chosen": -5.030124664306641, "logits/rejected": -4.256789207458496, "logps/chosen": -1.061401128768921, "logps/rejected": -1.315832257270813, "loss": 4.6776, "rewards/accuracies": 0.75, "rewards/chosen": -10.614011764526367, "rewards/margins": 2.544311285018921, "rewards/rejected": -13.158323287963867, "step": 374 }, { "epoch": 0.05106209150326797, "grad_norm": 43.68441813473087, "learning_rate": 4.0816326530612243e-07, "logits/chosen": -5.896186828613281, "logits/rejected": -4.19079065322876, "logps/chosen": -0.7971887588500977, "logps/rejected": -0.9658806324005127, "loss": 4.7059, "rewards/accuracies": 0.75, "rewards/chosen": -7.971887588500977, "rewards/margins": 1.6869187355041504, "rewards/rejected": -9.658805847167969, "step": 375 }, { "epoch": 0.05119825708061002, "grad_norm": 52.760022756046794, "learning_rate": 4.092517006802721e-07, "logits/chosen": -4.487117767333984, "logits/rejected": -6.3478593826293945, "logps/chosen": -1.1551398038864136, "logps/rejected": -1.0892930030822754, "loss": 4.5358, "rewards/accuracies": 0.25, "rewards/chosen": -11.551397323608398, "rewards/margins": -0.6584687232971191, "rewards/rejected": -10.892929077148438, "step": 376 }, { "epoch": 0.05133442265795207, "grad_norm": 59.55413225205095, "learning_rate": 4.1034013605442173e-07, "logits/chosen": -3.759427547454834, "logits/rejected": -5.074109077453613, "logps/chosen": -1.3292748928070068, "logps/rejected": -1.1516237258911133, "loss": 4.7997, "rewards/accuracies": 0.5, "rewards/chosen": -13.292749404907227, "rewards/margins": -1.7765127420425415, "rewards/rejected": -11.516237258911133, "step": 377 }, { "epoch": 0.051470588235294115, "grad_norm": 50.03941503449033, "learning_rate": 4.114285714285714e-07, "logits/chosen": -4.392102241516113, "logits/rejected": -4.103697299957275, "logps/chosen": -1.0755290985107422, "logps/rejected": -1.002875804901123, "loss": 4.898, "rewards/accuracies": 0.25, "rewards/chosen": -10.755290985107422, "rewards/margins": -0.7265328168869019, "rewards/rejected": -10.028759002685547, "step": 378 }, { "epoch": 0.051606753812636165, "grad_norm": 58.57907231887316, "learning_rate": 4.125170068027211e-07, "logits/chosen": -5.784555435180664, "logits/rejected": -4.2676873207092285, "logps/chosen": -1.0702967643737793, "logps/rejected": -1.447671890258789, "loss": 5.0389, "rewards/accuracies": 1.0, "rewards/chosen": -10.702967643737793, "rewards/margins": 3.7737512588500977, "rewards/rejected": -14.47671890258789, "step": 379 }, { "epoch": 0.051742919389978215, "grad_norm": 56.84761649791471, "learning_rate": 4.1360544217687074e-07, "logits/chosen": -4.891299247741699, "logits/rejected": -4.225244998931885, "logps/chosen": -0.942592203617096, "logps/rejected": -1.2216744422912598, "loss": 4.9448, "rewards/accuracies": 0.75, "rewards/chosen": -9.425922393798828, "rewards/margins": 2.790821075439453, "rewards/rejected": -12.216743469238281, "step": 380 }, { "epoch": 0.051879084967320264, "grad_norm": 39.86424248709267, "learning_rate": 4.146938775510204e-07, "logits/chosen": -3.847580909729004, "logits/rejected": -4.6515302658081055, "logps/chosen": -1.4245232343673706, "logps/rejected": -1.3031435012817383, "loss": 4.1825, "rewards/accuracies": 0.25, "rewards/chosen": -14.245231628417969, "rewards/margins": -1.2137963771820068, "rewards/rejected": -13.031435012817383, "step": 381 }, { "epoch": 0.05201525054466231, "grad_norm": 65.88429283558246, "learning_rate": 4.1578231292517005e-07, "logits/chosen": -4.4065656661987305, "logits/rejected": -3.639303207397461, "logps/chosen": -1.1241507530212402, "logps/rejected": -1.0169777870178223, "loss": 4.8693, "rewards/accuracies": 0.25, "rewards/chosen": -11.241507530212402, "rewards/margins": -1.0717291831970215, "rewards/rejected": -10.169777870178223, "step": 382 }, { "epoch": 0.05215141612200436, "grad_norm": 63.549812385286906, "learning_rate": 4.168707482993197e-07, "logits/chosen": -4.33826208114624, "logits/rejected": -3.186058282852173, "logps/chosen": -1.1618280410766602, "logps/rejected": -1.484189510345459, "loss": 4.7735, "rewards/accuracies": 0.75, "rewards/chosen": -11.618281364440918, "rewards/margins": 3.223614454269409, "rewards/rejected": -14.84189510345459, "step": 383 }, { "epoch": 0.05228758169934641, "grad_norm": 43.18810535380108, "learning_rate": 4.179591836734694e-07, "logits/chosen": -4.7782697677612305, "logits/rejected": -4.969653129577637, "logps/chosen": -0.9616748094558716, "logps/rejected": -0.7381795644760132, "loss": 4.7153, "rewards/accuracies": 0.25, "rewards/chosen": -9.616748809814453, "rewards/margins": -2.234952926635742, "rewards/rejected": -7.381795406341553, "step": 384 }, { "epoch": 0.05242374727668846, "grad_norm": 47.09229290299882, "learning_rate": 4.1904761904761906e-07, "logits/chosen": -2.751847267150879, "logits/rejected": -3.1571831703186035, "logps/chosen": -1.4500106573104858, "logps/rejected": -1.4934954643249512, "loss": 4.7225, "rewards/accuracies": 0.75, "rewards/chosen": -14.500106811523438, "rewards/margins": 0.4348485469818115, "rewards/rejected": -14.934954643249512, "step": 385 }, { "epoch": 0.0525599128540305, "grad_norm": 86.12620138624588, "learning_rate": 4.201360544217687e-07, "logits/chosen": -3.4587931632995605, "logits/rejected": -3.0349366664886475, "logps/chosen": -1.2382431030273438, "logps/rejected": -1.494152307510376, "loss": 4.4886, "rewards/accuracies": 1.0, "rewards/chosen": -12.382431983947754, "rewards/margins": 2.5590903759002686, "rewards/rejected": -14.941522598266602, "step": 386 }, { "epoch": 0.05269607843137255, "grad_norm": 38.8893282253715, "learning_rate": 4.2122448979591836e-07, "logits/chosen": -3.719339609146118, "logits/rejected": -3.216487407684326, "logps/chosen": -1.2164249420166016, "logps/rejected": -1.5033557415008545, "loss": 4.5824, "rewards/accuracies": 0.75, "rewards/chosen": -12.1642484664917, "rewards/margins": 2.869309186935425, "rewards/rejected": -15.033557891845703, "step": 387 }, { "epoch": 0.0528322440087146, "grad_norm": 48.35433424876795, "learning_rate": 4.2231292517006796e-07, "logits/chosen": -3.389918565750122, "logits/rejected": -2.6068432331085205, "logps/chosen": -1.2881076335906982, "logps/rejected": -1.9020192623138428, "loss": 4.3053, "rewards/accuracies": 1.0, "rewards/chosen": -12.881075859069824, "rewards/margins": 6.1391167640686035, "rewards/rejected": -19.020193099975586, "step": 388 }, { "epoch": 0.05296840958605664, "grad_norm": 50.91057344692353, "learning_rate": 4.234013605442177e-07, "logits/chosen": -4.003451347351074, "logits/rejected": -3.210225820541382, "logps/chosen": -1.6003930568695068, "logps/rejected": -1.6900222301483154, "loss": 4.66, "rewards/accuracies": 0.5, "rewards/chosen": -16.003931045532227, "rewards/margins": 0.896291971206665, "rewards/rejected": -16.900222778320312, "step": 389 }, { "epoch": 0.05310457516339869, "grad_norm": 44.746153392733916, "learning_rate": 4.2448979591836737e-07, "logits/chosen": -4.248198509216309, "logits/rejected": -4.6256232261657715, "logps/chosen": -0.9387425184249878, "logps/rejected": -1.111342191696167, "loss": 4.2304, "rewards/accuracies": 0.25, "rewards/chosen": -9.387425422668457, "rewards/margins": 1.725996494293213, "rewards/rejected": -11.113421440124512, "step": 390 }, { "epoch": 0.05324074074074074, "grad_norm": 56.45366498519587, "learning_rate": 4.25578231292517e-07, "logits/chosen": -3.3149781227111816, "logits/rejected": -4.226804733276367, "logps/chosen": -1.2171587944030762, "logps/rejected": -1.3357594013214111, "loss": 4.8798, "rewards/accuracies": 0.5, "rewards/chosen": -12.171586990356445, "rewards/margins": 1.1860058307647705, "rewards/rejected": -13.357593536376953, "step": 391 }, { "epoch": 0.05337690631808279, "grad_norm": 55.36836792578739, "learning_rate": 4.266666666666666e-07, "logits/chosen": -4.612475395202637, "logits/rejected": -4.072447299957275, "logps/chosen": -1.1316940784454346, "logps/rejected": -1.74626886844635, "loss": 4.3281, "rewards/accuracies": 1.0, "rewards/chosen": -11.316941261291504, "rewards/margins": 6.145747661590576, "rewards/rejected": -17.462688446044922, "step": 392 }, { "epoch": 0.053513071895424834, "grad_norm": 65.47139386378133, "learning_rate": 4.277551020408163e-07, "logits/chosen": -4.157782554626465, "logits/rejected": -3.5690855979919434, "logps/chosen": -1.3194139003753662, "logps/rejected": -1.1529537439346313, "loss": 4.8992, "rewards/accuracies": 0.5, "rewards/chosen": -13.194137573242188, "rewards/margins": -1.6646009683609009, "rewards/rejected": -11.529537200927734, "step": 393 }, { "epoch": 0.053649237472766884, "grad_norm": 57.63009003555696, "learning_rate": 4.2884353741496593e-07, "logits/chosen": -2.6153717041015625, "logits/rejected": -3.171003818511963, "logps/chosen": -1.7021191120147705, "logps/rejected": -1.4870145320892334, "loss": 5.1292, "rewards/accuracies": 0.25, "rewards/chosen": -17.021190643310547, "rewards/margins": -2.151045799255371, "rewards/rejected": -14.870145797729492, "step": 394 }, { "epoch": 0.05378540305010893, "grad_norm": 53.772765354701356, "learning_rate": 4.299319727891157e-07, "logits/chosen": -5.377692699432373, "logits/rejected": -4.610713958740234, "logps/chosen": -1.076084017753601, "logps/rejected": -1.31722891330719, "loss": 4.8871, "rewards/accuracies": 0.75, "rewards/chosen": -10.760839462280273, "rewards/margins": 2.4114491939544678, "rewards/rejected": -13.17228889465332, "step": 395 }, { "epoch": 0.05392156862745098, "grad_norm": 60.45137974302771, "learning_rate": 4.310204081632653e-07, "logits/chosen": -3.5165205001831055, "logits/rejected": -2.8365631103515625, "logps/chosen": -1.6021865606307983, "logps/rejected": -1.4897844791412354, "loss": 5.1689, "rewards/accuracies": 0.75, "rewards/chosen": -16.021865844726562, "rewards/margins": -1.1240203380584717, "rewards/rejected": -14.897844314575195, "step": 396 }, { "epoch": 0.054057734204793026, "grad_norm": 61.94147790998974, "learning_rate": 4.3210884353741494e-07, "logits/chosen": -4.007455348968506, "logits/rejected": -3.336472511291504, "logps/chosen": -1.0130470991134644, "logps/rejected": -1.2166389226913452, "loss": 5.0902, "rewards/accuracies": 0.75, "rewards/chosen": -10.130471229553223, "rewards/margins": 2.0359179973602295, "rewards/rejected": -12.166389465332031, "step": 397 }, { "epoch": 0.054193899782135076, "grad_norm": 42.11986098676664, "learning_rate": 4.331972789115646e-07, "logits/chosen": -4.578699111938477, "logits/rejected": -4.8498311042785645, "logps/chosen": -0.8790339231491089, "logps/rejected": -0.9052631855010986, "loss": 4.3585, "rewards/accuracies": 0.5, "rewards/chosen": -8.790338516235352, "rewards/margins": 0.2622934579849243, "rewards/rejected": -9.052632331848145, "step": 398 }, { "epoch": 0.054330065359477125, "grad_norm": 49.2787181808803, "learning_rate": 4.3428571428571424e-07, "logits/chosen": -3.4915337562561035, "logits/rejected": -3.350977897644043, "logps/chosen": -1.5776159763336182, "logps/rejected": -1.8437435626983643, "loss": 4.6383, "rewards/accuracies": 0.5, "rewards/chosen": -15.77616024017334, "rewards/margins": 2.6612753868103027, "rewards/rejected": -18.437435150146484, "step": 399 }, { "epoch": 0.054466230936819175, "grad_norm": 52.92816465153872, "learning_rate": 4.3537414965986395e-07, "logits/chosen": -3.761528968811035, "logits/rejected": -4.591896057128906, "logps/chosen": -0.9632505178451538, "logps/rejected": -1.0243831872940063, "loss": 5.3516, "rewards/accuracies": 0.5, "rewards/chosen": -9.632505416870117, "rewards/margins": 0.6113263368606567, "rewards/rejected": -10.243831634521484, "step": 400 }, { "epoch": 0.05460239651416122, "grad_norm": 45.68669367097855, "learning_rate": 4.364625850340136e-07, "logits/chosen": -4.95993709564209, "logits/rejected": -3.418145179748535, "logps/chosen": -0.9764912128448486, "logps/rejected": -1.1860324144363403, "loss": 4.5043, "rewards/accuracies": 0.75, "rewards/chosen": -9.764911651611328, "rewards/margins": 2.095412492752075, "rewards/rejected": -11.860323905944824, "step": 401 }, { "epoch": 0.05473856209150327, "grad_norm": 61.88048242463245, "learning_rate": 4.3755102040816325e-07, "logits/chosen": -3.989450216293335, "logits/rejected": -2.737484931945801, "logps/chosen": -1.0540305376052856, "logps/rejected": -1.4780402183532715, "loss": 4.3024, "rewards/accuracies": 1.0, "rewards/chosen": -10.540304183959961, "rewards/margins": 4.2400970458984375, "rewards/rejected": -14.780402183532715, "step": 402 }, { "epoch": 0.05487472766884532, "grad_norm": 44.8862226916202, "learning_rate": 4.386394557823129e-07, "logits/chosen": -3.970062732696533, "logits/rejected": -4.168905258178711, "logps/chosen": -0.8776034712791443, "logps/rejected": -0.9131577610969543, "loss": 3.8417, "rewards/accuracies": 0.25, "rewards/chosen": -8.776034355163574, "rewards/margins": 0.3555431365966797, "rewards/rejected": -9.13157844543457, "step": 403 }, { "epoch": 0.05501089324618736, "grad_norm": 50.74687637635371, "learning_rate": 4.3972789115646256e-07, "logits/chosen": -4.007328510284424, "logits/rejected": -4.83897066116333, "logps/chosen": -0.7331565618515015, "logps/rejected": -0.6233339309692383, "loss": 4.7121, "rewards/accuracies": 0.25, "rewards/chosen": -7.331565856933594, "rewards/margins": -1.09822678565979, "rewards/rejected": -6.233338832855225, "step": 404 }, { "epoch": 0.05514705882352941, "grad_norm": 46.81590078438447, "learning_rate": 4.4081632653061216e-07, "logits/chosen": -3.5568270683288574, "logits/rejected": -2.8460776805877686, "logps/chosen": -0.8345403671264648, "logps/rejected": -1.0704164505004883, "loss": 5.2342, "rewards/accuracies": 0.75, "rewards/chosen": -8.345403671264648, "rewards/margins": 2.358760118484497, "rewards/rejected": -10.704164505004883, "step": 405 }, { "epoch": 0.05528322440087146, "grad_norm": 58.56844435701675, "learning_rate": 4.419047619047619e-07, "logits/chosen": -4.017667293548584, "logits/rejected": -1.9660165309906006, "logps/chosen": -0.7921370267868042, "logps/rejected": -2.1258630752563477, "loss": 4.9128, "rewards/accuracies": 1.0, "rewards/chosen": -7.921370506286621, "rewards/margins": 13.337261199951172, "rewards/rejected": -21.258630752563477, "step": 406 }, { "epoch": 0.05541938997821351, "grad_norm": 55.490557231435055, "learning_rate": 4.4299319727891157e-07, "logits/chosen": -3.3159987926483154, "logits/rejected": -3.43422269821167, "logps/chosen": -0.9569330215454102, "logps/rejected": -1.2971683740615845, "loss": 5.2842, "rewards/accuracies": 0.5, "rewards/chosen": -9.569331169128418, "rewards/margins": 3.402353525161743, "rewards/rejected": -12.971684455871582, "step": 407 }, { "epoch": 0.05555555555555555, "grad_norm": 42.9670271230426, "learning_rate": 4.440816326530612e-07, "logits/chosen": -3.3089730739593506, "logits/rejected": -3.762237071990967, "logps/chosen": -0.8744107484817505, "logps/rejected": -1.3445937633514404, "loss": 4.8553, "rewards/accuracies": 0.5, "rewards/chosen": -8.744108200073242, "rewards/margins": 4.70182991027832, "rewards/rejected": -13.445937156677246, "step": 408 }, { "epoch": 0.0556917211328976, "grad_norm": 60.44621646457349, "learning_rate": 4.451700680272108e-07, "logits/chosen": -3.933250904083252, "logits/rejected": -3.399423837661743, "logps/chosen": -1.0699964761734009, "logps/rejected": -1.0447535514831543, "loss": 4.8824, "rewards/accuracies": 0.25, "rewards/chosen": -10.69996452331543, "rewards/margins": -0.2524292469024658, "rewards/rejected": -10.447535514831543, "step": 409 }, { "epoch": 0.05582788671023965, "grad_norm": 69.99590233280637, "learning_rate": 4.4625850340136047e-07, "logits/chosen": -4.385718822479248, "logits/rejected": -4.861325740814209, "logps/chosen": -1.31895112991333, "logps/rejected": -1.0323377847671509, "loss": 4.8869, "rewards/accuracies": 0.0, "rewards/chosen": -13.189512252807617, "rewards/margins": -2.866133689880371, "rewards/rejected": -10.323378562927246, "step": 410 }, { "epoch": 0.0559640522875817, "grad_norm": 48.5989541910103, "learning_rate": 4.4734693877551023e-07, "logits/chosen": -2.8287084102630615, "logits/rejected": -2.548316717147827, "logps/chosen": -1.1642886400222778, "logps/rejected": -1.5006481409072876, "loss": 4.738, "rewards/accuracies": 0.75, "rewards/chosen": -11.642887115478516, "rewards/margins": 3.3635945320129395, "rewards/rejected": -15.006481170654297, "step": 411 }, { "epoch": 0.056100217864923745, "grad_norm": 63.727842686277626, "learning_rate": 4.484353741496599e-07, "logits/chosen": -4.365447998046875, "logits/rejected": -3.498565673828125, "logps/chosen": -0.9711335897445679, "logps/rejected": -0.8645371198654175, "loss": 4.6315, "rewards/accuracies": 0.25, "rewards/chosen": -9.711336135864258, "rewards/margins": -1.0659648180007935, "rewards/rejected": -8.645370483398438, "step": 412 }, { "epoch": 0.056236383442265794, "grad_norm": 44.7639665818356, "learning_rate": 4.495238095238095e-07, "logits/chosen": -3.38047456741333, "logits/rejected": -3.778137683868408, "logps/chosen": -1.2060441970825195, "logps/rejected": -1.2078670263290405, "loss": 4.4809, "rewards/accuracies": 0.5, "rewards/chosen": -12.060441017150879, "rewards/margins": 0.018228888511657715, "rewards/rejected": -12.078670501708984, "step": 413 }, { "epoch": 0.056372549019607844, "grad_norm": 50.34038629148576, "learning_rate": 4.5061224489795913e-07, "logits/chosen": -3.9107115268707275, "logits/rejected": -3.606355667114258, "logps/chosen": -1.11201012134552, "logps/rejected": -1.0204873085021973, "loss": 4.327, "rewards/accuracies": 0.25, "rewards/chosen": -11.120100975036621, "rewards/margins": -0.9152282476425171, "rewards/rejected": -10.204872131347656, "step": 414 }, { "epoch": 0.056508714596949894, "grad_norm": 59.891614677366846, "learning_rate": 4.517006802721088e-07, "logits/chosen": -3.925548553466797, "logits/rejected": -3.6692562103271484, "logps/chosen": -1.0378334522247314, "logps/rejected": -1.129392385482788, "loss": 4.3964, "rewards/accuracies": 0.5, "rewards/chosen": -10.378334045410156, "rewards/margins": 0.9155896902084351, "rewards/rejected": -11.293924331665039, "step": 415 }, { "epoch": 0.05664488017429194, "grad_norm": 53.506946702027584, "learning_rate": 4.5278911564625854e-07, "logits/chosen": -4.348017692565918, "logits/rejected": -4.019487380981445, "logps/chosen": -1.0946455001831055, "logps/rejected": -0.9418354034423828, "loss": 4.4053, "rewards/accuracies": 0.5, "rewards/chosen": -10.946455001831055, "rewards/margins": -1.5281000137329102, "rewards/rejected": -9.418354034423828, "step": 416 }, { "epoch": 0.056781045751633986, "grad_norm": 45.855944218796346, "learning_rate": 4.5387755102040814e-07, "logits/chosen": -2.7912869453430176, "logits/rejected": -3.9245877265930176, "logps/chosen": -1.4037315845489502, "logps/rejected": -1.0789549350738525, "loss": 4.8322, "rewards/accuracies": 0.0, "rewards/chosen": -14.037315368652344, "rewards/margins": -3.2477667331695557, "rewards/rejected": -10.789548873901367, "step": 417 }, { "epoch": 0.056917211328976036, "grad_norm": 53.15335621757715, "learning_rate": 4.549659863945578e-07, "logits/chosen": -3.7729601860046387, "logits/rejected": -3.731234312057495, "logps/chosen": -1.2479910850524902, "logps/rejected": -1.5364620685577393, "loss": 4.685, "rewards/accuracies": 0.75, "rewards/chosen": -12.479910850524902, "rewards/margins": 2.884709596633911, "rewards/rejected": -15.364620208740234, "step": 418 }, { "epoch": 0.057053376906318086, "grad_norm": 44.23034612914322, "learning_rate": 4.5605442176870745e-07, "logits/chosen": -2.6709446907043457, "logits/rejected": -3.309422492980957, "logps/chosen": -1.3592281341552734, "logps/rejected": -1.4277238845825195, "loss": 4.0537, "rewards/accuracies": 0.5, "rewards/chosen": -13.592281341552734, "rewards/margins": 0.6849575042724609, "rewards/rejected": -14.277238845825195, "step": 419 }, { "epoch": 0.05718954248366013, "grad_norm": 46.14710204673767, "learning_rate": 4.571428571428571e-07, "logits/chosen": -3.9485347270965576, "logits/rejected": -4.034809112548828, "logps/chosen": -1.2876791954040527, "logps/rejected": -0.9872628450393677, "loss": 5.019, "rewards/accuracies": 0.25, "rewards/chosen": -12.876791000366211, "rewards/margins": -3.0041627883911133, "rewards/rejected": -9.872628211975098, "step": 420 }, { "epoch": 0.05732570806100218, "grad_norm": 87.55713279833216, "learning_rate": 4.5823129251700675e-07, "logits/chosen": -4.348410606384277, "logits/rejected": -4.340848922729492, "logps/chosen": -1.6238884925842285, "logps/rejected": -1.380614995956421, "loss": 5.5685, "rewards/accuracies": 0.5, "rewards/chosen": -16.23888397216797, "rewards/margins": -2.432734966278076, "rewards/rejected": -13.80614948272705, "step": 421 }, { "epoch": 0.05746187363834423, "grad_norm": 50.677974000199214, "learning_rate": 4.5931972789115646e-07, "logits/chosen": -4.455499649047852, "logits/rejected": -5.983485221862793, "logps/chosen": -1.112761378288269, "logps/rejected": -0.8814716339111328, "loss": 4.6696, "rewards/accuracies": 0.25, "rewards/chosen": -11.127613067626953, "rewards/margins": -2.3128976821899414, "rewards/rejected": -8.814716339111328, "step": 422 }, { "epoch": 0.05759803921568627, "grad_norm": 57.866129051929605, "learning_rate": 4.604081632653061e-07, "logits/chosen": -2.8621087074279785, "logits/rejected": -3.2761898040771484, "logps/chosen": -1.648512601852417, "logps/rejected": -1.3260743618011475, "loss": 4.4992, "rewards/accuracies": 0.25, "rewards/chosen": -16.485126495361328, "rewards/margins": -3.2243828773498535, "rewards/rejected": -13.260743141174316, "step": 423 }, { "epoch": 0.05773420479302832, "grad_norm": 73.70093797528712, "learning_rate": 4.6149659863945576e-07, "logits/chosen": -3.3768510818481445, "logits/rejected": -3.6393320560455322, "logps/chosen": -1.107578992843628, "logps/rejected": -1.1483232975006104, "loss": 4.5815, "rewards/accuracies": 0.5, "rewards/chosen": -11.075788497924805, "rewards/margins": 0.4074440002441406, "rewards/rejected": -11.483232498168945, "step": 424 }, { "epoch": 0.05787037037037037, "grad_norm": 60.92409832826714, "learning_rate": 4.625850340136054e-07, "logits/chosen": -4.6054840087890625, "logits/rejected": -2.7485876083374023, "logps/chosen": -0.8664693236351013, "logps/rejected": -1.6634843349456787, "loss": 4.8066, "rewards/accuracies": 1.0, "rewards/chosen": -8.664692878723145, "rewards/margins": 7.970150947570801, "rewards/rejected": -16.634843826293945, "step": 425 }, { "epoch": 0.05800653594771242, "grad_norm": 46.6272340091888, "learning_rate": 4.6367346938775507e-07, "logits/chosen": -3.728294849395752, "logits/rejected": -3.700622081756592, "logps/chosen": -1.316657543182373, "logps/rejected": -1.0281788110733032, "loss": 4.358, "rewards/accuracies": 0.5, "rewards/chosen": -13.16657543182373, "rewards/margins": -2.884787082672119, "rewards/rejected": -10.281787872314453, "step": 426 }, { "epoch": 0.05814270152505446, "grad_norm": 47.414764438906474, "learning_rate": 4.6476190476190477e-07, "logits/chosen": -3.9443864822387695, "logits/rejected": -3.8702392578125, "logps/chosen": -0.7735919952392578, "logps/rejected": -0.8687381744384766, "loss": 4.4073, "rewards/accuracies": 0.25, "rewards/chosen": -7.735919952392578, "rewards/margins": 0.9514614343643188, "rewards/rejected": -8.687381744384766, "step": 427 }, { "epoch": 0.05827886710239651, "grad_norm": 50.53715178388315, "learning_rate": 4.658503401360544e-07, "logits/chosen": -3.7760205268859863, "logits/rejected": -2.8079514503479004, "logps/chosen": -0.9912483096122742, "logps/rejected": -1.0823636054992676, "loss": 5.4885, "rewards/accuracies": 0.5, "rewards/chosen": -9.912483215332031, "rewards/margins": 0.9111528396606445, "rewards/rejected": -10.823636054992676, "step": 428 }, { "epoch": 0.05841503267973856, "grad_norm": 74.98241239798743, "learning_rate": 4.669387755102041e-07, "logits/chosen": -2.4879651069641113, "logits/rejected": -2.4984042644500732, "logps/chosen": -1.1001005172729492, "logps/rejected": -1.2046961784362793, "loss": 4.3753, "rewards/accuracies": 0.75, "rewards/chosen": -11.001005172729492, "rewards/margins": 1.045956015586853, "rewards/rejected": -12.046960830688477, "step": 429 }, { "epoch": 0.05855119825708061, "grad_norm": 53.790096389606994, "learning_rate": 4.6802721088435373e-07, "logits/chosen": -3.5369386672973633, "logits/rejected": -3.8102762699127197, "logps/chosen": -0.9720373749732971, "logps/rejected": -0.9755396842956543, "loss": 5.2722, "rewards/accuracies": 0.75, "rewards/chosen": -9.72037410736084, "rewards/margins": 0.03502213954925537, "rewards/rejected": -9.755395889282227, "step": 430 }, { "epoch": 0.058687363834422655, "grad_norm": 67.00063378767558, "learning_rate": 4.6911564625850333e-07, "logits/chosen": -4.447195053100586, "logits/rejected": -4.0393877029418945, "logps/chosen": -0.817156970500946, "logps/rejected": -0.9682495594024658, "loss": 5.7689, "rewards/accuracies": 0.75, "rewards/chosen": -8.17156982421875, "rewards/margins": 1.510925531387329, "rewards/rejected": -9.6824951171875, "step": 431 }, { "epoch": 0.058823529411764705, "grad_norm": 58.691391362644204, "learning_rate": 4.702040816326531e-07, "logits/chosen": -3.0873851776123047, "logits/rejected": -3.8270740509033203, "logps/chosen": -1.0501163005828857, "logps/rejected": -1.7744768857955933, "loss": 5.6094, "rewards/accuracies": 0.5, "rewards/chosen": -10.501163482666016, "rewards/margins": 7.243605613708496, "rewards/rejected": -17.744770050048828, "step": 432 }, { "epoch": 0.058959694989106755, "grad_norm": 53.43824286738626, "learning_rate": 4.7129251700680274e-07, "logits/chosen": -4.134598731994629, "logits/rejected": -2.519047260284424, "logps/chosen": -0.9638064503669739, "logps/rejected": -2.3212029933929443, "loss": 5.1956, "rewards/accuracies": 0.75, "rewards/chosen": -9.638065338134766, "rewards/margins": 13.573965072631836, "rewards/rejected": -23.2120304107666, "step": 433 }, { "epoch": 0.059095860566448805, "grad_norm": 41.68834330275679, "learning_rate": 4.723809523809524e-07, "logits/chosen": -3.853814125061035, "logits/rejected": -3.050342082977295, "logps/chosen": -1.1811935901641846, "logps/rejected": -1.1955151557922363, "loss": 4.1439, "rewards/accuracies": 0.5, "rewards/chosen": -11.811936378479004, "rewards/margins": 0.14321565628051758, "rewards/rejected": -11.95515251159668, "step": 434 }, { "epoch": 0.05923202614379085, "grad_norm": 50.743349072712306, "learning_rate": 4.73469387755102e-07, "logits/chosen": -3.979781150817871, "logits/rejected": -2.5384795665740967, "logps/chosen": -1.0466687679290771, "logps/rejected": -1.5874942541122437, "loss": 4.3178, "rewards/accuracies": 0.75, "rewards/chosen": -10.46668815612793, "rewards/margins": 5.408254623413086, "rewards/rejected": -15.874943733215332, "step": 435 }, { "epoch": 0.0593681917211329, "grad_norm": 58.653100377337914, "learning_rate": 4.7455782312925164e-07, "logits/chosen": -3.9931976795196533, "logits/rejected": -3.668680191040039, "logps/chosen": -1.0595136880874634, "logps/rejected": -0.9401049613952637, "loss": 5.1119, "rewards/accuracies": 0.25, "rewards/chosen": -10.595136642456055, "rewards/margins": -1.1940879821777344, "rewards/rejected": -9.40104866027832, "step": 436 }, { "epoch": 0.05950435729847495, "grad_norm": 51.73253208013592, "learning_rate": 4.756462585034013e-07, "logits/chosen": -4.125868797302246, "logits/rejected": -3.47910737991333, "logps/chosen": -0.9183812737464905, "logps/rejected": -1.1499073505401611, "loss": 4.5405, "rewards/accuracies": 0.75, "rewards/chosen": -9.183812141418457, "rewards/margins": 2.315260887145996, "rewards/rejected": -11.49907398223877, "step": 437 }, { "epoch": 0.059640522875817, "grad_norm": 57.95226933065235, "learning_rate": 4.7673469387755105e-07, "logits/chosen": -3.3008499145507812, "logits/rejected": -2.1456799507141113, "logps/chosen": -1.211259365081787, "logps/rejected": -1.2761374711990356, "loss": 4.8281, "rewards/accuracies": 0.5, "rewards/chosen": -12.112593650817871, "rewards/margins": 0.6487799882888794, "rewards/rejected": -12.761373519897461, "step": 438 }, { "epoch": 0.05977668845315904, "grad_norm": 46.7468864196759, "learning_rate": 4.778231292517007e-07, "logits/chosen": -3.258173942565918, "logits/rejected": -2.0370960235595703, "logps/chosen": -1.163620948791504, "logps/rejected": -1.6498100757598877, "loss": 4.5403, "rewards/accuracies": 0.75, "rewards/chosen": -11.636210441589355, "rewards/margins": 4.86189079284668, "rewards/rejected": -16.49810218811035, "step": 439 }, { "epoch": 0.05991285403050109, "grad_norm": 51.96817124645959, "learning_rate": 4.789115646258503e-07, "logits/chosen": -4.159543037414551, "logits/rejected": -4.327371120452881, "logps/chosen": -1.0910027027130127, "logps/rejected": -1.162089467048645, "loss": 4.5716, "rewards/accuracies": 0.75, "rewards/chosen": -10.910026550292969, "rewards/margins": 0.710867166519165, "rewards/rejected": -11.620894432067871, "step": 440 }, { "epoch": 0.06004901960784314, "grad_norm": 52.38519181045346, "learning_rate": 4.8e-07, "logits/chosen": -4.739843368530273, "logits/rejected": -4.272103309631348, "logps/chosen": -1.0090323686599731, "logps/rejected": -1.314993977546692, "loss": 4.721, "rewards/accuracies": 1.0, "rewards/chosen": -10.090323448181152, "rewards/margins": 3.0596163272857666, "rewards/rejected": -13.149940490722656, "step": 441 }, { "epoch": 0.06018518518518518, "grad_norm": 52.496628501263885, "learning_rate": 4.810884353741496e-07, "logits/chosen": -4.138759613037109, "logits/rejected": -4.138718128204346, "logps/chosen": -0.8354073762893677, "logps/rejected": -1.1500244140625, "loss": 4.2424, "rewards/accuracies": 0.75, "rewards/chosen": -8.354073524475098, "rewards/margins": 3.1461710929870605, "rewards/rejected": -11.500244140625, "step": 442 }, { "epoch": 0.06032135076252723, "grad_norm": 59.10361177680367, "learning_rate": 4.821768707482994e-07, "logits/chosen": -4.489090442657471, "logits/rejected": -4.223858833312988, "logps/chosen": -0.8937998414039612, "logps/rejected": -1.2671387195587158, "loss": 4.492, "rewards/accuracies": 0.75, "rewards/chosen": -8.93799877166748, "rewards/margins": 3.733388900756836, "rewards/rejected": -12.67138671875, "step": 443 }, { "epoch": 0.06045751633986928, "grad_norm": 39.80622864306376, "learning_rate": 4.83265306122449e-07, "logits/chosen": -4.973862648010254, "logits/rejected": -4.001169204711914, "logps/chosen": -0.7247560620307922, "logps/rejected": -0.9546008706092834, "loss": 4.3455, "rewards/accuracies": 1.0, "rewards/chosen": -7.247560501098633, "rewards/margins": 2.298448085784912, "rewards/rejected": -9.546009063720703, "step": 444 }, { "epoch": 0.06059368191721133, "grad_norm": 46.02458082986114, "learning_rate": 4.843537414965987e-07, "logits/chosen": -2.9560928344726562, "logits/rejected": -3.513561725616455, "logps/chosen": -1.3659014701843262, "logps/rejected": -1.6342802047729492, "loss": 4.3847, "rewards/accuracies": 0.5, "rewards/chosen": -13.659013748168945, "rewards/margins": 2.683788537979126, "rewards/rejected": -16.342802047729492, "step": 445 }, { "epoch": 0.060729847494553374, "grad_norm": 68.47259209659465, "learning_rate": 4.854421768707482e-07, "logits/chosen": -4.805624485015869, "logits/rejected": -3.6315250396728516, "logps/chosen": -1.0689924955368042, "logps/rejected": -1.1455459594726562, "loss": 4.8904, "rewards/accuracies": 0.5, "rewards/chosen": -10.689924240112305, "rewards/margins": 0.7655353546142578, "rewards/rejected": -11.455459594726562, "step": 446 }, { "epoch": 0.060866013071895424, "grad_norm": 67.71471245145474, "learning_rate": 4.865306122448979e-07, "logits/chosen": -2.8034379482269287, "logits/rejected": -2.122114896774292, "logps/chosen": -1.1339224576950073, "logps/rejected": -1.473841667175293, "loss": 5.0206, "rewards/accuracies": 1.0, "rewards/chosen": -11.339224815368652, "rewards/margins": 3.3991920948028564, "rewards/rejected": -14.73841667175293, "step": 447 }, { "epoch": 0.06100217864923747, "grad_norm": 52.2491678652555, "learning_rate": 4.876190476190476e-07, "logits/chosen": -4.422534942626953, "logits/rejected": -3.429605484008789, "logps/chosen": -1.192283034324646, "logps/rejected": -0.9996863603591919, "loss": 4.7247, "rewards/accuracies": 0.5, "rewards/chosen": -11.922830581665039, "rewards/margins": -1.9259672164916992, "rewards/rejected": -9.99686336517334, "step": 448 }, { "epoch": 0.06113834422657952, "grad_norm": 58.0851861358179, "learning_rate": 4.887074829931973e-07, "logits/chosen": -3.5463151931762695, "logits/rejected": -2.7396655082702637, "logps/chosen": -1.245950698852539, "logps/rejected": -1.6209993362426758, "loss": 4.8211, "rewards/accuracies": 0.75, "rewards/chosen": -12.45950698852539, "rewards/margins": 3.750485897064209, "rewards/rejected": -16.209993362426758, "step": 449 }, { "epoch": 0.061274509803921566, "grad_norm": 46.514902946733045, "learning_rate": 4.897959183673469e-07, "logits/chosen": -3.8762245178222656, "logits/rejected": -2.6820287704467773, "logps/chosen": -1.0636796951293945, "logps/rejected": -1.3907639980316162, "loss": 4.6935, "rewards/accuracies": 0.75, "rewards/chosen": -10.636796951293945, "rewards/margins": 3.270843029022217, "rewards/rejected": -13.90764045715332, "step": 450 }, { "epoch": 0.061410675381263616, "grad_norm": 50.76247721982234, "learning_rate": 4.908843537414966e-07, "logits/chosen": -3.5851612091064453, "logits/rejected": -3.2284152507781982, "logps/chosen": -0.7779825329780579, "logps/rejected": -0.9971481561660767, "loss": 4.2899, "rewards/accuracies": 0.75, "rewards/chosen": -7.779825210571289, "rewards/margins": 2.1916558742523193, "rewards/rejected": -9.971481323242188, "step": 451 }, { "epoch": 0.061546840958605666, "grad_norm": 45.828198777715656, "learning_rate": 4.919727891156462e-07, "logits/chosen": -3.9039225578308105, "logits/rejected": -3.265124797821045, "logps/chosen": -1.0069327354431152, "logps/rejected": -1.3279647827148438, "loss": 4.1619, "rewards/accuracies": 0.75, "rewards/chosen": -10.069328308105469, "rewards/margins": 3.2103207111358643, "rewards/rejected": -13.279648780822754, "step": 452 }, { "epoch": 0.061683006535947715, "grad_norm": 69.43580505718566, "learning_rate": 4.930612244897959e-07, "logits/chosen": -3.110739231109619, "logits/rejected": -4.229578495025635, "logps/chosen": -1.0588157176971436, "logps/rejected": -0.8863184452056885, "loss": 5.177, "rewards/accuracies": 0.25, "rewards/chosen": -10.588157653808594, "rewards/margins": -1.724973440170288, "rewards/rejected": -8.863183975219727, "step": 453 }, { "epoch": 0.06181917211328976, "grad_norm": 45.379611367260125, "learning_rate": 4.941496598639455e-07, "logits/chosen": -2.806413412094116, "logits/rejected": -3.8524351119995117, "logps/chosen": -1.17108154296875, "logps/rejected": -0.9822512269020081, "loss": 3.9489, "rewards/accuracies": 0.5, "rewards/chosen": -11.7108154296875, "rewards/margins": -1.8883037567138672, "rewards/rejected": -9.822511672973633, "step": 454 }, { "epoch": 0.06195533769063181, "grad_norm": 51.93873353275732, "learning_rate": 4.952380952380952e-07, "logits/chosen": -2.795647144317627, "logits/rejected": -4.194257736206055, "logps/chosen": -1.1835383176803589, "logps/rejected": -1.1161210536956787, "loss": 4.7971, "rewards/accuracies": 0.25, "rewards/chosen": -11.835383415222168, "rewards/margins": -0.67417311668396, "rewards/rejected": -11.161210060119629, "step": 455 }, { "epoch": 0.06209150326797386, "grad_norm": 51.01682721535262, "learning_rate": 4.963265306122448e-07, "logits/chosen": -4.68477725982666, "logits/rejected": -4.566251754760742, "logps/chosen": -0.8473321795463562, "logps/rejected": -0.8451428413391113, "loss": 4.5951, "rewards/accuracies": 0.5, "rewards/chosen": -8.473321914672852, "rewards/margins": -0.02189415693283081, "rewards/rejected": -8.451428413391113, "step": 456 }, { "epoch": 0.06222766884531591, "grad_norm": 90.23549687809697, "learning_rate": 4.974149659863945e-07, "logits/chosen": -3.1310272216796875, "logits/rejected": -2.846999168395996, "logps/chosen": -1.9583301544189453, "logps/rejected": -2.040076732635498, "loss": 5.0545, "rewards/accuracies": 0.75, "rewards/chosen": -19.583301544189453, "rewards/margins": 0.8174660205841064, "rewards/rejected": -20.400768280029297, "step": 457 }, { "epoch": 0.06236383442265795, "grad_norm": 45.269285754337446, "learning_rate": 4.985034013605442e-07, "logits/chosen": -3.829638957977295, "logits/rejected": -2.588376998901367, "logps/chosen": -0.9150474071502686, "logps/rejected": -1.184133529663086, "loss": 5.0787, "rewards/accuracies": 0.5, "rewards/chosen": -9.150474548339844, "rewards/margins": 2.6908602714538574, "rewards/rejected": -11.841334342956543, "step": 458 }, { "epoch": 0.0625, "grad_norm": 52.420884384630206, "learning_rate": 4.995918367346939e-07, "logits/chosen": -3.354618549346924, "logits/rejected": -3.17584228515625, "logps/chosen": -1.1467227935791016, "logps/rejected": -1.1778664588928223, "loss": 4.9923, "rewards/accuracies": 0.75, "rewards/chosen": -11.467228889465332, "rewards/margins": 0.3114355802536011, "rewards/rejected": -11.778663635253906, "step": 459 }, { "epoch": 0.06263616557734204, "grad_norm": 68.242007034197, "learning_rate": 5.006802721088436e-07, "logits/chosen": -4.154873847961426, "logits/rejected": -2.9726951122283936, "logps/chosen": -1.3183729648590088, "logps/rejected": -1.7081583738327026, "loss": 4.8621, "rewards/accuracies": 1.0, "rewards/chosen": -13.183730125427246, "rewards/margins": 3.8978543281555176, "rewards/rejected": -17.081584930419922, "step": 460 }, { "epoch": 0.0627723311546841, "grad_norm": 56.68175274044204, "learning_rate": 5.017687074829932e-07, "logits/chosen": -4.292179107666016, "logits/rejected": -3.23134708404541, "logps/chosen": -0.9359563589096069, "logps/rejected": -1.007216215133667, "loss": 4.7817, "rewards/accuracies": 0.75, "rewards/chosen": -9.359562873840332, "rewards/margins": 0.7125993967056274, "rewards/rejected": -10.072162628173828, "step": 461 }, { "epoch": 0.06290849673202614, "grad_norm": 45.26985405773494, "learning_rate": 5.028571428571429e-07, "logits/chosen": -3.4874684810638428, "logits/rejected": -3.455533027648926, "logps/chosen": -0.804561972618103, "logps/rejected": -0.9082500338554382, "loss": 5.0331, "rewards/accuracies": 0.75, "rewards/chosen": -8.04561996459961, "rewards/margins": 1.0368801355361938, "rewards/rejected": -9.082500457763672, "step": 462 }, { "epoch": 0.06304466230936819, "grad_norm": 40.862119201517714, "learning_rate": 5.039455782312925e-07, "logits/chosen": -1.9846842288970947, "logits/rejected": -2.6712048053741455, "logps/chosen": -1.4726109504699707, "logps/rejected": -1.25473952293396, "loss": 4.8576, "rewards/accuracies": 0.25, "rewards/chosen": -14.726110458374023, "rewards/margins": -2.1787147521972656, "rewards/rejected": -12.547395706176758, "step": 463 }, { "epoch": 0.06318082788671024, "grad_norm": 40.68208220830565, "learning_rate": 5.050340136054421e-07, "logits/chosen": -2.2969954013824463, "logits/rejected": -1.9334020614624023, "logps/chosen": -1.2705832719802856, "logps/rejected": -1.5885778665542603, "loss": 4.1497, "rewards/accuracies": 0.75, "rewards/chosen": -12.705833435058594, "rewards/margins": 3.179945945739746, "rewards/rejected": -15.88577938079834, "step": 464 }, { "epoch": 0.06331699346405228, "grad_norm": 58.27344586656203, "learning_rate": 5.061224489795918e-07, "logits/chosen": -2.9570345878601074, "logits/rejected": -2.241210699081421, "logps/chosen": -1.7653484344482422, "logps/rejected": -2.335049629211426, "loss": 4.944, "rewards/accuracies": 0.75, "rewards/chosen": -17.653484344482422, "rewards/margins": 5.6970109939575195, "rewards/rejected": -23.350496292114258, "step": 465 }, { "epoch": 0.06345315904139434, "grad_norm": 56.322285633281865, "learning_rate": 5.072108843537415e-07, "logits/chosen": -4.360490322113037, "logits/rejected": -3.2545666694641113, "logps/chosen": -0.8597768545150757, "logps/rejected": -1.1643985509872437, "loss": 4.616, "rewards/accuracies": 0.5, "rewards/chosen": -8.597768783569336, "rewards/margins": 3.046217441558838, "rewards/rejected": -11.643985748291016, "step": 466 }, { "epoch": 0.06358932461873638, "grad_norm": 49.95174957039178, "learning_rate": 5.082993197278911e-07, "logits/chosen": -2.1089706420898438, "logits/rejected": -1.9942501783370972, "logps/chosen": -1.1448436975479126, "logps/rejected": -1.2309846878051758, "loss": 4.425, "rewards/accuracies": 0.75, "rewards/chosen": -11.448436737060547, "rewards/margins": 0.86141037940979, "rewards/rejected": -12.309846878051758, "step": 467 }, { "epoch": 0.06372549019607843, "grad_norm": 44.16774580077828, "learning_rate": 5.093877551020408e-07, "logits/chosen": -2.653045654296875, "logits/rejected": -1.8875985145568848, "logps/chosen": -1.2760251760482788, "logps/rejected": -1.6877803802490234, "loss": 4.4469, "rewards/accuracies": 0.5, "rewards/chosen": -12.760251998901367, "rewards/margins": 4.117550849914551, "rewards/rejected": -16.877803802490234, "step": 468 }, { "epoch": 0.06386165577342048, "grad_norm": 59.941774330662945, "learning_rate": 5.104761904761904e-07, "logits/chosen": -2.060730457305908, "logits/rejected": -2.7613019943237305, "logps/chosen": -1.6004455089569092, "logps/rejected": -1.51969313621521, "loss": 5.1016, "rewards/accuracies": 0.5, "rewards/chosen": -16.004453659057617, "rewards/margins": -0.8075222969055176, "rewards/rejected": -15.196931838989258, "step": 469 }, { "epoch": 0.06399782135076253, "grad_norm": 41.94586428850396, "learning_rate": 5.115646258503402e-07, "logits/chosen": -2.015530824661255, "logits/rejected": -2.0496065616607666, "logps/chosen": -1.333191156387329, "logps/rejected": -1.5202157497406006, "loss": 4.7789, "rewards/accuracies": 0.5, "rewards/chosen": -13.331911087036133, "rewards/margins": 1.8702468872070312, "rewards/rejected": -15.202157974243164, "step": 470 }, { "epoch": 0.06413398692810457, "grad_norm": 46.51777142933087, "learning_rate": 5.126530612244897e-07, "logits/chosen": -3.4222207069396973, "logits/rejected": -3.055570125579834, "logps/chosen": -0.8962790966033936, "logps/rejected": -1.0016157627105713, "loss": 4.4768, "rewards/accuracies": 0.75, "rewards/chosen": -8.962791442871094, "rewards/margins": 1.053367018699646, "rewards/rejected": -10.016158103942871, "step": 471 }, { "epoch": 0.06427015250544663, "grad_norm": 52.3872862876935, "learning_rate": 5.137414965986394e-07, "logits/chosen": -3.448495864868164, "logits/rejected": -2.6980953216552734, "logps/chosen": -0.9410432577133179, "logps/rejected": -0.9795297980308533, "loss": 4.6091, "rewards/accuracies": 0.5, "rewards/chosen": -9.410432815551758, "rewards/margins": 0.384865403175354, "rewards/rejected": -9.795297622680664, "step": 472 }, { "epoch": 0.06440631808278867, "grad_norm": 45.11120379902936, "learning_rate": 5.14829931972789e-07, "logits/chosen": -4.022689342498779, "logits/rejected": -3.196662664413452, "logps/chosen": -0.8039114475250244, "logps/rejected": -1.2090609073638916, "loss": 4.4187, "rewards/accuracies": 0.75, "rewards/chosen": -8.039113998413086, "rewards/margins": 4.051494598388672, "rewards/rejected": -12.090608596801758, "step": 473 }, { "epoch": 0.06454248366013073, "grad_norm": 36.8173256798753, "learning_rate": 5.159183673469387e-07, "logits/chosen": -2.3641560077667236, "logits/rejected": -2.9541401863098145, "logps/chosen": -0.7087069749832153, "logps/rejected": -1.0835912227630615, "loss": 4.7348, "rewards/accuracies": 1.0, "rewards/chosen": -7.087069511413574, "rewards/margins": 3.7488441467285156, "rewards/rejected": -10.83591365814209, "step": 474 }, { "epoch": 0.06467864923747277, "grad_norm": 45.139410342142156, "learning_rate": 5.170068027210885e-07, "logits/chosen": -3.845733642578125, "logits/rejected": -3.827660083770752, "logps/chosen": -0.9940779805183411, "logps/rejected": -0.8782870769500732, "loss": 3.8387, "rewards/accuracies": 0.25, "rewards/chosen": -9.940779685974121, "rewards/margins": -1.1579089164733887, "rewards/rejected": -8.78287124633789, "step": 475 }, { "epoch": 0.06481481481481481, "grad_norm": 57.51805531890884, "learning_rate": 5.180952380952381e-07, "logits/chosen": -3.4426534175872803, "logits/rejected": -3.0436577796936035, "logps/chosen": -1.7406007051467896, "logps/rejected": -1.2587840557098389, "loss": 4.1414, "rewards/accuracies": 0.25, "rewards/chosen": -17.406007766723633, "rewards/margins": -4.8181657791137695, "rewards/rejected": -12.587841987609863, "step": 476 }, { "epoch": 0.06495098039215687, "grad_norm": 55.86005498731564, "learning_rate": 5.191836734693878e-07, "logits/chosen": -3.0768518447875977, "logits/rejected": -1.910510778427124, "logps/chosen": -1.1477091312408447, "logps/rejected": -1.3497154712677002, "loss": 5.1364, "rewards/accuracies": 0.25, "rewards/chosen": -11.477090835571289, "rewards/margins": 2.020064353942871, "rewards/rejected": -13.49715518951416, "step": 477 }, { "epoch": 0.06508714596949891, "grad_norm": 47.65826170947195, "learning_rate": 5.202721088435374e-07, "logits/chosen": -2.8557732105255127, "logits/rejected": -2.014413833618164, "logps/chosen": -0.9643675684928894, "logps/rejected": -1.2137538194656372, "loss": 4.5681, "rewards/accuracies": 0.75, "rewards/chosen": -9.643675804138184, "rewards/margins": 2.4938621520996094, "rewards/rejected": -12.137537956237793, "step": 478 }, { "epoch": 0.06522331154684095, "grad_norm": 95.11470108394312, "learning_rate": 5.213605442176871e-07, "logits/chosen": -2.961052417755127, "logits/rejected": -2.1055870056152344, "logps/chosen": -1.0963523387908936, "logps/rejected": -1.3360427618026733, "loss": 5.7739, "rewards/accuracies": 0.5, "rewards/chosen": -10.963523864746094, "rewards/margins": 2.396904230117798, "rewards/rejected": -13.360427856445312, "step": 479 }, { "epoch": 0.06535947712418301, "grad_norm": 46.34763022098749, "learning_rate": 5.224489795918367e-07, "logits/chosen": -3.3133623600006104, "logits/rejected": -2.7986373901367188, "logps/chosen": -1.0011088848114014, "logps/rejected": -1.1091097593307495, "loss": 4.7753, "rewards/accuracies": 0.5, "rewards/chosen": -10.011088371276855, "rewards/margins": 1.0800093412399292, "rewards/rejected": -11.091097831726074, "step": 480 }, { "epoch": 0.06549564270152505, "grad_norm": 51.90832644874036, "learning_rate": 5.235374149659864e-07, "logits/chosen": -3.126180648803711, "logits/rejected": -2.4339141845703125, "logps/chosen": -1.3860628604888916, "logps/rejected": -1.6059900522232056, "loss": 4.4939, "rewards/accuracies": 0.75, "rewards/chosen": -13.86063003540039, "rewards/margins": 2.1992714405059814, "rewards/rejected": -16.059900283813477, "step": 481 }, { "epoch": 0.0656318082788671, "grad_norm": 67.70441558603748, "learning_rate": 5.24625850340136e-07, "logits/chosen": -3.0055291652679443, "logits/rejected": -1.5121899843215942, "logps/chosen": -1.1013017892837524, "logps/rejected": -1.4546605348587036, "loss": 5.1027, "rewards/accuracies": 0.75, "rewards/chosen": -11.013017654418945, "rewards/margins": 3.53358793258667, "rewards/rejected": -14.546605110168457, "step": 482 }, { "epoch": 0.06576797385620915, "grad_norm": 48.709839379173935, "learning_rate": 5.257142857142857e-07, "logits/chosen": -3.293050765991211, "logits/rejected": -3.4682796001434326, "logps/chosen": -1.2368773221969604, "logps/rejected": -1.2902576923370361, "loss": 4.7356, "rewards/accuracies": 0.5, "rewards/chosen": -12.368772506713867, "rewards/margins": 0.5338044166564941, "rewards/rejected": -12.902578353881836, "step": 483 }, { "epoch": 0.0659041394335512, "grad_norm": 51.855017890333194, "learning_rate": 5.268027210884353e-07, "logits/chosen": -3.149684190750122, "logits/rejected": -2.3304836750030518, "logps/chosen": -1.2580931186676025, "logps/rejected": -1.484892725944519, "loss": 4.0661, "rewards/accuracies": 0.75, "rewards/chosen": -12.580930709838867, "rewards/margins": 2.2679967880249023, "rewards/rejected": -14.848928451538086, "step": 484 }, { "epoch": 0.06604030501089325, "grad_norm": 42.11105117371229, "learning_rate": 5.27891156462585e-07, "logits/chosen": -3.6756911277770996, "logits/rejected": -2.7144665718078613, "logps/chosen": -0.8044530153274536, "logps/rejected": -1.265647053718567, "loss": 4.7366, "rewards/accuracies": 0.75, "rewards/chosen": -8.044529914855957, "rewards/margins": 4.611940860748291, "rewards/rejected": -12.656471252441406, "step": 485 }, { "epoch": 0.0661764705882353, "grad_norm": 44.19972393323689, "learning_rate": 5.289795918367347e-07, "logits/chosen": -3.7175045013427734, "logits/rejected": -3.3369345664978027, "logps/chosen": -1.16758131980896, "logps/rejected": -1.6672298908233643, "loss": 4.6451, "rewards/accuracies": 1.0, "rewards/chosen": -11.675813674926758, "rewards/margins": 4.996485710144043, "rewards/rejected": -16.672298431396484, "step": 486 }, { "epoch": 0.06631263616557734, "grad_norm": 44.60575363357122, "learning_rate": 5.300680272108844e-07, "logits/chosen": -3.1067185401916504, "logits/rejected": -2.461934804916382, "logps/chosen": -1.1379724740982056, "logps/rejected": -1.180034875869751, "loss": 5.0355, "rewards/accuracies": 0.5, "rewards/chosen": -11.379724502563477, "rewards/margins": 0.420623779296875, "rewards/rejected": -11.800348281860352, "step": 487 }, { "epoch": 0.0664488017429194, "grad_norm": 41.481422053555406, "learning_rate": 5.31156462585034e-07, "logits/chosen": -2.4957308769226074, "logits/rejected": -1.2822606563568115, "logps/chosen": -1.122103214263916, "logps/rejected": -1.4508724212646484, "loss": 4.1279, "rewards/accuracies": 0.75, "rewards/chosen": -11.221031188964844, "rewards/margins": 3.2876923084259033, "rewards/rejected": -14.508724212646484, "step": 488 }, { "epoch": 0.06658496732026144, "grad_norm": 38.71093517168456, "learning_rate": 5.322448979591836e-07, "logits/chosen": -3.5945520401000977, "logits/rejected": -2.4668335914611816, "logps/chosen": -0.9126185178756714, "logps/rejected": -1.1714720726013184, "loss": 4.807, "rewards/accuracies": 1.0, "rewards/chosen": -9.126185417175293, "rewards/margins": 2.588536262512207, "rewards/rejected": -11.7147216796875, "step": 489 }, { "epoch": 0.06672113289760348, "grad_norm": 47.18965668609306, "learning_rate": 5.333333333333332e-07, "logits/chosen": -2.7456259727478027, "logits/rejected": -2.0691704750061035, "logps/chosen": -1.014132022857666, "logps/rejected": -1.235891342163086, "loss": 4.5097, "rewards/accuracies": 0.75, "rewards/chosen": -10.141321182250977, "rewards/margins": 2.217592239379883, "rewards/rejected": -12.35891342163086, "step": 490 }, { "epoch": 0.06685729847494554, "grad_norm": 52.153670358366185, "learning_rate": 5.34421768707483e-07, "logits/chosen": -3.6028807163238525, "logits/rejected": -2.541273593902588, "logps/chosen": -1.0086430311203003, "logps/rejected": -1.4665465354919434, "loss": 4.6214, "rewards/accuracies": 1.0, "rewards/chosen": -10.086430549621582, "rewards/margins": 4.579034805297852, "rewards/rejected": -14.665465354919434, "step": 491 }, { "epoch": 0.06699346405228758, "grad_norm": 37.376698486992574, "learning_rate": 5.355102040816326e-07, "logits/chosen": -3.2186760902404785, "logits/rejected": -2.133409023284912, "logps/chosen": -0.9959384202957153, "logps/rejected": -1.642189383506775, "loss": 4.0908, "rewards/accuracies": 0.75, "rewards/chosen": -9.95938491821289, "rewards/margins": 6.462509632110596, "rewards/rejected": -16.421894073486328, "step": 492 }, { "epoch": 0.06712962962962964, "grad_norm": 44.87996803857128, "learning_rate": 5.365986394557823e-07, "logits/chosen": -1.896686315536499, "logits/rejected": -2.94346284866333, "logps/chosen": -1.224096417427063, "logps/rejected": -1.396283745765686, "loss": 4.5427, "rewards/accuracies": 0.75, "rewards/chosen": -12.24096393585205, "rewards/margins": 1.7218732833862305, "rewards/rejected": -13.962837219238281, "step": 493 }, { "epoch": 0.06726579520697168, "grad_norm": 41.366735401096946, "learning_rate": 5.37687074829932e-07, "logits/chosen": -3.654369354248047, "logits/rejected": -3.5314693450927734, "logps/chosen": -0.8405681252479553, "logps/rejected": -1.011076807975769, "loss": 4.2304, "rewards/accuracies": 0.75, "rewards/chosen": -8.405680656433105, "rewards/margins": 1.7050867080688477, "rewards/rejected": -10.110767364501953, "step": 494 }, { "epoch": 0.06740196078431372, "grad_norm": 44.578231432092714, "learning_rate": 5.387755102040816e-07, "logits/chosen": -4.657895565032959, "logits/rejected": -4.162392616271973, "logps/chosen": -0.7328702211380005, "logps/rejected": -0.6419887542724609, "loss": 4.1759, "rewards/accuracies": 0.5, "rewards/chosen": -7.328701972961426, "rewards/margins": -0.9088144898414612, "rewards/rejected": -6.419887542724609, "step": 495 }, { "epoch": 0.06753812636165578, "grad_norm": 51.25700539685679, "learning_rate": 5.398639455782313e-07, "logits/chosen": -1.7686305046081543, "logits/rejected": -2.0772883892059326, "logps/chosen": -1.3809216022491455, "logps/rejected": -1.2834855318069458, "loss": 4.473, "rewards/accuracies": 0.5, "rewards/chosen": -13.809215545654297, "rewards/margins": -0.9743602275848389, "rewards/rejected": -12.834854125976562, "step": 496 }, { "epoch": 0.06767429193899782, "grad_norm": 43.589241431765515, "learning_rate": 5.409523809523809e-07, "logits/chosen": -3.0764927864074707, "logits/rejected": -3.3913044929504395, "logps/chosen": -0.9625169038772583, "logps/rejected": -0.9080711007118225, "loss": 4.9556, "rewards/accuracies": 0.5, "rewards/chosen": -9.625168800354004, "rewards/margins": -0.5444581508636475, "rewards/rejected": -9.080710411071777, "step": 497 }, { "epoch": 0.06781045751633986, "grad_norm": 39.427502764115815, "learning_rate": 5.420408163265306e-07, "logits/chosen": -2.9082350730895996, "logits/rejected": -2.7891600131988525, "logps/chosen": -0.9149742722511292, "logps/rejected": -0.9554600119590759, "loss": 4.2116, "rewards/accuracies": 0.5, "rewards/chosen": -9.14974308013916, "rewards/margins": 0.40485715866088867, "rewards/rejected": -9.55459976196289, "step": 498 }, { "epoch": 0.06794662309368192, "grad_norm": 60.75620114474602, "learning_rate": 5.431292517006802e-07, "logits/chosen": -2.4197402000427246, "logits/rejected": -3.090038537979126, "logps/chosen": -1.2086396217346191, "logps/rejected": -1.6045942306518555, "loss": 5.1078, "rewards/accuracies": 0.75, "rewards/chosen": -12.086397171020508, "rewards/margins": 3.959545850753784, "rewards/rejected": -16.045944213867188, "step": 499 }, { "epoch": 0.06808278867102396, "grad_norm": 52.473377232195986, "learning_rate": 5.442176870748299e-07, "logits/chosen": -2.913633346557617, "logits/rejected": -2.9762306213378906, "logps/chosen": -1.3501558303833008, "logps/rejected": -1.1746515035629272, "loss": 4.8186, "rewards/accuracies": 0.25, "rewards/chosen": -13.501558303833008, "rewards/margins": -1.7550432682037354, "rewards/rejected": -11.746515274047852, "step": 500 }, { "epoch": 0.068218954248366, "grad_norm": 47.144644839472456, "learning_rate": 5.453061224489795e-07, "logits/chosen": -3.4964828491210938, "logits/rejected": -2.0828115940093994, "logps/chosen": -1.1088865995407104, "logps/rejected": -1.478776216506958, "loss": 4.4622, "rewards/accuracies": 0.75, "rewards/chosen": -11.088866233825684, "rewards/margins": 3.698896884918213, "rewards/rejected": -14.787763595581055, "step": 501 }, { "epoch": 0.06835511982570806, "grad_norm": 43.35049634924928, "learning_rate": 5.463945578231293e-07, "logits/chosen": -2.50521183013916, "logits/rejected": -3.187676429748535, "logps/chosen": -1.210740566253662, "logps/rejected": -1.5217219591140747, "loss": 4.1781, "rewards/accuracies": 0.25, "rewards/chosen": -12.107404708862305, "rewards/margins": 3.109813928604126, "rewards/rejected": -15.217220306396484, "step": 502 }, { "epoch": 0.0684912854030501, "grad_norm": 50.70604258402021, "learning_rate": 5.474829931972789e-07, "logits/chosen": -3.090226888656616, "logits/rejected": -2.554786443710327, "logps/chosen": -1.463890552520752, "logps/rejected": -0.9991621971130371, "loss": 5.3208, "rewards/accuracies": 0.0, "rewards/chosen": -14.63890552520752, "rewards/margins": -4.647283554077148, "rewards/rejected": -9.991622924804688, "step": 503 }, { "epoch": 0.06862745098039216, "grad_norm": 53.36944902718577, "learning_rate": 5.485714285714286e-07, "logits/chosen": -2.662991762161255, "logits/rejected": -3.659628391265869, "logps/chosen": -1.0153424739837646, "logps/rejected": -0.7369922399520874, "loss": 4.3493, "rewards/accuracies": 0.25, "rewards/chosen": -10.153424263000488, "rewards/margins": -2.7835021018981934, "rewards/rejected": -7.369922637939453, "step": 504 }, { "epoch": 0.0687636165577342, "grad_norm": 48.368488553618874, "learning_rate": 5.496598639455782e-07, "logits/chosen": -3.414267063140869, "logits/rejected": -3.5098934173583984, "logps/chosen": -0.7415063381195068, "logps/rejected": -0.7837628126144409, "loss": 4.7981, "rewards/accuracies": 0.75, "rewards/chosen": -7.415063381195068, "rewards/margins": 0.4225647449493408, "rewards/rejected": -7.837628364562988, "step": 505 }, { "epoch": 0.06889978213507625, "grad_norm": 51.045266924631726, "learning_rate": 5.507482993197279e-07, "logits/chosen": -1.731513500213623, "logits/rejected": -1.9554202556610107, "logps/chosen": -1.1760766506195068, "logps/rejected": -1.191620945930481, "loss": 4.7553, "rewards/accuracies": 0.5, "rewards/chosen": -11.760766983032227, "rewards/margins": 0.1554419994354248, "rewards/rejected": -11.916208267211914, "step": 506 }, { "epoch": 0.0690359477124183, "grad_norm": 41.18712871190175, "learning_rate": 5.518367346938775e-07, "logits/chosen": -1.7039432525634766, "logits/rejected": -1.5498898029327393, "logps/chosen": -1.1091399192810059, "logps/rejected": -1.1720014810562134, "loss": 4.2984, "rewards/accuracies": 0.5, "rewards/chosen": -11.091400146484375, "rewards/margins": 0.6286147832870483, "rewards/rejected": -11.720014572143555, "step": 507 }, { "epoch": 0.06917211328976035, "grad_norm": 55.84554693286532, "learning_rate": 5.529251700680272e-07, "logits/chosen": -3.3508152961730957, "logits/rejected": -2.218747854232788, "logps/chosen": -0.7237839698791504, "logps/rejected": -1.0864360332489014, "loss": 4.1906, "rewards/accuracies": 0.75, "rewards/chosen": -7.237840175628662, "rewards/margins": 3.626521110534668, "rewards/rejected": -10.864360809326172, "step": 508 }, { "epoch": 0.06930827886710239, "grad_norm": 41.870617958988475, "learning_rate": 5.540136054421768e-07, "logits/chosen": -2.134795665740967, "logits/rejected": -1.8932857513427734, "logps/chosen": -1.2204103469848633, "logps/rejected": -1.014575481414795, "loss": 4.6491, "rewards/accuracies": 0.25, "rewards/chosen": -12.204102516174316, "rewards/margins": -2.058347702026367, "rewards/rejected": -10.145753860473633, "step": 509 }, { "epoch": 0.06944444444444445, "grad_norm": 49.92718294780802, "learning_rate": 5.551020408163265e-07, "logits/chosen": -3.2463297843933105, "logits/rejected": -3.077113151550293, "logps/chosen": -1.016869068145752, "logps/rejected": -0.9979915022850037, "loss": 4.7337, "rewards/accuracies": 0.5, "rewards/chosen": -10.168691635131836, "rewards/margins": -0.1887761354446411, "rewards/rejected": -9.979915618896484, "step": 510 }, { "epoch": 0.06958061002178649, "grad_norm": 40.50143052656478, "learning_rate": 5.561904761904761e-07, "logits/chosen": -3.2331511974334717, "logits/rejected": -4.212170124053955, "logps/chosen": -0.9817467927932739, "logps/rejected": -1.025583267211914, "loss": 4.5087, "rewards/accuracies": 0.5, "rewards/chosen": -9.81746768951416, "rewards/margins": 0.4383646249771118, "rewards/rejected": -10.255831718444824, "step": 511 }, { "epoch": 0.06971677559912855, "grad_norm": 52.689756479351395, "learning_rate": 5.572789115646258e-07, "logits/chosen": -1.9035272598266602, "logits/rejected": -2.2654736042022705, "logps/chosen": -1.2345178127288818, "logps/rejected": -1.2163217067718506, "loss": 4.5482, "rewards/accuracies": 0.25, "rewards/chosen": -12.34517765045166, "rewards/margins": -0.1819608211517334, "rewards/rejected": -12.163217544555664, "step": 512 }, { "epoch": 0.06985294117647059, "grad_norm": 50.422250838369045, "learning_rate": 5.583673469387756e-07, "logits/chosen": -3.0864222049713135, "logits/rejected": -3.5359435081481934, "logps/chosen": -1.1029174327850342, "logps/rejected": -1.2551933526992798, "loss": 4.7619, "rewards/accuracies": 0.75, "rewards/chosen": -11.0291748046875, "rewards/margins": 1.5227595567703247, "rewards/rejected": -12.551933288574219, "step": 513 }, { "epoch": 0.06998910675381263, "grad_norm": 44.74728718345034, "learning_rate": 5.594557823129252e-07, "logits/chosen": -2.4888052940368652, "logits/rejected": -2.4695749282836914, "logps/chosen": -1.2744624614715576, "logps/rejected": -1.3882547616958618, "loss": 5.2098, "rewards/accuracies": 0.5, "rewards/chosen": -12.744624137878418, "rewards/margins": 1.1379234790802002, "rewards/rejected": -13.882547378540039, "step": 514 }, { "epoch": 0.07012527233115469, "grad_norm": 49.355634403628386, "learning_rate": 5.605442176870748e-07, "logits/chosen": -2.4740874767303467, "logits/rejected": -2.750011920928955, "logps/chosen": -1.1767381429672241, "logps/rejected": -0.9742264747619629, "loss": 4.7938, "rewards/accuracies": 0.25, "rewards/chosen": -11.76738166809082, "rewards/margins": -2.0251169204711914, "rewards/rejected": -9.742264747619629, "step": 515 }, { "epoch": 0.07026143790849673, "grad_norm": 65.16800352810574, "learning_rate": 5.616326530612244e-07, "logits/chosen": -4.649001598358154, "logits/rejected": -4.046987056732178, "logps/chosen": -1.0259004831314087, "logps/rejected": -0.9934563636779785, "loss": 5.2561, "rewards/accuracies": 0.5, "rewards/chosen": -10.259004592895508, "rewards/margins": -0.32444095611572266, "rewards/rejected": -9.934563636779785, "step": 516 }, { "epoch": 0.07039760348583878, "grad_norm": 49.5260790343194, "learning_rate": 5.627210884353741e-07, "logits/chosen": -1.4050493240356445, "logits/rejected": -2.377441167831421, "logps/chosen": -1.6689602136611938, "logps/rejected": -1.4586232900619507, "loss": 4.3706, "rewards/accuracies": 0.25, "rewards/chosen": -16.68960189819336, "rewards/margins": -2.103367805480957, "rewards/rejected": -14.586233139038086, "step": 517 }, { "epoch": 0.07053376906318083, "grad_norm": 46.91340984567876, "learning_rate": 5.638095238095238e-07, "logits/chosen": -3.289820909500122, "logits/rejected": -2.098113536834717, "logps/chosen": -1.3252105712890625, "logps/rejected": -1.071479082107544, "loss": 4.5651, "rewards/accuracies": 0.5, "rewards/chosen": -13.252105712890625, "rewards/margins": -2.5373141765594482, "rewards/rejected": -10.714791297912598, "step": 518 }, { "epoch": 0.07066993464052287, "grad_norm": 53.06561860798649, "learning_rate": 5.648979591836735e-07, "logits/chosen": -4.081661224365234, "logits/rejected": -4.0660481452941895, "logps/chosen": -0.7771832346916199, "logps/rejected": -0.6609522104263306, "loss": 4.9005, "rewards/accuracies": 0.0, "rewards/chosen": -7.771832466125488, "rewards/margins": -1.1623101234436035, "rewards/rejected": -6.609521865844727, "step": 519 }, { "epoch": 0.07080610021786492, "grad_norm": 51.51219186906303, "learning_rate": 5.659863945578231e-07, "logits/chosen": -3.4427413940429688, "logits/rejected": -3.5143649578094482, "logps/chosen": -1.3564233779907227, "logps/rejected": -0.9794079065322876, "loss": 4.6772, "rewards/accuracies": 0.25, "rewards/chosen": -13.564233779907227, "rewards/margins": -3.7701544761657715, "rewards/rejected": -9.794078826904297, "step": 520 }, { "epoch": 0.07094226579520697, "grad_norm": 46.51724277554875, "learning_rate": 5.670748299319728e-07, "logits/chosen": -3.457324981689453, "logits/rejected": -2.623541831970215, "logps/chosen": -1.4199084043502808, "logps/rejected": -1.7119513750076294, "loss": 4.3889, "rewards/accuracies": 1.0, "rewards/chosen": -14.19908332824707, "rewards/margins": 2.9204297065734863, "rewards/rejected": -17.11951446533203, "step": 521 }, { "epoch": 0.07107843137254902, "grad_norm": 39.95632616170753, "learning_rate": 5.681632653061224e-07, "logits/chosen": -3.9152753353118896, "logits/rejected": -2.3829188346862793, "logps/chosen": -0.7618710398674011, "logps/rejected": -1.1294097900390625, "loss": 4.054, "rewards/accuracies": 1.0, "rewards/chosen": -7.618710517883301, "rewards/margins": 3.6753876209259033, "rewards/rejected": -11.294097900390625, "step": 522 }, { "epoch": 0.07121459694989107, "grad_norm": 44.70616759534069, "learning_rate": 5.692517006802721e-07, "logits/chosen": -3.6877918243408203, "logits/rejected": -3.5540385246276855, "logps/chosen": -1.012211799621582, "logps/rejected": -0.8393704891204834, "loss": 4.4969, "rewards/accuracies": 0.0, "rewards/chosen": -10.122118949890137, "rewards/margins": -1.7284140586853027, "rewards/rejected": -8.393705368041992, "step": 523 }, { "epoch": 0.07135076252723312, "grad_norm": 43.225918184122804, "learning_rate": 5.703401360544217e-07, "logits/chosen": -2.839071750640869, "logits/rejected": -1.4154413938522339, "logps/chosen": -1.377930998802185, "logps/rejected": -2.352097988128662, "loss": 4.3511, "rewards/accuracies": 0.75, "rewards/chosen": -13.77931022644043, "rewards/margins": 9.741668701171875, "rewards/rejected": -23.520978927612305, "step": 524 }, { "epoch": 0.07148692810457516, "grad_norm": 62.490231102963136, "learning_rate": 5.714285714285714e-07, "logits/chosen": -2.6947526931762695, "logits/rejected": -2.789160966873169, "logps/chosen": -1.223914623260498, "logps/rejected": -1.331604242324829, "loss": 5.0487, "rewards/accuracies": 0.5, "rewards/chosen": -12.239145278930664, "rewards/margins": 1.0768976211547852, "rewards/rejected": -13.316041946411133, "step": 525 }, { "epoch": 0.07162309368191722, "grad_norm": 44.67154897255365, "learning_rate": 5.72517006802721e-07, "logits/chosen": -2.8423728942871094, "logits/rejected": -1.5813877582550049, "logps/chosen": -1.1929619312286377, "logps/rejected": -1.857546329498291, "loss": 4.2751, "rewards/accuracies": 0.75, "rewards/chosen": -11.929619789123535, "rewards/margins": 6.645843029022217, "rewards/rejected": -18.575462341308594, "step": 526 }, { "epoch": 0.07175925925925926, "grad_norm": 79.92268852900168, "learning_rate": 5.736054421768707e-07, "logits/chosen": -2.058612585067749, "logits/rejected": -2.615536689758301, "logps/chosen": -1.1030551195144653, "logps/rejected": -1.2500829696655273, "loss": 4.0152, "rewards/accuracies": 1.0, "rewards/chosen": -11.03055191040039, "rewards/margins": 1.470278024673462, "rewards/rejected": -12.500829696655273, "step": 527 }, { "epoch": 0.0718954248366013, "grad_norm": 44.963338341796714, "learning_rate": 5.746938775510203e-07, "logits/chosen": -2.6014933586120605, "logits/rejected": -2.2794928550720215, "logps/chosen": -1.198171615600586, "logps/rejected": -1.0466035604476929, "loss": 4.7378, "rewards/accuracies": 0.25, "rewards/chosen": -11.981717109680176, "rewards/margins": -1.515681505203247, "rewards/rejected": -10.466035842895508, "step": 528 }, { "epoch": 0.07203159041394336, "grad_norm": 47.92824558063916, "learning_rate": 5.757823129251701e-07, "logits/chosen": -3.646101474761963, "logits/rejected": -3.3631434440612793, "logps/chosen": -1.053350567817688, "logps/rejected": -1.3500213623046875, "loss": 4.381, "rewards/accuracies": 0.75, "rewards/chosen": -10.533504486083984, "rewards/margins": 2.966708183288574, "rewards/rejected": -13.500213623046875, "step": 529 }, { "epoch": 0.0721677559912854, "grad_norm": 36.048506653129024, "learning_rate": 5.768707482993198e-07, "logits/chosen": -3.035130739212036, "logits/rejected": -2.2416820526123047, "logps/chosen": -1.0438287258148193, "logps/rejected": -1.5621064901351929, "loss": 4.1823, "rewards/accuracies": 1.0, "rewards/chosen": -10.438286781311035, "rewards/margins": 5.182777404785156, "rewards/rejected": -15.621064186096191, "step": 530 }, { "epoch": 0.07230392156862746, "grad_norm": 53.86633254864284, "learning_rate": 5.779591836734694e-07, "logits/chosen": -3.0320847034454346, "logits/rejected": -3.443748950958252, "logps/chosen": -0.9660254120826721, "logps/rejected": -1.2665331363677979, "loss": 4.5416, "rewards/accuracies": 1.0, "rewards/chosen": -9.66025447845459, "rewards/margins": 3.0050759315490723, "rewards/rejected": -12.665329933166504, "step": 531 }, { "epoch": 0.0724400871459695, "grad_norm": 43.15018843647521, "learning_rate": 5.790476190476191e-07, "logits/chosen": -2.207063674926758, "logits/rejected": -1.217820644378662, "logps/chosen": -1.4524285793304443, "logps/rejected": -2.0840392112731934, "loss": 4.0872, "rewards/accuracies": 0.75, "rewards/chosen": -14.524286270141602, "rewards/margins": 6.316103935241699, "rewards/rejected": -20.840389251708984, "step": 532 }, { "epoch": 0.07257625272331154, "grad_norm": 55.499304864912304, "learning_rate": 5.801360544217686e-07, "logits/chosen": -2.9308176040649414, "logits/rejected": -2.8453803062438965, "logps/chosen": -1.2383687496185303, "logps/rejected": -1.5449168682098389, "loss": 5.325, "rewards/accuracies": 0.75, "rewards/chosen": -12.383687973022461, "rewards/margins": 3.0654807090759277, "rewards/rejected": -15.449169158935547, "step": 533 }, { "epoch": 0.0727124183006536, "grad_norm": 45.97165400763556, "learning_rate": 5.812244897959184e-07, "logits/chosen": -3.1713709831237793, "logits/rejected": -3.1586499214172363, "logps/chosen": -0.9073365926742554, "logps/rejected": -1.1602027416229248, "loss": 4.6098, "rewards/accuracies": 0.75, "rewards/chosen": -9.073366165161133, "rewards/margins": 2.52866268157959, "rewards/rejected": -11.602027893066406, "step": 534 }, { "epoch": 0.07284858387799564, "grad_norm": 44.97305253913826, "learning_rate": 5.82312925170068e-07, "logits/chosen": -3.080355405807495, "logits/rejected": -2.0168638229370117, "logps/chosen": -1.0038633346557617, "logps/rejected": -1.5835446119308472, "loss": 4.3305, "rewards/accuracies": 0.75, "rewards/chosen": -10.038633346557617, "rewards/margins": 5.796812057495117, "rewards/rejected": -15.835445404052734, "step": 535 }, { "epoch": 0.07298474945533769, "grad_norm": 47.22996755317874, "learning_rate": 5.834013605442177e-07, "logits/chosen": -2.9205572605133057, "logits/rejected": -2.1790900230407715, "logps/chosen": -1.1314074993133545, "logps/rejected": -1.3616652488708496, "loss": 4.8553, "rewards/accuracies": 0.75, "rewards/chosen": -11.314075469970703, "rewards/margins": 2.3025777339935303, "rewards/rejected": -13.616652488708496, "step": 536 }, { "epoch": 0.07312091503267974, "grad_norm": 40.352035081411785, "learning_rate": 5.844897959183673e-07, "logits/chosen": -2.6462063789367676, "logits/rejected": -1.6492867469787598, "logps/chosen": -1.0278992652893066, "logps/rejected": -1.1781549453735352, "loss": 4.5672, "rewards/accuracies": 0.75, "rewards/chosen": -10.27899169921875, "rewards/margins": 1.5025584697723389, "rewards/rejected": -11.781550407409668, "step": 537 }, { "epoch": 0.07325708061002179, "grad_norm": 49.31922539447483, "learning_rate": 5.85578231292517e-07, "logits/chosen": -1.876529574394226, "logits/rejected": -1.8933525085449219, "logps/chosen": -0.904739499092102, "logps/rejected": -1.2518165111541748, "loss": 4.5193, "rewards/accuracies": 1.0, "rewards/chosen": -9.047395706176758, "rewards/margins": 3.470768928527832, "rewards/rejected": -12.51816463470459, "step": 538 }, { "epoch": 0.07339324618736383, "grad_norm": 43.6996347121619, "learning_rate": 5.866666666666666e-07, "logits/chosen": -3.69486665725708, "logits/rejected": -3.232206106185913, "logps/chosen": -1.1310230493545532, "logps/rejected": -1.4437167644500732, "loss": 4.714, "rewards/accuracies": 1.0, "rewards/chosen": -11.310230255126953, "rewards/margins": 3.1269376277923584, "rewards/rejected": -14.43716812133789, "step": 539 }, { "epoch": 0.07352941176470588, "grad_norm": 59.79551736918519, "learning_rate": 5.877551020408163e-07, "logits/chosen": -2.5714402198791504, "logits/rejected": -3.1539134979248047, "logps/chosen": -1.3617265224456787, "logps/rejected": -1.1483540534973145, "loss": 5.1944, "rewards/accuracies": 0.25, "rewards/chosen": -13.617264747619629, "rewards/margins": -2.133725166320801, "rewards/rejected": -11.483539581298828, "step": 540 }, { "epoch": 0.07366557734204793, "grad_norm": 43.45499362321002, "learning_rate": 5.888435374149659e-07, "logits/chosen": -2.391265630722046, "logits/rejected": -2.4516372680664062, "logps/chosen": -1.2022141218185425, "logps/rejected": -1.1716055870056152, "loss": 4.4594, "rewards/accuracies": 0.25, "rewards/chosen": -12.022141456604004, "rewards/margins": -0.30608558654785156, "rewards/rejected": -11.716055870056152, "step": 541 }, { "epoch": 0.07380174291938998, "grad_norm": 43.00770363655113, "learning_rate": 5.899319727891156e-07, "logits/chosen": -2.4321303367614746, "logits/rejected": -1.5889924764633179, "logps/chosen": -1.0465128421783447, "logps/rejected": -1.282304286956787, "loss": 4.2964, "rewards/accuracies": 0.75, "rewards/chosen": -10.465129852294922, "rewards/margins": 2.357914447784424, "rewards/rejected": -12.823043823242188, "step": 542 }, { "epoch": 0.07393790849673203, "grad_norm": 42.23491645291864, "learning_rate": 5.910204081632652e-07, "logits/chosen": -2.280940532684326, "logits/rejected": -2.4187138080596924, "logps/chosen": -0.9433045387268066, "logps/rejected": -0.9905986189842224, "loss": 4.764, "rewards/accuracies": 0.5, "rewards/chosen": -9.433045387268066, "rewards/margins": 0.4729405641555786, "rewards/rejected": -9.905985832214355, "step": 543 }, { "epoch": 0.07407407407407407, "grad_norm": 48.614783948478504, "learning_rate": 5.921088435374149e-07, "logits/chosen": -2.1121151447296143, "logits/rejected": -3.0325875282287598, "logps/chosen": -1.6516271829605103, "logps/rejected": -1.1242382526397705, "loss": 4.2102, "rewards/accuracies": 0.25, "rewards/chosen": -16.516273498535156, "rewards/margins": -5.273890495300293, "rewards/rejected": -11.24238109588623, "step": 544 }, { "epoch": 0.07421023965141613, "grad_norm": 72.39186338916265, "learning_rate": 5.931972789115646e-07, "logits/chosen": -3.5062475204467773, "logits/rejected": -2.5739920139312744, "logps/chosen": -0.9131498336791992, "logps/rejected": -1.052865982055664, "loss": 4.8164, "rewards/accuracies": 0.5, "rewards/chosen": -9.131498336791992, "rewards/margins": 1.3971623182296753, "rewards/rejected": -10.528661727905273, "step": 545 }, { "epoch": 0.07434640522875817, "grad_norm": 53.49290009904152, "learning_rate": 5.942857142857143e-07, "logits/chosen": -2.6550002098083496, "logits/rejected": -2.224186897277832, "logps/chosen": -1.3134297132492065, "logps/rejected": -1.4414632320404053, "loss": 4.7498, "rewards/accuracies": 0.5, "rewards/chosen": -13.134296417236328, "rewards/margins": 1.2803348302841187, "rewards/rejected": -14.414631843566895, "step": 546 }, { "epoch": 0.07448257080610021, "grad_norm": 53.22831664342922, "learning_rate": 5.95374149659864e-07, "logits/chosen": -1.2901287078857422, "logits/rejected": -2.2904341220855713, "logps/chosen": -1.6091017723083496, "logps/rejected": -1.909346103668213, "loss": 5.1644, "rewards/accuracies": 0.5, "rewards/chosen": -16.091018676757812, "rewards/margins": 3.0024428367614746, "rewards/rejected": -19.093461990356445, "step": 547 }, { "epoch": 0.07461873638344227, "grad_norm": 42.29362065131642, "learning_rate": 5.964625850340136e-07, "logits/chosen": -3.273163080215454, "logits/rejected": -1.853790283203125, "logps/chosen": -0.8754934072494507, "logps/rejected": -1.2393321990966797, "loss": 4.7809, "rewards/accuracies": 1.0, "rewards/chosen": -8.754934310913086, "rewards/margins": 3.6383872032165527, "rewards/rejected": -12.393321990966797, "step": 548 }, { "epoch": 0.07475490196078431, "grad_norm": 41.726413389780184, "learning_rate": 5.975510204081633e-07, "logits/chosen": -3.261287212371826, "logits/rejected": -2.034339189529419, "logps/chosen": -1.0834790468215942, "logps/rejected": -1.6411304473876953, "loss": 4.368, "rewards/accuracies": 0.5, "rewards/chosen": -10.834790229797363, "rewards/margins": 5.576514720916748, "rewards/rejected": -16.411304473876953, "step": 549 }, { "epoch": 0.07489106753812637, "grad_norm": 48.806022702547764, "learning_rate": 5.986394557823129e-07, "logits/chosen": -3.3904733657836914, "logits/rejected": -2.7350518703460693, "logps/chosen": -1.1337281465530396, "logps/rejected": -1.2400124073028564, "loss": 4.121, "rewards/accuracies": 0.25, "rewards/chosen": -11.337282180786133, "rewards/margins": 1.0628423690795898, "rewards/rejected": -12.400124549865723, "step": 550 }, { "epoch": 0.07502723311546841, "grad_norm": 48.866394233926556, "learning_rate": 5.997278911564626e-07, "logits/chosen": -3.9995265007019043, "logits/rejected": -4.3908491134643555, "logps/chosen": -0.9303375482559204, "logps/rejected": -1.1935251951217651, "loss": 4.8463, "rewards/accuracies": 0.5, "rewards/chosen": -9.303375244140625, "rewards/margins": 2.6318764686584473, "rewards/rejected": -11.935251235961914, "step": 551 }, { "epoch": 0.07516339869281045, "grad_norm": 49.79440147480309, "learning_rate": 6.008163265306122e-07, "logits/chosen": -3.5048816204071045, "logits/rejected": -2.3747317790985107, "logps/chosen": -0.9576873183250427, "logps/rejected": -1.0743203163146973, "loss": 4.8257, "rewards/accuracies": 0.5, "rewards/chosen": -9.576873779296875, "rewards/margins": 1.166330099105835, "rewards/rejected": -10.743204116821289, "step": 552 }, { "epoch": 0.07529956427015251, "grad_norm": 47.82685069854109, "learning_rate": 6.019047619047619e-07, "logits/chosen": -2.6381888389587402, "logits/rejected": -2.448133945465088, "logps/chosen": -1.0726618766784668, "logps/rejected": -1.1739044189453125, "loss": 4.6313, "rewards/accuracies": 0.5, "rewards/chosen": -10.726617813110352, "rewards/margins": 1.012425422668457, "rewards/rejected": -11.739044189453125, "step": 553 }, { "epoch": 0.07543572984749455, "grad_norm": 42.72266035138078, "learning_rate": 6.029931972789115e-07, "logits/chosen": -1.9112746715545654, "logits/rejected": -1.5995330810546875, "logps/chosen": -1.121203064918518, "logps/rejected": -1.1550216674804688, "loss": 3.9763, "rewards/accuracies": 0.5, "rewards/chosen": -11.212030410766602, "rewards/margins": 0.3381870985031128, "rewards/rejected": -11.550216674804688, "step": 554 }, { "epoch": 0.0755718954248366, "grad_norm": 45.58326007219861, "learning_rate": 6.040816326530612e-07, "logits/chosen": -2.1588876247406006, "logits/rejected": -1.8733437061309814, "logps/chosen": -1.071306586265564, "logps/rejected": -1.0362670421600342, "loss": 4.8055, "rewards/accuracies": 0.25, "rewards/chosen": -10.713066101074219, "rewards/margins": -0.3503960371017456, "rewards/rejected": -10.3626708984375, "step": 555 }, { "epoch": 0.07570806100217865, "grad_norm": 43.26670847463388, "learning_rate": 6.051700680272109e-07, "logits/chosen": -3.659604549407959, "logits/rejected": -3.186384439468384, "logps/chosen": -0.9348289966583252, "logps/rejected": -1.136042833328247, "loss": 4.5073, "rewards/accuracies": 0.75, "rewards/chosen": -9.348289489746094, "rewards/margins": 2.0121378898620605, "rewards/rejected": -11.360427856445312, "step": 556 }, { "epoch": 0.0758442265795207, "grad_norm": 44.432622980073816, "learning_rate": 6.062585034013606e-07, "logits/chosen": -1.6850087642669678, "logits/rejected": -2.7763800621032715, "logps/chosen": -1.629197120666504, "logps/rejected": -1.0491318702697754, "loss": 4.0865, "rewards/accuracies": 0.25, "rewards/chosen": -16.291969299316406, "rewards/margins": -5.8006510734558105, "rewards/rejected": -10.491318702697754, "step": 557 }, { "epoch": 0.07598039215686274, "grad_norm": 43.265031629175354, "learning_rate": 6.073469387755101e-07, "logits/chosen": -4.587276935577393, "logits/rejected": -2.6791720390319824, "logps/chosen": -1.0295172929763794, "logps/rejected": -1.3791284561157227, "loss": 4.4328, "rewards/accuracies": 0.75, "rewards/chosen": -10.295172691345215, "rewards/margins": 3.4961116313934326, "rewards/rejected": -13.791284561157227, "step": 558 }, { "epoch": 0.0761165577342048, "grad_norm": 44.089275496266545, "learning_rate": 6.084353741496598e-07, "logits/chosen": -1.9902973175048828, "logits/rejected": -3.7541661262512207, "logps/chosen": -0.9154926538467407, "logps/rejected": -0.9361611008644104, "loss": 4.6754, "rewards/accuracies": 0.5, "rewards/chosen": -9.154926300048828, "rewards/margins": 0.20668423175811768, "rewards/rejected": -9.361610412597656, "step": 559 }, { "epoch": 0.07625272331154684, "grad_norm": 56.37478528928272, "learning_rate": 6.095238095238094e-07, "logits/chosen": -3.3105673789978027, "logits/rejected": -2.01082444190979, "logps/chosen": -1.0109097957611084, "logps/rejected": -1.4130735397338867, "loss": 4.6345, "rewards/accuracies": 0.75, "rewards/chosen": -10.10909652709961, "rewards/margins": 4.021637916564941, "rewards/rejected": -14.130735397338867, "step": 560 }, { "epoch": 0.0763888888888889, "grad_norm": 43.324909496030955, "learning_rate": 6.106122448979592e-07, "logits/chosen": -1.7329590320587158, "logits/rejected": -2.520401954650879, "logps/chosen": -1.274024248123169, "logps/rejected": -1.1403762102127075, "loss": 4.9596, "rewards/accuracies": 0.25, "rewards/chosen": -12.740242004394531, "rewards/margins": -1.3364806175231934, "rewards/rejected": -11.40376091003418, "step": 561 }, { "epoch": 0.07652505446623094, "grad_norm": 38.92199735157995, "learning_rate": 6.117006802721088e-07, "logits/chosen": -2.1596486568450928, "logits/rejected": -3.9795422554016113, "logps/chosen": -1.1261370182037354, "logps/rejected": -0.8986576795578003, "loss": 4.2819, "rewards/accuracies": 0.25, "rewards/chosen": -11.261369705200195, "rewards/margins": -2.2747929096221924, "rewards/rejected": -8.986577033996582, "step": 562 }, { "epoch": 0.07666122004357298, "grad_norm": 47.15287974354056, "learning_rate": 6.127891156462585e-07, "logits/chosen": -2.7447500228881836, "logits/rejected": -3.341521739959717, "logps/chosen": -0.9428697824478149, "logps/rejected": -1.1998918056488037, "loss": 4.6211, "rewards/accuracies": 0.75, "rewards/chosen": -9.42869758605957, "rewards/margins": 2.5702202320098877, "rewards/rejected": -11.998918533325195, "step": 563 }, { "epoch": 0.07679738562091504, "grad_norm": 41.19933967224391, "learning_rate": 6.138775510204081e-07, "logits/chosen": -3.7809364795684814, "logits/rejected": -2.886810302734375, "logps/chosen": -0.8676085472106934, "logps/rejected": -0.9626082181930542, "loss": 4.0796, "rewards/accuracies": 0.5, "rewards/chosen": -8.676085472106934, "rewards/margins": 0.9499971866607666, "rewards/rejected": -9.626083374023438, "step": 564 }, { "epoch": 0.07693355119825708, "grad_norm": 45.02583646130901, "learning_rate": 6.149659863945578e-07, "logits/chosen": -1.7909882068634033, "logits/rejected": -1.2828783988952637, "logps/chosen": -1.2743408679962158, "logps/rejected": -1.4393922090530396, "loss": 4.7076, "rewards/accuracies": 0.75, "rewards/chosen": -12.743409156799316, "rewards/margins": 1.6505131721496582, "rewards/rejected": -14.393922805786133, "step": 565 }, { "epoch": 0.07706971677559912, "grad_norm": 51.861948601224206, "learning_rate": 6.160544217687075e-07, "logits/chosen": -2.8729019165039062, "logits/rejected": -1.7449029684066772, "logps/chosen": -1.0965144634246826, "logps/rejected": -1.4338178634643555, "loss": 4.7573, "rewards/accuracies": 0.75, "rewards/chosen": -10.965144157409668, "rewards/margins": 3.373035192489624, "rewards/rejected": -14.338178634643555, "step": 566 }, { "epoch": 0.07720588235294118, "grad_norm": 43.90607491874017, "learning_rate": 6.171428571428571e-07, "logits/chosen": -2.2494189739227295, "logits/rejected": -2.3272876739501953, "logps/chosen": -0.9908394813537598, "logps/rejected": -1.3577537536621094, "loss": 4.21, "rewards/accuracies": 0.75, "rewards/chosen": -9.908393859863281, "rewards/margins": 3.669142007827759, "rewards/rejected": -13.577536582946777, "step": 567 }, { "epoch": 0.07734204793028322, "grad_norm": 38.60968739530749, "learning_rate": 6.182312925170068e-07, "logits/chosen": -2.495932102203369, "logits/rejected": -2.8028132915496826, "logps/chosen": -1.5257453918457031, "logps/rejected": -1.2761623859405518, "loss": 4.9291, "rewards/accuracies": 0.5, "rewards/chosen": -15.257452964782715, "rewards/margins": -2.4958300590515137, "rewards/rejected": -12.76162338256836, "step": 568 }, { "epoch": 0.07747821350762528, "grad_norm": 44.76392899101536, "learning_rate": 6.193197278911564e-07, "logits/chosen": -1.7351999282836914, "logits/rejected": -2.1211867332458496, "logps/chosen": -0.9518553614616394, "logps/rejected": -1.4849989414215088, "loss": 4.0753, "rewards/accuracies": 0.75, "rewards/chosen": -9.518553733825684, "rewards/margins": 5.331435203552246, "rewards/rejected": -14.84998893737793, "step": 569 }, { "epoch": 0.07761437908496732, "grad_norm": 49.24750840084276, "learning_rate": 6.204081632653061e-07, "logits/chosen": -2.2526161670684814, "logits/rejected": -2.737415075302124, "logps/chosen": -0.944683313369751, "logps/rejected": -1.511658787727356, "loss": 4.6694, "rewards/accuracies": 0.5, "rewards/chosen": -9.446832656860352, "rewards/margins": 5.6697540283203125, "rewards/rejected": -15.11658763885498, "step": 570 }, { "epoch": 0.07775054466230936, "grad_norm": 47.64854557168102, "learning_rate": 6.214965986394557e-07, "logits/chosen": -2.4334957599639893, "logits/rejected": -2.3210089206695557, "logps/chosen": -1.1833598613739014, "logps/rejected": -1.1799216270446777, "loss": 4.7346, "rewards/accuracies": 0.5, "rewards/chosen": -11.833599090576172, "rewards/margins": -0.03438234329223633, "rewards/rejected": -11.799216270446777, "step": 571 }, { "epoch": 0.07788671023965142, "grad_norm": 46.95504058136107, "learning_rate": 6.225850340136055e-07, "logits/chosen": -3.913431167602539, "logits/rejected": -2.710848808288574, "logps/chosen": -0.8266744613647461, "logps/rejected": -0.9752511978149414, "loss": 4.1318, "rewards/accuracies": 0.5, "rewards/chosen": -8.266744613647461, "rewards/margins": 1.4857673645019531, "rewards/rejected": -9.752511978149414, "step": 572 }, { "epoch": 0.07802287581699346, "grad_norm": 44.25229302236887, "learning_rate": 6.236734693877551e-07, "logits/chosen": -2.158151626586914, "logits/rejected": -2.300163984298706, "logps/chosen": -1.1931536197662354, "logps/rejected": -1.120074987411499, "loss": 4.9073, "rewards/accuracies": 0.25, "rewards/chosen": -11.931535720825195, "rewards/margins": -0.7307850122451782, "rewards/rejected": -11.200750350952148, "step": 573 }, { "epoch": 0.07815904139433551, "grad_norm": 46.26762758914474, "learning_rate": 6.247619047619048e-07, "logits/chosen": -2.0157198905944824, "logits/rejected": -1.9624749422073364, "logps/chosen": -1.2574255466461182, "logps/rejected": -1.5994433164596558, "loss": 4.6701, "rewards/accuracies": 1.0, "rewards/chosen": -12.574254989624023, "rewards/margins": 3.4201760292053223, "rewards/rejected": -15.99443244934082, "step": 574 }, { "epoch": 0.07829520697167756, "grad_norm": 41.549840923414195, "learning_rate": 6.258503401360544e-07, "logits/chosen": -2.1139156818389893, "logits/rejected": -1.3261111974716187, "logps/chosen": -1.5256156921386719, "logps/rejected": -1.4514455795288086, "loss": 4.2793, "rewards/accuracies": 0.5, "rewards/chosen": -15.256157875061035, "rewards/margins": -0.7417013645172119, "rewards/rejected": -14.514455795288086, "step": 575 }, { "epoch": 0.0784313725490196, "grad_norm": 47.898580359267235, "learning_rate": 6.26938775510204e-07, "logits/chosen": -3.3864259719848633, "logits/rejected": -2.2437126636505127, "logps/chosen": -1.1890993118286133, "logps/rejected": -1.5027236938476562, "loss": 4.1718, "rewards/accuracies": 0.75, "rewards/chosen": -11.89099407196045, "rewards/margins": 3.1362433433532715, "rewards/rejected": -15.027237892150879, "step": 576 }, { "epoch": 0.07856753812636165, "grad_norm": 49.26718081190389, "learning_rate": 6.280272108843537e-07, "logits/chosen": -2.1808981895446777, "logits/rejected": -2.3677072525024414, "logps/chosen": -1.836155652999878, "logps/rejected": -1.3988218307495117, "loss": 5.0321, "rewards/accuracies": 0.5, "rewards/chosen": -18.361557006835938, "rewards/margins": -4.373337745666504, "rewards/rejected": -13.988218307495117, "step": 577 }, { "epoch": 0.0787037037037037, "grad_norm": 47.77213206827258, "learning_rate": 6.291156462585034e-07, "logits/chosen": -2.8635387420654297, "logits/rejected": -2.7442731857299805, "logps/chosen": -1.0204731225967407, "logps/rejected": -1.0178407430648804, "loss": 4.4259, "rewards/accuracies": 0.5, "rewards/chosen": -10.204730987548828, "rewards/margins": -0.026323795318603516, "rewards/rejected": -10.178407669067383, "step": 578 }, { "epoch": 0.07883986928104575, "grad_norm": 52.04718888679204, "learning_rate": 6.30204081632653e-07, "logits/chosen": -3.5573525428771973, "logits/rejected": -1.1939268112182617, "logps/chosen": -1.4744977951049805, "logps/rejected": -1.7643961906433105, "loss": 4.5295, "rewards/accuracies": 0.75, "rewards/chosen": -14.744976997375488, "rewards/margins": 2.898984909057617, "rewards/rejected": -17.64396095275879, "step": 579 }, { "epoch": 0.0789760348583878, "grad_norm": 58.555431617162306, "learning_rate": 6.312925170068027e-07, "logits/chosen": -1.8089094161987305, "logits/rejected": -2.8016138076782227, "logps/chosen": -1.368967890739441, "logps/rejected": -1.249255895614624, "loss": 5.1639, "rewards/accuracies": 0.25, "rewards/chosen": -13.689678192138672, "rewards/margins": -1.197120189666748, "rewards/rejected": -12.492559432983398, "step": 580 }, { "epoch": 0.07911220043572985, "grad_norm": 38.358431878376386, "learning_rate": 6.323809523809523e-07, "logits/chosen": -1.9675347805023193, "logits/rejected": -2.484963893890381, "logps/chosen": -1.08917236328125, "logps/rejected": -1.1390619277954102, "loss": 4.8824, "rewards/accuracies": 0.75, "rewards/chosen": -10.891722679138184, "rewards/margins": 0.4988975524902344, "rewards/rejected": -11.390620231628418, "step": 581 }, { "epoch": 0.07924836601307189, "grad_norm": 44.55500963867729, "learning_rate": 6.33469387755102e-07, "logits/chosen": -2.9388372898101807, "logits/rejected": -3.0146048069000244, "logps/chosen": -1.2043007612228394, "logps/rejected": -1.4577360153198242, "loss": 5.0832, "rewards/accuracies": 0.75, "rewards/chosen": -12.043006896972656, "rewards/margins": 2.534353256225586, "rewards/rejected": -14.577360153198242, "step": 582 }, { "epoch": 0.07938453159041395, "grad_norm": 45.70782815332849, "learning_rate": 6.345578231292518e-07, "logits/chosen": -3.199193000793457, "logits/rejected": -2.526639699935913, "logps/chosen": -1.025562047958374, "logps/rejected": -1.135059118270874, "loss": 4.4536, "rewards/accuracies": 0.5, "rewards/chosen": -10.255620956420898, "rewards/margins": 1.0949699878692627, "rewards/rejected": -11.350591659545898, "step": 583 }, { "epoch": 0.07952069716775599, "grad_norm": 44.656121661074394, "learning_rate": 6.356462585034013e-07, "logits/chosen": -3.217808723449707, "logits/rejected": -2.5175414085388184, "logps/chosen": -1.358670949935913, "logps/rejected": -1.632150411605835, "loss": 4.0938, "rewards/accuracies": 0.75, "rewards/chosen": -13.586708068847656, "rewards/margins": 2.7347943782806396, "rewards/rejected": -16.321502685546875, "step": 584 }, { "epoch": 0.07965686274509803, "grad_norm": 44.71447093222853, "learning_rate": 6.36734693877551e-07, "logits/chosen": -2.9487404823303223, "logits/rejected": -3.7863709926605225, "logps/chosen": -1.0904741287231445, "logps/rejected": -1.1545336246490479, "loss": 4.3006, "rewards/accuracies": 0.5, "rewards/chosen": -10.904741287231445, "rewards/margins": 0.640594482421875, "rewards/rejected": -11.54533576965332, "step": 585 }, { "epoch": 0.07979302832244009, "grad_norm": 47.53885163224002, "learning_rate": 6.378231292517006e-07, "logits/chosen": -2.972482919692993, "logits/rejected": -2.6537866592407227, "logps/chosen": -1.2612535953521729, "logps/rejected": -1.3217425346374512, "loss": 4.7153, "rewards/accuracies": 0.75, "rewards/chosen": -12.61253547668457, "rewards/margins": 0.6048893928527832, "rewards/rejected": -13.217424392700195, "step": 586 }, { "epoch": 0.07992919389978213, "grad_norm": 50.1921557399397, "learning_rate": 6.389115646258503e-07, "logits/chosen": -3.9929189682006836, "logits/rejected": -3.3344078063964844, "logps/chosen": -0.838044285774231, "logps/rejected": -0.9100294709205627, "loss": 4.1318, "rewards/accuracies": 0.75, "rewards/chosen": -8.38044261932373, "rewards/margins": 0.7198518514633179, "rewards/rejected": -9.10029411315918, "step": 587 }, { "epoch": 0.08006535947712418, "grad_norm": 52.46648681788607, "learning_rate": 6.4e-07, "logits/chosen": -3.163931131362915, "logits/rejected": -2.065485715866089, "logps/chosen": -1.2787175178527832, "logps/rejected": -1.3588215112686157, "loss": 4.5739, "rewards/accuracies": 0.75, "rewards/chosen": -12.787176132202148, "rewards/margins": 0.8010390996932983, "rewards/rejected": -13.588214874267578, "step": 588 }, { "epoch": 0.08020152505446623, "grad_norm": 51.76556780347119, "learning_rate": 6.410884353741497e-07, "logits/chosen": -2.660893440246582, "logits/rejected": -2.2553257942199707, "logps/chosen": -1.3350601196289062, "logps/rejected": -1.3890564441680908, "loss": 4.9354, "rewards/accuracies": 0.5, "rewards/chosen": -13.350601196289062, "rewards/margins": 0.5399630069732666, "rewards/rejected": -13.89056396484375, "step": 589 }, { "epoch": 0.08033769063180828, "grad_norm": 46.677386663813905, "learning_rate": 6.421768707482993e-07, "logits/chosen": -1.819617509841919, "logits/rejected": -2.1603853702545166, "logps/chosen": -1.150843620300293, "logps/rejected": -1.3328871726989746, "loss": 4.4276, "rewards/accuracies": 0.75, "rewards/chosen": -11.508437156677246, "rewards/margins": 1.8204336166381836, "rewards/rejected": -13.32887077331543, "step": 590 }, { "epoch": 0.08047385620915033, "grad_norm": 43.93789666280034, "learning_rate": 6.43265306122449e-07, "logits/chosen": -1.3214337825775146, "logits/rejected": -0.9992637634277344, "logps/chosen": -1.4156913757324219, "logps/rejected": -1.372721791267395, "loss": 5.0024, "rewards/accuracies": 0.5, "rewards/chosen": -14.156913757324219, "rewards/margins": -0.42969703674316406, "rewards/rejected": -13.727216720581055, "step": 591 }, { "epoch": 0.08061002178649238, "grad_norm": 48.24101475645529, "learning_rate": 6.443537414965986e-07, "logits/chosen": -3.402017116546631, "logits/rejected": -1.672685146331787, "logps/chosen": -0.7914307713508606, "logps/rejected": -1.1708588600158691, "loss": 4.486, "rewards/accuracies": 1.0, "rewards/chosen": -7.914308071136475, "rewards/margins": 3.794281244277954, "rewards/rejected": -11.708589553833008, "step": 592 }, { "epoch": 0.08074618736383442, "grad_norm": 41.08466792142108, "learning_rate": 6.454421768707483e-07, "logits/chosen": -3.398608684539795, "logits/rejected": -2.1733527183532715, "logps/chosen": -0.7917782068252563, "logps/rejected": -1.0878217220306396, "loss": 4.4737, "rewards/accuracies": 0.5, "rewards/chosen": -7.917782783508301, "rewards/margins": 2.9604344367980957, "rewards/rejected": -10.878216743469238, "step": 593 }, { "epoch": 0.08088235294117647, "grad_norm": 43.42051480973539, "learning_rate": 6.465306122448979e-07, "logits/chosen": -3.752169370651245, "logits/rejected": -1.8819544315338135, "logps/chosen": -1.4572197198867798, "logps/rejected": -1.9496443271636963, "loss": 4.2292, "rewards/accuracies": 1.0, "rewards/chosen": -14.572196960449219, "rewards/margins": 4.924245834350586, "rewards/rejected": -19.496442794799805, "step": 594 }, { "epoch": 0.08101851851851852, "grad_norm": 42.68230253287607, "learning_rate": 6.476190476190476e-07, "logits/chosen": -3.5202174186706543, "logits/rejected": -3.0683584213256836, "logps/chosen": -0.9971708059310913, "logps/rejected": -1.2722337245941162, "loss": 4.4184, "rewards/accuracies": 0.5, "rewards/chosen": -9.971708297729492, "rewards/margins": 2.75062894821167, "rewards/rejected": -12.722336769104004, "step": 595 }, { "epoch": 0.08115468409586056, "grad_norm": 44.523482342349645, "learning_rate": 6.487074829931972e-07, "logits/chosen": -3.24003529548645, "logits/rejected": -2.746913433074951, "logps/chosen": -1.2214171886444092, "logps/rejected": -1.248082160949707, "loss": 4.6059, "rewards/accuracies": 0.5, "rewards/chosen": -12.214170455932617, "rewards/margins": 0.266650915145874, "rewards/rejected": -12.48082160949707, "step": 596 }, { "epoch": 0.08129084967320262, "grad_norm": 43.57450195862737, "learning_rate": 6.497959183673469e-07, "logits/chosen": -3.8560643196105957, "logits/rejected": -3.3901658058166504, "logps/chosen": -0.9318897128105164, "logps/rejected": -1.048673152923584, "loss": 4.5376, "rewards/accuracies": 0.75, "rewards/chosen": -9.318897247314453, "rewards/margins": 1.1678344011306763, "rewards/rejected": -10.48673152923584, "step": 597 }, { "epoch": 0.08142701525054466, "grad_norm": 44.60396311755034, "learning_rate": 6.508843537414965e-07, "logits/chosen": -2.798079013824463, "logits/rejected": -1.4954564571380615, "logps/chosen": -1.1066312789916992, "logps/rejected": -1.3064115047454834, "loss": 4.7387, "rewards/accuracies": 0.75, "rewards/chosen": -11.066312789916992, "rewards/margins": 1.997802734375, "rewards/rejected": -13.064115524291992, "step": 598 }, { "epoch": 0.08156318082788672, "grad_norm": 51.93731256539819, "learning_rate": 6.519727891156463e-07, "logits/chosen": -1.6544690132141113, "logits/rejected": -1.780668020248413, "logps/chosen": -1.5816271305084229, "logps/rejected": -1.486692190170288, "loss": 4.5535, "rewards/accuracies": 0.25, "rewards/chosen": -15.816271781921387, "rewards/margins": -0.9493494033813477, "rewards/rejected": -14.866922378540039, "step": 599 }, { "epoch": 0.08169934640522876, "grad_norm": 41.70303014520611, "learning_rate": 6.53061224489796e-07, "logits/chosen": -3.1737027168273926, "logits/rejected": -1.239342212677002, "logps/chosen": -1.1764155626296997, "logps/rejected": -1.2854150533676147, "loss": 4.5559, "rewards/accuracies": 0.5, "rewards/chosen": -11.764154434204102, "rewards/margins": 1.0899956226348877, "rewards/rejected": -12.854150772094727, "step": 600 }, { "epoch": 0.0818355119825708, "grad_norm": 55.75244068911868, "learning_rate": 6.541496598639456e-07, "logits/chosen": -2.6803536415100098, "logits/rejected": -2.564791202545166, "logps/chosen": -0.9335293173789978, "logps/rejected": -1.0475046634674072, "loss": 4.1141, "rewards/accuracies": 0.75, "rewards/chosen": -9.335293769836426, "rewards/margins": 1.1397532224655151, "rewards/rejected": -10.475046157836914, "step": 601 }, { "epoch": 0.08197167755991286, "grad_norm": 46.29840245598669, "learning_rate": 6.552380952380951e-07, "logits/chosen": -3.312419891357422, "logits/rejected": -1.0645296573638916, "logps/chosen": -1.0440183877944946, "logps/rejected": -1.610776424407959, "loss": 4.0352, "rewards/accuracies": 1.0, "rewards/chosen": -10.440183639526367, "rewards/margins": 5.667580604553223, "rewards/rejected": -16.107763290405273, "step": 602 }, { "epoch": 0.0821078431372549, "grad_norm": 65.22828665220507, "learning_rate": 6.563265306122448e-07, "logits/chosen": -3.3894295692443848, "logits/rejected": -1.3778702020645142, "logps/chosen": -1.358232855796814, "logps/rejected": -1.7412601709365845, "loss": 4.4171, "rewards/accuracies": 0.75, "rewards/chosen": -13.582328796386719, "rewards/margins": 3.830273389816284, "rewards/rejected": -17.412601470947266, "step": 603 }, { "epoch": 0.08224400871459694, "grad_norm": 43.744685358370305, "learning_rate": 6.574149659863946e-07, "logits/chosen": -3.222620725631714, "logits/rejected": -3.824906826019287, "logps/chosen": -1.0739129781723022, "logps/rejected": -1.001055121421814, "loss": 4.5834, "rewards/accuracies": 0.5, "rewards/chosen": -10.739130020141602, "rewards/margins": -0.7285788059234619, "rewards/rejected": -10.010551452636719, "step": 604 }, { "epoch": 0.082380174291939, "grad_norm": 39.91073401258962, "learning_rate": 6.585034013605442e-07, "logits/chosen": -0.6425571441650391, "logits/rejected": -1.154647946357727, "logps/chosen": -1.4799199104309082, "logps/rejected": -1.5946712493896484, "loss": 3.975, "rewards/accuracies": 0.5, "rewards/chosen": -14.799200057983398, "rewards/margins": 1.1475119590759277, "rewards/rejected": -15.946712493896484, "step": 605 }, { "epoch": 0.08251633986928104, "grad_norm": 47.68017934983074, "learning_rate": 6.595918367346939e-07, "logits/chosen": -1.2562389373779297, "logits/rejected": -0.5284585952758789, "logps/chosen": -1.4649722576141357, "logps/rejected": -1.6942086219787598, "loss": 3.9768, "rewards/accuracies": 0.75, "rewards/chosen": -14.649723052978516, "rewards/margins": 2.292363405227661, "rewards/rejected": -16.94208526611328, "step": 606 }, { "epoch": 0.08265250544662309, "grad_norm": 71.64038874791503, "learning_rate": 6.606802721088435e-07, "logits/chosen": -2.010120153427124, "logits/rejected": -1.620278000831604, "logps/chosen": -2.126923084259033, "logps/rejected": -1.6744815111160278, "loss": 5.2516, "rewards/accuracies": 0.5, "rewards/chosen": -21.26923179626465, "rewards/margins": -4.524415969848633, "rewards/rejected": -16.744815826416016, "step": 607 }, { "epoch": 0.08278867102396514, "grad_norm": 40.21709771160562, "learning_rate": 6.617687074829932e-07, "logits/chosen": -3.199326515197754, "logits/rejected": -2.153519868850708, "logps/chosen": -0.9460991621017456, "logps/rejected": -1.4973745346069336, "loss": 4.8447, "rewards/accuracies": 1.0, "rewards/chosen": -9.460990905761719, "rewards/margins": 5.512754440307617, "rewards/rejected": -14.973745346069336, "step": 608 }, { "epoch": 0.08292483660130719, "grad_norm": 54.656597852574684, "learning_rate": 6.628571428571428e-07, "logits/chosen": -4.089111804962158, "logits/rejected": -2.3725574016571045, "logps/chosen": -0.9458285570144653, "logps/rejected": -1.6011548042297363, "loss": 4.2798, "rewards/accuracies": 0.75, "rewards/chosen": -9.45828628540039, "rewards/margins": 6.553261756896973, "rewards/rejected": -16.011547088623047, "step": 609 }, { "epoch": 0.08306100217864924, "grad_norm": 41.34706923166125, "learning_rate": 6.639455782312925e-07, "logits/chosen": -1.5226633548736572, "logits/rejected": -2.4405086040496826, "logps/chosen": -1.5521851778030396, "logps/rejected": -1.4588794708251953, "loss": 4.5934, "rewards/accuracies": 0.5, "rewards/chosen": -15.521851539611816, "rewards/margins": -0.9330577850341797, "rewards/rejected": -14.58879280090332, "step": 610 }, { "epoch": 0.08319716775599129, "grad_norm": 42.776184733989865, "learning_rate": 6.650340136054421e-07, "logits/chosen": -1.7152584791183472, "logits/rejected": -3.2261605262756348, "logps/chosen": -1.4794158935546875, "logps/rejected": -1.0178847312927246, "loss": 5.2167, "rewards/accuracies": 0.25, "rewards/chosen": -14.794159889221191, "rewards/margins": -4.615312576293945, "rewards/rejected": -10.178847312927246, "step": 611 }, { "epoch": 0.08333333333333333, "grad_norm": 42.58563976050089, "learning_rate": 6.661224489795918e-07, "logits/chosen": -1.1541788578033447, "logits/rejected": -0.7441282272338867, "logps/chosen": -1.3330185413360596, "logps/rejected": -1.4743231534957886, "loss": 4.5323, "rewards/accuracies": 1.0, "rewards/chosen": -13.330185890197754, "rewards/margins": 1.4130456447601318, "rewards/rejected": -14.743231773376465, "step": 612 }, { "epoch": 0.08346949891067539, "grad_norm": 48.89848855337731, "learning_rate": 6.672108843537414e-07, "logits/chosen": -1.5427765846252441, "logits/rejected": -3.4249117374420166, "logps/chosen": -1.3020484447479248, "logps/rejected": -1.105483055114746, "loss": 4.9366, "rewards/accuracies": 0.5, "rewards/chosen": -13.02048397064209, "rewards/margins": -1.965653896331787, "rewards/rejected": -11.054830551147461, "step": 613 }, { "epoch": 0.08360566448801743, "grad_norm": 48.13864153119173, "learning_rate": 6.682993197278911e-07, "logits/chosen": -2.3720149993896484, "logits/rejected": -0.8136985898017883, "logps/chosen": -1.1791075468063354, "logps/rejected": -1.391645908355713, "loss": 4.5877, "rewards/accuracies": 0.75, "rewards/chosen": -11.791074752807617, "rewards/margins": 2.1253843307495117, "rewards/rejected": -13.916460037231445, "step": 614 }, { "epoch": 0.08374183006535947, "grad_norm": 39.32196689184507, "learning_rate": 6.693877551020408e-07, "logits/chosen": -3.7478904724121094, "logits/rejected": -2.8227896690368652, "logps/chosen": -1.477585792541504, "logps/rejected": -1.9945253133773804, "loss": 4.6647, "rewards/accuracies": 0.75, "rewards/chosen": -14.775857925415039, "rewards/margins": 5.169394493103027, "rewards/rejected": -19.94525146484375, "step": 615 }, { "epoch": 0.08387799564270153, "grad_norm": 35.87164796360243, "learning_rate": 6.704761904761905e-07, "logits/chosen": -1.7957749366760254, "logits/rejected": -2.230074405670166, "logps/chosen": -1.2284047603607178, "logps/rejected": -1.0496501922607422, "loss": 4.3916, "rewards/accuracies": 0.25, "rewards/chosen": -12.284048080444336, "rewards/margins": -1.7875460386276245, "rewards/rejected": -10.496501922607422, "step": 616 }, { "epoch": 0.08401416122004357, "grad_norm": 45.747246543221756, "learning_rate": 6.715646258503401e-07, "logits/chosen": -1.834326148033142, "logits/rejected": -2.3603262901306152, "logps/chosen": -1.2375255823135376, "logps/rejected": -1.3946058750152588, "loss": 4.4806, "rewards/accuracies": 0.5, "rewards/chosen": -12.375255584716797, "rewards/margins": 1.570803165435791, "rewards/rejected": -13.946059226989746, "step": 617 }, { "epoch": 0.08415032679738563, "grad_norm": 66.42991309893881, "learning_rate": 6.726530612244898e-07, "logits/chosen": -3.4441351890563965, "logits/rejected": -4.131948947906494, "logps/chosen": -1.0145514011383057, "logps/rejected": -0.8655078411102295, "loss": 4.7994, "rewards/accuracies": 0.25, "rewards/chosen": -10.145513534545898, "rewards/margins": -1.4904344081878662, "rewards/rejected": -8.655078887939453, "step": 618 }, { "epoch": 0.08428649237472767, "grad_norm": 47.4983893126731, "learning_rate": 6.737414965986393e-07, "logits/chosen": -2.117906332015991, "logits/rejected": -0.7682610750198364, "logps/chosen": -1.102469801902771, "logps/rejected": -1.4337544441223145, "loss": 4.453, "rewards/accuracies": 0.75, "rewards/chosen": -11.024697303771973, "rewards/margins": 3.3128464221954346, "rewards/rejected": -14.337544441223145, "step": 619 }, { "epoch": 0.08442265795206971, "grad_norm": 46.15048200981663, "learning_rate": 6.748299319727891e-07, "logits/chosen": -0.9899232387542725, "logits/rejected": -1.327868103981018, "logps/chosen": -1.3401645421981812, "logps/rejected": -1.2669577598571777, "loss": 5.4372, "rewards/accuracies": 0.25, "rewards/chosen": -13.40164566040039, "rewards/margins": -0.7320680618286133, "rewards/rejected": -12.669577598571777, "step": 620 }, { "epoch": 0.08455882352941177, "grad_norm": 48.46029038130455, "learning_rate": 6.759183673469388e-07, "logits/chosen": -2.699263572692871, "logits/rejected": -2.3598735332489014, "logps/chosen": -1.4129489660263062, "logps/rejected": -1.0753196477890015, "loss": 4.5493, "rewards/accuracies": 0.25, "rewards/chosen": -14.12948989868164, "rewards/margins": -3.3762929439544678, "rewards/rejected": -10.753196716308594, "step": 621 }, { "epoch": 0.08469498910675381, "grad_norm": 54.1937026946401, "learning_rate": 6.770068027210884e-07, "logits/chosen": -1.0762853622436523, "logits/rejected": -1.4107470512390137, "logps/chosen": -1.2827301025390625, "logps/rejected": -1.5352598428726196, "loss": 4.6808, "rewards/accuracies": 0.75, "rewards/chosen": -12.827301025390625, "rewards/margins": 2.5252976417541504, "rewards/rejected": -15.352598190307617, "step": 622 }, { "epoch": 0.08483115468409586, "grad_norm": 44.687663793056906, "learning_rate": 6.780952380952381e-07, "logits/chosen": -2.968900442123413, "logits/rejected": -1.9511373043060303, "logps/chosen": -1.0685451030731201, "logps/rejected": -1.3705370426177979, "loss": 4.0792, "rewards/accuracies": 0.75, "rewards/chosen": -10.685450553894043, "rewards/margins": 3.0199196338653564, "rewards/rejected": -13.70536994934082, "step": 623 }, { "epoch": 0.08496732026143791, "grad_norm": 38.57810155205115, "learning_rate": 6.791836734693877e-07, "logits/chosen": -2.3258557319641113, "logits/rejected": -1.235877513885498, "logps/chosen": -0.9512310028076172, "logps/rejected": -2.0438671112060547, "loss": 4.2014, "rewards/accuracies": 1.0, "rewards/chosen": -9.512310028076172, "rewards/margins": 10.926361083984375, "rewards/rejected": -20.438671112060547, "step": 624 }, { "epoch": 0.08510348583877995, "grad_norm": 49.94512067490684, "learning_rate": 6.802721088435374e-07, "logits/chosen": -0.7289698123931885, "logits/rejected": -0.8181058168411255, "logps/chosen": -1.2990211248397827, "logps/rejected": -1.2978870868682861, "loss": 5.0008, "rewards/accuracies": 0.5, "rewards/chosen": -12.99021053314209, "rewards/margins": -0.011340856552124023, "rewards/rejected": -12.978870391845703, "step": 625 }, { "epoch": 0.085239651416122, "grad_norm": 49.89634669374292, "learning_rate": 6.813605442176871e-07, "logits/chosen": -1.0884077548980713, "logits/rejected": -1.2764735221862793, "logps/chosen": -1.2880243062973022, "logps/rejected": -1.5589125156402588, "loss": 4.5594, "rewards/accuracies": 0.75, "rewards/chosen": -12.880243301391602, "rewards/margins": 2.708881378173828, "rewards/rejected": -15.58912467956543, "step": 626 }, { "epoch": 0.08537581699346405, "grad_norm": 50.08079826504838, "learning_rate": 6.824489795918367e-07, "logits/chosen": -2.806659460067749, "logits/rejected": -2.5479955673217773, "logps/chosen": -0.8170806169509888, "logps/rejected": -1.159200668334961, "loss": 4.2059, "rewards/accuracies": 0.75, "rewards/chosen": -8.170805931091309, "rewards/margins": 3.4212005138397217, "rewards/rejected": -11.59200668334961, "step": 627 }, { "epoch": 0.0855119825708061, "grad_norm": 65.59697175510009, "learning_rate": 6.835374149659863e-07, "logits/chosen": 0.18612173199653625, "logits/rejected": -0.20317226648330688, "logps/chosen": -1.638903260231018, "logps/rejected": -1.476873517036438, "loss": 4.8192, "rewards/accuracies": 0.25, "rewards/chosen": -16.389034271240234, "rewards/margins": -1.620297908782959, "rewards/rejected": -14.7687349319458, "step": 628 }, { "epoch": 0.08564814814814815, "grad_norm": 47.501984634654995, "learning_rate": 6.84625850340136e-07, "logits/chosen": -0.4922381341457367, "logits/rejected": -0.17096449434757233, "logps/chosen": -1.2086262702941895, "logps/rejected": -1.9490710496902466, "loss": 4.3245, "rewards/accuracies": 1.0, "rewards/chosen": -12.086261749267578, "rewards/margins": 7.40444803237915, "rewards/rejected": -19.490711212158203, "step": 629 }, { "epoch": 0.0857843137254902, "grad_norm": 45.406473032229876, "learning_rate": 6.857142857142856e-07, "logits/chosen": -0.4664459824562073, "logits/rejected": -0.23163071274757385, "logps/chosen": -1.5701396465301514, "logps/rejected": -1.7693395614624023, "loss": 4.4195, "rewards/accuracies": 0.75, "rewards/chosen": -15.701395988464355, "rewards/margins": 1.9920005798339844, "rewards/rejected": -17.693397521972656, "step": 630 }, { "epoch": 0.08592047930283224, "grad_norm": 42.15965869755041, "learning_rate": 6.868027210884354e-07, "logits/chosen": -2.0629868507385254, "logits/rejected": -2.38601016998291, "logps/chosen": -1.2306432723999023, "logps/rejected": -1.400719165802002, "loss": 4.2453, "rewards/accuracies": 0.75, "rewards/chosen": -12.306434631347656, "rewards/margins": 1.7007567882537842, "rewards/rejected": -14.007190704345703, "step": 631 }, { "epoch": 0.0860566448801743, "grad_norm": 46.64230547050454, "learning_rate": 6.87891156462585e-07, "logits/chosen": -2.85886287689209, "logits/rejected": -4.367680549621582, "logps/chosen": -1.2726919651031494, "logps/rejected": -0.7012374401092529, "loss": 4.4051, "rewards/accuracies": 0.0, "rewards/chosen": -12.726919174194336, "rewards/margins": -5.714544773101807, "rewards/rejected": -7.0123748779296875, "step": 632 }, { "epoch": 0.08619281045751634, "grad_norm": 39.10195850502057, "learning_rate": 6.889795918367347e-07, "logits/chosen": -1.7505921125411987, "logits/rejected": -1.7901312112808228, "logps/chosen": -1.155066967010498, "logps/rejected": -1.2549517154693604, "loss": 4.4558, "rewards/accuracies": 0.5, "rewards/chosen": -11.55066967010498, "rewards/margins": 0.9988484382629395, "rewards/rejected": -12.549517631530762, "step": 633 }, { "epoch": 0.08632897603485838, "grad_norm": 53.18887935915557, "learning_rate": 6.900680272108843e-07, "logits/chosen": -1.9251489639282227, "logits/rejected": -1.2560462951660156, "logps/chosen": -1.0711355209350586, "logps/rejected": -1.7133415937423706, "loss": 4.4433, "rewards/accuracies": 1.0, "rewards/chosen": -10.711355209350586, "rewards/margins": 6.422060966491699, "rewards/rejected": -17.1334171295166, "step": 634 }, { "epoch": 0.08646514161220044, "grad_norm": 43.49845956334472, "learning_rate": 6.91156462585034e-07, "logits/chosen": -0.03264153003692627, "logits/rejected": -0.47154051065444946, "logps/chosen": -1.5368471145629883, "logps/rejected": -1.8416309356689453, "loss": 4.1901, "rewards/accuracies": 0.5, "rewards/chosen": -15.368471145629883, "rewards/margins": 3.0478382110595703, "rewards/rejected": -18.416309356689453, "step": 635 }, { "epoch": 0.08660130718954248, "grad_norm": 48.45046903731199, "learning_rate": 6.922448979591836e-07, "logits/chosen": -1.6819252967834473, "logits/rejected": -1.3621867895126343, "logps/chosen": -1.3532620668411255, "logps/rejected": -1.4377219676971436, "loss": 4.9706, "rewards/accuracies": 0.5, "rewards/chosen": -13.532620429992676, "rewards/margins": 0.8445987701416016, "rewards/rejected": -14.377219200134277, "step": 636 }, { "epoch": 0.08673747276688454, "grad_norm": 47.43223746076681, "learning_rate": 6.933333333333333e-07, "logits/chosen": -1.9323481321334839, "logits/rejected": -1.747902274131775, "logps/chosen": -1.1981849670410156, "logps/rejected": -1.4899052381515503, "loss": 4.6391, "rewards/accuracies": 0.75, "rewards/chosen": -11.98184871673584, "rewards/margins": 2.917203664779663, "rewards/rejected": -14.899052619934082, "step": 637 }, { "epoch": 0.08687363834422658, "grad_norm": 39.26303020718586, "learning_rate": 6.94421768707483e-07, "logits/chosen": -2.46567964553833, "logits/rejected": -1.5781235694885254, "logps/chosen": -1.151689887046814, "logps/rejected": -1.1769484281539917, "loss": 4.1909, "rewards/accuracies": 0.75, "rewards/chosen": -11.516899108886719, "rewards/margins": 0.2525848150253296, "rewards/rejected": -11.76948356628418, "step": 638 }, { "epoch": 0.08700980392156862, "grad_norm": 43.303207739714935, "learning_rate": 6.955102040816326e-07, "logits/chosen": -2.8542420864105225, "logits/rejected": -2.8627543449401855, "logps/chosen": -0.9697030186653137, "logps/rejected": -0.919002890586853, "loss": 4.4339, "rewards/accuracies": 0.25, "rewards/chosen": -9.697030067443848, "rewards/margins": -0.5070009231567383, "rewards/rejected": -9.19002914428711, "step": 639 }, { "epoch": 0.08714596949891068, "grad_norm": 47.41425909720812, "learning_rate": 6.965986394557823e-07, "logits/chosen": -2.7692649364471436, "logits/rejected": -1.6215736865997314, "logps/chosen": -1.3777852058410645, "logps/rejected": -1.9478729963302612, "loss": 4.3663, "rewards/accuracies": 0.5, "rewards/chosen": -13.777852058410645, "rewards/margins": 5.700878620147705, "rewards/rejected": -19.478729248046875, "step": 640 }, { "epoch": 0.08728213507625272, "grad_norm": 48.91826654778275, "learning_rate": 6.976870748299319e-07, "logits/chosen": -1.765453815460205, "logits/rejected": -0.45665574073791504, "logps/chosen": -1.5376968383789062, "logps/rejected": -1.6747403144836426, "loss": 5.3506, "rewards/accuracies": 0.75, "rewards/chosen": -15.376968383789062, "rewards/margins": 1.3704347610473633, "rewards/rejected": -16.74740219116211, "step": 641 }, { "epoch": 0.08741830065359477, "grad_norm": 40.70931933861876, "learning_rate": 6.987755102040817e-07, "logits/chosen": -1.4545269012451172, "logits/rejected": -1.6968895196914673, "logps/chosen": -1.4915120601654053, "logps/rejected": -1.398620843887329, "loss": 4.1323, "rewards/accuracies": 0.25, "rewards/chosen": -14.915122032165527, "rewards/margins": -0.928912878036499, "rewards/rejected": -13.986207962036133, "step": 642 }, { "epoch": 0.08755446623093682, "grad_norm": 41.81166085472726, "learning_rate": 6.998639455782313e-07, "logits/chosen": -1.4033504724502563, "logits/rejected": -0.8503090739250183, "logps/chosen": -1.3826708793640137, "logps/rejected": -1.171365737915039, "loss": 4.5687, "rewards/accuracies": 0.25, "rewards/chosen": -13.826708793640137, "rewards/margins": -2.1130526065826416, "rewards/rejected": -11.713656425476074, "step": 643 }, { "epoch": 0.08769063180827887, "grad_norm": 46.93254188126062, "learning_rate": 7.00952380952381e-07, "logits/chosen": -3.137544631958008, "logits/rejected": -2.6746675968170166, "logps/chosen": -1.151672601699829, "logps/rejected": -1.074217677116394, "loss": 4.4278, "rewards/accuracies": 0.25, "rewards/chosen": -11.516725540161133, "rewards/margins": -0.7745497226715088, "rewards/rejected": -10.742176055908203, "step": 644 }, { "epoch": 0.08782679738562091, "grad_norm": 41.43692513516883, "learning_rate": 7.020408163265305e-07, "logits/chosen": -1.4805817604064941, "logits/rejected": -2.1193528175354004, "logps/chosen": -1.1210222244262695, "logps/rejected": -1.2383766174316406, "loss": 4.9476, "rewards/accuracies": 0.75, "rewards/chosen": -11.210222244262695, "rewards/margins": 1.173543930053711, "rewards/rejected": -12.383766174316406, "step": 645 }, { "epoch": 0.08796296296296297, "grad_norm": 43.2630244669903, "learning_rate": 7.031292517006802e-07, "logits/chosen": -2.734353542327881, "logits/rejected": -2.295260429382324, "logps/chosen": -0.8614736795425415, "logps/rejected": -1.020436406135559, "loss": 4.4396, "rewards/accuracies": 1.0, "rewards/chosen": -8.614736557006836, "rewards/margins": 1.589627742767334, "rewards/rejected": -10.204364776611328, "step": 646 }, { "epoch": 0.08809912854030501, "grad_norm": 41.284968994083485, "learning_rate": 7.042176870748299e-07, "logits/chosen": 0.5754787921905518, "logits/rejected": -0.6587244272232056, "logps/chosen": -1.6230111122131348, "logps/rejected": -1.932807207107544, "loss": 4.5795, "rewards/accuracies": 0.75, "rewards/chosen": -16.230112075805664, "rewards/margins": 3.097960948944092, "rewards/rejected": -19.32807159423828, "step": 647 }, { "epoch": 0.08823529411764706, "grad_norm": 40.69197391347, "learning_rate": 7.053061224489796e-07, "logits/chosen": -0.978300929069519, "logits/rejected": -2.260051727294922, "logps/chosen": -1.2529600858688354, "logps/rejected": -0.9860125184059143, "loss": 4.8506, "rewards/accuracies": 0.0, "rewards/chosen": -12.529601097106934, "rewards/margins": -2.669475555419922, "rewards/rejected": -9.860125541687012, "step": 648 }, { "epoch": 0.08837145969498911, "grad_norm": 51.685897833056714, "learning_rate": 7.063945578231292e-07, "logits/chosen": -1.2813022136688232, "logits/rejected": -0.09345835447311401, "logps/chosen": -1.5688502788543701, "logps/rejected": -1.4916841983795166, "loss": 4.5248, "rewards/accuracies": 0.5, "rewards/chosen": -15.688502311706543, "rewards/margins": -0.7716600894927979, "rewards/rejected": -14.916842460632324, "step": 649 }, { "epoch": 0.08850762527233115, "grad_norm": 40.83163126059889, "learning_rate": 7.074829931972789e-07, "logits/chosen": -2.5911810398101807, "logits/rejected": -0.4755602478981018, "logps/chosen": -1.469336986541748, "logps/rejected": -1.4867868423461914, "loss": 4.2896, "rewards/accuracies": 0.5, "rewards/chosen": -14.693368911743164, "rewards/margins": 0.17449951171875, "rewards/rejected": -14.867868423461914, "step": 650 }, { "epoch": 0.0886437908496732, "grad_norm": 41.25728452449899, "learning_rate": 7.085714285714285e-07, "logits/chosen": -1.2515945434570312, "logits/rejected": -0.8685569167137146, "logps/chosen": -1.0592890977859497, "logps/rejected": -0.9196414351463318, "loss": 4.7366, "rewards/accuracies": 0.25, "rewards/chosen": -10.592891693115234, "rewards/margins": -1.3964768648147583, "rewards/rejected": -9.19641399383545, "step": 651 }, { "epoch": 0.08877995642701525, "grad_norm": 46.765062263540756, "learning_rate": 7.096598639455783e-07, "logits/chosen": -1.4840779304504395, "logits/rejected": -1.4411932229995728, "logps/chosen": -1.5243773460388184, "logps/rejected": -1.302971363067627, "loss": 4.1497, "rewards/accuracies": 0.25, "rewards/chosen": -15.243772506713867, "rewards/margins": -2.214059829711914, "rewards/rejected": -13.029712677001953, "step": 652 }, { "epoch": 0.08891612200435729, "grad_norm": 50.170660680117955, "learning_rate": 7.107482993197278e-07, "logits/chosen": -2.3905181884765625, "logits/rejected": -1.249734878540039, "logps/chosen": -1.1275136470794678, "logps/rejected": -1.2667349576950073, "loss": 3.7803, "rewards/accuracies": 0.75, "rewards/chosen": -11.275136947631836, "rewards/margins": 1.3922125101089478, "rewards/rejected": -12.667348861694336, "step": 653 }, { "epoch": 0.08905228758169935, "grad_norm": 38.27003785423984, "learning_rate": 7.118367346938775e-07, "logits/chosen": -2.8680341243743896, "logits/rejected": -0.5584972500801086, "logps/chosen": -0.9507551789283752, "logps/rejected": -1.0714600086212158, "loss": 4.2238, "rewards/accuracies": 0.75, "rewards/chosen": -9.507551193237305, "rewards/margins": 1.2070486545562744, "rewards/rejected": -10.714600563049316, "step": 654 }, { "epoch": 0.08918845315904139, "grad_norm": 40.58379829505822, "learning_rate": 7.129251700680271e-07, "logits/chosen": -2.1814892292022705, "logits/rejected": -1.6123849153518677, "logps/chosen": -1.1625759601593018, "logps/rejected": -1.3998236656188965, "loss": 4.7784, "rewards/accuracies": 0.75, "rewards/chosen": -11.625758171081543, "rewards/margins": 2.372478485107422, "rewards/rejected": -13.998237609863281, "step": 655 }, { "epoch": 0.08932461873638345, "grad_norm": 50.59574919711516, "learning_rate": 7.140136054421768e-07, "logits/chosen": -0.82741379737854, "logits/rejected": -1.7154357433319092, "logps/chosen": -1.158742904663086, "logps/rejected": -1.3245470523834229, "loss": 4.43, "rewards/accuracies": 0.75, "rewards/chosen": -11.58742904663086, "rewards/margins": 1.658041000366211, "rewards/rejected": -13.245469093322754, "step": 656 }, { "epoch": 0.08946078431372549, "grad_norm": 46.06946269875832, "learning_rate": 7.151020408163264e-07, "logits/chosen": 0.45199865102767944, "logits/rejected": 1.1383434534072876, "logps/chosen": -1.4263839721679688, "logps/rejected": -1.5267257690429688, "loss": 4.5181, "rewards/accuracies": 0.75, "rewards/chosen": -14.263839721679688, "rewards/margins": 1.0034186840057373, "rewards/rejected": -15.267257690429688, "step": 657 }, { "epoch": 0.08959694989106753, "grad_norm": 51.23360267292443, "learning_rate": 7.161904761904762e-07, "logits/chosen": -2.280205726623535, "logits/rejected": -1.036294937133789, "logps/chosen": -1.1109986305236816, "logps/rejected": -1.6445038318634033, "loss": 4.3957, "rewards/accuracies": 0.75, "rewards/chosen": -11.1099853515625, "rewards/margins": 5.335053443908691, "rewards/rejected": -16.445039749145508, "step": 658 }, { "epoch": 0.08973311546840959, "grad_norm": 47.04321913734578, "learning_rate": 7.172789115646259e-07, "logits/chosen": -1.0761737823486328, "logits/rejected": 0.02260369062423706, "logps/chosen": -1.2378181219100952, "logps/rejected": -1.3720680475234985, "loss": 4.9019, "rewards/accuracies": 0.5, "rewards/chosen": -12.378181457519531, "rewards/margins": 1.3424994945526123, "rewards/rejected": -13.720680236816406, "step": 659 }, { "epoch": 0.08986928104575163, "grad_norm": 48.653879427710365, "learning_rate": 7.183673469387755e-07, "logits/chosen": -2.8847060203552246, "logits/rejected": -1.8982551097869873, "logps/chosen": -1.0577784776687622, "logps/rejected": -1.0660207271575928, "loss": 4.3166, "rewards/accuracies": 0.5, "rewards/chosen": -10.577784538269043, "rewards/margins": 0.08242261409759521, "rewards/rejected": -10.660207748413086, "step": 660 }, { "epoch": 0.09000544662309368, "grad_norm": 48.550154826612776, "learning_rate": 7.194557823129252e-07, "logits/chosen": -2.855888843536377, "logits/rejected": -1.5934865474700928, "logps/chosen": -1.1706957817077637, "logps/rejected": -1.373837947845459, "loss": 4.783, "rewards/accuracies": 0.5, "rewards/chosen": -11.706958770751953, "rewards/margins": 2.0314202308654785, "rewards/rejected": -13.738378524780273, "step": 661 }, { "epoch": 0.09014161220043573, "grad_norm": 58.4026347251471, "learning_rate": 7.205442176870748e-07, "logits/chosen": -0.5893259048461914, "logits/rejected": 0.2952589988708496, "logps/chosen": -1.4463276863098145, "logps/rejected": -1.3339684009552002, "loss": 4.8638, "rewards/accuracies": 0.5, "rewards/chosen": -14.463277816772461, "rewards/margins": -1.123594045639038, "rewards/rejected": -13.339683532714844, "step": 662 }, { "epoch": 0.09027777777777778, "grad_norm": 48.77935968386407, "learning_rate": 7.216326530612245e-07, "logits/chosen": -1.1749669313430786, "logits/rejected": 0.7338775396347046, "logps/chosen": -1.2327874898910522, "logps/rejected": -1.8573276996612549, "loss": 4.1797, "rewards/accuracies": 1.0, "rewards/chosen": -12.327875137329102, "rewards/margins": 6.245402812957764, "rewards/rejected": -18.57327651977539, "step": 663 }, { "epoch": 0.09041394335511982, "grad_norm": 48.534618162986234, "learning_rate": 7.227210884353741e-07, "logits/chosen": -0.9152378439903259, "logits/rejected": -0.8537068963050842, "logps/chosen": -1.5478107929229736, "logps/rejected": -1.5021013021469116, "loss": 4.7771, "rewards/accuracies": 0.5, "rewards/chosen": -15.478107452392578, "rewards/margins": -0.4570949077606201, "rewards/rejected": -15.021012306213379, "step": 664 }, { "epoch": 0.09055010893246188, "grad_norm": 39.54220902092579, "learning_rate": 7.238095238095238e-07, "logits/chosen": -2.9923999309539795, "logits/rejected": -1.2824888229370117, "logps/chosen": -1.0448970794677734, "logps/rejected": -1.5155349969863892, "loss": 4.7409, "rewards/accuracies": 0.75, "rewards/chosen": -10.448970794677734, "rewards/margins": 4.706378936767578, "rewards/rejected": -15.155349731445312, "step": 665 }, { "epoch": 0.09068627450980392, "grad_norm": 42.00791107697011, "learning_rate": 7.248979591836734e-07, "logits/chosen": -0.5962499976158142, "logits/rejected": -0.756334662437439, "logps/chosen": -1.7285890579223633, "logps/rejected": -1.6300365924835205, "loss": 4.5764, "rewards/accuracies": 0.25, "rewards/chosen": -17.285890579223633, "rewards/margins": -0.9855251312255859, "rewards/rejected": -16.300365447998047, "step": 666 }, { "epoch": 0.09082244008714598, "grad_norm": 42.374216078343736, "learning_rate": 7.259863945578231e-07, "logits/chosen": -1.9879686832427979, "logits/rejected": -2.0401968955993652, "logps/chosen": -1.2647435665130615, "logps/rejected": -1.161630392074585, "loss": 4.5837, "rewards/accuracies": 0.0, "rewards/chosen": -12.647435188293457, "rewards/margins": -1.0311315059661865, "rewards/rejected": -11.616303443908691, "step": 667 }, { "epoch": 0.09095860566448802, "grad_norm": 43.58263541288087, "learning_rate": 7.270748299319728e-07, "logits/chosen": -0.04843667149543762, "logits/rejected": 0.580354630947113, "logps/chosen": -1.7613463401794434, "logps/rejected": -1.7951139211654663, "loss": 4.2503, "rewards/accuracies": 0.5, "rewards/chosen": -17.613462448120117, "rewards/margins": 0.3376760482788086, "rewards/rejected": -17.95113754272461, "step": 668 }, { "epoch": 0.09109477124183006, "grad_norm": 55.3168561999338, "learning_rate": 7.281632653061225e-07, "logits/chosen": -0.44252675771713257, "logits/rejected": -0.13974586129188538, "logps/chosen": -1.4606740474700928, "logps/rejected": -1.7403895854949951, "loss": 4.6739, "rewards/accuracies": 1.0, "rewards/chosen": -14.606740951538086, "rewards/margins": 2.7971556186676025, "rewards/rejected": -17.40389633178711, "step": 669 }, { "epoch": 0.09123093681917212, "grad_norm": 48.78267642852759, "learning_rate": 7.29251700680272e-07, "logits/chosen": -0.9513230323791504, "logits/rejected": -0.9863169193267822, "logps/chosen": -1.2756454944610596, "logps/rejected": -1.534219741821289, "loss": 4.5532, "rewards/accuracies": 0.75, "rewards/chosen": -12.756455421447754, "rewards/margins": 2.5857419967651367, "rewards/rejected": -15.34219741821289, "step": 670 }, { "epoch": 0.09136710239651416, "grad_norm": 44.92576167146359, "learning_rate": 7.303401360544217e-07, "logits/chosen": 0.4303511381149292, "logits/rejected": -0.23084700107574463, "logps/chosen": -1.6245568990707397, "logps/rejected": -1.7984977960586548, "loss": 4.6694, "rewards/accuracies": 0.5, "rewards/chosen": -16.245569229125977, "rewards/margins": 1.7394092082977295, "rewards/rejected": -17.98497772216797, "step": 671 }, { "epoch": 0.0915032679738562, "grad_norm": 40.84514456257058, "learning_rate": 7.314285714285713e-07, "logits/chosen": -0.7317081093788147, "logits/rejected": -0.5217989087104797, "logps/chosen": -1.2046802043914795, "logps/rejected": -1.5380363464355469, "loss": 4.7876, "rewards/accuracies": 0.75, "rewards/chosen": -12.046802520751953, "rewards/margins": 3.333559989929199, "rewards/rejected": -15.380362510681152, "step": 672 }, { "epoch": 0.09163943355119826, "grad_norm": 45.221666701637616, "learning_rate": 7.32517006802721e-07, "logits/chosen": -2.714024066925049, "logits/rejected": -1.6860437393188477, "logps/chosen": -0.9929487705230713, "logps/rejected": -1.2894072532653809, "loss": 4.105, "rewards/accuracies": 1.0, "rewards/chosen": -9.929487228393555, "rewards/margins": 2.9645838737487793, "rewards/rejected": -12.894071578979492, "step": 673 }, { "epoch": 0.0917755991285403, "grad_norm": 42.804179794124856, "learning_rate": 7.336054421768707e-07, "logits/chosen": -1.2533427476882935, "logits/rejected": 0.491943359375, "logps/chosen": -1.2432630062103271, "logps/rejected": -1.5159964561462402, "loss": 4.7413, "rewards/accuracies": 0.75, "rewards/chosen": -12.432629585266113, "rewards/margins": 2.7273359298706055, "rewards/rejected": -15.159965515136719, "step": 674 }, { "epoch": 0.09191176470588236, "grad_norm": 42.0543638627975, "learning_rate": 7.346938775510204e-07, "logits/chosen": -0.7690948247909546, "logits/rejected": -0.2596537470817566, "logps/chosen": -1.5548579692840576, "logps/rejected": -1.737626552581787, "loss": 4.4195, "rewards/accuracies": 0.75, "rewards/chosen": -15.548579216003418, "rewards/margins": 1.8276851177215576, "rewards/rejected": -17.376266479492188, "step": 675 }, { "epoch": 0.0920479302832244, "grad_norm": 43.05742689447013, "learning_rate": 7.357823129251701e-07, "logits/chosen": -1.6259522438049316, "logits/rejected": -1.3253235816955566, "logps/chosen": -1.0475354194641113, "logps/rejected": -1.1794662475585938, "loss": 4.861, "rewards/accuracies": 0.5, "rewards/chosen": -10.475353240966797, "rewards/margins": 1.3193087577819824, "rewards/rejected": -11.794662475585938, "step": 676 }, { "epoch": 0.09218409586056645, "grad_norm": 72.90762523266828, "learning_rate": 7.368707482993197e-07, "logits/chosen": -0.9490575194358826, "logits/rejected": -2.504819393157959, "logps/chosen": -1.1682517528533936, "logps/rejected": -0.9658427238464355, "loss": 4.615, "rewards/accuracies": 0.0, "rewards/chosen": -11.682518005371094, "rewards/margins": -2.02409029006958, "rewards/rejected": -9.658427238464355, "step": 677 }, { "epoch": 0.0923202614379085, "grad_norm": 43.85897461588643, "learning_rate": 7.379591836734694e-07, "logits/chosen": 0.05861362814903259, "logits/rejected": 0.08623175323009491, "logps/chosen": -1.541240930557251, "logps/rejected": -1.6925530433654785, "loss": 4.0946, "rewards/accuracies": 0.75, "rewards/chosen": -15.412408828735352, "rewards/margins": 1.5131218433380127, "rewards/rejected": -16.92552947998047, "step": 678 }, { "epoch": 0.09245642701525054, "grad_norm": 37.74205863684378, "learning_rate": 7.39047619047619e-07, "logits/chosen": -1.031054139137268, "logits/rejected": 1.4145876169204712, "logps/chosen": -1.2959513664245605, "logps/rejected": -1.5400187969207764, "loss": 3.9677, "rewards/accuracies": 0.75, "rewards/chosen": -12.959513664245605, "rewards/margins": 2.4406750202178955, "rewards/rejected": -15.400188446044922, "step": 679 }, { "epoch": 0.09259259259259259, "grad_norm": 52.078868973119725, "learning_rate": 7.401360544217687e-07, "logits/chosen": -1.3145700693130493, "logits/rejected": -0.6749060750007629, "logps/chosen": -1.7240591049194336, "logps/rejected": -1.6701761484146118, "loss": 5.2664, "rewards/accuracies": 0.75, "rewards/chosen": -17.240591049194336, "rewards/margins": -0.5388295650482178, "rewards/rejected": -16.70176124572754, "step": 680 }, { "epoch": 0.09272875816993464, "grad_norm": 41.35700396870533, "learning_rate": 7.412244897959183e-07, "logits/chosen": -2.1758508682250977, "logits/rejected": -0.5502135753631592, "logps/chosen": -1.1998624801635742, "logps/rejected": -1.328154444694519, "loss": 4.0955, "rewards/accuracies": 0.5, "rewards/chosen": -11.998624801635742, "rewards/margins": 1.2829186916351318, "rewards/rejected": -13.281543731689453, "step": 681 }, { "epoch": 0.09286492374727669, "grad_norm": 46.34218045236385, "learning_rate": 7.42312925170068e-07, "logits/chosen": -2.4190151691436768, "logits/rejected": -1.8049378395080566, "logps/chosen": -1.000082015991211, "logps/rejected": -1.0164884328842163, "loss": 4.7107, "rewards/accuracies": 0.75, "rewards/chosen": -10.00082015991211, "rewards/margins": 0.16406488418579102, "rewards/rejected": -10.164884567260742, "step": 682 }, { "epoch": 0.09300108932461873, "grad_norm": 47.68272653218517, "learning_rate": 7.434013605442176e-07, "logits/chosen": -0.33062997460365295, "logits/rejected": -0.8949194550514221, "logps/chosen": -1.2932116985321045, "logps/rejected": -1.070622205734253, "loss": 4.0331, "rewards/accuracies": 0.25, "rewards/chosen": -12.932116508483887, "rewards/margins": -2.225893974304199, "rewards/rejected": -10.706222534179688, "step": 683 }, { "epoch": 0.09313725490196079, "grad_norm": 43.18441552698691, "learning_rate": 7.444897959183673e-07, "logits/chosen": -1.2744708061218262, "logits/rejected": -0.030560612678527832, "logps/chosen": -0.9943463802337646, "logps/rejected": -1.5822784900665283, "loss": 4.6468, "rewards/accuracies": 1.0, "rewards/chosen": -9.943464279174805, "rewards/margins": 5.87932014465332, "rewards/rejected": -15.822784423828125, "step": 684 }, { "epoch": 0.09327342047930283, "grad_norm": 47.253714441827945, "learning_rate": 7.45578231292517e-07, "logits/chosen": 0.06063342094421387, "logits/rejected": 0.8169047236442566, "logps/chosen": -1.603643774986267, "logps/rejected": -1.8753246068954468, "loss": 4.8619, "rewards/accuracies": 0.75, "rewards/chosen": -16.03643798828125, "rewards/margins": 2.7168092727661133, "rewards/rejected": -18.753246307373047, "step": 685 }, { "epoch": 0.09340958605664489, "grad_norm": 49.87074850454892, "learning_rate": 7.466666666666667e-07, "logits/chosen": -1.04884934425354, "logits/rejected": -0.5650853514671326, "logps/chosen": -1.600728988647461, "logps/rejected": -1.6812005043029785, "loss": 4.7817, "rewards/accuracies": 0.5, "rewards/chosen": -16.00728988647461, "rewards/margins": 0.8047158718109131, "rewards/rejected": -16.8120059967041, "step": 686 }, { "epoch": 0.09354575163398693, "grad_norm": 42.147213222565384, "learning_rate": 7.477551020408163e-07, "logits/chosen": -1.1583398580551147, "logits/rejected": -0.3938123881816864, "logps/chosen": -1.4318757057189941, "logps/rejected": -1.429311990737915, "loss": 4.3734, "rewards/accuracies": 0.25, "rewards/chosen": -14.318757057189941, "rewards/margins": -0.02563798427581787, "rewards/rejected": -14.293119430541992, "step": 687 }, { "epoch": 0.09368191721132897, "grad_norm": 42.66354735267599, "learning_rate": 7.488435374149659e-07, "logits/chosen": -0.6309993863105774, "logits/rejected": 1.679006814956665, "logps/chosen": -1.359928846359253, "logps/rejected": -1.8345006704330444, "loss": 4.7409, "rewards/accuracies": 1.0, "rewards/chosen": -13.599288940429688, "rewards/margins": 4.745716571807861, "rewards/rejected": -18.34500503540039, "step": 688 }, { "epoch": 0.09381808278867103, "grad_norm": 44.40668935969842, "learning_rate": 7.499319727891155e-07, "logits/chosen": 0.14752139151096344, "logits/rejected": 0.6450062394142151, "logps/chosen": -1.9288934469223022, "logps/rejected": -2.0186924934387207, "loss": 4.5942, "rewards/accuracies": 0.75, "rewards/chosen": -19.28893280029297, "rewards/margins": 0.8979921340942383, "rewards/rejected": -20.186925888061523, "step": 689 }, { "epoch": 0.09395424836601307, "grad_norm": 38.189844024081324, "learning_rate": 7.510204081632653e-07, "logits/chosen": -2.0135703086853027, "logits/rejected": 0.3571273982524872, "logps/chosen": -1.0978765487670898, "logps/rejected": -1.2632924318313599, "loss": 4.463, "rewards/accuracies": 0.75, "rewards/chosen": -10.978765487670898, "rewards/margins": 1.6541599035263062, "rewards/rejected": -12.632925033569336, "step": 690 }, { "epoch": 0.09409041394335511, "grad_norm": 45.59350134315056, "learning_rate": 7.521088435374149e-07, "logits/chosen": 0.11071091145277023, "logits/rejected": -0.06653738021850586, "logps/chosen": -1.4514433145523071, "logps/rejected": -1.6425246000289917, "loss": 4.747, "rewards/accuracies": 0.5, "rewards/chosen": -14.514433860778809, "rewards/margins": 1.9108123779296875, "rewards/rejected": -16.425247192382812, "step": 691 }, { "epoch": 0.09422657952069717, "grad_norm": 41.83839847445653, "learning_rate": 7.531972789115646e-07, "logits/chosen": -1.5491693019866943, "logits/rejected": -1.4817986488342285, "logps/chosen": -1.2849147319793701, "logps/rejected": -1.111653208732605, "loss": 4.6189, "rewards/accuracies": 0.25, "rewards/chosen": -12.84914779663086, "rewards/margins": -1.7326157093048096, "rewards/rejected": -11.116532325744629, "step": 692 }, { "epoch": 0.09436274509803921, "grad_norm": 55.672755709933476, "learning_rate": 7.542857142857142e-07, "logits/chosen": -2.134584903717041, "logits/rejected": -1.4737731218338013, "logps/chosen": -1.1639045476913452, "logps/rejected": -1.3551857471466064, "loss": 4.9761, "rewards/accuracies": 0.75, "rewards/chosen": -11.639045715332031, "rewards/margins": 1.9128122329711914, "rewards/rejected": -13.551857948303223, "step": 693 }, { "epoch": 0.09449891067538127, "grad_norm": 47.366641512040964, "learning_rate": 7.553741496598639e-07, "logits/chosen": -1.5003392696380615, "logits/rejected": -1.180270791053772, "logps/chosen": -1.1718944311141968, "logps/rejected": -1.2431176900863647, "loss": 4.8825, "rewards/accuracies": 0.5, "rewards/chosen": -11.718944549560547, "rewards/margins": 0.7122328281402588, "rewards/rejected": -12.431177139282227, "step": 694 }, { "epoch": 0.09463507625272331, "grad_norm": 43.97797279578993, "learning_rate": 7.564625850340137e-07, "logits/chosen": 0.7685288190841675, "logits/rejected": 0.14603550732135773, "logps/chosen": -1.192243218421936, "logps/rejected": -1.2198100090026855, "loss": 4.4566, "rewards/accuracies": 0.5, "rewards/chosen": -11.922432899475098, "rewards/margins": 0.2756669521331787, "rewards/rejected": -12.198099136352539, "step": 695 }, { "epoch": 0.09477124183006536, "grad_norm": 58.287840041720365, "learning_rate": 7.575510204081632e-07, "logits/chosen": -0.5064850449562073, "logits/rejected": -0.13354772329330444, "logps/chosen": -1.5847620964050293, "logps/rejected": -1.5697126388549805, "loss": 5.2932, "rewards/accuracies": 0.25, "rewards/chosen": -15.847620964050293, "rewards/margins": -0.15049338340759277, "rewards/rejected": -15.697127342224121, "step": 696 }, { "epoch": 0.09490740740740741, "grad_norm": 347.7893054569063, "learning_rate": 7.586394557823129e-07, "logits/chosen": -0.7942438125610352, "logits/rejected": -0.7327542901039124, "logps/chosen": -1.2537939548492432, "logps/rejected": -1.209998369216919, "loss": 5.4353, "rewards/accuracies": 0.5, "rewards/chosen": -12.53794002532959, "rewards/margins": -0.43795621395111084, "rewards/rejected": -12.099983215332031, "step": 697 }, { "epoch": 0.09504357298474946, "grad_norm": 43.10772451159401, "learning_rate": 7.597278911564625e-07, "logits/chosen": -1.030556559562683, "logits/rejected": -1.3950252532958984, "logps/chosen": -1.3147428035736084, "logps/rejected": -1.2890074253082275, "loss": 4.1629, "rewards/accuracies": 0.5, "rewards/chosen": -13.147427558898926, "rewards/margins": -0.2573525905609131, "rewards/rejected": -12.890074729919434, "step": 698 }, { "epoch": 0.0951797385620915, "grad_norm": 56.64001364482097, "learning_rate": 7.608163265306122e-07, "logits/chosen": 0.6216972470283508, "logits/rejected": 1.512524127960205, "logps/chosen": -1.44383704662323, "logps/rejected": -1.606933355331421, "loss": 4.7419, "rewards/accuracies": 0.75, "rewards/chosen": -14.438369750976562, "rewards/margins": 1.63096284866333, "rewards/rejected": -16.069332122802734, "step": 699 }, { "epoch": 0.09531590413943355, "grad_norm": 38.68057681480692, "learning_rate": 7.619047619047618e-07, "logits/chosen": -0.9229204058647156, "logits/rejected": -1.8337852954864502, "logps/chosen": -1.0434694290161133, "logps/rejected": -1.2594497203826904, "loss": 4.0755, "rewards/accuracies": 0.75, "rewards/chosen": -10.434694290161133, "rewards/margins": 2.1598033905029297, "rewards/rejected": -12.594496726989746, "step": 700 }, { "epoch": 0.0954520697167756, "grad_norm": 47.655387265753355, "learning_rate": 7.629931972789116e-07, "logits/chosen": 0.5379080772399902, "logits/rejected": 0.2412363439798355, "logps/chosen": -1.7797297239303589, "logps/rejected": -1.674952507019043, "loss": 4.489, "rewards/accuracies": 0.25, "rewards/chosen": -17.797298431396484, "rewards/margins": -1.0477724075317383, "rewards/rejected": -16.74952507019043, "step": 701 }, { "epoch": 0.09558823529411764, "grad_norm": 44.1408606966101, "learning_rate": 7.640816326530612e-07, "logits/chosen": -0.35111042857170105, "logits/rejected": -0.6477332711219788, "logps/chosen": -1.3295586109161377, "logps/rejected": -1.354958415031433, "loss": 4.0497, "rewards/accuracies": 0.5, "rewards/chosen": -13.295585632324219, "rewards/margins": 0.25399839878082275, "rewards/rejected": -13.54958438873291, "step": 702 }, { "epoch": 0.0957244008714597, "grad_norm": 40.87502478380578, "learning_rate": 7.651700680272109e-07, "logits/chosen": -3.777395248413086, "logits/rejected": -1.1470290422439575, "logps/chosen": -1.120322346687317, "logps/rejected": -1.3320046663284302, "loss": 4.1665, "rewards/accuracies": 0.75, "rewards/chosen": -11.203224182128906, "rewards/margins": 2.1168227195739746, "rewards/rejected": -13.320046424865723, "step": 703 }, { "epoch": 0.09586056644880174, "grad_norm": 47.21168441193202, "learning_rate": 7.662585034013605e-07, "logits/chosen": -0.23166656494140625, "logits/rejected": -0.3906424045562744, "logps/chosen": -1.2068699598312378, "logps/rejected": -1.3883442878723145, "loss": 4.5059, "rewards/accuracies": 0.5, "rewards/chosen": -12.068700790405273, "rewards/margins": 1.814742088317871, "rewards/rejected": -13.883441925048828, "step": 704 }, { "epoch": 0.0959967320261438, "grad_norm": 46.42672815253593, "learning_rate": 7.673469387755102e-07, "logits/chosen": -0.9172815084457397, "logits/rejected": -1.1027836799621582, "logps/chosen": -1.3285033702850342, "logps/rejected": -1.7970869541168213, "loss": 4.5843, "rewards/accuracies": 1.0, "rewards/chosen": -13.2850341796875, "rewards/margins": 4.685833930969238, "rewards/rejected": -17.970869064331055, "step": 705 }, { "epoch": 0.09613289760348584, "grad_norm": 40.97934407973209, "learning_rate": 7.684353741496598e-07, "logits/chosen": -1.729881763458252, "logits/rejected": -0.3515084981918335, "logps/chosen": -1.401078701019287, "logps/rejected": -1.5389922857284546, "loss": 3.797, "rewards/accuracies": 0.5, "rewards/chosen": -14.010787963867188, "rewards/margins": 1.3791353702545166, "rewards/rejected": -15.389923095703125, "step": 706 }, { "epoch": 0.09626906318082788, "grad_norm": 41.767185397072296, "learning_rate": 7.695238095238095e-07, "logits/chosen": -2.299805164337158, "logits/rejected": -0.9581261277198792, "logps/chosen": -0.9740153551101685, "logps/rejected": -1.0846879482269287, "loss": 4.5112, "rewards/accuracies": 0.75, "rewards/chosen": -9.740153312683105, "rewards/margins": 1.1067256927490234, "rewards/rejected": -10.846879005432129, "step": 707 }, { "epoch": 0.09640522875816994, "grad_norm": 42.109904014738355, "learning_rate": 7.706122448979591e-07, "logits/chosen": -1.385081171989441, "logits/rejected": -1.5946451425552368, "logps/chosen": -1.132194995880127, "logps/rejected": -1.2908332347869873, "loss": 4.529, "rewards/accuracies": 0.75, "rewards/chosen": -11.321950912475586, "rewards/margins": 1.586382269859314, "rewards/rejected": -12.908332824707031, "step": 708 }, { "epoch": 0.09654139433551198, "grad_norm": 40.38333341460613, "learning_rate": 7.717006802721088e-07, "logits/chosen": 0.7435685992240906, "logits/rejected": 1.004119873046875, "logps/chosen": -1.397862434387207, "logps/rejected": -1.4029738903045654, "loss": 3.8461, "rewards/accuracies": 0.5, "rewards/chosen": -13.978625297546387, "rewards/margins": 0.05111360549926758, "rewards/rejected": -14.029738426208496, "step": 709 }, { "epoch": 0.09667755991285402, "grad_norm": 51.71017779077159, "learning_rate": 7.727891156462584e-07, "logits/chosen": -1.259523868560791, "logits/rejected": -1.1074053049087524, "logps/chosen": -1.0922147035598755, "logps/rejected": -1.203784704208374, "loss": 4.6965, "rewards/accuracies": 0.5, "rewards/chosen": -10.922147750854492, "rewards/margins": 1.1156994104385376, "rewards/rejected": -12.037845611572266, "step": 710 }, { "epoch": 0.09681372549019608, "grad_norm": 41.32812375977817, "learning_rate": 7.738775510204082e-07, "logits/chosen": 0.4901611804962158, "logits/rejected": 1.1959195137023926, "logps/chosen": -1.33061683177948, "logps/rejected": -1.642816185951233, "loss": 4.6183, "rewards/accuracies": 0.75, "rewards/chosen": -13.306168556213379, "rewards/margins": 3.1219935417175293, "rewards/rejected": -16.42816162109375, "step": 711 }, { "epoch": 0.09694989106753812, "grad_norm": 40.97603519999412, "learning_rate": 7.749659863945579e-07, "logits/chosen": 0.8455914258956909, "logits/rejected": -0.024904191493988037, "logps/chosen": -1.8398375511169434, "logps/rejected": -1.8176478147506714, "loss": 4.761, "rewards/accuracies": 0.5, "rewards/chosen": -18.39837646484375, "rewards/margins": -0.22189736366271973, "rewards/rejected": -18.17647933959961, "step": 712 }, { "epoch": 0.09708605664488018, "grad_norm": 52.30757935189555, "learning_rate": 7.760544217687075e-07, "logits/chosen": -0.833655059337616, "logits/rejected": -1.2728029489517212, "logps/chosen": -1.4459762573242188, "logps/rejected": -1.3748979568481445, "loss": 5.0931, "rewards/accuracies": 0.25, "rewards/chosen": -14.459763526916504, "rewards/margins": -0.7107841968536377, "rewards/rejected": -13.748979568481445, "step": 713 }, { "epoch": 0.09722222222222222, "grad_norm": 42.20639540328432, "learning_rate": 7.771428571428571e-07, "logits/chosen": -1.9750702381134033, "logits/rejected": 0.7031581997871399, "logps/chosen": -0.9978775978088379, "logps/rejected": -1.40965735912323, "loss": 4.3077, "rewards/accuracies": 0.5, "rewards/chosen": -9.978775024414062, "rewards/margins": 4.117798328399658, "rewards/rejected": -14.096573829650879, "step": 714 }, { "epoch": 0.09735838779956427, "grad_norm": 37.97718761930537, "learning_rate": 7.782312925170067e-07, "logits/chosen": 0.2282334566116333, "logits/rejected": 0.215992733836174, "logps/chosen": -2.1538262367248535, "logps/rejected": -2.6747748851776123, "loss": 3.9216, "rewards/accuracies": 1.0, "rewards/chosen": -21.53826141357422, "rewards/margins": 5.209488391876221, "rewards/rejected": -26.74774932861328, "step": 715 }, { "epoch": 0.09749455337690632, "grad_norm": 55.42878582357541, "learning_rate": 7.793197278911564e-07, "logits/chosen": -0.6256116628646851, "logits/rejected": 0.4783474802970886, "logps/chosen": -1.6903719902038574, "logps/rejected": -1.8479328155517578, "loss": 4.5622, "rewards/accuracies": 0.75, "rewards/chosen": -16.90372085571289, "rewards/margins": 1.575608253479004, "rewards/rejected": -18.479328155517578, "step": 716 }, { "epoch": 0.09763071895424837, "grad_norm": 42.04225550008642, "learning_rate": 7.804081632653061e-07, "logits/chosen": 1.4558359384536743, "logits/rejected": 0.25389865040779114, "logps/chosen": -1.819035291671753, "logps/rejected": -2.338430643081665, "loss": 4.2802, "rewards/accuracies": 0.5, "rewards/chosen": -18.190353393554688, "rewards/margins": 5.193953037261963, "rewards/rejected": -23.384307861328125, "step": 717 }, { "epoch": 0.09776688453159041, "grad_norm": 43.70514445806336, "learning_rate": 7.814965986394558e-07, "logits/chosen": 0.8398693799972534, "logits/rejected": 1.1826547384262085, "logps/chosen": -1.7252662181854248, "logps/rejected": -1.8363432884216309, "loss": 4.3858, "rewards/accuracies": 0.75, "rewards/chosen": -17.252662658691406, "rewards/margins": 1.1107711791992188, "rewards/rejected": -18.363433837890625, "step": 718 }, { "epoch": 0.09790305010893247, "grad_norm": 43.45182537956577, "learning_rate": 7.825850340136054e-07, "logits/chosen": 0.910040557384491, "logits/rejected": 2.4864728450775146, "logps/chosen": -1.811880111694336, "logps/rejected": -1.514055609703064, "loss": 4.6364, "rewards/accuracies": 0.5, "rewards/chosen": -18.118799209594727, "rewards/margins": -2.978243827819824, "rewards/rejected": -15.140555381774902, "step": 719 }, { "epoch": 0.09803921568627451, "grad_norm": 36.83145316590201, "learning_rate": 7.836734693877551e-07, "logits/chosen": -0.4177202582359314, "logits/rejected": -0.6483035087585449, "logps/chosen": -1.066969633102417, "logps/rejected": -1.2961382865905762, "loss": 3.8601, "rewards/accuracies": 0.75, "rewards/chosen": -10.669695854187012, "rewards/margins": 2.2916862964630127, "rewards/rejected": -12.961382865905762, "step": 720 }, { "epoch": 0.09817538126361655, "grad_norm": 59.555923917508494, "learning_rate": 7.847619047619047e-07, "logits/chosen": 0.22509491443634033, "logits/rejected": 1.9675440788269043, "logps/chosen": -1.3469066619873047, "logps/rejected": -1.8185796737670898, "loss": 4.8717, "rewards/accuracies": 1.0, "rewards/chosen": -13.469066619873047, "rewards/margins": 4.71673059463501, "rewards/rejected": -18.18579864501953, "step": 721 }, { "epoch": 0.09831154684095861, "grad_norm": 53.11347433843598, "learning_rate": 7.858503401360544e-07, "logits/chosen": -0.4427299201488495, "logits/rejected": 0.5891492366790771, "logps/chosen": -1.3105250597000122, "logps/rejected": -1.413138508796692, "loss": 4.4221, "rewards/accuracies": 0.5, "rewards/chosen": -13.105250358581543, "rewards/margins": 1.0261344909667969, "rewards/rejected": -14.13138484954834, "step": 722 }, { "epoch": 0.09844771241830065, "grad_norm": 41.450645218923064, "learning_rate": 7.86938775510204e-07, "logits/chosen": -0.7939389944076538, "logits/rejected": 0.9260754585266113, "logps/chosen": -1.2432875633239746, "logps/rejected": -1.7621334791183472, "loss": 4.0935, "rewards/accuracies": 0.75, "rewards/chosen": -12.43287467956543, "rewards/margins": 5.188460826873779, "rewards/rejected": -17.621335983276367, "step": 723 }, { "epoch": 0.09858387799564271, "grad_norm": 44.164834438654964, "learning_rate": 7.880272108843537e-07, "logits/chosen": 0.10087406635284424, "logits/rejected": -0.5849605798721313, "logps/chosen": -2.2590010166168213, "logps/rejected": -1.8150582313537598, "loss": 5.3102, "rewards/accuracies": 0.25, "rewards/chosen": -22.590007781982422, "rewards/margins": -4.439427375793457, "rewards/rejected": -18.15058135986328, "step": 724 }, { "epoch": 0.09872004357298475, "grad_norm": 43.93032001238438, "learning_rate": 7.891156462585033e-07, "logits/chosen": -1.126882791519165, "logits/rejected": -0.41325342655181885, "logps/chosen": -1.073030948638916, "logps/rejected": -1.486212968826294, "loss": 4.2333, "rewards/accuracies": 1.0, "rewards/chosen": -10.730310440063477, "rewards/margins": 4.131819248199463, "rewards/rejected": -14.862129211425781, "step": 725 }, { "epoch": 0.0988562091503268, "grad_norm": 40.342274932160365, "learning_rate": 7.90204081632653e-07, "logits/chosen": -1.37852144241333, "logits/rejected": -1.4632079601287842, "logps/chosen": -1.1414504051208496, "logps/rejected": -1.0059678554534912, "loss": 4.7522, "rewards/accuracies": 0.25, "rewards/chosen": -11.414505004882812, "rewards/margins": -1.3548264503479004, "rewards/rejected": -10.059678077697754, "step": 726 }, { "epoch": 0.09899237472766885, "grad_norm": 43.30857186666193, "learning_rate": 7.912925170068027e-07, "logits/chosen": -0.9609636664390564, "logits/rejected": 1.4349071979522705, "logps/chosen": -1.3441052436828613, "logps/rejected": -1.7313222885131836, "loss": 4.7158, "rewards/accuracies": 1.0, "rewards/chosen": -13.44105339050293, "rewards/margins": 3.8721699714660645, "rewards/rejected": -17.313222885131836, "step": 727 }, { "epoch": 0.09912854030501089, "grad_norm": 46.39864074645033, "learning_rate": 7.923809523809524e-07, "logits/chosen": -2.091214656829834, "logits/rejected": 0.2565019428730011, "logps/chosen": -1.4338600635528564, "logps/rejected": -1.8346482515335083, "loss": 4.462, "rewards/accuracies": 0.5, "rewards/chosen": -14.338600158691406, "rewards/margins": 4.007882118225098, "rewards/rejected": -18.34648323059082, "step": 728 }, { "epoch": 0.09926470588235294, "grad_norm": 41.69325817237784, "learning_rate": 7.93469387755102e-07, "logits/chosen": -0.23673617839813232, "logits/rejected": -0.785517692565918, "logps/chosen": -1.2609608173370361, "logps/rejected": -1.1649501323699951, "loss": 4.6284, "rewards/accuracies": 0.5, "rewards/chosen": -12.609609603881836, "rewards/margins": -0.960107684135437, "rewards/rejected": -11.64950180053711, "step": 729 }, { "epoch": 0.09940087145969499, "grad_norm": 39.59603050119525, "learning_rate": 7.945578231292517e-07, "logits/chosen": -0.5889722108840942, "logits/rejected": -0.2944960594177246, "logps/chosen": -1.47526216506958, "logps/rejected": -1.4231247901916504, "loss": 4.1048, "rewards/accuracies": 0.5, "rewards/chosen": -14.752622604370117, "rewards/margins": -0.5213744640350342, "rewards/rejected": -14.231247901916504, "step": 730 }, { "epoch": 0.09953703703703703, "grad_norm": 37.47730525481829, "learning_rate": 7.956462585034014e-07, "logits/chosen": -1.0991027355194092, "logits/rejected": 0.545995831489563, "logps/chosen": -1.3544883728027344, "logps/rejected": -1.9763755798339844, "loss": 4.5534, "rewards/accuracies": 0.75, "rewards/chosen": -13.544883728027344, "rewards/margins": 6.218873023986816, "rewards/rejected": -19.763755798339844, "step": 731 }, { "epoch": 0.09967320261437909, "grad_norm": 39.710231287709924, "learning_rate": 7.967346938775509e-07, "logits/chosen": -0.7110096216201782, "logits/rejected": -1.9309161901474, "logps/chosen": -1.4411211013793945, "logps/rejected": -1.3180830478668213, "loss": 4.1867, "rewards/accuracies": 0.5, "rewards/chosen": -14.411211013793945, "rewards/margins": -1.2303802967071533, "rewards/rejected": -13.180830001831055, "step": 732 }, { "epoch": 0.09980936819172113, "grad_norm": 44.25928377820453, "learning_rate": 7.978231292517007e-07, "logits/chosen": 1.1769523620605469, "logits/rejected": 2.0462234020233154, "logps/chosen": -1.5601530075073242, "logps/rejected": -1.9634521007537842, "loss": 4.295, "rewards/accuracies": 0.75, "rewards/chosen": -15.601529121398926, "rewards/margins": 4.032991409301758, "rewards/rejected": -19.634521484375, "step": 733 }, { "epoch": 0.09994553376906318, "grad_norm": 46.299023391405115, "learning_rate": 7.989115646258503e-07, "logits/chosen": 0.7978740930557251, "logits/rejected": 1.215343713760376, "logps/chosen": -1.6659742593765259, "logps/rejected": -1.4225976467132568, "loss": 4.2359, "rewards/accuracies": 0.0, "rewards/chosen": -16.65974235534668, "rewards/margins": -2.4337658882141113, "rewards/rejected": -14.225976943969727, "step": 734 }, { "epoch": 0.10008169934640523, "grad_norm": 47.18545336718434, "learning_rate": 8e-07, "logits/chosen": -0.4410151243209839, "logits/rejected": 2.273383140563965, "logps/chosen": -1.3254457712173462, "logps/rejected": -2.0175886154174805, "loss": 4.2451, "rewards/accuracies": 1.0, "rewards/chosen": -13.254457473754883, "rewards/margins": 6.921428680419922, "rewards/rejected": -20.175886154174805, "step": 735 }, { "epoch": 0.10021786492374728, "grad_norm": 45.614604359313965, "learning_rate": 7.999999548083467e-07, "logits/chosen": -1.210679531097412, "logits/rejected": 1.4144299030303955, "logps/chosen": -1.4769377708435059, "logps/rejected": -1.7815994024276733, "loss": 4.0785, "rewards/accuracies": 1.0, "rewards/chosen": -14.769377708435059, "rewards/margins": 3.0466158390045166, "rewards/rejected": -17.815994262695312, "step": 736 }, { "epoch": 0.10035403050108932, "grad_norm": 44.23198599144205, "learning_rate": 7.999998192333973e-07, "logits/chosen": -0.8116459846496582, "logits/rejected": -0.07048946619033813, "logps/chosen": -1.0515270233154297, "logps/rejected": -1.2474240064620972, "loss": 4.2717, "rewards/accuracies": 0.75, "rewards/chosen": -10.515270233154297, "rewards/margins": 1.958970069885254, "rewards/rejected": -12.47424030303955, "step": 737 }, { "epoch": 0.10049019607843138, "grad_norm": 40.19890102603443, "learning_rate": 7.999995932751822e-07, "logits/chosen": 1.408050775527954, "logits/rejected": 0.2488189935684204, "logps/chosen": -2.1084983348846436, "logps/rejected": -2.011385917663574, "loss": 5.0056, "rewards/accuracies": 0.25, "rewards/chosen": -21.084983825683594, "rewards/margins": -0.9711227416992188, "rewards/rejected": -20.113861083984375, "step": 738 }, { "epoch": 0.10062636165577342, "grad_norm": 45.67121265219109, "learning_rate": 7.999992769337527e-07, "logits/chosen": 1.1698172092437744, "logits/rejected": 2.561275005340576, "logps/chosen": -1.365849494934082, "logps/rejected": -2.031737804412842, "loss": 4.6044, "rewards/accuracies": 1.0, "rewards/chosen": -13.65849494934082, "rewards/margins": 6.658884048461914, "rewards/rejected": -20.317378997802734, "step": 739 }, { "epoch": 0.10076252723311546, "grad_norm": 46.05393646820464, "learning_rate": 7.999988702091802e-07, "logits/chosen": -0.021095767617225647, "logits/rejected": 0.24443399906158447, "logps/chosen": -1.2320833206176758, "logps/rejected": -1.9033678770065308, "loss": 4.5238, "rewards/accuracies": 0.75, "rewards/chosen": -12.320833206176758, "rewards/margins": 6.712845802307129, "rewards/rejected": -19.033679962158203, "step": 740 }, { "epoch": 0.10089869281045752, "grad_norm": 48.286881878587415, "learning_rate": 7.999983731015564e-07, "logits/chosen": -0.3931344747543335, "logits/rejected": 0.1912919282913208, "logps/chosen": -1.346359372138977, "logps/rejected": -1.5095915794372559, "loss": 4.4928, "rewards/accuracies": 0.5, "rewards/chosen": -13.463593482971191, "rewards/margins": 1.6323225498199463, "rewards/rejected": -15.095916748046875, "step": 741 }, { "epoch": 0.10103485838779956, "grad_norm": 40.48962439921178, "learning_rate": 7.999977856109937e-07, "logits/chosen": 0.39493605494499207, "logits/rejected": 0.48169851303100586, "logps/chosen": -1.589890956878662, "logps/rejected": -1.6529561281204224, "loss": 4.0055, "rewards/accuracies": 0.75, "rewards/chosen": -15.898910522460938, "rewards/margins": 0.6306512355804443, "rewards/rejected": -16.52956199645996, "step": 742 }, { "epoch": 0.10117102396514162, "grad_norm": 44.00336980606346, "learning_rate": 7.999971077376251e-07, "logits/chosen": -0.14405620098114014, "logits/rejected": 0.4904278516769409, "logps/chosen": -1.2833818197250366, "logps/rejected": -1.5799874067306519, "loss": 4.0103, "rewards/accuracies": 0.75, "rewards/chosen": -12.833817481994629, "rewards/margins": 2.966055393218994, "rewards/rejected": -15.799873352050781, "step": 743 }, { "epoch": 0.10130718954248366, "grad_norm": 38.50956855083683, "learning_rate": 7.999963394816036e-07, "logits/chosen": -0.8616921901702881, "logits/rejected": 0.4801185429096222, "logps/chosen": -1.256547451019287, "logps/rejected": -1.5807321071624756, "loss": 4.2985, "rewards/accuracies": 0.75, "rewards/chosen": -12.565474510192871, "rewards/margins": 3.2418458461761475, "rewards/rejected": -15.807319641113281, "step": 744 }, { "epoch": 0.1014433551198257, "grad_norm": 51.32785416306338, "learning_rate": 7.999954808431027e-07, "logits/chosen": 0.2825924754142761, "logits/rejected": 1.8369059562683105, "logps/chosen": -1.5344984531402588, "logps/rejected": -1.9440052509307861, "loss": 4.6861, "rewards/accuracies": 0.75, "rewards/chosen": -15.34498405456543, "rewards/margins": 4.095068454742432, "rewards/rejected": -19.440052032470703, "step": 745 }, { "epoch": 0.10157952069716776, "grad_norm": 43.3006093364911, "learning_rate": 7.999945318223166e-07, "logits/chosen": 0.3547249436378479, "logits/rejected": 0.9866666793823242, "logps/chosen": -1.6175650358200073, "logps/rejected": -1.833793044090271, "loss": 4.345, "rewards/accuracies": 0.75, "rewards/chosen": -16.17565155029297, "rewards/margins": 2.1622793674468994, "rewards/rejected": -18.337928771972656, "step": 746 }, { "epoch": 0.1017156862745098, "grad_norm": 47.38785756979575, "learning_rate": 7.999934924194596e-07, "logits/chosen": 0.9028383493423462, "logits/rejected": 0.16222332417964935, "logps/chosen": -1.479372501373291, "logps/rejected": -1.4947645664215088, "loss": 4.1818, "rewards/accuracies": 0.5, "rewards/chosen": -14.793724060058594, "rewards/margins": 0.15392136573791504, "rewards/rejected": -14.947646141052246, "step": 747 }, { "epoch": 0.10185185185185185, "grad_norm": 51.429171342481425, "learning_rate": 7.999923626347666e-07, "logits/chosen": 0.6375131607055664, "logits/rejected": 1.025119662284851, "logps/chosen": -1.8949167728424072, "logps/rejected": -1.5858170986175537, "loss": 4.9348, "rewards/accuracies": 0.5, "rewards/chosen": -18.949167251586914, "rewards/margins": -3.0909957885742188, "rewards/rejected": -15.858171463012695, "step": 748 }, { "epoch": 0.1019880174291939, "grad_norm": 51.42313385877123, "learning_rate": 7.999911424684928e-07, "logits/chosen": 0.06881570816040039, "logits/rejected": -0.10504263639450073, "logps/chosen": -1.735964298248291, "logps/rejected": -1.8346107006072998, "loss": 4.6165, "rewards/accuracies": 0.25, "rewards/chosen": -17.359643936157227, "rewards/margins": 0.9864635467529297, "rewards/rejected": -18.346107482910156, "step": 749 }, { "epoch": 0.10212418300653595, "grad_norm": 52.39219749797928, "learning_rate": 7.99989831920914e-07, "logits/chosen": 1.7105737924575806, "logits/rejected": -0.07358014583587646, "logps/chosen": -1.3396865129470825, "logps/rejected": -1.5803020000457764, "loss": 4.5804, "rewards/accuracies": 0.75, "rewards/chosen": -13.396864891052246, "rewards/margins": 2.406153678894043, "rewards/rejected": -15.803019523620605, "step": 750 }, { "epoch": 0.102260348583878, "grad_norm": 42.2345492696032, "learning_rate": 7.999884309923265e-07, "logits/chosen": 0.19208914041519165, "logits/rejected": 1.714714765548706, "logps/chosen": -1.6224769353866577, "logps/rejected": -1.721709966659546, "loss": 3.7866, "rewards/accuracies": 0.75, "rewards/chosen": -16.224769592285156, "rewards/margins": 0.9923286437988281, "rewards/rejected": -17.217098236083984, "step": 751 }, { "epoch": 0.10239651416122005, "grad_norm": 51.95362774692042, "learning_rate": 7.999869396830466e-07, "logits/chosen": 0.665439248085022, "logits/rejected": 0.8813046216964722, "logps/chosen": -2.0490691661834717, "logps/rejected": -2.067579746246338, "loss": 4.651, "rewards/accuracies": 0.5, "rewards/chosen": -20.490692138671875, "rewards/margins": 0.1851062774658203, "rewards/rejected": -20.675796508789062, "step": 752 }, { "epoch": 0.10253267973856209, "grad_norm": 38.58734240253835, "learning_rate": 7.999853579934114e-07, "logits/chosen": 0.7750709056854248, "logits/rejected": 1.8525831699371338, "logps/chosen": -1.741391897201538, "logps/rejected": -2.063436508178711, "loss": 4.4047, "rewards/accuracies": 0.75, "rewards/chosen": -17.41391944885254, "rewards/margins": 3.220444679260254, "rewards/rejected": -20.63436508178711, "step": 753 }, { "epoch": 0.10266884531590414, "grad_norm": 41.71065624500089, "learning_rate": 7.999836859237781e-07, "logits/chosen": -0.33100125193595886, "logits/rejected": 0.3829704821109772, "logps/chosen": -1.521706223487854, "logps/rejected": -1.7050172090530396, "loss": 4.2152, "rewards/accuracies": 0.5, "rewards/chosen": -15.217061996459961, "rewards/margins": 1.833108901977539, "rewards/rejected": -17.0501708984375, "step": 754 }, { "epoch": 0.10280501089324619, "grad_norm": 41.55357964683521, "learning_rate": 7.999819234745248e-07, "logits/chosen": 0.035284340381622314, "logits/rejected": 1.0627695322036743, "logps/chosen": -1.3855576515197754, "logps/rejected": -1.5702134370803833, "loss": 4.2487, "rewards/accuracies": 0.75, "rewards/chosen": -13.855576515197754, "rewards/margins": 1.8465583324432373, "rewards/rejected": -15.702134132385254, "step": 755 }, { "epoch": 0.10294117647058823, "grad_norm": 44.39262021946132, "learning_rate": 7.999800706460496e-07, "logits/chosen": 0.39929521083831787, "logits/rejected": 1.628351092338562, "logps/chosen": -1.396173357963562, "logps/rejected": -1.4980230331420898, "loss": 4.5639, "rewards/accuracies": 0.5, "rewards/chosen": -13.961732864379883, "rewards/margins": 1.0184969902038574, "rewards/rejected": -14.980231285095215, "step": 756 }, { "epoch": 0.10307734204793029, "grad_norm": 38.70749328131499, "learning_rate": 7.99978127438771e-07, "logits/chosen": 0.620746374130249, "logits/rejected": 1.3903729915618896, "logps/chosen": -1.5968000888824463, "logps/rejected": -1.817156434059143, "loss": 4.3037, "rewards/accuracies": 0.75, "rewards/chosen": -15.968001365661621, "rewards/margins": 2.2035634517669678, "rewards/rejected": -18.17156410217285, "step": 757 }, { "epoch": 0.10321350762527233, "grad_norm": 45.89713753409492, "learning_rate": 7.999760938531286e-07, "logits/chosen": -1.9423983097076416, "logits/rejected": -1.2320376634597778, "logps/chosen": -1.3688420057296753, "logps/rejected": -1.4056966304779053, "loss": 4.7158, "rewards/accuracies": 0.75, "rewards/chosen": -13.688419342041016, "rewards/margins": 0.3685474395751953, "rewards/rejected": -14.056966781616211, "step": 758 }, { "epoch": 0.10334967320261437, "grad_norm": 44.4050145937326, "learning_rate": 7.999739698895813e-07, "logits/chosen": 1.4521764516830444, "logits/rejected": 1.195876121520996, "logps/chosen": -1.4207777976989746, "logps/rejected": -1.4205923080444336, "loss": 4.3087, "rewards/accuracies": 0.5, "rewards/chosen": -14.207778930664062, "rewards/margins": -0.0018553733825683594, "rewards/rejected": -14.205923080444336, "step": 759 }, { "epoch": 0.10348583877995643, "grad_norm": 39.80846551253932, "learning_rate": 7.999717555486093e-07, "logits/chosen": 1.1555609703063965, "logits/rejected": 0.609862208366394, "logps/chosen": -1.5089918375015259, "logps/rejected": -1.8511998653411865, "loss": 4.5633, "rewards/accuracies": 0.75, "rewards/chosen": -15.08991813659668, "rewards/margins": 3.4220802783966064, "rewards/rejected": -18.51199722290039, "step": 760 }, { "epoch": 0.10362200435729847, "grad_norm": 50.72777459511233, "learning_rate": 7.99969450830713e-07, "logits/chosen": 1.8486884832382202, "logits/rejected": 2.4359445571899414, "logps/chosen": -2.055424690246582, "logps/rejected": -1.8613698482513428, "loss": 4.5434, "rewards/accuracies": 0.25, "rewards/chosen": -20.554244995117188, "rewards/margins": -1.940546989440918, "rewards/rejected": -18.613698959350586, "step": 761 }, { "epoch": 0.10375816993464053, "grad_norm": 49.71008855710028, "learning_rate": 7.999670557364131e-07, "logits/chosen": -1.082577109336853, "logits/rejected": 2.46382474899292, "logps/chosen": -1.4801744222640991, "logps/rejected": -1.985170602798462, "loss": 4.2872, "rewards/accuracies": 0.75, "rewards/chosen": -14.80174446105957, "rewards/margins": 5.049961566925049, "rewards/rejected": -19.85170555114746, "step": 762 }, { "epoch": 0.10389433551198257, "grad_norm": 38.97889727601398, "learning_rate": 7.999645702662507e-07, "logits/chosen": -0.6254681348800659, "logits/rejected": 1.8390403985977173, "logps/chosen": -1.0403552055358887, "logps/rejected": -1.3084906339645386, "loss": 4.4142, "rewards/accuracies": 0.75, "rewards/chosen": -10.40355110168457, "rewards/margins": 2.6813549995422363, "rewards/rejected": -13.084906578063965, "step": 763 }, { "epoch": 0.10403050108932461, "grad_norm": 45.489926420888764, "learning_rate": 7.999619944207876e-07, "logits/chosen": -0.8354950547218323, "logits/rejected": -0.3669109344482422, "logps/chosen": -1.8151025772094727, "logps/rejected": -1.5263116359710693, "loss": 5.0458, "rewards/accuracies": 0.25, "rewards/chosen": -18.151025772094727, "rewards/margins": -2.887909412384033, "rewards/rejected": -15.263116836547852, "step": 764 }, { "epoch": 0.10416666666666667, "grad_norm": 48.77790023956561, "learning_rate": 7.999593282006057e-07, "logits/chosen": 1.1495563983917236, "logits/rejected": 2.688265800476074, "logps/chosen": -1.6578179597854614, "logps/rejected": -2.2497756481170654, "loss": 4.4001, "rewards/accuracies": 1.0, "rewards/chosen": -16.57817840576172, "rewards/margins": 5.919576644897461, "rewards/rejected": -22.497756958007812, "step": 765 }, { "epoch": 0.10430283224400871, "grad_norm": 41.88109080756215, "learning_rate": 7.999565716063075e-07, "logits/chosen": 1.778949499130249, "logits/rejected": 0.5361418128013611, "logps/chosen": -1.9050710201263428, "logps/rejected": -2.00811505317688, "loss": 4.249, "rewards/accuracies": 0.5, "rewards/chosen": -19.050710678100586, "rewards/margins": 1.0304396152496338, "rewards/rejected": -20.08115005493164, "step": 766 }, { "epoch": 0.10443899782135076, "grad_norm": 45.27971382322474, "learning_rate": 7.999537246385159e-07, "logits/chosen": 1.4859033823013306, "logits/rejected": 2.0653905868530273, "logps/chosen": -1.3333009481430054, "logps/rejected": -1.7175750732421875, "loss": 4.8872, "rewards/accuracies": 1.0, "rewards/chosen": -13.333009719848633, "rewards/margins": 3.842740535736084, "rewards/rejected": -17.175750732421875, "step": 767 }, { "epoch": 0.10457516339869281, "grad_norm": 43.266505254997725, "learning_rate": 7.999507872978741e-07, "logits/chosen": 0.15767845511436462, "logits/rejected": 1.5301482677459717, "logps/chosen": -1.3417859077453613, "logps/rejected": -1.4690945148468018, "loss": 4.6232, "rewards/accuracies": 0.75, "rewards/chosen": -13.41786003112793, "rewards/margins": 1.273085594177246, "rewards/rejected": -14.69094467163086, "step": 768 }, { "epoch": 0.10471132897603486, "grad_norm": 44.20094785681242, "learning_rate": 7.99947759585046e-07, "logits/chosen": 0.1425793170928955, "logits/rejected": 0.8546615839004517, "logps/chosen": -1.315858244895935, "logps/rejected": -1.3919997215270996, "loss": 4.8573, "rewards/accuracies": 0.5, "rewards/chosen": -13.158581733703613, "rewards/margins": 0.761415958404541, "rewards/rejected": -13.919998168945312, "step": 769 }, { "epoch": 0.10484749455337691, "grad_norm": 42.68503475643858, "learning_rate": 7.999446415007156e-07, "logits/chosen": 1.627928376197815, "logits/rejected": 0.015999257564544678, "logps/chosen": -1.437565565109253, "logps/rejected": -1.1845749616622925, "loss": 4.399, "rewards/accuracies": 0.25, "rewards/chosen": -14.375657081604004, "rewards/margins": -2.529907464981079, "rewards/rejected": -11.845748901367188, "step": 770 }, { "epoch": 0.10498366013071896, "grad_norm": 47.94045246384138, "learning_rate": 7.999414330455873e-07, "logits/chosen": 2.193402051925659, "logits/rejected": 2.954232692718506, "logps/chosen": -1.5745937824249268, "logps/rejected": -1.7298146486282349, "loss": 4.7846, "rewards/accuracies": 1.0, "rewards/chosen": -15.74593734741211, "rewards/margins": 1.5522077083587646, "rewards/rejected": -17.298145294189453, "step": 771 }, { "epoch": 0.105119825708061, "grad_norm": 40.901415767696236, "learning_rate": 7.999381342203864e-07, "logits/chosen": 2.4137072563171387, "logits/rejected": 3.311873435974121, "logps/chosen": -1.9217844009399414, "logps/rejected": -2.409714698791504, "loss": 4.1141, "rewards/accuracies": 1.0, "rewards/chosen": -19.217844009399414, "rewards/margins": 4.879302501678467, "rewards/rejected": -24.09714698791504, "step": 772 }, { "epoch": 0.10525599128540306, "grad_norm": 55.028522710848954, "learning_rate": 7.999347450258582e-07, "logits/chosen": 0.10101759433746338, "logits/rejected": 1.1604301929473877, "logps/chosen": -1.5210280418395996, "logps/rejected": -1.8105673789978027, "loss": 4.4808, "rewards/accuracies": 0.75, "rewards/chosen": -15.210280418395996, "rewards/margins": 2.895393133163452, "rewards/rejected": -18.10567283630371, "step": 773 }, { "epoch": 0.1053921568627451, "grad_norm": 42.03220434080206, "learning_rate": 7.999312654627684e-07, "logits/chosen": 1.2980345487594604, "logits/rejected": 1.1059099435806274, "logps/chosen": -1.62300443649292, "logps/rejected": -1.5634139776229858, "loss": 4.1684, "rewards/accuracies": 0.75, "rewards/chosen": -16.230045318603516, "rewards/margins": -0.5959049463272095, "rewards/rejected": -15.634140014648438, "step": 774 }, { "epoch": 0.10552832244008714, "grad_norm": 45.581953959442814, "learning_rate": 7.999276955319033e-07, "logits/chosen": 1.4043192863464355, "logits/rejected": -0.3233621120452881, "logps/chosen": -2.0439395904541016, "logps/rejected": -1.6491154432296753, "loss": 5.1672, "rewards/accuracies": 0.0, "rewards/chosen": -20.43939781188965, "rewards/margins": -3.948242664337158, "rewards/rejected": -16.49115562438965, "step": 775 }, { "epoch": 0.1056644880174292, "grad_norm": 43.840481308719475, "learning_rate": 7.999240352340695e-07, "logits/chosen": -1.2009888887405396, "logits/rejected": 0.3306981325149536, "logps/chosen": -1.4617528915405273, "logps/rejected": -1.932098388671875, "loss": 4.6187, "rewards/accuracies": 1.0, "rewards/chosen": -14.617528915405273, "rewards/margins": 4.703454971313477, "rewards/rejected": -19.32098388671875, "step": 776 }, { "epoch": 0.10580065359477124, "grad_norm": 44.380930726903316, "learning_rate": 7.999202845700942e-07, "logits/chosen": -0.29295387864112854, "logits/rejected": -0.1686018705368042, "logps/chosen": -1.2567007541656494, "logps/rejected": -1.5483498573303223, "loss": 3.8253, "rewards/accuracies": 0.75, "rewards/chosen": -12.567007064819336, "rewards/margins": 2.9164910316467285, "rewards/rejected": -15.483498573303223, "step": 777 }, { "epoch": 0.10593681917211328, "grad_norm": 51.14088770324271, "learning_rate": 7.999164435408249e-07, "logits/chosen": 1.7298741340637207, "logits/rejected": 0.15213525295257568, "logps/chosen": -1.9578964710235596, "logps/rejected": -1.6905760765075684, "loss": 5.0417, "rewards/accuracies": 0.25, "rewards/chosen": -19.578964233398438, "rewards/margins": -2.673203945159912, "rewards/rejected": -16.90576171875, "step": 778 }, { "epoch": 0.10607298474945534, "grad_norm": 45.34468491157733, "learning_rate": 7.999125121471293e-07, "logits/chosen": 3.4303455352783203, "logits/rejected": 3.5849618911743164, "logps/chosen": -1.956491231918335, "logps/rejected": -2.41819429397583, "loss": 4.8967, "rewards/accuracies": 1.0, "rewards/chosen": -19.56491470336914, "rewards/margins": 4.617029666900635, "rewards/rejected": -24.181943893432617, "step": 779 }, { "epoch": 0.10620915032679738, "grad_norm": 45.700232923537236, "learning_rate": 7.99908490389896e-07, "logits/chosen": 2.4437692165374756, "logits/rejected": 1.6017787456512451, "logps/chosen": -1.608868956565857, "logps/rejected": -1.6798582077026367, "loss": 4.6893, "rewards/accuracies": 0.5, "rewards/chosen": -16.088687896728516, "rewards/margins": 0.7098945379257202, "rewards/rejected": -16.798583984375, "step": 780 }, { "epoch": 0.10634531590413944, "grad_norm": 47.112784761465754, "learning_rate": 7.999043782700334e-07, "logits/chosen": 1.9845503568649292, "logits/rejected": 2.3197145462036133, "logps/chosen": -1.930969476699829, "logps/rejected": -2.0229830741882324, "loss": 4.1985, "rewards/accuracies": 0.5, "rewards/chosen": -19.309696197509766, "rewards/margins": 0.9201333522796631, "rewards/rejected": -20.229827880859375, "step": 781 }, { "epoch": 0.10648148148148148, "grad_norm": 49.74426032377798, "learning_rate": 7.999001757884712e-07, "logits/chosen": 1.1080516576766968, "logits/rejected": 1.4778143167495728, "logps/chosen": -1.608581304550171, "logps/rejected": -1.768815517425537, "loss": 4.3115, "rewards/accuracies": 0.75, "rewards/chosen": -16.085813522338867, "rewards/margins": 1.6023411750793457, "rewards/rejected": -17.688156127929688, "step": 782 }, { "epoch": 0.10661764705882353, "grad_norm": 51.689837367389245, "learning_rate": 7.998958829461585e-07, "logits/chosen": 1.281875729560852, "logits/rejected": 1.7213735580444336, "logps/chosen": -1.7470237016677856, "logps/rejected": -1.644425392150879, "loss": 4.5392, "rewards/accuracies": 0.5, "rewards/chosen": -17.470237731933594, "rewards/margins": -1.0259826183319092, "rewards/rejected": -16.44425392150879, "step": 783 }, { "epoch": 0.10675381263616558, "grad_norm": 42.93964364445003, "learning_rate": 7.998914997440655e-07, "logits/chosen": 0.7287387847900391, "logits/rejected": 1.4198493957519531, "logps/chosen": -1.51877760887146, "logps/rejected": -1.701560139656067, "loss": 3.9812, "rewards/accuracies": 0.75, "rewards/chosen": -15.187776565551758, "rewards/margins": 1.8278248310089111, "rewards/rejected": -17.015600204467773, "step": 784 }, { "epoch": 0.10688997821350762, "grad_norm": 46.68021492352761, "learning_rate": 7.998870261831825e-07, "logits/chosen": 1.7686176300048828, "logits/rejected": 0.579914927482605, "logps/chosen": -1.283801794052124, "logps/rejected": -1.2337303161621094, "loss": 4.3271, "rewards/accuracies": 0.25, "rewards/chosen": -12.838016510009766, "rewards/margins": -0.5007138252258301, "rewards/rejected": -12.337303161621094, "step": 785 }, { "epoch": 0.10702614379084967, "grad_norm": 44.111687056495256, "learning_rate": 7.998824622645205e-07, "logits/chosen": 0.025328367948532104, "logits/rejected": -0.0822637677192688, "logps/chosen": -1.1412020921707153, "logps/rejected": -1.444559097290039, "loss": 4.3425, "rewards/accuracies": 0.75, "rewards/chosen": -11.412020683288574, "rewards/margins": 3.033569574356079, "rewards/rejected": -14.44559097290039, "step": 786 }, { "epoch": 0.10716230936819172, "grad_norm": 49.99256830159619, "learning_rate": 7.998778079891108e-07, "logits/chosen": 1.554835319519043, "logits/rejected": 2.7827296257019043, "logps/chosen": -1.5441763401031494, "logps/rejected": -1.5858314037322998, "loss": 4.9536, "rewards/accuracies": 0.75, "rewards/chosen": -15.441762924194336, "rewards/margins": 0.4165511131286621, "rewards/rejected": -15.85831356048584, "step": 787 }, { "epoch": 0.10729847494553377, "grad_norm": 46.7069293085837, "learning_rate": 7.998730633580049e-07, "logits/chosen": 1.7797095775604248, "logits/rejected": 1.9218792915344238, "logps/chosen": -1.4529622793197632, "logps/rejected": -1.452394723892212, "loss": 4.5445, "rewards/accuracies": 0.5, "rewards/chosen": -14.529623985290527, "rewards/margins": -0.005675315856933594, "rewards/rejected": -14.523947715759277, "step": 788 }, { "epoch": 0.10743464052287582, "grad_norm": 50.40072212930707, "learning_rate": 7.998682283722749e-07, "logits/chosen": -0.35996755957603455, "logits/rejected": -0.25329524278640747, "logps/chosen": -1.5995252132415771, "logps/rejected": -1.632236123085022, "loss": 4.6162, "rewards/accuracies": 0.5, "rewards/chosen": -15.99525260925293, "rewards/margins": 0.32710862159729004, "rewards/rejected": -16.32236099243164, "step": 789 }, { "epoch": 0.10757080610021787, "grad_norm": 41.94468389355299, "learning_rate": 7.998633030330134e-07, "logits/chosen": 0.7753897309303284, "logits/rejected": 1.173697829246521, "logps/chosen": -1.5638926029205322, "logps/rejected": -1.850898265838623, "loss": 3.5535, "rewards/accuracies": 1.0, "rewards/chosen": -15.638925552368164, "rewards/margins": 2.870058298110962, "rewards/rejected": -18.508983612060547, "step": 790 }, { "epoch": 0.10770697167755991, "grad_norm": 52.079414987452644, "learning_rate": 7.998582873413332e-07, "logits/chosen": 2.30661678314209, "logits/rejected": 2.7405154705047607, "logps/chosen": -2.329056978225708, "logps/rejected": -2.1419105529785156, "loss": 4.5619, "rewards/accuracies": 0.75, "rewards/chosen": -23.290569305419922, "rewards/margins": -1.8714632987976074, "rewards/rejected": -21.41910743713379, "step": 791 }, { "epoch": 0.10784313725490197, "grad_norm": 54.88247268176086, "learning_rate": 7.998531812983677e-07, "logits/chosen": 2.5414929389953613, "logits/rejected": 2.1460349559783936, "logps/chosen": -1.7746572494506836, "logps/rejected": -1.6183476448059082, "loss": 4.9437, "rewards/accuracies": 0.25, "rewards/chosen": -17.746572494506836, "rewards/margins": -1.5630955696105957, "rewards/rejected": -16.1834774017334, "step": 792 }, { "epoch": 0.10797930283224401, "grad_norm": 47.40509162424588, "learning_rate": 7.998479849052709e-07, "logits/chosen": 1.382943868637085, "logits/rejected": 2.196470260620117, "logps/chosen": -1.4620976448059082, "logps/rejected": -1.9145565032958984, "loss": 4.6329, "rewards/accuracies": 0.75, "rewards/chosen": -14.620977401733398, "rewards/margins": 4.5245890617370605, "rewards/rejected": -19.145566940307617, "step": 793 }, { "epoch": 0.10811546840958605, "grad_norm": 39.826711825019935, "learning_rate": 7.998426981632164e-07, "logits/chosen": 1.9330472946166992, "logits/rejected": 3.3441638946533203, "logps/chosen": -1.7426728010177612, "logps/rejected": -1.9802806377410889, "loss": 4.3463, "rewards/accuracies": 0.75, "rewards/chosen": -17.426727294921875, "rewards/margins": 2.3760781288146973, "rewards/rejected": -19.802806854248047, "step": 794 }, { "epoch": 0.10825163398692811, "grad_norm": 42.083922247068024, "learning_rate": 7.998373210733992e-07, "logits/chosen": 1.5107004642486572, "logits/rejected": 3.354304075241089, "logps/chosen": -1.6760627031326294, "logps/rejected": -2.181931495666504, "loss": 4.077, "rewards/accuracies": 1.0, "rewards/chosen": -16.76062774658203, "rewards/margins": 5.058686256408691, "rewards/rejected": -21.819313049316406, "step": 795 }, { "epoch": 0.10838779956427015, "grad_norm": 48.45861462199401, "learning_rate": 7.998318536370344e-07, "logits/chosen": 1.3337500095367432, "logits/rejected": 1.541146159172058, "logps/chosen": -2.0364580154418945, "logps/rejected": -2.0074164867401123, "loss": 4.8426, "rewards/accuracies": 0.25, "rewards/chosen": -20.364580154418945, "rewards/margins": -0.29041600227355957, "rewards/rejected": -20.07416534423828, "step": 796 }, { "epoch": 0.1085239651416122, "grad_norm": 45.84221124453613, "learning_rate": 7.998262958553571e-07, "logits/chosen": -0.9901857972145081, "logits/rejected": 0.8132500648498535, "logps/chosen": -1.3358142375946045, "logps/rejected": -1.4973361492156982, "loss": 4.6811, "rewards/accuracies": 1.0, "rewards/chosen": -13.358142852783203, "rewards/margins": 1.6152195930480957, "rewards/rejected": -14.97336196899414, "step": 797 }, { "epoch": 0.10866013071895425, "grad_norm": 46.44879553595845, "learning_rate": 7.998206477296233e-07, "logits/chosen": -0.6073671579360962, "logits/rejected": -0.526842474937439, "logps/chosen": -1.3112778663635254, "logps/rejected": -1.188691258430481, "loss": 5.18, "rewards/accuracies": 0.5, "rewards/chosen": -13.11277961730957, "rewards/margins": -1.2258665561676025, "rewards/rejected": -11.88691234588623, "step": 798 }, { "epoch": 0.1087962962962963, "grad_norm": 50.06515248467984, "learning_rate": 7.998149092611092e-07, "logits/chosen": 2.6759729385375977, "logits/rejected": 2.716930627822876, "logps/chosen": -2.0148584842681885, "logps/rejected": -1.8663893938064575, "loss": 4.3055, "rewards/accuracies": 0.5, "rewards/chosen": -20.14858627319336, "rewards/margins": -1.4846904277801514, "rewards/rejected": -18.663894653320312, "step": 799 }, { "epoch": 0.10893246187363835, "grad_norm": 41.720457134710124, "learning_rate": 7.998090804511114e-07, "logits/chosen": 1.2223092317581177, "logits/rejected": 1.610391616821289, "logps/chosen": -1.4216381311416626, "logps/rejected": -1.6042468547821045, "loss": 5.0682, "rewards/accuracies": 0.75, "rewards/chosen": -14.21638011932373, "rewards/margins": 1.8260881900787354, "rewards/rejected": -16.042469024658203, "step": 800 }, { "epoch": 0.1090686274509804, "grad_norm": 45.73547990435087, "learning_rate": 7.99803161300947e-07, "logits/chosen": 3.1126341819763184, "logits/rejected": 2.3621530532836914, "logps/chosen": -1.7287466526031494, "logps/rejected": -1.6734702587127686, "loss": 4.5176, "rewards/accuracies": 0.5, "rewards/chosen": -17.287466049194336, "rewards/margins": -0.5527634620666504, "rewards/rejected": -16.734703063964844, "step": 801 }, { "epoch": 0.10920479302832244, "grad_norm": 48.07283840778601, "learning_rate": 7.997971518119536e-07, "logits/chosen": -0.16622626781463623, "logits/rejected": 1.5247889757156372, "logps/chosen": -1.2025394439697266, "logps/rejected": -1.5813252925872803, "loss": 4.9843, "rewards/accuracies": 1.0, "rewards/chosen": -12.025394439697266, "rewards/margins": 3.787858486175537, "rewards/rejected": -15.813253402709961, "step": 802 }, { "epoch": 0.10934095860566449, "grad_norm": 47.34247469390269, "learning_rate": 7.997910519854888e-07, "logits/chosen": -1.6399832963943481, "logits/rejected": -0.8416985869407654, "logps/chosen": -1.0410175323486328, "logps/rejected": -1.0998342037200928, "loss": 4.4384, "rewards/accuracies": 0.5, "rewards/chosen": -10.410175323486328, "rewards/margins": 0.5881668329238892, "rewards/rejected": -10.99834156036377, "step": 803 }, { "epoch": 0.10947712418300654, "grad_norm": 46.64697333399379, "learning_rate": 7.997848618229312e-07, "logits/chosen": -1.8696999549865723, "logits/rejected": -0.08830299973487854, "logps/chosen": -1.1017017364501953, "logps/rejected": -1.2299284934997559, "loss": 4.4798, "rewards/accuracies": 0.75, "rewards/chosen": -11.017017364501953, "rewards/margins": 1.2822673320770264, "rewards/rejected": -12.299284934997559, "step": 804 }, { "epoch": 0.10961328976034858, "grad_norm": 38.965510772603885, "learning_rate": 7.997785813256795e-07, "logits/chosen": 0.8911816477775574, "logits/rejected": 1.9969193935394287, "logps/chosen": -1.4009978771209717, "logps/rejected": -1.6616787910461426, "loss": 4.2221, "rewards/accuracies": 1.0, "rewards/chosen": -14.009977340698242, "rewards/margins": 2.606809616088867, "rewards/rejected": -16.61678695678711, "step": 805 }, { "epoch": 0.10974945533769064, "grad_norm": 43.116058125362386, "learning_rate": 7.997722104951527e-07, "logits/chosen": 1.0840072631835938, "logits/rejected": 1.3189767599105835, "logps/chosen": -1.7338677644729614, "logps/rejected": -1.6661049127578735, "loss": 4.5106, "rewards/accuracies": 0.0, "rewards/chosen": -17.33867835998535, "rewards/margins": -0.677628755569458, "rewards/rejected": -16.661048889160156, "step": 806 }, { "epoch": 0.10988562091503268, "grad_norm": 35.44266634674197, "learning_rate": 7.997657493327904e-07, "logits/chosen": -0.14621293544769287, "logits/rejected": 1.510076880455017, "logps/chosen": -1.3104679584503174, "logps/rejected": -1.5537261962890625, "loss": 4.2905, "rewards/accuracies": 0.75, "rewards/chosen": -13.104681015014648, "rewards/margins": 2.432581901550293, "rewards/rejected": -15.537261962890625, "step": 807 }, { "epoch": 0.11002178649237472, "grad_norm": 41.06641974848438, "learning_rate": 7.997591978400525e-07, "logits/chosen": 1.1518406867980957, "logits/rejected": -0.5534327030181885, "logps/chosen": -1.3765920400619507, "logps/rejected": -1.347874402999878, "loss": 4.5968, "rewards/accuracies": 0.5, "rewards/chosen": -13.76591968536377, "rewards/margins": -0.28717637062072754, "rewards/rejected": -13.478743553161621, "step": 808 }, { "epoch": 0.11015795206971678, "grad_norm": 49.19202268985815, "learning_rate": 7.997525560184194e-07, "logits/chosen": -0.6464723944664001, "logits/rejected": -0.44014400243759155, "logps/chosen": -1.636777400970459, "logps/rejected": -1.4684593677520752, "loss": 4.2776, "rewards/accuracies": 0.25, "rewards/chosen": -16.367773056030273, "rewards/margins": -1.6831793785095215, "rewards/rejected": -14.68459415435791, "step": 809 }, { "epoch": 0.11029411764705882, "grad_norm": 43.9156722469725, "learning_rate": 7.997458238693919e-07, "logits/chosen": 1.6188347339630127, "logits/rejected": 2.407925605773926, "logps/chosen": -1.5214686393737793, "logps/rejected": -1.5777535438537598, "loss": 4.5389, "rewards/accuracies": 0.75, "rewards/chosen": -15.21468734741211, "rewards/margins": 0.5628478527069092, "rewards/rejected": -15.777535438537598, "step": 810 }, { "epoch": 0.11043028322440088, "grad_norm": 41.41665366025796, "learning_rate": 7.997390013944912e-07, "logits/chosen": -0.3010925054550171, "logits/rejected": 2.282715082168579, "logps/chosen": -1.3222005367279053, "logps/rejected": -1.620559811592102, "loss": 4.9247, "rewards/accuracies": 0.75, "rewards/chosen": -13.222005844116211, "rewards/margins": 2.983593463897705, "rewards/rejected": -16.205598831176758, "step": 811 }, { "epoch": 0.11056644880174292, "grad_norm": 42.790312156556354, "learning_rate": 7.997320885952587e-07, "logits/chosen": 2.603745937347412, "logits/rejected": 2.1851820945739746, "logps/chosen": -1.9110941886901855, "logps/rejected": -1.9279760122299194, "loss": 4.3189, "rewards/accuracies": 0.5, "rewards/chosen": -19.110942840576172, "rewards/margins": 0.16881632804870605, "rewards/rejected": -19.27975845336914, "step": 812 }, { "epoch": 0.11070261437908496, "grad_norm": 51.76299627325181, "learning_rate": 7.997250854732567e-07, "logits/chosen": 0.9644882678985596, "logits/rejected": 2.3229942321777344, "logps/chosen": -1.69218111038208, "logps/rejected": -2.0131049156188965, "loss": 4.2593, "rewards/accuracies": 1.0, "rewards/chosen": -16.921810150146484, "rewards/margins": 3.2092411518096924, "rewards/rejected": -20.131052017211914, "step": 813 }, { "epoch": 0.11083877995642702, "grad_norm": 37.501196945355886, "learning_rate": 7.997179920300675e-07, "logits/chosen": 0.5582494139671326, "logits/rejected": -0.6090248823165894, "logps/chosen": -1.355931043624878, "logps/rejected": -1.2185312509536743, "loss": 3.8601, "rewards/accuracies": 0.5, "rewards/chosen": -13.559310913085938, "rewards/margins": -1.373998761177063, "rewards/rejected": -12.185312271118164, "step": 814 }, { "epoch": 0.11097494553376906, "grad_norm": 71.0437445659698, "learning_rate": 7.997108082672939e-07, "logits/chosen": -1.1441336870193481, "logits/rejected": -0.1964048147201538, "logps/chosen": -1.1839947700500488, "logps/rejected": -1.3360567092895508, "loss": 5.2116, "rewards/accuracies": 0.75, "rewards/chosen": -11.839947700500488, "rewards/margins": 1.5206185579299927, "rewards/rejected": -13.360566139221191, "step": 815 }, { "epoch": 0.1111111111111111, "grad_norm": 43.771222045605526, "learning_rate": 7.99703534186559e-07, "logits/chosen": 1.3186039924621582, "logits/rejected": 3.1654396057128906, "logps/chosen": -1.442265272140503, "logps/rejected": -1.7048871517181396, "loss": 4.1997, "rewards/accuracies": 0.75, "rewards/chosen": -14.422653198242188, "rewards/margins": 2.626218795776367, "rewards/rejected": -17.048871994018555, "step": 816 }, { "epoch": 0.11124727668845316, "grad_norm": 43.41591891016607, "learning_rate": 7.996961697895066e-07, "logits/chosen": 1.0838921070098877, "logits/rejected": 3.1694538593292236, "logps/chosen": -1.3795793056488037, "logps/rejected": -1.6778197288513184, "loss": 4.5432, "rewards/accuracies": 1.0, "rewards/chosen": -13.795793533325195, "rewards/margins": 2.9824044704437256, "rewards/rejected": -16.7781982421875, "step": 817 }, { "epoch": 0.1113834422657952, "grad_norm": 42.2111580504543, "learning_rate": 7.996887150778008e-07, "logits/chosen": 1.1598048210144043, "logits/rejected": 2.7199158668518066, "logps/chosen": -1.6308882236480713, "logps/rejected": -1.85318922996521, "loss": 4.1506, "rewards/accuracies": 0.5, "rewards/chosen": -16.308883666992188, "rewards/margins": 2.223010301589966, "rewards/rejected": -18.531892776489258, "step": 818 }, { "epoch": 0.11151960784313726, "grad_norm": 40.40092802673446, "learning_rate": 7.99681170053126e-07, "logits/chosen": 2.2066895961761475, "logits/rejected": 3.886895179748535, "logps/chosen": -1.5765347480773926, "logps/rejected": -1.733697533607483, "loss": 4.4335, "rewards/accuracies": 0.75, "rewards/chosen": -15.765348434448242, "rewards/margins": 1.5716273784637451, "rewards/rejected": -17.33697509765625, "step": 819 }, { "epoch": 0.1116557734204793, "grad_norm": 42.64521730671627, "learning_rate": 7.996735347171869e-07, "logits/chosen": -1.499160647392273, "logits/rejected": -0.30113399028778076, "logps/chosen": -1.5323439836502075, "logps/rejected": -1.9027698040008545, "loss": 4.3736, "rewards/accuracies": 0.75, "rewards/chosen": -15.323440551757812, "rewards/margins": 3.7042579650878906, "rewards/rejected": -19.027698516845703, "step": 820 }, { "epoch": 0.11179193899782135, "grad_norm": 38.922970184335554, "learning_rate": 7.996658090717091e-07, "logits/chosen": 2.8273892402648926, "logits/rejected": 2.414011240005493, "logps/chosen": -1.5999360084533691, "logps/rejected": -1.631941556930542, "loss": 4.1244, "rewards/accuracies": 0.5, "rewards/chosen": -15.999360084533691, "rewards/margins": 0.3200569152832031, "rewards/rejected": -16.319416046142578, "step": 821 }, { "epoch": 0.1119281045751634, "grad_norm": 44.92357982268903, "learning_rate": 7.996579931184378e-07, "logits/chosen": 2.1838557720184326, "logits/rejected": 2.206143617630005, "logps/chosen": -1.6106412410736084, "logps/rejected": -1.8121957778930664, "loss": 4.7605, "rewards/accuracies": 0.75, "rewards/chosen": -16.106412887573242, "rewards/margins": 2.0155444145202637, "rewards/rejected": -18.121957778930664, "step": 822 }, { "epoch": 0.11206427015250545, "grad_norm": 40.2067763403575, "learning_rate": 7.996500868591395e-07, "logits/chosen": -0.2827523648738861, "logits/rejected": 1.9213664531707764, "logps/chosen": -1.22501540184021, "logps/rejected": -1.5472135543823242, "loss": 4.3096, "rewards/accuracies": 1.0, "rewards/chosen": -12.250154495239258, "rewards/margins": 3.22198224067688, "rewards/rejected": -15.472136497497559, "step": 823 }, { "epoch": 0.11220043572984749, "grad_norm": 45.18179496209624, "learning_rate": 7.996420902956006e-07, "logits/chosen": 0.9126839637756348, "logits/rejected": 2.1977109909057617, "logps/chosen": -1.3611631393432617, "logps/rejected": -1.7155568599700928, "loss": 4.1662, "rewards/accuracies": 1.0, "rewards/chosen": -13.611631393432617, "rewards/margins": 3.5439364910125732, "rewards/rejected": -17.155567169189453, "step": 824 }, { "epoch": 0.11233660130718955, "grad_norm": 48.90236597205194, "learning_rate": 7.996340034296277e-07, "logits/chosen": 1.7769758701324463, "logits/rejected": 2.4535775184631348, "logps/chosen": -1.4856864213943481, "logps/rejected": -1.503831148147583, "loss": 4.3948, "rewards/accuracies": 0.75, "rewards/chosen": -14.856863975524902, "rewards/margins": 0.18144726753234863, "rewards/rejected": -15.038311004638672, "step": 825 }, { "epoch": 0.11247276688453159, "grad_norm": 43.010241751736245, "learning_rate": 7.996258262630485e-07, "logits/chosen": 2.399486780166626, "logits/rejected": 3.373544931411743, "logps/chosen": -1.674178123474121, "logps/rejected": -1.8692243099212646, "loss": 3.9348, "rewards/accuracies": 0.5, "rewards/chosen": -16.74178123474121, "rewards/margins": 1.9504618644714355, "rewards/rejected": -18.692243576049805, "step": 826 }, { "epoch": 0.11260893246187363, "grad_norm": 43.79937026723613, "learning_rate": 7.996175587977104e-07, "logits/chosen": 1.6525425910949707, "logits/rejected": 0.7102622985839844, "logps/chosen": -1.4823822975158691, "logps/rejected": -1.2895011901855469, "loss": 4.463, "rewards/accuracies": 0.25, "rewards/chosen": -14.823822975158691, "rewards/margins": -1.92881178855896, "rewards/rejected": -12.895011901855469, "step": 827 }, { "epoch": 0.11274509803921569, "grad_norm": 48.205696352996604, "learning_rate": 7.996092010354817e-07, "logits/chosen": 1.981593132019043, "logits/rejected": 2.024764060974121, "logps/chosen": -1.5150176286697388, "logps/rejected": -1.5545960664749146, "loss": 4.6519, "rewards/accuracies": 0.75, "rewards/chosen": -15.150177001953125, "rewards/margins": 0.3957836627960205, "rewards/rejected": -15.545960426330566, "step": 828 }, { "epoch": 0.11288126361655773, "grad_norm": 41.15111552512683, "learning_rate": 7.996007529782508e-07, "logits/chosen": 3.0032753944396973, "logits/rejected": 3.287769317626953, "logps/chosen": -2.0617880821228027, "logps/rejected": -2.3594627380371094, "loss": 3.8501, "rewards/accuracies": 0.75, "rewards/chosen": -20.617881774902344, "rewards/margins": 2.9767463207244873, "rewards/rejected": -23.594627380371094, "step": 829 }, { "epoch": 0.11301742919389979, "grad_norm": 45.95761526175978, "learning_rate": 7.995922146279267e-07, "logits/chosen": 1.5732563734054565, "logits/rejected": 2.136934757232666, "logps/chosen": -1.9805256128311157, "logps/rejected": -2.023333787918091, "loss": 4.3442, "rewards/accuracies": 0.5, "rewards/chosen": -19.805255889892578, "rewards/margins": 0.4280812740325928, "rewards/rejected": -20.23333740234375, "step": 830 }, { "epoch": 0.11315359477124183, "grad_norm": 48.072765368240674, "learning_rate": 7.995835859864385e-07, "logits/chosen": 2.952587366104126, "logits/rejected": 0.32962629199028015, "logps/chosen": -1.399166226387024, "logps/rejected": -1.2559670209884644, "loss": 4.9053, "rewards/accuracies": 0.25, "rewards/chosen": -13.99166202545166, "rewards/margins": -1.4319922924041748, "rewards/rejected": -12.559669494628906, "step": 831 }, { "epoch": 0.11328976034858387, "grad_norm": 59.4758766836427, "learning_rate": 7.995748670557361e-07, "logits/chosen": 4.099056243896484, "logits/rejected": 3.422488212585449, "logps/chosen": -2.5500011444091797, "logps/rejected": -2.2328479290008545, "loss": 5.0373, "rewards/accuracies": 0.5, "rewards/chosen": -25.50001335144043, "rewards/margins": -3.171535015106201, "rewards/rejected": -22.328479766845703, "step": 832 }, { "epoch": 0.11342592592592593, "grad_norm": 47.96955180759823, "learning_rate": 7.995660578377897e-07, "logits/chosen": 2.7724385261535645, "logits/rejected": 2.3566336631774902, "logps/chosen": -1.3520514965057373, "logps/rejected": -1.3691229820251465, "loss": 4.4039, "rewards/accuracies": 0.5, "rewards/chosen": -13.520515441894531, "rewards/margins": 0.1707148551940918, "rewards/rejected": -13.691230773925781, "step": 833 }, { "epoch": 0.11356209150326797, "grad_norm": 44.37271955461852, "learning_rate": 7.995571583345896e-07, "logits/chosen": 2.704946756362915, "logits/rejected": 4.200160980224609, "logps/chosen": -1.9050039052963257, "logps/rejected": -1.5701004266738892, "loss": 4.63, "rewards/accuracies": 0.0, "rewards/chosen": -19.050039291381836, "rewards/margins": -3.3490347862243652, "rewards/rejected": -15.701004028320312, "step": 834 }, { "epoch": 0.11369825708061002, "grad_norm": 44.80056171153922, "learning_rate": 7.995481685481467e-07, "logits/chosen": 1.2476739883422852, "logits/rejected": 2.4849250316619873, "logps/chosen": -1.4875773191452026, "logps/rejected": -1.7272416353225708, "loss": 4.185, "rewards/accuracies": 1.0, "rewards/chosen": -14.875772476196289, "rewards/margins": 2.3966426849365234, "rewards/rejected": -17.272415161132812, "step": 835 }, { "epoch": 0.11383442265795207, "grad_norm": 45.56390729426428, "learning_rate": 7.995390884804925e-07, "logits/chosen": 2.8282270431518555, "logits/rejected": 3.4526987075805664, "logps/chosen": -1.8931866884231567, "logps/rejected": -1.9238853454589844, "loss": 4.384, "rewards/accuracies": 0.75, "rewards/chosen": -18.931867599487305, "rewards/margins": 0.30698513984680176, "rewards/rejected": -19.238853454589844, "step": 836 }, { "epoch": 0.11397058823529412, "grad_norm": 46.69729344504733, "learning_rate": 7.995299181336787e-07, "logits/chosen": 2.0192503929138184, "logits/rejected": 3.453049659729004, "logps/chosen": -1.5260149240493774, "logps/rejected": -1.8976690769195557, "loss": 4.4829, "rewards/accuracies": 1.0, "rewards/chosen": -15.260149002075195, "rewards/margins": 3.7165420055389404, "rewards/rejected": -18.97669219970703, "step": 837 }, { "epoch": 0.11410675381263617, "grad_norm": 50.6415286798642, "learning_rate": 7.995206575097774e-07, "logits/chosen": 2.5289466381073, "logits/rejected": 3.0025687217712402, "logps/chosen": -1.7541215419769287, "logps/rejected": -1.8788098096847534, "loss": 4.5422, "rewards/accuracies": 0.5, "rewards/chosen": -17.541215896606445, "rewards/margins": 1.2468812465667725, "rewards/rejected": -18.788097381591797, "step": 838 }, { "epoch": 0.11424291938997821, "grad_norm": 63.45579087328, "learning_rate": 7.995113066108809e-07, "logits/chosen": 0.9508804082870483, "logits/rejected": 2.5107579231262207, "logps/chosen": -1.547587513923645, "logps/rejected": -1.65601646900177, "loss": 4.4804, "rewards/accuracies": 0.75, "rewards/chosen": -15.475874900817871, "rewards/margins": 1.0842900276184082, "rewards/rejected": -16.560165405273438, "step": 839 }, { "epoch": 0.11437908496732026, "grad_norm": 46.98610508455081, "learning_rate": 7.995018654391023e-07, "logits/chosen": 2.2774386405944824, "logits/rejected": 3.077622890472412, "logps/chosen": -1.2566163539886475, "logps/rejected": -1.7297171354293823, "loss": 4.2073, "rewards/accuracies": 1.0, "rewards/chosen": -12.566163063049316, "rewards/margins": 4.731008529663086, "rewards/rejected": -17.297170639038086, "step": 840 }, { "epoch": 0.11451525054466231, "grad_norm": 41.92354206394137, "learning_rate": 7.99492333996575e-07, "logits/chosen": 1.1424217224121094, "logits/rejected": 3.6814727783203125, "logps/chosen": -1.4910902976989746, "logps/rejected": -1.8798474073410034, "loss": 4.2421, "rewards/accuracies": 1.0, "rewards/chosen": -14.910902976989746, "rewards/margins": 3.887571096420288, "rewards/rejected": -18.798473358154297, "step": 841 }, { "epoch": 0.11465141612200436, "grad_norm": 46.881984626477774, "learning_rate": 7.994827122854523e-07, "logits/chosen": 1.1174691915512085, "logits/rejected": 2.587496042251587, "logps/chosen": -1.3951618671417236, "logps/rejected": -1.650969386100769, "loss": 4.5022, "rewards/accuracies": 0.75, "rewards/chosen": -13.951618194580078, "rewards/margins": 2.558075428009033, "rewards/rejected": -16.509693145751953, "step": 842 }, { "epoch": 0.1147875816993464, "grad_norm": 46.406244624209776, "learning_rate": 7.994730003079089e-07, "logits/chosen": 2.2886931896209717, "logits/rejected": 0.310746431350708, "logps/chosen": -1.9798873662948608, "logps/rejected": -1.4224189519882202, "loss": 4.4827, "rewards/accuracies": 0.0, "rewards/chosen": -19.798873901367188, "rewards/margins": -5.574683666229248, "rewards/rejected": -14.224189758300781, "step": 843 }, { "epoch": 0.11492374727668846, "grad_norm": 47.37419257219609, "learning_rate": 7.994631980661389e-07, "logits/chosen": 1.629844307899475, "logits/rejected": 2.855994701385498, "logps/chosen": -1.7131924629211426, "logps/rejected": -2.3272547721862793, "loss": 4.6089, "rewards/accuracies": 0.75, "rewards/chosen": -17.131925582885742, "rewards/margins": 6.140622138977051, "rewards/rejected": -23.27254867553711, "step": 844 }, { "epoch": 0.1150599128540305, "grad_norm": 46.968529423649635, "learning_rate": 7.994533055623573e-07, "logits/chosen": 0.8650320768356323, "logits/rejected": 3.723538398742676, "logps/chosen": -1.4823113679885864, "logps/rejected": -1.8010408878326416, "loss": 4.13, "rewards/accuracies": 0.75, "rewards/chosen": -14.823114395141602, "rewards/margins": 3.1872940063476562, "rewards/rejected": -18.010408401489258, "step": 845 }, { "epoch": 0.11519607843137254, "grad_norm": 54.7766132777569, "learning_rate": 7.994433227987996e-07, "logits/chosen": -0.5167635679244995, "logits/rejected": -0.3210301995277405, "logps/chosen": -1.4584416151046753, "logps/rejected": -1.7351486682891846, "loss": 3.8521, "rewards/accuracies": 1.0, "rewards/chosen": -14.584417343139648, "rewards/margins": 2.767068862915039, "rewards/rejected": -17.351486206054688, "step": 846 }, { "epoch": 0.1153322440087146, "grad_norm": 40.199902120895146, "learning_rate": 7.994332497777209e-07, "logits/chosen": 0.6127530336380005, "logits/rejected": 4.240593910217285, "logps/chosen": -1.7447566986083984, "logps/rejected": -1.9363104104995728, "loss": 4.3514, "rewards/accuracies": 0.75, "rewards/chosen": -17.447566986083984, "rewards/margins": 1.9155373573303223, "rewards/rejected": -19.36310386657715, "step": 847 }, { "epoch": 0.11546840958605664, "grad_norm": 49.16399410717227, "learning_rate": 7.994230865013979e-07, "logits/chosen": 1.4192800521850586, "logits/rejected": 1.7304315567016602, "logps/chosen": -1.7894847393035889, "logps/rejected": -1.9903833866119385, "loss": 3.5759, "rewards/accuracies": 0.75, "rewards/chosen": -17.894847869873047, "rewards/margins": 2.0089850425720215, "rewards/rejected": -19.903831481933594, "step": 848 }, { "epoch": 0.1156045751633987, "grad_norm": 54.63451697949813, "learning_rate": 7.994128329721269e-07, "logits/chosen": 0.4251525402069092, "logits/rejected": 1.5633599758148193, "logps/chosen": -1.1839582920074463, "logps/rejected": -1.267134428024292, "loss": 4.0145, "rewards/accuracies": 0.75, "rewards/chosen": -11.839582443237305, "rewards/margins": 0.8317608833312988, "rewards/rejected": -12.671342849731445, "step": 849 }, { "epoch": 0.11574074074074074, "grad_norm": 54.58671631430967, "learning_rate": 7.994024891922245e-07, "logits/chosen": 2.5375373363494873, "logits/rejected": 1.9217723608016968, "logps/chosen": -1.7868391275405884, "logps/rejected": -1.7867859601974487, "loss": 4.6416, "rewards/accuracies": 0.5, "rewards/chosen": -17.868391036987305, "rewards/margins": -0.0005316734313964844, "rewards/rejected": -17.86785888671875, "step": 850 }, { "epoch": 0.11587690631808278, "grad_norm": 48.343758124513805, "learning_rate": 7.993920551640283e-07, "logits/chosen": 2.201683282852173, "logits/rejected": 3.807379722595215, "logps/chosen": -2.0116477012634277, "logps/rejected": -2.4347832202911377, "loss": 3.6726, "rewards/accuracies": 0.75, "rewards/chosen": -20.11647605895996, "rewards/margins": 4.231355667114258, "rewards/rejected": -24.34783172607422, "step": 851 }, { "epoch": 0.11601307189542484, "grad_norm": 53.55420333872477, "learning_rate": 7.993815308898958e-07, "logits/chosen": 1.8952603340148926, "logits/rejected": 2.046891689300537, "logps/chosen": -1.9970299005508423, "logps/rejected": -2.026341438293457, "loss": 4.8522, "rewards/accuracies": 0.5, "rewards/chosen": -19.970298767089844, "rewards/margins": 0.29311490058898926, "rewards/rejected": -20.263412475585938, "step": 852 }, { "epoch": 0.11614923747276688, "grad_norm": 49.45265641061821, "learning_rate": 7.993709163722051e-07, "logits/chosen": 2.6542701721191406, "logits/rejected": 2.2075979709625244, "logps/chosen": -1.859556794166565, "logps/rejected": -1.4654427766799927, "loss": 4.689, "rewards/accuracies": 0.25, "rewards/chosen": -18.59556770324707, "rewards/margins": -3.9411399364471436, "rewards/rejected": -14.654428482055664, "step": 853 }, { "epoch": 0.11628540305010893, "grad_norm": 51.081619348704166, "learning_rate": 7.993602116133546e-07, "logits/chosen": 3.510411500930786, "logits/rejected": 1.9202117919921875, "logps/chosen": -1.8506027460098267, "logps/rejected": -1.8032580614089966, "loss": 5.0647, "rewards/accuracies": 0.5, "rewards/chosen": -18.506027221679688, "rewards/margins": -0.4734470844268799, "rewards/rejected": -18.032581329345703, "step": 854 }, { "epoch": 0.11642156862745098, "grad_norm": 50.62543547913867, "learning_rate": 7.993494166157631e-07, "logits/chosen": 2.438410997390747, "logits/rejected": 1.9896981716156006, "logps/chosen": -1.949419617652893, "logps/rejected": -1.811435580253601, "loss": 4.6582, "rewards/accuracies": 0.25, "rewards/chosen": -19.49419593811035, "rewards/margins": -1.3798408508300781, "rewards/rejected": -18.114355087280273, "step": 855 }, { "epoch": 0.11655773420479303, "grad_norm": 53.567834480397416, "learning_rate": 7.993385313818699e-07, "logits/chosen": 2.441056489944458, "logits/rejected": 3.5582034587860107, "logps/chosen": -1.7859982252120972, "logps/rejected": -2.0205092430114746, "loss": 4.23, "rewards/accuracies": 0.75, "rewards/chosen": -17.859981536865234, "rewards/margins": 2.345109462738037, "rewards/rejected": -20.20509147644043, "step": 856 }, { "epoch": 0.11669389978213508, "grad_norm": 44.13996029656647, "learning_rate": 7.993275559141346e-07, "logits/chosen": 0.22133344411849976, "logits/rejected": 1.6047887802124023, "logps/chosen": -1.6458743810653687, "logps/rejected": -1.9428167343139648, "loss": 3.7823, "rewards/accuracies": 1.0, "rewards/chosen": -16.458742141723633, "rewards/margins": 2.969424247741699, "rewards/rejected": -19.42816734313965, "step": 857 }, { "epoch": 0.11683006535947713, "grad_norm": 48.722243588710846, "learning_rate": 7.993164902150371e-07, "logits/chosen": 3.120360851287842, "logits/rejected": 3.247420310974121, "logps/chosen": -2.173720121383667, "logps/rejected": -2.120820999145508, "loss": 4.7006, "rewards/accuracies": 0.25, "rewards/chosen": -21.737201690673828, "rewards/margins": -0.5289902687072754, "rewards/rejected": -21.208209991455078, "step": 858 }, { "epoch": 0.11696623093681917, "grad_norm": 68.87544979298674, "learning_rate": 7.993053342870779e-07, "logits/chosen": 3.980652093887329, "logits/rejected": 3.9197211265563965, "logps/chosen": -2.266096591949463, "logps/rejected": -2.4181833267211914, "loss": 4.9572, "rewards/accuracies": 0.75, "rewards/chosen": -22.660966873168945, "rewards/margins": 1.5208678245544434, "rewards/rejected": -24.181835174560547, "step": 859 }, { "epoch": 0.11710239651416122, "grad_norm": 51.330990483892904, "learning_rate": 7.992940881327778e-07, "logits/chosen": 4.339508533477783, "logits/rejected": 4.806326866149902, "logps/chosen": -2.201201915740967, "logps/rejected": -2.1861722469329834, "loss": 4.2055, "rewards/accuracies": 0.5, "rewards/chosen": -22.01201820373535, "rewards/margins": -0.15029573440551758, "rewards/rejected": -21.861722946166992, "step": 860 }, { "epoch": 0.11723856209150327, "grad_norm": 53.27101785747058, "learning_rate": 7.992827517546777e-07, "logits/chosen": 2.8333237171173096, "logits/rejected": 4.645842552185059, "logps/chosen": -2.2139110565185547, "logps/rejected": -2.4793238639831543, "loss": 4.7255, "rewards/accuracies": 1.0, "rewards/chosen": -22.139110565185547, "rewards/margins": 2.6541295051574707, "rewards/rejected": -24.79323959350586, "step": 861 }, { "epoch": 0.11737472766884531, "grad_norm": 45.53254189261093, "learning_rate": 7.992713251553395e-07, "logits/chosen": 1.385441541671753, "logits/rejected": 2.9354054927825928, "logps/chosen": -1.6334688663482666, "logps/rejected": -1.712799072265625, "loss": 4.5661, "rewards/accuracies": 0.5, "rewards/chosen": -16.334688186645508, "rewards/margins": 0.7933023571968079, "rewards/rejected": -17.12799072265625, "step": 862 }, { "epoch": 0.11751089324618737, "grad_norm": 49.58093045012216, "learning_rate": 7.992598083373449e-07, "logits/chosen": -0.12410475313663483, "logits/rejected": 2.752866268157959, "logps/chosen": -1.4814844131469727, "logps/rejected": -2.0705432891845703, "loss": 4.6331, "rewards/accuracies": 1.0, "rewards/chosen": -14.814844131469727, "rewards/margins": 5.89058780670166, "rewards/rejected": -20.705432891845703, "step": 863 }, { "epoch": 0.11764705882352941, "grad_norm": 52.54638285848401, "learning_rate": 7.992482013032963e-07, "logits/chosen": 4.0326128005981445, "logits/rejected": 4.245574474334717, "logps/chosen": -2.1645009517669678, "logps/rejected": -2.6763851642608643, "loss": 5.2994, "rewards/accuracies": 0.5, "rewards/chosen": -21.645009994506836, "rewards/margins": 5.118844032287598, "rewards/rejected": -26.76385498046875, "step": 864 }, { "epoch": 0.11778322440087145, "grad_norm": 50.725361892984765, "learning_rate": 7.992365040558164e-07, "logits/chosen": 2.423882484436035, "logits/rejected": 3.5195913314819336, "logps/chosen": -1.7876449823379517, "logps/rejected": -1.9292962551116943, "loss": 4.4913, "rewards/accuracies": 0.5, "rewards/chosen": -17.876449584960938, "rewards/margins": 1.4165124893188477, "rewards/rejected": -19.2929630279541, "step": 865 }, { "epoch": 0.11791938997821351, "grad_norm": 54.78196695953161, "learning_rate": 7.992247165975483e-07, "logits/chosen": 1.8908677101135254, "logits/rejected": 1.2041738033294678, "logps/chosen": -1.8776276111602783, "logps/rejected": -1.8243515491485596, "loss": 4.7911, "rewards/accuracies": 0.5, "rewards/chosen": -18.776275634765625, "rewards/margins": -0.5327606201171875, "rewards/rejected": -18.243515014648438, "step": 866 }, { "epoch": 0.11805555555555555, "grad_norm": 54.582903511910985, "learning_rate": 7.992128389311554e-07, "logits/chosen": 2.0120766162872314, "logits/rejected": 1.5648332834243774, "logps/chosen": -1.8362517356872559, "logps/rejected": -1.792034387588501, "loss": 5.2277, "rewards/accuracies": 0.25, "rewards/chosen": -18.362518310546875, "rewards/margins": -0.44217371940612793, "rewards/rejected": -17.92034339904785, "step": 867 }, { "epoch": 0.11819172113289761, "grad_norm": 44.37086720057941, "learning_rate": 7.992008710593216e-07, "logits/chosen": 2.1598846912384033, "logits/rejected": 1.2868808507919312, "logps/chosen": -1.6502948999404907, "logps/rejected": -1.7930500507354736, "loss": 4.6189, "rewards/accuracies": 0.5, "rewards/chosen": -16.502948760986328, "rewards/margins": 1.4275517463684082, "rewards/rejected": -17.930500030517578, "step": 868 }, { "epoch": 0.11832788671023965, "grad_norm": 52.714659770974784, "learning_rate": 7.991888129847513e-07, "logits/chosen": 4.4493608474731445, "logits/rejected": 3.8399298191070557, "logps/chosen": -2.5757923126220703, "logps/rejected": -2.1237781047821045, "loss": 4.599, "rewards/accuracies": 0.25, "rewards/chosen": -25.757923126220703, "rewards/margins": -4.520142555236816, "rewards/rejected": -21.237781524658203, "step": 869 }, { "epoch": 0.1184640522875817, "grad_norm": 48.564790144503476, "learning_rate": 7.991766647101688e-07, "logits/chosen": 3.215428113937378, "logits/rejected": 3.270777940750122, "logps/chosen": -1.8084893226623535, "logps/rejected": -1.7332772016525269, "loss": 4.1048, "rewards/accuracies": 0.25, "rewards/chosen": -18.08489227294922, "rewards/margins": -0.7521212100982666, "rewards/rejected": -17.33277130126953, "step": 870 }, { "epoch": 0.11860021786492375, "grad_norm": 52.12580122712689, "learning_rate": 7.991644262383194e-07, "logits/chosen": 1.5108394622802734, "logits/rejected": 3.3123583793640137, "logps/chosen": -1.4959354400634766, "logps/rejected": -1.6827478408813477, "loss": 4.6875, "rewards/accuracies": 0.75, "rewards/chosen": -14.959354400634766, "rewards/margins": 1.8681232929229736, "rewards/rejected": -16.827478408813477, "step": 871 }, { "epoch": 0.1187363834422658, "grad_norm": 53.29165778622577, "learning_rate": 7.991520975719684e-07, "logits/chosen": 5.400932312011719, "logits/rejected": 3.800661087036133, "logps/chosen": -2.185899496078491, "logps/rejected": -2.310715675354004, "loss": 4.5488, "rewards/accuracies": 0.25, "rewards/chosen": -21.858993530273438, "rewards/margins": 1.2481637001037598, "rewards/rejected": -23.10715675354004, "step": 872 }, { "epoch": 0.11887254901960784, "grad_norm": 51.25220548897344, "learning_rate": 7.991396787139013e-07, "logits/chosen": 4.243049621582031, "logits/rejected": 5.633801460266113, "logps/chosen": -2.0541582107543945, "logps/rejected": -2.3270456790924072, "loss": 4.0915, "rewards/accuracies": 0.75, "rewards/chosen": -20.541580200195312, "rewards/margins": 2.728874921798706, "rewards/rejected": -23.270456314086914, "step": 873 }, { "epoch": 0.1190087145969499, "grad_norm": 50.62391219690929, "learning_rate": 7.991271696669247e-07, "logits/chosen": 3.5271315574645996, "logits/rejected": 2.5851893424987793, "logps/chosen": -2.0719523429870605, "logps/rejected": -2.024388313293457, "loss": 3.8494, "rewards/accuracies": 0.75, "rewards/chosen": -20.719524383544922, "rewards/margins": -0.47563958168029785, "rewards/rejected": -20.243885040283203, "step": 874 }, { "epoch": 0.11914488017429194, "grad_norm": 52.701977519744716, "learning_rate": 7.991145704338649e-07, "logits/chosen": 2.2306771278381348, "logits/rejected": 3.6684458255767822, "logps/chosen": -1.6128427982330322, "logps/rejected": -1.5204486846923828, "loss": 4.8848, "rewards/accuracies": 0.5, "rewards/chosen": -16.128427505493164, "rewards/margins": -0.9239418506622314, "rewards/rejected": -15.204485893249512, "step": 875 }, { "epoch": 0.119281045751634, "grad_norm": 45.034882321817896, "learning_rate": 7.991018810175687e-07, "logits/chosen": 2.3071792125701904, "logits/rejected": 4.761366844177246, "logps/chosen": -1.7150706052780151, "logps/rejected": -1.935694694519043, "loss": 3.7907, "rewards/accuracies": 1.0, "rewards/chosen": -17.150705337524414, "rewards/margins": 2.20624041557312, "rewards/rejected": -19.356945037841797, "step": 876 }, { "epoch": 0.11941721132897604, "grad_norm": 51.69503389908951, "learning_rate": 7.990891014209034e-07, "logits/chosen": 2.0511887073516846, "logits/rejected": 2.9388327598571777, "logps/chosen": -1.626375675201416, "logps/rejected": -1.6925098896026611, "loss": 4.0066, "rewards/accuracies": 0.5, "rewards/chosen": -16.263755798339844, "rewards/margins": 0.6613430976867676, "rewards/rejected": -16.925098419189453, "step": 877 }, { "epoch": 0.11955337690631808, "grad_norm": 43.628107045198526, "learning_rate": 7.990762316467568e-07, "logits/chosen": 5.323932647705078, "logits/rejected": 4.204380989074707, "logps/chosen": -2.0348165035247803, "logps/rejected": -2.4408843517303467, "loss": 4.5968, "rewards/accuracies": 1.0, "rewards/chosen": -20.348163604736328, "rewards/margins": 4.060677528381348, "rewards/rejected": -24.408842086791992, "step": 878 }, { "epoch": 0.11968954248366014, "grad_norm": 60.1506029446996, "learning_rate": 7.99063271698037e-07, "logits/chosen": 3.0431742668151855, "logits/rejected": 4.738241195678711, "logps/chosen": -1.845768928527832, "logps/rejected": -2.125215768814087, "loss": 3.4606, "rewards/accuracies": 1.0, "rewards/chosen": -18.45768928527832, "rewards/margins": 2.7944679260253906, "rewards/rejected": -21.252159118652344, "step": 879 }, { "epoch": 0.11982570806100218, "grad_norm": 49.726279613434585, "learning_rate": 7.990502215776722e-07, "logits/chosen": 2.364562511444092, "logits/rejected": 3.814422845840454, "logps/chosen": -2.229569435119629, "logps/rejected": -2.138154983520508, "loss": 4.2295, "rewards/accuracies": 0.25, "rewards/chosen": -22.295692443847656, "rewards/margins": -0.9141418933868408, "rewards/rejected": -21.381549835205078, "step": 880 }, { "epoch": 0.11996187363834422, "grad_norm": 47.62254485640418, "learning_rate": 7.990370812886113e-07, "logits/chosen": 3.9903767108917236, "logits/rejected": 3.561342239379883, "logps/chosen": -2.2625503540039062, "logps/rejected": -2.050830602645874, "loss": 4.811, "rewards/accuracies": 0.5, "rewards/chosen": -22.625505447387695, "rewards/margins": -2.1171998977661133, "rewards/rejected": -20.5083065032959, "step": 881 }, { "epoch": 0.12009803921568628, "grad_norm": 45.31488670372731, "learning_rate": 7.990238508338232e-07, "logits/chosen": 4.295554161071777, "logits/rejected": 2.507777214050293, "logps/chosen": -1.6545426845550537, "logps/rejected": -1.713692307472229, "loss": 4.7762, "rewards/accuracies": 0.25, "rewards/chosen": -16.545425415039062, "rewards/margins": 0.5914962291717529, "rewards/rejected": -17.13692283630371, "step": 882 }, { "epoch": 0.12023420479302832, "grad_norm": 51.4288333908726, "learning_rate": 7.990105302162978e-07, "logits/chosen": 2.3356525897979736, "logits/rejected": 4.896504878997803, "logps/chosen": -1.383535385131836, "logps/rejected": -1.8402224779129028, "loss": 4.3857, "rewards/accuracies": 0.75, "rewards/chosen": -13.83535385131836, "rewards/margins": 4.56687068939209, "rewards/rejected": -18.402225494384766, "step": 883 }, { "epoch": 0.12037037037037036, "grad_norm": 43.40893607266728, "learning_rate": 7.989971194390447e-07, "logits/chosen": 2.3620147705078125, "logits/rejected": 3.925027847290039, "logps/chosen": -2.0039594173431396, "logps/rejected": -2.0436177253723145, "loss": 4.7649, "rewards/accuracies": 0.25, "rewards/chosen": -20.039592742919922, "rewards/margins": 0.39658498764038086, "rewards/rejected": -20.43617820739746, "step": 884 }, { "epoch": 0.12050653594771242, "grad_norm": 44.98425278088978, "learning_rate": 7.989836185050945e-07, "logits/chosen": 2.06748628616333, "logits/rejected": 2.3972461223602295, "logps/chosen": -1.5501110553741455, "logps/rejected": -1.6493635177612305, "loss": 4.2867, "rewards/accuracies": 0.5, "rewards/chosen": -15.501111030578613, "rewards/margins": 0.9925248622894287, "rewards/rejected": -16.493637084960938, "step": 885 }, { "epoch": 0.12064270152505446, "grad_norm": 54.499245004877935, "learning_rate": 7.989700274174976e-07, "logits/chosen": 3.6672072410583496, "logits/rejected": 3.6248669624328613, "logps/chosen": -1.8138240575790405, "logps/rejected": -1.950298547744751, "loss": 4.0277, "rewards/accuracies": 0.75, "rewards/chosen": -18.138240814208984, "rewards/margins": 1.3647444248199463, "rewards/rejected": -19.502986907958984, "step": 886 }, { "epoch": 0.12077886710239652, "grad_norm": 43.94788530775465, "learning_rate": 7.989563461793251e-07, "logits/chosen": 2.957564353942871, "logits/rejected": 3.4561147689819336, "logps/chosen": -1.7467494010925293, "logps/rejected": -1.8997581005096436, "loss": 4.0799, "rewards/accuracies": 0.75, "rewards/chosen": -17.46749496459961, "rewards/margins": 1.530085802078247, "rewards/rejected": -18.99757957458496, "step": 887 }, { "epoch": 0.12091503267973856, "grad_norm": 47.963870997381825, "learning_rate": 7.989425747936683e-07, "logits/chosen": 2.9467356204986572, "logits/rejected": 4.6088547706604, "logps/chosen": -2.3541157245635986, "logps/rejected": -2.185776472091675, "loss": 4.0556, "rewards/accuracies": 0.5, "rewards/chosen": -23.541156768798828, "rewards/margins": -1.6833930015563965, "rewards/rejected": -21.857765197753906, "step": 888 }, { "epoch": 0.1210511982570806, "grad_norm": 45.23078066566835, "learning_rate": 7.989287132636392e-07, "logits/chosen": 2.640075206756592, "logits/rejected": 4.25126838684082, "logps/chosen": -1.7079917192459106, "logps/rejected": -2.127403736114502, "loss": 3.9907, "rewards/accuracies": 0.75, "rewards/chosen": -17.079917907714844, "rewards/margins": 4.194119453430176, "rewards/rejected": -21.274036407470703, "step": 889 }, { "epoch": 0.12118736383442266, "grad_norm": 60.38725861822943, "learning_rate": 7.989147615923695e-07, "logits/chosen": 2.7251880168914795, "logits/rejected": 2.33418607711792, "logps/chosen": -1.8869001865386963, "logps/rejected": -1.9755947589874268, "loss": 4.5297, "rewards/accuracies": 0.5, "rewards/chosen": -18.869001388549805, "rewards/margins": 0.8869457244873047, "rewards/rejected": -19.75594711303711, "step": 890 }, { "epoch": 0.1213235294117647, "grad_norm": 39.78367029497921, "learning_rate": 7.98900719783012e-07, "logits/chosen": 3.592390537261963, "logits/rejected": 3.4704785346984863, "logps/chosen": -1.9145240783691406, "logps/rejected": -1.9831550121307373, "loss": 4.263, "rewards/accuracies": 0.5, "rewards/chosen": -19.145240783691406, "rewards/margins": 0.6863094568252563, "rewards/rejected": -19.8315486907959, "step": 891 }, { "epoch": 0.12145969498910675, "grad_norm": 39.98600859499776, "learning_rate": 7.988865878387398e-07, "logits/chosen": 4.986907482147217, "logits/rejected": 3.9844894409179688, "logps/chosen": -1.8658912181854248, "logps/rejected": -1.820509672164917, "loss": 3.8988, "rewards/accuracies": 0.25, "rewards/chosen": -18.658912658691406, "rewards/margins": -0.45381617546081543, "rewards/rejected": -18.205097198486328, "step": 892 }, { "epoch": 0.1215958605664488, "grad_norm": 47.93114141413785, "learning_rate": 7.988723657627457e-07, "logits/chosen": 2.828723430633545, "logits/rejected": 3.1478519439697266, "logps/chosen": -1.7641884088516235, "logps/rejected": -2.167163133621216, "loss": 4.5463, "rewards/accuracies": 0.75, "rewards/chosen": -17.641883850097656, "rewards/margins": 4.02974796295166, "rewards/rejected": -21.671632766723633, "step": 893 }, { "epoch": 0.12173202614379085, "grad_norm": 43.96547445286468, "learning_rate": 7.988580535582434e-07, "logits/chosen": 1.5343694686889648, "logits/rejected": 2.131925582885742, "logps/chosen": -1.510658621788025, "logps/rejected": -1.760756492614746, "loss": 4.4418, "rewards/accuracies": 1.0, "rewards/chosen": -15.106585502624512, "rewards/margins": 2.500978946685791, "rewards/rejected": -17.60756492614746, "step": 894 }, { "epoch": 0.1218681917211329, "grad_norm": 47.17399407085669, "learning_rate": 7.988436512284667e-07, "logits/chosen": 4.421229839324951, "logits/rejected": 4.899317741394043, "logps/chosen": -2.006418228149414, "logps/rejected": -2.0009467601776123, "loss": 4.7293, "rewards/accuracies": 0.5, "rewards/chosen": -20.06418228149414, "rewards/margins": -0.05471348762512207, "rewards/rejected": -20.00946807861328, "step": 895 }, { "epoch": 0.12200435729847495, "grad_norm": 43.107856955478454, "learning_rate": 7.988291587766704e-07, "logits/chosen": 0.8289044499397278, "logits/rejected": 2.702223539352417, "logps/chosen": -1.587628960609436, "logps/rejected": -1.7312891483306885, "loss": 4.8402, "rewards/accuracies": 0.5, "rewards/chosen": -15.876289367675781, "rewards/margins": 1.43660306930542, "rewards/rejected": -17.31289291381836, "step": 896 }, { "epoch": 0.12214052287581699, "grad_norm": 48.80560255353635, "learning_rate": 7.98814576206129e-07, "logits/chosen": 3.487823963165283, "logits/rejected": 4.3467488288879395, "logps/chosen": -2.316397190093994, "logps/rejected": -2.3016953468322754, "loss": 4.67, "rewards/accuracies": 0.5, "rewards/chosen": -23.163970947265625, "rewards/margins": -0.14701604843139648, "rewards/rejected": -23.016956329345703, "step": 897 }, { "epoch": 0.12227668845315905, "grad_norm": 51.94487224199201, "learning_rate": 7.987999035201373e-07, "logits/chosen": 3.246192216873169, "logits/rejected": 3.2712979316711426, "logps/chosen": -1.5077824592590332, "logps/rejected": -1.714011788368225, "loss": 4.3118, "rewards/accuracies": 0.5, "rewards/chosen": -15.077825546264648, "rewards/margins": 2.0622920989990234, "rewards/rejected": -17.140117645263672, "step": 898 }, { "epoch": 0.12241285403050109, "grad_norm": 40.745920143751306, "learning_rate": 7.987851407220109e-07, "logits/chosen": 2.261122226715088, "logits/rejected": 3.705681562423706, "logps/chosen": -1.9126046895980835, "logps/rejected": -2.072704792022705, "loss": 4.2735, "rewards/accuracies": 0.5, "rewards/chosen": -19.126047134399414, "rewards/margins": 1.6010000705718994, "rewards/rejected": -20.727046966552734, "step": 899 }, { "epoch": 0.12254901960784313, "grad_norm": 46.92729038709749, "learning_rate": 7.987702878150855e-07, "logits/chosen": 2.420598030090332, "logits/rejected": 3.9819130897521973, "logps/chosen": -1.8165498971939087, "logps/rejected": -1.8223271369934082, "loss": 4.457, "rewards/accuracies": 0.5, "rewards/chosen": -18.165498733520508, "rewards/margins": 0.057772159576416016, "rewards/rejected": -18.223270416259766, "step": 900 }, { "epoch": 0.12268518518518519, "grad_norm": 40.254809916916535, "learning_rate": 7.987553448027174e-07, "logits/chosen": 3.497283935546875, "logits/rejected": 2.517294406890869, "logps/chosen": -1.8074856996536255, "logps/rejected": -1.5914708375930786, "loss": 4.4078, "rewards/accuracies": 0.5, "rewards/chosen": -18.074857711791992, "rewards/margins": -2.1601479053497314, "rewards/rejected": -15.914709091186523, "step": 901 }, { "epoch": 0.12282135076252723, "grad_norm": 46.211637156703866, "learning_rate": 7.987403116882831e-07, "logits/chosen": 5.898731231689453, "logits/rejected": 3.39202880859375, "logps/chosen": -2.1485395431518555, "logps/rejected": -2.0711123943328857, "loss": 5.0794, "rewards/accuracies": 0.75, "rewards/chosen": -21.485395431518555, "rewards/margins": -0.7742719650268555, "rewards/rejected": -20.711124420166016, "step": 902 }, { "epoch": 0.12295751633986927, "grad_norm": 40.47768813378599, "learning_rate": 7.987251884751792e-07, "logits/chosen": 1.2656118869781494, "logits/rejected": 3.010359764099121, "logps/chosen": -1.5878299474716187, "logps/rejected": -2.1297402381896973, "loss": 4.3612, "rewards/accuracies": 1.0, "rewards/chosen": -15.878299713134766, "rewards/margins": 5.419101715087891, "rewards/rejected": -21.297401428222656, "step": 903 }, { "epoch": 0.12309368191721133, "grad_norm": 45.002176119070654, "learning_rate": 7.98709975166823e-07, "logits/chosen": 3.1371729373931885, "logits/rejected": 5.0243964195251465, "logps/chosen": -1.9846304655075073, "logps/rejected": -2.1264407634735107, "loss": 4.9665, "rewards/accuracies": 0.75, "rewards/chosen": -19.846303939819336, "rewards/margins": 1.4181022644042969, "rewards/rejected": -21.264408111572266, "step": 904 }, { "epoch": 0.12322984749455337, "grad_norm": 41.9371879041104, "learning_rate": 7.986946717666523e-07, "logits/chosen": 1.9815075397491455, "logits/rejected": 2.9157814979553223, "logps/chosen": -1.7494208812713623, "logps/rejected": -1.8174506425857544, "loss": 4.2174, "rewards/accuracies": 0.75, "rewards/chosen": -17.49420928955078, "rewards/margins": 0.6802983283996582, "rewards/rejected": -18.17450714111328, "step": 905 }, { "epoch": 0.12336601307189543, "grad_norm": 51.144513611598924, "learning_rate": 7.986792782781248e-07, "logits/chosen": 3.584714412689209, "logits/rejected": 3.4769034385681152, "logps/chosen": -1.8228222131729126, "logps/rejected": -1.7441591024398804, "loss": 4.9674, "rewards/accuracies": 0.5, "rewards/chosen": -18.228221893310547, "rewards/margins": -0.7866318225860596, "rewards/rejected": -17.44158935546875, "step": 906 }, { "epoch": 0.12350217864923747, "grad_norm": 39.00588308356759, "learning_rate": 7.986637947047188e-07, "logits/chosen": 0.6035721302032471, "logits/rejected": 2.180741310119629, "logps/chosen": -1.422387957572937, "logps/rejected": -1.5311262607574463, "loss": 4.3254, "rewards/accuracies": 0.5, "rewards/chosen": -14.223878860473633, "rewards/margins": 1.0873827934265137, "rewards/rejected": -15.311262130737305, "step": 907 }, { "epoch": 0.12363834422657952, "grad_norm": 43.24576843305702, "learning_rate": 7.986482210499332e-07, "logits/chosen": 4.407856464385986, "logits/rejected": 4.617403984069824, "logps/chosen": -1.7517787218093872, "logps/rejected": -1.9689725637435913, "loss": 4.4325, "rewards/accuracies": 1.0, "rewards/chosen": -17.51778793334961, "rewards/margins": 2.171938419342041, "rewards/rejected": -19.68972396850586, "step": 908 }, { "epoch": 0.12377450980392157, "grad_norm": 39.8597891799477, "learning_rate": 7.986325573172866e-07, "logits/chosen": 3.188600540161133, "logits/rejected": 3.3822360038757324, "logps/chosen": -1.8077504634857178, "logps/rejected": -2.0854079723358154, "loss": 4.3261, "rewards/accuracies": 1.0, "rewards/chosen": -18.077503204345703, "rewards/margins": 2.776573657989502, "rewards/rejected": -20.85407829284668, "step": 909 }, { "epoch": 0.12391067538126362, "grad_norm": 40.48649777596271, "learning_rate": 7.986168035103185e-07, "logits/chosen": 4.318376541137695, "logits/rejected": 4.441105365753174, "logps/chosen": -2.050438404083252, "logps/rejected": -2.065145492553711, "loss": 4.6948, "rewards/accuracies": 0.25, "rewards/chosen": -20.504384994506836, "rewards/margins": 0.14707112312316895, "rewards/rejected": -20.65145492553711, "step": 910 }, { "epoch": 0.12404684095860566, "grad_norm": 39.34427592505673, "learning_rate": 7.986009596325889e-07, "logits/chosen": 4.235972881317139, "logits/rejected": 3.971723794937134, "logps/chosen": -2.0144314765930176, "logps/rejected": -1.9402859210968018, "loss": 4.0947, "rewards/accuracies": 0.25, "rewards/chosen": -20.14431381225586, "rewards/margins": -0.7414541244506836, "rewards/rejected": -19.402860641479492, "step": 911 }, { "epoch": 0.12418300653594772, "grad_norm": 55.13086746594179, "learning_rate": 7.985850256876774e-07, "logits/chosen": 2.611689805984497, "logits/rejected": 4.877657890319824, "logps/chosen": -1.6150108575820923, "logps/rejected": -1.7778851985931396, "loss": 4.8756, "rewards/accuracies": 0.75, "rewards/chosen": -16.150108337402344, "rewards/margins": 1.6287431716918945, "rewards/rejected": -17.778850555419922, "step": 912 }, { "epoch": 0.12431917211328976, "grad_norm": 41.77063340125889, "learning_rate": 7.985690016791846e-07, "logits/chosen": 2.863447666168213, "logits/rejected": 4.869321823120117, "logps/chosen": -1.51405668258667, "logps/rejected": -1.8145978450775146, "loss": 3.9938, "rewards/accuracies": 0.75, "rewards/chosen": -15.140567779541016, "rewards/margins": 3.0054123401641846, "rewards/rejected": -18.145980834960938, "step": 913 }, { "epoch": 0.12445533769063181, "grad_norm": 41.3773080372825, "learning_rate": 7.985528876107314e-07, "logits/chosen": 4.1706862449646, "logits/rejected": 5.78816556930542, "logps/chosen": -1.9079742431640625, "logps/rejected": -2.2198243141174316, "loss": 4.5301, "rewards/accuracies": 1.0, "rewards/chosen": -19.079742431640625, "rewards/margins": 3.1184990406036377, "rewards/rejected": -22.1982421875, "step": 914 }, { "epoch": 0.12459150326797386, "grad_norm": 44.8235391820467, "learning_rate": 7.985366834859586e-07, "logits/chosen": 3.2147250175476074, "logits/rejected": 4.555718421936035, "logps/chosen": -2.053485631942749, "logps/rejected": -2.31988263130188, "loss": 4.5558, "rewards/accuracies": 0.75, "rewards/chosen": -20.534854888916016, "rewards/margins": 2.663970470428467, "rewards/rejected": -23.198827743530273, "step": 915 }, { "epoch": 0.1247276688453159, "grad_norm": 51.80056019710519, "learning_rate": 7.985203893085281e-07, "logits/chosen": 3.9187111854553223, "logits/rejected": 4.752150535583496, "logps/chosen": -2.083672046661377, "logps/rejected": -2.2120864391326904, "loss": 5.2339, "rewards/accuracies": 0.75, "rewards/chosen": -20.83671760559082, "rewards/margins": 1.2841455936431885, "rewards/rejected": -22.120864868164062, "step": 916 }, { "epoch": 0.12486383442265796, "grad_norm": 45.808721478793274, "learning_rate": 7.985040050821211e-07, "logits/chosen": 2.3678431510925293, "logits/rejected": 5.106697082519531, "logps/chosen": -1.841052770614624, "logps/rejected": -2.1922996044158936, "loss": 4.1853, "rewards/accuracies": 0.75, "rewards/chosen": -18.410526275634766, "rewards/margins": 3.5124692916870117, "rewards/rejected": -21.922996520996094, "step": 917 }, { "epoch": 0.125, "grad_norm": 52.193693729904226, "learning_rate": 7.984875308104403e-07, "logits/chosen": 3.818605899810791, "logits/rejected": 4.436585426330566, "logps/chosen": -1.8361883163452148, "logps/rejected": -2.098642349243164, "loss": 4.5985, "rewards/accuracies": 0.75, "rewards/chosen": -18.36188316345215, "rewards/margins": 2.624539613723755, "rewards/rejected": -20.98642349243164, "step": 918 }, { "epoch": 0.12513616557734206, "grad_norm": 40.68653663176752, "learning_rate": 7.984709664972079e-07, "logits/chosen": 4.49832010269165, "logits/rejected": 5.177220344543457, "logps/chosen": -1.807134985923767, "logps/rejected": -2.1265640258789062, "loss": 4.427, "rewards/accuracies": 1.0, "rewards/chosen": -18.07135009765625, "rewards/margins": 3.194289445877075, "rewards/rejected": -21.265640258789062, "step": 919 }, { "epoch": 0.12527233115468409, "grad_norm": 39.8036608579622, "learning_rate": 7.984543121461669e-07, "logits/chosen": 1.7482304573059082, "logits/rejected": 3.028883934020996, "logps/chosen": -1.8290069103240967, "logps/rejected": -1.993873119354248, "loss": 4.3224, "rewards/accuracies": 0.25, "rewards/chosen": -18.290069580078125, "rewards/margins": 1.6486623287200928, "rewards/rejected": -19.938732147216797, "step": 920 }, { "epoch": 0.12540849673202614, "grad_norm": 43.56477264237012, "learning_rate": 7.984375677610804e-07, "logits/chosen": 2.5465946197509766, "logits/rejected": 2.466539144515991, "logps/chosen": -1.4865622520446777, "logps/rejected": -1.6256344318389893, "loss": 4.5329, "rewards/accuracies": 0.75, "rewards/chosen": -14.865623474121094, "rewards/margins": 1.3907198905944824, "rewards/rejected": -16.256343841552734, "step": 921 }, { "epoch": 0.1255446623093682, "grad_norm": 46.080835568121124, "learning_rate": 7.984207333457318e-07, "logits/chosen": 4.878488540649414, "logits/rejected": 4.020928382873535, "logps/chosen": -2.3201558589935303, "logps/rejected": -2.229597806930542, "loss": 4.5935, "rewards/accuracies": 0.5, "rewards/chosen": -23.201557159423828, "rewards/margins": -0.9055802822113037, "rewards/rejected": -22.295978546142578, "step": 922 }, { "epoch": 0.12568082788671023, "grad_norm": 44.54751850669088, "learning_rate": 7.984038089039254e-07, "logits/chosen": 3.4787421226501465, "logits/rejected": 4.01495885848999, "logps/chosen": -2.3273308277130127, "logps/rejected": -2.4329044818878174, "loss": 4.9413, "rewards/accuracies": 0.5, "rewards/chosen": -23.27330780029297, "rewards/margins": 1.0557360649108887, "rewards/rejected": -24.329044342041016, "step": 923 }, { "epoch": 0.12581699346405228, "grad_norm": 43.969334879073976, "learning_rate": 7.98386794439485e-07, "logits/chosen": 3.795699119567871, "logits/rejected": 5.928686618804932, "logps/chosen": -1.6254379749298096, "logps/rejected": -2.1006834506988525, "loss": 4.5266, "rewards/accuracies": 0.75, "rewards/chosen": -16.254379272460938, "rewards/margins": 4.752453327178955, "rewards/rejected": -21.006832122802734, "step": 924 }, { "epoch": 0.12595315904139434, "grad_norm": 41.28367830069464, "learning_rate": 7.983696899562552e-07, "logits/chosen": 3.8273301124572754, "logits/rejected": 5.454450607299805, "logps/chosen": -2.059460163116455, "logps/rejected": -2.334733247756958, "loss": 4.2254, "rewards/accuracies": 0.75, "rewards/chosen": -20.5946044921875, "rewards/margins": 2.7527294158935547, "rewards/rejected": -23.347332000732422, "step": 925 }, { "epoch": 0.12608932461873637, "grad_norm": 40.13116693654219, "learning_rate": 7.98352495458101e-07, "logits/chosen": 5.558241844177246, "logits/rejected": 5.219942092895508, "logps/chosen": -2.000541925430298, "logps/rejected": -1.7785676717758179, "loss": 4.7046, "rewards/accuracies": 0.0, "rewards/chosen": -20.00541877746582, "rewards/margins": -2.2197418212890625, "rewards/rejected": -17.785676956176758, "step": 926 }, { "epoch": 0.12622549019607843, "grad_norm": 43.43238779886385, "learning_rate": 7.983352109489077e-07, "logits/chosen": 5.6935930252075195, "logits/rejected": 5.867504119873047, "logps/chosen": -2.1132583618164062, "logps/rejected": -2.3592963218688965, "loss": 4.2935, "rewards/accuracies": 0.75, "rewards/chosen": -21.132583618164062, "rewards/margins": 2.460378646850586, "rewards/rejected": -23.59296417236328, "step": 927 }, { "epoch": 0.12636165577342048, "grad_norm": 43.363857894811375, "learning_rate": 7.983178364325808e-07, "logits/chosen": 5.620109558105469, "logits/rejected": 4.6452202796936035, "logps/chosen": -2.075416088104248, "logps/rejected": -1.973883032798767, "loss": 4.5654, "rewards/accuracies": 0.5, "rewards/chosen": -20.754159927368164, "rewards/margins": -1.015329360961914, "rewards/rejected": -19.73883056640625, "step": 928 }, { "epoch": 0.12649782135076254, "grad_norm": 43.45478803438351, "learning_rate": 7.983003719130464e-07, "logits/chosen": 2.998201608657837, "logits/rejected": 3.2976837158203125, "logps/chosen": -1.57574462890625, "logps/rejected": -1.3838376998901367, "loss": 4.5803, "rewards/accuracies": 0.0, "rewards/chosen": -15.7574462890625, "rewards/margins": -1.9190688133239746, "rewards/rejected": -13.838376998901367, "step": 929 }, { "epoch": 0.12663398692810457, "grad_norm": 45.52702401939743, "learning_rate": 7.982828173942503e-07, "logits/chosen": 5.243517875671387, "logits/rejected": 6.2054033279418945, "logps/chosen": -2.2617807388305664, "logps/rejected": -2.1677300930023193, "loss": 4.6753, "rewards/accuracies": 0.5, "rewards/chosen": -22.61780548095703, "rewards/margins": -0.9405059814453125, "rewards/rejected": -21.67729949951172, "step": 930 }, { "epoch": 0.12677015250544663, "grad_norm": 55.602880005005574, "learning_rate": 7.982651728801596e-07, "logits/chosen": 5.1115827560424805, "logits/rejected": 5.937126159667969, "logps/chosen": -2.171280860900879, "logps/rejected": -2.199911594390869, "loss": 5.3685, "rewards/accuracies": 0.75, "rewards/chosen": -21.712806701660156, "rewards/margins": 0.28630876541137695, "rewards/rejected": -21.999114990234375, "step": 931 }, { "epoch": 0.12690631808278868, "grad_norm": 49.97586496425484, "learning_rate": 7.982474383747608e-07, "logits/chosen": 4.251694202423096, "logits/rejected": 4.240933418273926, "logps/chosen": -1.3672776222229004, "logps/rejected": -1.4467191696166992, "loss": 4.624, "rewards/accuracies": 0.75, "rewards/chosen": -13.67277717590332, "rewards/margins": 0.7944140434265137, "rewards/rejected": -14.467191696166992, "step": 932 }, { "epoch": 0.1270424836601307, "grad_norm": 48.298495592496906, "learning_rate": 7.982296138820615e-07, "logits/chosen": 2.5226478576660156, "logits/rejected": 3.680156707763672, "logps/chosen": -1.9695088863372803, "logps/rejected": -2.5283637046813965, "loss": 4.1732, "rewards/accuracies": 0.75, "rewards/chosen": -19.69508934020996, "rewards/margins": 5.588547229766846, "rewards/rejected": -25.28363800048828, "step": 933 }, { "epoch": 0.12717864923747277, "grad_norm": 49.50186183725674, "learning_rate": 7.982116994060891e-07, "logits/chosen": 3.31354022026062, "logits/rejected": 3.6397509574890137, "logps/chosen": -1.701838731765747, "logps/rejected": -1.8171958923339844, "loss": 4.9189, "rewards/accuracies": 0.75, "rewards/chosen": -17.018386840820312, "rewards/margins": 1.153571367263794, "rewards/rejected": -18.171958923339844, "step": 934 }, { "epoch": 0.12731481481481483, "grad_norm": 44.0086513060537, "learning_rate": 7.981936949508915e-07, "logits/chosen": 3.585862159729004, "logits/rejected": 4.452330112457275, "logps/chosen": -1.7846643924713135, "logps/rejected": -1.908537745475769, "loss": 4.3801, "rewards/accuracies": 0.75, "rewards/chosen": -17.846643447875977, "rewards/margins": 1.2387340068817139, "rewards/rejected": -19.085376739501953, "step": 935 }, { "epoch": 0.12745098039215685, "grad_norm": 45.9578736384635, "learning_rate": 7.98175600520537e-07, "logits/chosen": 5.182056427001953, "logits/rejected": 4.152080535888672, "logps/chosen": -2.092437744140625, "logps/rejected": -1.805800199508667, "loss": 3.9388, "rewards/accuracies": 0.25, "rewards/chosen": -20.92437744140625, "rewards/margins": -2.866375684738159, "rewards/rejected": -18.058002471923828, "step": 936 }, { "epoch": 0.1275871459694989, "grad_norm": 91.76648028964395, "learning_rate": 7.981574161191144e-07, "logits/chosen": 4.5943193435668945, "logits/rejected": 4.335433006286621, "logps/chosen": -2.0250701904296875, "logps/rejected": -1.8691189289093018, "loss": 4.5923, "rewards/accuracies": 0.25, "rewards/chosen": -20.250699996948242, "rewards/margins": -1.5595126152038574, "rewards/rejected": -18.69118881225586, "step": 937 }, { "epoch": 0.12772331154684097, "grad_norm": 45.191995501105374, "learning_rate": 7.981391417507323e-07, "logits/chosen": 4.406839847564697, "logits/rejected": 5.74362850189209, "logps/chosen": -1.6569294929504395, "logps/rejected": -2.0100255012512207, "loss": 3.8023, "rewards/accuracies": 0.75, "rewards/chosen": -16.569297790527344, "rewards/margins": 3.530959367752075, "rewards/rejected": -20.10025405883789, "step": 938 }, { "epoch": 0.127859477124183, "grad_norm": 41.52191317249439, "learning_rate": 7.981207774195201e-07, "logits/chosen": 2.5387444496154785, "logits/rejected": 2.8645801544189453, "logps/chosen": -1.424180269241333, "logps/rejected": -1.4311084747314453, "loss": 4.2696, "rewards/accuracies": 0.5, "rewards/chosen": -14.241802215576172, "rewards/margins": 0.06928277015686035, "rewards/rejected": -14.311084747314453, "step": 939 }, { "epoch": 0.12799564270152505, "grad_norm": 43.03524480871868, "learning_rate": 7.981023231296273e-07, "logits/chosen": 4.692812919616699, "logits/rejected": 5.1078386306762695, "logps/chosen": -1.7576051950454712, "logps/rejected": -2.0710296630859375, "loss": 4.2642, "rewards/accuracies": 0.75, "rewards/chosen": -17.576053619384766, "rewards/margins": 3.134244203567505, "rewards/rejected": -20.710296630859375, "step": 940 }, { "epoch": 0.1281318082788671, "grad_norm": 43.41556955512253, "learning_rate": 7.980837788852239e-07, "logits/chosen": 5.880429744720459, "logits/rejected": 6.466599464416504, "logps/chosen": -2.024789810180664, "logps/rejected": -2.380061626434326, "loss": 4.51, "rewards/accuracies": 0.5, "rewards/chosen": -20.24789810180664, "rewards/margins": 3.552718162536621, "rewards/rejected": -23.800617218017578, "step": 941 }, { "epoch": 0.12826797385620914, "grad_norm": 44.89124119131034, "learning_rate": 7.980651446905e-07, "logits/chosen": 3.2019145488739014, "logits/rejected": 4.520216941833496, "logps/chosen": -1.3334492444992065, "logps/rejected": -1.7090873718261719, "loss": 4.3457, "rewards/accuracies": 1.0, "rewards/chosen": -13.334492683410645, "rewards/margins": 3.756381034851074, "rewards/rejected": -17.09087371826172, "step": 942 }, { "epoch": 0.1284041394335512, "grad_norm": 46.10949579874916, "learning_rate": 7.980464205496662e-07, "logits/chosen": 5.869545936584473, "logits/rejected": 5.99006462097168, "logps/chosen": -2.4524755477905273, "logps/rejected": -2.578752040863037, "loss": 4.2213, "rewards/accuracies": 0.75, "rewards/chosen": -24.524757385253906, "rewards/margins": 1.2627654075622559, "rewards/rejected": -25.787521362304688, "step": 943 }, { "epoch": 0.12854030501089325, "grad_norm": 46.775878079726425, "learning_rate": 7.980276064669535e-07, "logits/chosen": 6.008179664611816, "logits/rejected": 6.277617454528809, "logps/chosen": -2.305604934692383, "logps/rejected": -2.331799030303955, "loss": 4.0479, "rewards/accuracies": 0.25, "rewards/chosen": -23.056049346923828, "rewards/margins": 0.26194190979003906, "rewards/rejected": -23.3179931640625, "step": 944 }, { "epoch": 0.12867647058823528, "grad_norm": 48.01439790738306, "learning_rate": 7.98008702446613e-07, "logits/chosen": 4.046601295471191, "logits/rejected": 4.3200297355651855, "logps/chosen": -1.9345022439956665, "logps/rejected": -1.7892529964447021, "loss": 3.7715, "rewards/accuracies": 0.25, "rewards/chosen": -19.34502410888672, "rewards/margins": -1.4524927139282227, "rewards/rejected": -17.89253044128418, "step": 945 }, { "epoch": 0.12881263616557734, "grad_norm": 45.43557234537793, "learning_rate": 7.979897084929162e-07, "logits/chosen": 3.7595040798187256, "logits/rejected": 4.544129371643066, "logps/chosen": -1.8869317770004272, "logps/rejected": -2.1857967376708984, "loss": 4.0272, "rewards/accuracies": 0.75, "rewards/chosen": -18.86931800842285, "rewards/margins": 2.9886474609375, "rewards/rejected": -21.85796546936035, "step": 946 }, { "epoch": 0.1289488017429194, "grad_norm": 45.99342066923114, "learning_rate": 7.979706246101548e-07, "logits/chosen": 5.219947338104248, "logits/rejected": 3.8915762901306152, "logps/chosen": -2.103365421295166, "logps/rejected": -2.0413331985473633, "loss": 4.2068, "rewards/accuracies": 0.25, "rewards/chosen": -21.033653259277344, "rewards/margins": -0.6203231811523438, "rewards/rejected": -20.413330078125, "step": 947 }, { "epoch": 0.12908496732026145, "grad_norm": 46.354711166877955, "learning_rate": 7.979514508026412e-07, "logits/chosen": 4.930499076843262, "logits/rejected": 3.7916998863220215, "logps/chosen": -1.9827927350997925, "logps/rejected": -1.7542588710784912, "loss": 4.3218, "rewards/accuracies": 0.25, "rewards/chosen": -19.827926635742188, "rewards/margins": -2.2853381633758545, "rewards/rejected": -17.542587280273438, "step": 948 }, { "epoch": 0.12922113289760348, "grad_norm": 40.62778395942098, "learning_rate": 7.979321870747078e-07, "logits/chosen": 5.177513122558594, "logits/rejected": 4.911441802978516, "logps/chosen": -1.7997159957885742, "logps/rejected": -1.9357613325119019, "loss": 4.3303, "rewards/accuracies": 0.75, "rewards/chosen": -17.997161865234375, "rewards/margins": 1.36045241355896, "rewards/rejected": -19.35761260986328, "step": 949 }, { "epoch": 0.12935729847494554, "grad_norm": 49.818720557752194, "learning_rate": 7.979128334307073e-07, "logits/chosen": 4.689702033996582, "logits/rejected": 5.211864471435547, "logps/chosen": -2.0964255332946777, "logps/rejected": -2.24532413482666, "loss": 4.3023, "rewards/accuracies": 0.75, "rewards/chosen": -20.96425437927246, "rewards/margins": 1.488987922668457, "rewards/rejected": -22.453243255615234, "step": 950 }, { "epoch": 0.1294934640522876, "grad_norm": 46.66545171433313, "learning_rate": 7.978933898750132e-07, "logits/chosen": 4.868618011474609, "logits/rejected": 4.534048080444336, "logps/chosen": -1.9763551950454712, "logps/rejected": -2.1047744750976562, "loss": 4.0862, "rewards/accuracies": 0.5, "rewards/chosen": -19.763551712036133, "rewards/margins": 1.2841925621032715, "rewards/rejected": -21.047744750976562, "step": 951 }, { "epoch": 0.12962962962962962, "grad_norm": 45.63231868057853, "learning_rate": 7.978738564120183e-07, "logits/chosen": 5.538141250610352, "logits/rejected": 5.93363094329834, "logps/chosen": -2.303807258605957, "logps/rejected": -2.4818382263183594, "loss": 4.0067, "rewards/accuracies": 0.75, "rewards/chosen": -23.03807258605957, "rewards/margins": 1.780311107635498, "rewards/rejected": -24.818384170532227, "step": 952 }, { "epoch": 0.12976579520697168, "grad_norm": 47.75635945158951, "learning_rate": 7.978542330461368e-07, "logits/chosen": 2.6477859020233154, "logits/rejected": 3.9663517475128174, "logps/chosen": -1.6019757986068726, "logps/rejected": -1.6432515382766724, "loss": 4.5896, "rewards/accuracies": 0.5, "rewards/chosen": -16.019758224487305, "rewards/margins": 0.41275787353515625, "rewards/rejected": -16.43251609802246, "step": 953 }, { "epoch": 0.12990196078431374, "grad_norm": 45.495396180687806, "learning_rate": 7.978345197818027e-07, "logits/chosen": 3.7314252853393555, "logits/rejected": 3.780731439590454, "logps/chosen": -1.5377405881881714, "logps/rejected": -1.9862622022628784, "loss": 4.1295, "rewards/accuracies": 0.5, "rewards/chosen": -15.377406120300293, "rewards/margins": 4.485215663909912, "rewards/rejected": -19.862621307373047, "step": 954 }, { "epoch": 0.13003812636165576, "grad_norm": 50.60720221522497, "learning_rate": 7.978147166234702e-07, "logits/chosen": 3.30387544631958, "logits/rejected": 5.157296657562256, "logps/chosen": -1.7173078060150146, "logps/rejected": -2.1454110145568848, "loss": 4.5674, "rewards/accuracies": 0.75, "rewards/chosen": -17.173080444335938, "rewards/margins": 4.281033039093018, "rewards/rejected": -21.454113006591797, "step": 955 }, { "epoch": 0.13017429193899782, "grad_norm": 47.26497484918179, "learning_rate": 7.977948235756142e-07, "logits/chosen": 4.833887100219727, "logits/rejected": 4.927692890167236, "logps/chosen": -2.187930107116699, "logps/rejected": -2.016284465789795, "loss": 4.3951, "rewards/accuracies": 0.5, "rewards/chosen": -21.879301071166992, "rewards/margins": -1.716456651687622, "rewards/rejected": -20.162845611572266, "step": 956 }, { "epoch": 0.13031045751633988, "grad_norm": 40.707954200841264, "learning_rate": 7.977748406427297e-07, "logits/chosen": 3.9976699352264404, "logits/rejected": 4.288434028625488, "logps/chosen": -2.0935511589050293, "logps/rejected": -2.191582202911377, "loss": 4.1338, "rewards/accuracies": 0.5, "rewards/chosen": -20.93551254272461, "rewards/margins": 0.9803080558776855, "rewards/rejected": -21.915821075439453, "step": 957 }, { "epoch": 0.1304466230936819, "grad_norm": 41.012991966761604, "learning_rate": 7.977547678293318e-07, "logits/chosen": 4.611617088317871, "logits/rejected": 3.5973215103149414, "logps/chosen": -2.004378318786621, "logps/rejected": -2.043058395385742, "loss": 4.6332, "rewards/accuracies": 0.25, "rewards/chosen": -20.043785095214844, "rewards/margins": 0.38680100440979004, "rewards/rejected": -20.430583953857422, "step": 958 }, { "epoch": 0.13058278867102396, "grad_norm": 51.735758425295444, "learning_rate": 7.977346051399563e-07, "logits/chosen": 5.4543609619140625, "logits/rejected": 5.555315971374512, "logps/chosen": -1.9710004329681396, "logps/rejected": -2.177668571472168, "loss": 4.2072, "rewards/accuracies": 1.0, "rewards/chosen": -19.710002899169922, "rewards/margins": 2.0666825771331787, "rewards/rejected": -21.776687622070312, "step": 959 }, { "epoch": 0.13071895424836602, "grad_norm": 47.658182003936105, "learning_rate": 7.97714352579159e-07, "logits/chosen": 3.243781566619873, "logits/rejected": 2.8925232887268066, "logps/chosen": -2.1940836906433105, "logps/rejected": -1.6191627979278564, "loss": 4.5916, "rewards/accuracies": 0.0, "rewards/chosen": -21.940834045410156, "rewards/margins": -5.749205589294434, "rewards/rejected": -16.191627502441406, "step": 960 }, { "epoch": 0.13085511982570805, "grad_norm": 50.48586049171019, "learning_rate": 7.976940101515161e-07, "logits/chosen": 5.040128231048584, "logits/rejected": 6.722963333129883, "logps/chosen": -2.321722984313965, "logps/rejected": -2.433382511138916, "loss": 4.3638, "rewards/accuracies": 0.75, "rewards/chosen": -23.217227935791016, "rewards/margins": 1.1165974140167236, "rewards/rejected": -24.333826065063477, "step": 961 }, { "epoch": 0.1309912854030501, "grad_norm": 39.837876134717966, "learning_rate": 7.976735778616243e-07, "logits/chosen": 4.384061336517334, "logits/rejected": 4.706863880157471, "logps/chosen": -2.0620226860046387, "logps/rejected": -2.229872941970825, "loss": 3.639, "rewards/accuracies": 0.75, "rewards/chosen": -20.620227813720703, "rewards/margins": 1.678502082824707, "rewards/rejected": -22.298728942871094, "step": 962 }, { "epoch": 0.13112745098039216, "grad_norm": 46.60570308674728, "learning_rate": 7.976530557141005e-07, "logits/chosen": 2.316159248352051, "logits/rejected": 2.9466326236724854, "logps/chosen": -2.061793565750122, "logps/rejected": -2.5574495792388916, "loss": 4.2877, "rewards/accuracies": 0.5, "rewards/chosen": -20.617935180664062, "rewards/margins": 4.956560134887695, "rewards/rejected": -25.57449722290039, "step": 963 }, { "epoch": 0.1312636165577342, "grad_norm": 49.467535132137385, "learning_rate": 7.976324437135816e-07, "logits/chosen": 3.949324607849121, "logits/rejected": 4.663497447967529, "logps/chosen": -1.5650334358215332, "logps/rejected": -1.8772802352905273, "loss": 5.2575, "rewards/accuracies": 0.75, "rewards/chosen": -15.650335311889648, "rewards/margins": 3.1224677562713623, "rewards/rejected": -18.772802352905273, "step": 964 }, { "epoch": 0.13139978213507625, "grad_norm": 49.483592336127785, "learning_rate": 7.976117418647252e-07, "logits/chosen": 3.696624755859375, "logits/rejected": 4.311827659606934, "logps/chosen": -1.4631876945495605, "logps/rejected": -1.7357325553894043, "loss": 3.9165, "rewards/accuracies": 0.75, "rewards/chosen": -14.631875991821289, "rewards/margins": 2.725449562072754, "rewards/rejected": -17.35732650756836, "step": 965 }, { "epoch": 0.1315359477124183, "grad_norm": 53.72298543498963, "learning_rate": 7.975909501722091e-07, "logits/chosen": 2.930861473083496, "logits/rejected": 3.987672805786133, "logps/chosen": -1.8782322406768799, "logps/rejected": -1.7090808153152466, "loss": 4.8616, "rewards/accuracies": 0.5, "rewards/chosen": -18.78232192993164, "rewards/margins": -1.691514253616333, "rewards/rejected": -17.090808868408203, "step": 966 }, { "epoch": 0.13167211328976036, "grad_norm": 50.23596698027922, "learning_rate": 7.975700686407312e-07, "logits/chosen": 3.5127878189086914, "logits/rejected": 4.916624069213867, "logps/chosen": -1.8425971269607544, "logps/rejected": -2.251166820526123, "loss": 4.5731, "rewards/accuracies": 1.0, "rewards/chosen": -18.42597198486328, "rewards/margins": 4.085698127746582, "rewards/rejected": -22.511669158935547, "step": 967 }, { "epoch": 0.1318082788671024, "grad_norm": 55.198274892397286, "learning_rate": 7.9754909727501e-07, "logits/chosen": 3.573599338531494, "logits/rejected": 5.496377944946289, "logps/chosen": -2.186345100402832, "logps/rejected": -2.286656379699707, "loss": 4.277, "rewards/accuracies": 0.75, "rewards/chosen": -21.863452911376953, "rewards/margins": 1.003110408782959, "rewards/rejected": -22.866561889648438, "step": 968 }, { "epoch": 0.13194444444444445, "grad_norm": 44.130745641160004, "learning_rate": 7.975280360797841e-07, "logits/chosen": 4.4539265632629395, "logits/rejected": 4.652978897094727, "logps/chosen": -2.3425192832946777, "logps/rejected": -2.359921932220459, "loss": 4.4661, "rewards/accuracies": 0.75, "rewards/chosen": -23.425193786621094, "rewards/margins": 0.1740264892578125, "rewards/rejected": -23.599220275878906, "step": 969 }, { "epoch": 0.1320806100217865, "grad_norm": 49.181775785195434, "learning_rate": 7.975068850598125e-07, "logits/chosen": 4.4732666015625, "logits/rejected": 3.2213997840881348, "logps/chosen": -2.1240692138671875, "logps/rejected": -1.8698011636734009, "loss": 4.2158, "rewards/accuracies": 0.25, "rewards/chosen": -21.240692138671875, "rewards/margins": -2.5426812171936035, "rewards/rejected": -18.698009490966797, "step": 970 }, { "epoch": 0.13221677559912853, "grad_norm": 55.903040639611774, "learning_rate": 7.974856442198743e-07, "logits/chosen": 4.404035568237305, "logits/rejected": 4.816661834716797, "logps/chosen": -1.9597232341766357, "logps/rejected": -1.8745677471160889, "loss": 4.4329, "rewards/accuracies": 0.25, "rewards/chosen": -19.597232818603516, "rewards/margins": -0.8515546321868896, "rewards/rejected": -18.745677947998047, "step": 971 }, { "epoch": 0.1323529411764706, "grad_norm": 46.960434460929214, "learning_rate": 7.974643135647692e-07, "logits/chosen": 5.587034225463867, "logits/rejected": 2.855776071548462, "logps/chosen": -2.040330171585083, "logps/rejected": -1.7123420238494873, "loss": 4.5843, "rewards/accuracies": 0.25, "rewards/chosen": -20.403301239013672, "rewards/margins": -3.2798800468444824, "rewards/rejected": -17.12342071533203, "step": 972 }, { "epoch": 0.13248910675381265, "grad_norm": 48.649040260413464, "learning_rate": 7.97442893099317e-07, "logits/chosen": 3.6958534717559814, "logits/rejected": 4.014970302581787, "logps/chosen": -1.847307562828064, "logps/rejected": -1.8692615032196045, "loss": 4.1667, "rewards/accuracies": 0.5, "rewards/chosen": -18.47307586669922, "rewards/margins": 0.21953773498535156, "rewards/rejected": -18.692615509033203, "step": 973 }, { "epoch": 0.13262527233115468, "grad_norm": 49.515036013996294, "learning_rate": 7.974213828283577e-07, "logits/chosen": 4.788330554962158, "logits/rejected": 4.94857931137085, "logps/chosen": -2.1730380058288574, "logps/rejected": -2.216825008392334, "loss": 4.6354, "rewards/accuracies": 0.5, "rewards/chosen": -21.73038101196289, "rewards/margins": 0.4378688335418701, "rewards/rejected": -22.168251037597656, "step": 974 }, { "epoch": 0.13276143790849673, "grad_norm": 48.877946185359235, "learning_rate": 7.973997827567519e-07, "logits/chosen": 4.3404645919799805, "logits/rejected": 6.075716972351074, "logps/chosen": -1.9075777530670166, "logps/rejected": -2.4737398624420166, "loss": 4.2369, "rewards/accuracies": 1.0, "rewards/chosen": -19.075777053833008, "rewards/margins": 5.661621570587158, "rewards/rejected": -24.737398147583008, "step": 975 }, { "epoch": 0.1328976034858388, "grad_norm": 46.176025665974166, "learning_rate": 7.973780928893802e-07, "logits/chosen": 4.068933486938477, "logits/rejected": 5.557577133178711, "logps/chosen": -1.9583613872528076, "logps/rejected": -2.585409164428711, "loss": 3.9077, "rewards/accuracies": 1.0, "rewards/chosen": -19.583614349365234, "rewards/margins": 6.270476818084717, "rewards/rejected": -25.85409164428711, "step": 976 }, { "epoch": 0.13303376906318082, "grad_norm": 55.24952958482162, "learning_rate": 7.973563132311437e-07, "logits/chosen": 3.186720609664917, "logits/rejected": 4.006548881530762, "logps/chosen": -2.4185166358947754, "logps/rejected": -2.288719892501831, "loss": 4.3626, "rewards/accuracies": 0.5, "rewards/chosen": -24.185165405273438, "rewards/margins": -1.2979679107666016, "rewards/rejected": -22.88719940185547, "step": 977 }, { "epoch": 0.13316993464052287, "grad_norm": 52.23682827639788, "learning_rate": 7.973344437869636e-07, "logits/chosen": 4.381330490112305, "logits/rejected": 4.771608829498291, "logps/chosen": -1.7256782054901123, "logps/rejected": -2.391085624694824, "loss": 4.7022, "rewards/accuracies": 0.75, "rewards/chosen": -17.25678253173828, "rewards/margins": 6.654074668884277, "rewards/rejected": -23.910856246948242, "step": 978 }, { "epoch": 0.13330610021786493, "grad_norm": 49.52623606388777, "learning_rate": 7.973124845617815e-07, "logits/chosen": 4.71274471282959, "logits/rejected": 5.981463432312012, "logps/chosen": -2.406320571899414, "logps/rejected": -2.525867223739624, "loss": 4.7223, "rewards/accuracies": 0.75, "rewards/chosen": -24.063203811645508, "rewards/margins": 1.1954689025878906, "rewards/rejected": -25.2586727142334, "step": 979 }, { "epoch": 0.13344226579520696, "grad_norm": 299.53750140318766, "learning_rate": 7.972904355605594e-07, "logits/chosen": 6.071537017822266, "logits/rejected": 3.836822986602783, "logps/chosen": -2.0733251571655273, "logps/rejected": -1.9927624464035034, "loss": 3.863, "rewards/accuracies": 0.25, "rewards/chosen": -20.733251571655273, "rewards/margins": -0.8056271076202393, "rewards/rejected": -19.927623748779297, "step": 980 }, { "epoch": 0.13357843137254902, "grad_norm": 41.16848785544294, "learning_rate": 7.972682967882793e-07, "logits/chosen": 4.249267578125, "logits/rejected": 3.8540520668029785, "logps/chosen": -2.076010227203369, "logps/rejected": -1.9619109630584717, "loss": 4.2563, "rewards/accuracies": 0.5, "rewards/chosen": -20.760103225708008, "rewards/margins": -1.1409940719604492, "rewards/rejected": -19.619110107421875, "step": 981 }, { "epoch": 0.13371459694989107, "grad_norm": 43.35489874980458, "learning_rate": 7.972460682499436e-07, "logits/chosen": 3.0209426879882812, "logits/rejected": 6.416609764099121, "logps/chosen": -2.15079665184021, "logps/rejected": -2.4145703315734863, "loss": 4.3814, "rewards/accuracies": 0.5, "rewards/chosen": -21.507965087890625, "rewards/margins": 2.6377358436584473, "rewards/rejected": -24.145702362060547, "step": 982 }, { "epoch": 0.1338507625272331, "grad_norm": 50.30201573386523, "learning_rate": 7.972237499505752e-07, "logits/chosen": 4.87807559967041, "logits/rejected": 3.782625198364258, "logps/chosen": -2.173731803894043, "logps/rejected": -2.1201155185699463, "loss": 4.2023, "rewards/accuracies": 0.5, "rewards/chosen": -21.73731803894043, "rewards/margins": -0.5361618995666504, "rewards/rejected": -21.201156616210938, "step": 983 }, { "epoch": 0.13398692810457516, "grad_norm": 48.19197172692833, "learning_rate": 7.972013418952171e-07, "logits/chosen": 4.137051582336426, "logits/rejected": 4.72794246673584, "logps/chosen": -1.6385531425476074, "logps/rejected": -2.027855634689331, "loss": 4.2917, "rewards/accuracies": 0.75, "rewards/chosen": -16.38553237915039, "rewards/margins": 3.8930234909057617, "rewards/rejected": -20.27855682373047, "step": 984 }, { "epoch": 0.13412309368191722, "grad_norm": 50.42120843563975, "learning_rate": 7.971788440889324e-07, "logits/chosen": 5.742424011230469, "logits/rejected": 5.35345983505249, "logps/chosen": -2.073181629180908, "logps/rejected": -2.0682880878448486, "loss": 5.5683, "rewards/accuracies": 0.5, "rewards/chosen": -20.731815338134766, "rewards/margins": -0.04893374443054199, "rewards/rejected": -20.682880401611328, "step": 985 }, { "epoch": 0.13425925925925927, "grad_norm": 46.1894056467477, "learning_rate": 7.971562565368048e-07, "logits/chosen": 4.596338272094727, "logits/rejected": 4.471016883850098, "logps/chosen": -2.0574839115142822, "logps/rejected": -2.266995906829834, "loss": 4.1483, "rewards/accuracies": 0.5, "rewards/chosen": -20.574838638305664, "rewards/margins": 2.0951199531555176, "rewards/rejected": -22.669958114624023, "step": 986 }, { "epoch": 0.1343954248366013, "grad_norm": 49.78689429366621, "learning_rate": 7.971335792439381e-07, "logits/chosen": 3.3289294242858887, "logits/rejected": 4.109737873077393, "logps/chosen": -1.764394760131836, "logps/rejected": -1.9657104015350342, "loss": 4.2713, "rewards/accuracies": 0.5, "rewards/chosen": -17.64394760131836, "rewards/margins": 2.013155937194824, "rewards/rejected": -19.6571044921875, "step": 987 }, { "epoch": 0.13453159041394336, "grad_norm": 43.39749219455012, "learning_rate": 7.971108122154564e-07, "logits/chosen": 2.3927154541015625, "logits/rejected": 3.695120334625244, "logps/chosen": -1.8220311403274536, "logps/rejected": -2.2636098861694336, "loss": 4.045, "rewards/accuracies": 0.75, "rewards/chosen": -18.220312118530273, "rewards/margins": 4.415787696838379, "rewards/rejected": -22.63610076904297, "step": 988 }, { "epoch": 0.13466775599128541, "grad_norm": 46.73347898455127, "learning_rate": 7.970879554565041e-07, "logits/chosen": 4.252068996429443, "logits/rejected": 2.166208505630493, "logps/chosen": -2.1137869358062744, "logps/rejected": -2.047011375427246, "loss": 3.8749, "rewards/accuracies": 0.5, "rewards/chosen": -21.13787078857422, "rewards/margins": -0.6677556037902832, "rewards/rejected": -20.470115661621094, "step": 989 }, { "epoch": 0.13480392156862744, "grad_norm": 45.709469029967906, "learning_rate": 7.970650089722459e-07, "logits/chosen": 6.054469108581543, "logits/rejected": 5.76076602935791, "logps/chosen": -2.162172794342041, "logps/rejected": -2.4052770137786865, "loss": 3.7916, "rewards/accuracies": 0.5, "rewards/chosen": -21.621726989746094, "rewards/margins": 2.4310429096221924, "rewards/rejected": -24.05276870727539, "step": 990 }, { "epoch": 0.1349400871459695, "grad_norm": 48.60623438515894, "learning_rate": 7.970419727678669e-07, "logits/chosen": 5.205188274383545, "logits/rejected": 6.36868953704834, "logps/chosen": -2.4312639236450195, "logps/rejected": -2.7501049041748047, "loss": 4.6714, "rewards/accuracies": 1.0, "rewards/chosen": -24.312639236450195, "rewards/margins": 3.1884098052978516, "rewards/rejected": -27.501049041748047, "step": 991 }, { "epoch": 0.13507625272331156, "grad_norm": 44.636234093016874, "learning_rate": 7.970188468485719e-07, "logits/chosen": 5.22271728515625, "logits/rejected": 5.722444534301758, "logps/chosen": -2.4301843643188477, "logps/rejected": -2.5844924449920654, "loss": 4.2916, "rewards/accuracies": 0.75, "rewards/chosen": -24.30184555053711, "rewards/margins": 1.5430798530578613, "rewards/rejected": -25.844924926757812, "step": 992 }, { "epoch": 0.1352124183006536, "grad_norm": 45.28524869018446, "learning_rate": 7.969956312195868e-07, "logits/chosen": 5.161330223083496, "logits/rejected": 5.98961067199707, "logps/chosen": -2.6645493507385254, "logps/rejected": -2.7252910137176514, "loss": 4.3854, "rewards/accuracies": 0.5, "rewards/chosen": -26.645492553710938, "rewards/margins": 0.6074180603027344, "rewards/rejected": -27.252910614013672, "step": 993 }, { "epoch": 0.13534858387799564, "grad_norm": 46.74077400138685, "learning_rate": 7.969723258861573e-07, "logits/chosen": 2.7193984985351562, "logits/rejected": 4.0957932472229, "logps/chosen": -2.207775354385376, "logps/rejected": -2.3894920349121094, "loss": 3.8619, "rewards/accuracies": 0.25, "rewards/chosen": -22.077754974365234, "rewards/margins": 1.817164421081543, "rewards/rejected": -23.894920349121094, "step": 994 }, { "epoch": 0.1354847494553377, "grad_norm": 47.72389234929848, "learning_rate": 7.969489308535494e-07, "logits/chosen": 3.6320559978485107, "logits/rejected": 4.079094886779785, "logps/chosen": -1.670238971710205, "logps/rejected": -1.8892056941986084, "loss": 4.0905, "rewards/accuracies": 0.75, "rewards/chosen": -16.702388763427734, "rewards/margins": 2.189667224884033, "rewards/rejected": -18.89205551147461, "step": 995 }, { "epoch": 0.13562091503267973, "grad_norm": 61.92057566681859, "learning_rate": 7.969254461270493e-07, "logits/chosen": 4.475957870483398, "logits/rejected": 4.0688652992248535, "logps/chosen": -1.8753018379211426, "logps/rejected": -2.3181424140930176, "loss": 5.0777, "rewards/accuracies": 1.0, "rewards/chosen": -18.75301742553711, "rewards/margins": 4.428409099578857, "rewards/rejected": -23.181427001953125, "step": 996 }, { "epoch": 0.13575708061002179, "grad_norm": 47.43782306699678, "learning_rate": 7.969018717119635e-07, "logits/chosen": 6.647150039672852, "logits/rejected": 7.23000431060791, "logps/chosen": -2.1353063583374023, "logps/rejected": -2.254739761352539, "loss": 4.0669, "rewards/accuracies": 0.75, "rewards/chosen": -21.353065490722656, "rewards/margins": 1.194333791732788, "rewards/rejected": -22.54739761352539, "step": 997 }, { "epoch": 0.13589324618736384, "grad_norm": 45.46371013699971, "learning_rate": 7.968782076136191e-07, "logits/chosen": 3.090085983276367, "logits/rejected": 5.1781158447265625, "logps/chosen": -2.2127490043640137, "logps/rejected": -2.463121175765991, "loss": 4.1258, "rewards/accuracies": 0.5, "rewards/chosen": -22.127490997314453, "rewards/margins": 2.503721237182617, "rewards/rejected": -24.631210327148438, "step": 998 }, { "epoch": 0.13602941176470587, "grad_norm": 45.40997601504266, "learning_rate": 7.968544538373631e-07, "logits/chosen": 4.4176177978515625, "logits/rejected": 6.5257487297058105, "logps/chosen": -1.9734573364257812, "logps/rejected": -2.458214521408081, "loss": 3.8476, "rewards/accuracies": 0.75, "rewards/chosen": -19.734573364257812, "rewards/margins": 4.847572326660156, "rewards/rejected": -24.58214569091797, "step": 999 }, { "epoch": 0.13616557734204793, "grad_norm": 54.14078280424121, "learning_rate": 7.968306103885627e-07, "logits/chosen": 5.095308303833008, "logits/rejected": 7.081783294677734, "logps/chosen": -2.0963075160980225, "logps/rejected": -2.324903964996338, "loss": 3.911, "rewards/accuracies": 0.75, "rewards/chosen": -20.96307373046875, "rewards/margins": 2.2859649658203125, "rewards/rejected": -23.249038696289062, "step": 1000 }, { "epoch": 0.13630174291938998, "grad_norm": 49.76145772758982, "learning_rate": 7.968066772726057e-07, "logits/chosen": 6.001337051391602, "logits/rejected": 6.779987335205078, "logps/chosen": -2.2453722953796387, "logps/rejected": -2.3215560913085938, "loss": 4.132, "rewards/accuracies": 0.5, "rewards/chosen": -22.453723907470703, "rewards/margins": 0.7618370056152344, "rewards/rejected": -23.215560913085938, "step": 1001 }, { "epoch": 0.136437908496732, "grad_norm": 45.20599930113853, "learning_rate": 7.967826544949e-07, "logits/chosen": 4.415453910827637, "logits/rejected": 5.5224738121032715, "logps/chosen": -2.0833821296691895, "logps/rejected": -2.515896797180176, "loss": 3.4966, "rewards/accuracies": 1.0, "rewards/chosen": -20.83382225036621, "rewards/margins": 4.3251447677612305, "rewards/rejected": -25.158966064453125, "step": 1002 }, { "epoch": 0.13657407407407407, "grad_norm": 50.46145388232553, "learning_rate": 7.967585420608735e-07, "logits/chosen": 5.720084190368652, "logits/rejected": 6.569056510925293, "logps/chosen": -2.3902950286865234, "logps/rejected": -2.559734582901001, "loss": 4.3657, "rewards/accuracies": 0.75, "rewards/chosen": -23.902950286865234, "rewards/margins": 1.6943964958190918, "rewards/rejected": -25.597347259521484, "step": 1003 }, { "epoch": 0.13671023965141613, "grad_norm": 48.59835077328434, "learning_rate": 7.96734339975975e-07, "logits/chosen": 6.277549743652344, "logits/rejected": 5.737805366516113, "logps/chosen": -2.4725770950317383, "logps/rejected": -2.5829086303710938, "loss": 4.0474, "rewards/accuracies": 0.75, "rewards/chosen": -24.725772857666016, "rewards/margins": 1.1033143997192383, "rewards/rejected": -25.829086303710938, "step": 1004 }, { "epoch": 0.13684640522875818, "grad_norm": 47.48201671787137, "learning_rate": 7.967100482456726e-07, "logits/chosen": 6.598492622375488, "logits/rejected": 7.241626739501953, "logps/chosen": -2.2484073638916016, "logps/rejected": -2.5032172203063965, "loss": 4.1314, "rewards/accuracies": 1.0, "rewards/chosen": -22.484073638916016, "rewards/margins": 2.548098087310791, "rewards/rejected": -25.03217124938965, "step": 1005 }, { "epoch": 0.1369825708061002, "grad_norm": 45.618710829607465, "learning_rate": 7.966856668754559e-07, "logits/chosen": 6.41782808303833, "logits/rejected": 7.973243713378906, "logps/chosen": -2.325000047683716, "logps/rejected": -2.7355237007141113, "loss": 4.1539, "rewards/accuracies": 1.0, "rewards/chosen": -23.25, "rewards/margins": 4.105236530303955, "rewards/rejected": -27.35523796081543, "step": 1006 }, { "epoch": 0.13711873638344227, "grad_norm": 51.6208867933134, "learning_rate": 7.966611958708337e-07, "logits/chosen": 5.451704978942871, "logits/rejected": 6.66154670715332, "logps/chosen": -2.2738373279571533, "logps/rejected": -2.3759424686431885, "loss": 4.5483, "rewards/accuracies": 0.5, "rewards/chosen": -22.738374710083008, "rewards/margins": 1.0210504531860352, "rewards/rejected": -23.75942611694336, "step": 1007 }, { "epoch": 0.13725490196078433, "grad_norm": 44.526398191422885, "learning_rate": 7.966366352373354e-07, "logits/chosen": 6.249236106872559, "logits/rejected": 6.043563365936279, "logps/chosen": -2.281723737716675, "logps/rejected": -2.4183239936828613, "loss": 4.4686, "rewards/accuracies": 0.5, "rewards/chosen": -22.817237854003906, "rewards/margins": 1.365999698638916, "rewards/rejected": -24.183237075805664, "step": 1008 }, { "epoch": 0.13739106753812635, "grad_norm": 45.74824016104616, "learning_rate": 7.966119849805107e-07, "logits/chosen": 5.0006561279296875, "logits/rejected": 5.844088554382324, "logps/chosen": -2.572932243347168, "logps/rejected": -2.5682966709136963, "loss": 4.4915, "rewards/accuracies": 0.25, "rewards/chosen": -25.729324340820312, "rewards/margins": -0.046357154846191406, "rewards/rejected": -25.682966232299805, "step": 1009 }, { "epoch": 0.1375272331154684, "grad_norm": 50.07583698373833, "learning_rate": 7.965872451059295e-07, "logits/chosen": 4.802380561828613, "logits/rejected": 6.344061851501465, "logps/chosen": -2.2107934951782227, "logps/rejected": -2.6180267333984375, "loss": 4.2357, "rewards/accuracies": 1.0, "rewards/chosen": -22.107933044433594, "rewards/margins": 4.072335243225098, "rewards/rejected": -26.180267333984375, "step": 1010 }, { "epoch": 0.13766339869281047, "grad_norm": 49.52754801288958, "learning_rate": 7.965624156191822e-07, "logits/chosen": 5.7733154296875, "logits/rejected": 6.368053913116455, "logps/chosen": -1.9966380596160889, "logps/rejected": -2.253098249435425, "loss": 4.4812, "rewards/accuracies": 1.0, "rewards/chosen": -19.966379165649414, "rewards/margins": 2.5646023750305176, "rewards/rejected": -22.530981063842773, "step": 1011 }, { "epoch": 0.1377995642701525, "grad_norm": 51.88978824940081, "learning_rate": 7.96537496525879e-07, "logits/chosen": 5.544942855834961, "logits/rejected": 4.098874092102051, "logps/chosen": -1.6594606637954712, "logps/rejected": -1.793116807937622, "loss": 4.5206, "rewards/accuracies": 0.5, "rewards/chosen": -16.594608306884766, "rewards/margins": 1.3365614414215088, "rewards/rejected": -17.931167602539062, "step": 1012 }, { "epoch": 0.13793572984749455, "grad_norm": 48.21486930936456, "learning_rate": 7.965124878316506e-07, "logits/chosen": 6.837491512298584, "logits/rejected": 6.547076225280762, "logps/chosen": -2.4216904640197754, "logps/rejected": -2.4374685287475586, "loss": 3.8703, "rewards/accuracies": 0.5, "rewards/chosen": -24.21690559387207, "rewards/margins": 0.15777969360351562, "rewards/rejected": -24.374685287475586, "step": 1013 }, { "epoch": 0.1380718954248366, "grad_norm": 47.368589125818175, "learning_rate": 7.96487389542148e-07, "logits/chosen": 6.79013729095459, "logits/rejected": 6.289558410644531, "logps/chosen": -2.1570816040039062, "logps/rejected": -2.354886531829834, "loss": 4.2769, "rewards/accuracies": 0.75, "rewards/chosen": -21.570816040039062, "rewards/margins": 1.9780488014221191, "rewards/rejected": -23.548866271972656, "step": 1014 }, { "epoch": 0.13820806100217864, "grad_norm": 43.18940756410414, "learning_rate": 7.964622016630424e-07, "logits/chosen": 5.34299373626709, "logits/rejected": 7.165875434875488, "logps/chosen": -1.9480900764465332, "logps/rejected": -2.5139260292053223, "loss": 3.7887, "rewards/accuracies": 1.0, "rewards/chosen": -19.480899810791016, "rewards/margins": 5.658359050750732, "rewards/rejected": -25.139259338378906, "step": 1015 }, { "epoch": 0.1383442265795207, "grad_norm": 47.29184706256728, "learning_rate": 7.964369242000252e-07, "logits/chosen": 6.295655250549316, "logits/rejected": 6.984072685241699, "logps/chosen": -2.283302068710327, "logps/rejected": -2.3678412437438965, "loss": 4.4914, "rewards/accuracies": 0.5, "rewards/chosen": -22.83302116394043, "rewards/margins": 0.8453927040100098, "rewards/rejected": -23.67841339111328, "step": 1016 }, { "epoch": 0.13848039215686275, "grad_norm": 47.74400727971823, "learning_rate": 7.964115571588078e-07, "logits/chosen": 2.342904806137085, "logits/rejected": 5.2735466957092285, "logps/chosen": -1.4363536834716797, "logps/rejected": -1.7764179706573486, "loss": 4.248, "rewards/accuracies": 0.75, "rewards/chosen": -14.363536834716797, "rewards/margins": 3.400642156600952, "rewards/rejected": -17.764179229736328, "step": 1017 }, { "epoch": 0.13861655773420478, "grad_norm": 52.75405482912719, "learning_rate": 7.963861005451224e-07, "logits/chosen": 2.921297073364258, "logits/rejected": 4.22599983215332, "logps/chosen": -1.542711853981018, "logps/rejected": -1.7385343313217163, "loss": 4.4118, "rewards/accuracies": 1.0, "rewards/chosen": -15.427118301391602, "rewards/margins": 1.9582245349884033, "rewards/rejected": -17.38534164428711, "step": 1018 }, { "epoch": 0.13875272331154684, "grad_norm": 42.116751294401524, "learning_rate": 7.96360554364721e-07, "logits/chosen": 5.618294715881348, "logits/rejected": 5.720634460449219, "logps/chosen": -2.20989727973938, "logps/rejected": -2.3575439453125, "loss": 4.471, "rewards/accuracies": 0.75, "rewards/chosen": -22.09897232055664, "rewards/margins": 1.476468563079834, "rewards/rejected": -23.575439453125, "step": 1019 }, { "epoch": 0.1388888888888889, "grad_norm": 43.63603720762214, "learning_rate": 7.963349186233759e-07, "logits/chosen": 5.598814487457275, "logits/rejected": 6.189894676208496, "logps/chosen": -1.6234828233718872, "logps/rejected": -1.5805050134658813, "loss": 4.4648, "rewards/accuracies": 0.5, "rewards/chosen": -16.23482894897461, "rewards/margins": -0.429779052734375, "rewards/rejected": -15.805048942565918, "step": 1020 }, { "epoch": 0.13902505446623092, "grad_norm": 41.99822708118574, "learning_rate": 7.9630919332688e-07, "logits/chosen": 5.763742923736572, "logits/rejected": 4.340229034423828, "logps/chosen": -2.0051190853118896, "logps/rejected": -2.0857837200164795, "loss": 4.2445, "rewards/accuracies": 0.75, "rewards/chosen": -20.051189422607422, "rewards/margins": 0.8066473007202148, "rewards/rejected": -20.857837677001953, "step": 1021 }, { "epoch": 0.13916122004357298, "grad_norm": 47.343811954482526, "learning_rate": 7.962833784810457e-07, "logits/chosen": 4.737030029296875, "logits/rejected": 4.57504940032959, "logps/chosen": -1.7685344219207764, "logps/rejected": -1.7274246215820312, "loss": 3.9716, "rewards/accuracies": 0.5, "rewards/chosen": -17.685344696044922, "rewards/margins": -0.41109776496887207, "rewards/rejected": -17.274246215820312, "step": 1022 }, { "epoch": 0.13929738562091504, "grad_norm": 44.78213436812369, "learning_rate": 7.962574740917066e-07, "logits/chosen": 5.581634998321533, "logits/rejected": 7.223572254180908, "logps/chosen": -2.2959578037261963, "logps/rejected": -2.3069121837615967, "loss": 4.06, "rewards/accuracies": 0.25, "rewards/chosen": -22.959577560424805, "rewards/margins": 0.1095438003540039, "rewards/rejected": -23.069122314453125, "step": 1023 }, { "epoch": 0.1394335511982571, "grad_norm": 50.418263366689594, "learning_rate": 7.962314801647157e-07, "logits/chosen": 3.0895938873291016, "logits/rejected": 1.7852187156677246, "logps/chosen": -2.0040173530578613, "logps/rejected": -1.8630855083465576, "loss": 4.7996, "rewards/accuracies": 0.5, "rewards/chosen": -20.040172576904297, "rewards/margins": -1.4093174934387207, "rewards/rejected": -18.630855560302734, "step": 1024 }, { "epoch": 0.13956971677559912, "grad_norm": 51.20895825363692, "learning_rate": 7.962053967059464e-07, "logits/chosen": 4.556878566741943, "logits/rejected": 6.175836563110352, "logps/chosen": -1.9567795991897583, "logps/rejected": -2.5375237464904785, "loss": 5.2357, "rewards/accuracies": 0.75, "rewards/chosen": -19.56779670715332, "rewards/margins": 5.80743932723999, "rewards/rejected": -25.37523651123047, "step": 1025 }, { "epoch": 0.13970588235294118, "grad_norm": 43.72677214078259, "learning_rate": 7.961792237212927e-07, "logits/chosen": 5.326869964599609, "logits/rejected": 6.572521686553955, "logps/chosen": -2.3116960525512695, "logps/rejected": -2.6726267337799072, "loss": 4.151, "rewards/accuracies": 1.0, "rewards/chosen": -23.116962432861328, "rewards/margins": 3.6093053817749023, "rewards/rejected": -26.726268768310547, "step": 1026 }, { "epoch": 0.13984204793028324, "grad_norm": 47.23711777485016, "learning_rate": 7.961529612166685e-07, "logits/chosen": 5.538036346435547, "logits/rejected": 5.233587265014648, "logps/chosen": -1.9885448217391968, "logps/rejected": -2.020040512084961, "loss": 4.249, "rewards/accuracies": 0.25, "rewards/chosen": -19.885448455810547, "rewards/margins": 0.3149547576904297, "rewards/rejected": -20.20040512084961, "step": 1027 }, { "epoch": 0.13997821350762527, "grad_norm": 48.26297819542979, "learning_rate": 7.961266091980082e-07, "logits/chosen": 5.758380889892578, "logits/rejected": 6.7058892250061035, "logps/chosen": -2.1783814430236816, "logps/rejected": -2.1304283142089844, "loss": 4.0218, "rewards/accuracies": 0.5, "rewards/chosen": -21.7838134765625, "rewards/margins": -0.47953057289123535, "rewards/rejected": -21.304283142089844, "step": 1028 }, { "epoch": 0.14011437908496732, "grad_norm": 49.461606003715126, "learning_rate": 7.96100167671266e-07, "logits/chosen": 2.0233209133148193, "logits/rejected": 3.2957189083099365, "logps/chosen": -1.7247480154037476, "logps/rejected": -2.143801212310791, "loss": 4.9112, "rewards/accuracies": 0.75, "rewards/chosen": -17.247482299804688, "rewards/margins": 4.190533638000488, "rewards/rejected": -21.43801498413086, "step": 1029 }, { "epoch": 0.14025054466230938, "grad_norm": 78.4128949861436, "learning_rate": 7.960736366424167e-07, "logits/chosen": 4.4859232902526855, "logits/rejected": 5.78399658203125, "logps/chosen": -1.7223292589187622, "logps/rejected": -1.753121256828308, "loss": 4.3707, "rewards/accuracies": 0.5, "rewards/chosen": -17.22329330444336, "rewards/margins": 0.3079204559326172, "rewards/rejected": -17.531213760375977, "step": 1030 }, { "epoch": 0.1403867102396514, "grad_norm": 46.87544637394054, "learning_rate": 7.960470161174555e-07, "logits/chosen": 5.39923095703125, "logits/rejected": 6.768150329589844, "logps/chosen": -2.055283546447754, "logps/rejected": -2.464139461517334, "loss": 4.0556, "rewards/accuracies": 0.75, "rewards/chosen": -20.55283546447754, "rewards/margins": 4.088558197021484, "rewards/rejected": -24.64139175415039, "step": 1031 }, { "epoch": 0.14052287581699346, "grad_norm": 45.40313471597038, "learning_rate": 7.96020306102397e-07, "logits/chosen": 4.117392539978027, "logits/rejected": 4.539342880249023, "logps/chosen": -1.889591097831726, "logps/rejected": -1.7617037296295166, "loss": 4.2243, "rewards/accuracies": 0.5, "rewards/chosen": -18.895910263061523, "rewards/margins": -1.2788739204406738, "rewards/rejected": -17.617034912109375, "step": 1032 }, { "epoch": 0.14065904139433552, "grad_norm": 51.57296178198962, "learning_rate": 7.959935066032769e-07, "logits/chosen": 5.766148567199707, "logits/rejected": 6.909154891967773, "logps/chosen": -2.018228530883789, "logps/rejected": -2.3192243576049805, "loss": 4.6114, "rewards/accuracies": 0.75, "rewards/chosen": -20.18228530883789, "rewards/margins": 3.0099599361419678, "rewards/rejected": -23.192245483398438, "step": 1033 }, { "epoch": 0.14079520697167755, "grad_norm": 47.66491490010809, "learning_rate": 7.959666176261507e-07, "logits/chosen": 7.724686622619629, "logits/rejected": 5.702220916748047, "logps/chosen": -2.1996216773986816, "logps/rejected": -2.128397226333618, "loss": 4.4325, "rewards/accuracies": 0.25, "rewards/chosen": -21.9962158203125, "rewards/margins": -0.7122447490692139, "rewards/rejected": -21.283971786499023, "step": 1034 }, { "epoch": 0.1409313725490196, "grad_norm": 54.13790026223089, "learning_rate": 7.95939639177094e-07, "logits/chosen": 4.97796630859375, "logits/rejected": 5.28643798828125, "logps/chosen": -2.345182418823242, "logps/rejected": -2.3530054092407227, "loss": 4.7218, "rewards/accuracies": 0.75, "rewards/chosen": -23.45182228088379, "rewards/margins": 0.0782313346862793, "rewards/rejected": -23.530052185058594, "step": 1035 }, { "epoch": 0.14106753812636166, "grad_norm": 49.439111509801535, "learning_rate": 7.95912571262203e-07, "logits/chosen": 5.424799919128418, "logits/rejected": 5.145305633544922, "logps/chosen": -2.220357894897461, "logps/rejected": -2.2263307571411133, "loss": 3.8326, "rewards/accuracies": 0.25, "rewards/chosen": -22.203577041625977, "rewards/margins": 0.05973005294799805, "rewards/rejected": -22.263309478759766, "step": 1036 }, { "epoch": 0.1412037037037037, "grad_norm": 46.56977784124548, "learning_rate": 7.958854138875941e-07, "logits/chosen": 6.588531494140625, "logits/rejected": 7.996070384979248, "logps/chosen": -1.9154024124145508, "logps/rejected": -2.2985329627990723, "loss": 3.9128, "rewards/accuracies": 0.75, "rewards/chosen": -19.154024124145508, "rewards/margins": 3.831306219100952, "rewards/rejected": -22.985328674316406, "step": 1037 }, { "epoch": 0.14133986928104575, "grad_norm": 53.837258731109614, "learning_rate": 7.958581670594032e-07, "logits/chosen": 8.371169090270996, "logits/rejected": 6.339287281036377, "logps/chosen": -2.506917953491211, "logps/rejected": -2.193854331970215, "loss": 4.9474, "rewards/accuracies": 0.25, "rewards/chosen": -25.06917953491211, "rewards/margins": -3.1306374073028564, "rewards/rejected": -21.938541412353516, "step": 1038 }, { "epoch": 0.1414760348583878, "grad_norm": 51.14420245082575, "learning_rate": 7.958308307837873e-07, "logits/chosen": 5.103220462799072, "logits/rejected": 4.67169713973999, "logps/chosen": -2.3873162269592285, "logps/rejected": -2.268601894378662, "loss": 4.1705, "rewards/accuracies": 0.25, "rewards/chosen": -23.87316131591797, "rewards/margins": -1.1871447563171387, "rewards/rejected": -22.686016082763672, "step": 1039 }, { "epoch": 0.14161220043572983, "grad_norm": 56.566455222048496, "learning_rate": 7.958034050669234e-07, "logits/chosen": 7.413856506347656, "logits/rejected": 6.061680793762207, "logps/chosen": -2.113213300704956, "logps/rejected": -1.8814314603805542, "loss": 4.7728, "rewards/accuracies": 0.25, "rewards/chosen": -21.13213348388672, "rewards/margins": -2.317819118499756, "rewards/rejected": -18.814313888549805, "step": 1040 }, { "epoch": 0.1417483660130719, "grad_norm": 51.33436365660264, "learning_rate": 7.957758899150083e-07, "logits/chosen": 5.1677021980285645, "logits/rejected": 5.8287200927734375, "logps/chosen": -2.0133185386657715, "logps/rejected": -2.0151803493499756, "loss": 4.8616, "rewards/accuracies": 0.5, "rewards/chosen": -20.13318634033203, "rewards/margins": 0.018617630004882812, "rewards/rejected": -20.151803970336914, "step": 1041 }, { "epoch": 0.14188453159041395, "grad_norm": 47.691800916925374, "learning_rate": 7.957482853342593e-07, "logits/chosen": 6.663287162780762, "logits/rejected": 5.999980926513672, "logps/chosen": -2.439042329788208, "logps/rejected": -1.9532809257507324, "loss": 4.5613, "rewards/accuracies": 0.5, "rewards/chosen": -24.390422821044922, "rewards/margins": -4.857613563537598, "rewards/rejected": -19.532808303833008, "step": 1042 }, { "epoch": 0.142020697167756, "grad_norm": 48.85435880657411, "learning_rate": 7.95720591330914e-07, "logits/chosen": 6.023458480834961, "logits/rejected": 5.882223606109619, "logps/chosen": -2.5212020874023438, "logps/rejected": -2.5633764266967773, "loss": 4.2504, "rewards/accuracies": 0.5, "rewards/chosen": -25.212020874023438, "rewards/margins": 0.42174196243286133, "rewards/rejected": -25.63376235961914, "step": 1043 }, { "epoch": 0.14215686274509803, "grad_norm": 54.02642753029152, "learning_rate": 7.9569280791123e-07, "logits/chosen": 3.4496264457702637, "logits/rejected": 5.062396049499512, "logps/chosen": -1.8893815279006958, "logps/rejected": -2.235926389694214, "loss": 4.4064, "rewards/accuracies": 1.0, "rewards/chosen": -18.893814086914062, "rewards/margins": 3.4654488563537598, "rewards/rejected": -22.359264373779297, "step": 1044 }, { "epoch": 0.1422930283224401, "grad_norm": 51.16103452087469, "learning_rate": 7.956649350814853e-07, "logits/chosen": 4.699954986572266, "logits/rejected": 4.188632011413574, "logps/chosen": -2.4921836853027344, "logps/rejected": -2.311572313308716, "loss": 4.3148, "rewards/accuracies": 0.5, "rewards/chosen": -24.921838760375977, "rewards/margins": -1.8061156272888184, "rewards/rejected": -23.11572265625, "step": 1045 }, { "epoch": 0.14242919389978215, "grad_norm": 57.60075128022346, "learning_rate": 7.956369728479778e-07, "logits/chosen": 6.127275466918945, "logits/rejected": 7.1096343994140625, "logps/chosen": -2.05462908744812, "logps/rejected": -2.2164840698242188, "loss": 4.4302, "rewards/accuracies": 1.0, "rewards/chosen": -20.546289443969727, "rewards/margins": 1.6185495853424072, "rewards/rejected": -22.164840698242188, "step": 1046 }, { "epoch": 0.14256535947712418, "grad_norm": 61.671903016845235, "learning_rate": 7.956089212170261e-07, "logits/chosen": 7.358280658721924, "logits/rejected": 7.347799301147461, "logps/chosen": -2.1088247299194336, "logps/rejected": -2.3068912029266357, "loss": 4.1656, "rewards/accuracies": 0.5, "rewards/chosen": -21.088247299194336, "rewards/margins": 1.9806647300720215, "rewards/rejected": -23.068910598754883, "step": 1047 }, { "epoch": 0.14270152505446623, "grad_norm": 46.82099163696588, "learning_rate": 7.955807801949682e-07, "logits/chosen": 5.050839900970459, "logits/rejected": 6.0324602127075195, "logps/chosen": -1.9618070125579834, "logps/rejected": -2.3362631797790527, "loss": 3.7749, "rewards/accuracies": 1.0, "rewards/chosen": -19.61806869506836, "rewards/margins": 3.744563102722168, "rewards/rejected": -23.362632751464844, "step": 1048 }, { "epoch": 0.1428376906318083, "grad_norm": 44.026297107432846, "learning_rate": 7.955525497881633e-07, "logits/chosen": 8.015851974487305, "logits/rejected": 7.808614730834961, "logps/chosen": -2.418079376220703, "logps/rejected": -2.7481720447540283, "loss": 4.0271, "rewards/accuracies": 0.5, "rewards/chosen": -24.180795669555664, "rewards/margins": 3.300924777984619, "rewards/rejected": -27.481719970703125, "step": 1049 }, { "epoch": 0.14297385620915032, "grad_norm": 56.76710590732392, "learning_rate": 7.955242300029901e-07, "logits/chosen": 6.722243785858154, "logits/rejected": 6.408138751983643, "logps/chosen": -1.770882487297058, "logps/rejected": -1.6527904272079468, "loss": 4.4803, "rewards/accuracies": 0.25, "rewards/chosen": -17.708824157714844, "rewards/margins": -1.180920124053955, "rewards/rejected": -16.527904510498047, "step": 1050 }, { "epoch": 0.14311002178649238, "grad_norm": 58.77501825176823, "learning_rate": 7.954958208458478e-07, "logits/chosen": 6.099965572357178, "logits/rejected": 7.690578937530518, "logps/chosen": -2.103487253189087, "logps/rejected": -2.104119300842285, "loss": 4.6715, "rewards/accuracies": 0.75, "rewards/chosen": -21.03487205505371, "rewards/margins": 0.006321430206298828, "rewards/rejected": -21.041194915771484, "step": 1051 }, { "epoch": 0.14324618736383443, "grad_norm": 89.37630399299353, "learning_rate": 7.954673223231553e-07, "logits/chosen": 8.168821334838867, "logits/rejected": 7.01918888092041, "logps/chosen": -2.417025566101074, "logps/rejected": -2.530818462371826, "loss": 4.3203, "rewards/accuracies": 0.5, "rewards/chosen": -24.170257568359375, "rewards/margins": 1.1379272937774658, "rewards/rejected": -25.308181762695312, "step": 1052 }, { "epoch": 0.14338235294117646, "grad_norm": 55.03044299858314, "learning_rate": 7.954387344413525e-07, "logits/chosen": 6.285354137420654, "logits/rejected": 7.7118072509765625, "logps/chosen": -1.8568533658981323, "logps/rejected": -2.4189391136169434, "loss": 4.4132, "rewards/accuracies": 0.75, "rewards/chosen": -18.56853485107422, "rewards/margins": 5.620856761932373, "rewards/rejected": -24.18939208984375, "step": 1053 }, { "epoch": 0.14351851851851852, "grad_norm": 52.261145338285104, "learning_rate": 7.95410057206899e-07, "logits/chosen": 6.365130424499512, "logits/rejected": 7.183696746826172, "logps/chosen": -2.086379051208496, "logps/rejected": -2.3186213970184326, "loss": 3.9486, "rewards/accuracies": 0.75, "rewards/chosen": -20.863788604736328, "rewards/margins": 2.3224236965179443, "rewards/rejected": -23.186214447021484, "step": 1054 }, { "epoch": 0.14365468409586057, "grad_norm": 52.06947041850545, "learning_rate": 7.953812906262745e-07, "logits/chosen": 8.42334270477295, "logits/rejected": 7.200815200805664, "logps/chosen": -2.07843279838562, "logps/rejected": -2.0698065757751465, "loss": 4.5746, "rewards/accuracies": 0.5, "rewards/chosen": -20.78432846069336, "rewards/margins": -0.08626508712768555, "rewards/rejected": -20.698062896728516, "step": 1055 }, { "epoch": 0.1437908496732026, "grad_norm": 46.04880586261344, "learning_rate": 7.953524347059792e-07, "logits/chosen": 5.79136848449707, "logits/rejected": 5.997037410736084, "logps/chosen": -2.2104530334472656, "logps/rejected": -2.2293944358825684, "loss": 4.3001, "rewards/accuracies": 0.25, "rewards/chosen": -22.104530334472656, "rewards/margins": 0.18941378593444824, "rewards/rejected": -22.2939453125, "step": 1056 }, { "epoch": 0.14392701525054466, "grad_norm": 63.73277193941704, "learning_rate": 7.953234894525333e-07, "logits/chosen": 6.079222202301025, "logits/rejected": 7.983799934387207, "logps/chosen": -1.9270939826965332, "logps/rejected": -2.4362268447875977, "loss": 3.8043, "rewards/accuracies": 1.0, "rewards/chosen": -19.270938873291016, "rewards/margins": 5.091328144073486, "rewards/rejected": -24.362266540527344, "step": 1057 }, { "epoch": 0.14406318082788672, "grad_norm": 48.33556860199937, "learning_rate": 7.952944548724771e-07, "logits/chosen": 6.490633487701416, "logits/rejected": 5.926776885986328, "logps/chosen": -1.6585322618484497, "logps/rejected": -1.705830693244934, "loss": 4.3275, "rewards/accuracies": 0.5, "rewards/chosen": -16.585323333740234, "rewards/margins": 0.47298455238342285, "rewards/rejected": -17.058305740356445, "step": 1058 }, { "epoch": 0.14419934640522875, "grad_norm": 44.35113371310246, "learning_rate": 7.952653309723713e-07, "logits/chosen": 7.352355480194092, "logits/rejected": 9.262449264526367, "logps/chosen": -2.331267833709717, "logps/rejected": -2.5905957221984863, "loss": 4.4077, "rewards/accuracies": 0.75, "rewards/chosen": -23.31267738342285, "rewards/margins": 2.5932793617248535, "rewards/rejected": -25.905956268310547, "step": 1059 }, { "epoch": 0.1443355119825708, "grad_norm": 44.70169121231648, "learning_rate": 7.952361177587966e-07, "logits/chosen": 8.908016204833984, "logits/rejected": 8.161298751831055, "logps/chosen": -2.625208854675293, "logps/rejected": -2.681849479675293, "loss": 4.113, "rewards/accuracies": 0.5, "rewards/chosen": -26.252090454101562, "rewards/margins": 0.566403865814209, "rewards/rejected": -26.818492889404297, "step": 1060 }, { "epoch": 0.14447167755991286, "grad_norm": 47.80530227201829, "learning_rate": 7.952068152383541e-07, "logits/chosen": 5.3619585037231445, "logits/rejected": 8.519325256347656, "logps/chosen": -2.272505283355713, "logps/rejected": -2.379106283187866, "loss": 4.0375, "rewards/accuracies": 0.25, "rewards/chosen": -22.725053787231445, "rewards/margins": 1.0660090446472168, "rewards/rejected": -23.79106330871582, "step": 1061 }, { "epoch": 0.14460784313725492, "grad_norm": 52.22515395411018, "learning_rate": 7.951774234176648e-07, "logits/chosen": 9.031639099121094, "logits/rejected": 9.230886459350586, "logps/chosen": -2.75531005859375, "logps/rejected": -2.908029079437256, "loss": 4.2301, "rewards/accuracies": 0.75, "rewards/chosen": -27.553102493286133, "rewards/margins": 1.5271897315979004, "rewards/rejected": -29.080291748046875, "step": 1062 }, { "epoch": 0.14474400871459694, "grad_norm": 43.91513499123723, "learning_rate": 7.951479423033703e-07, "logits/chosen": 8.07962417602539, "logits/rejected": 7.422569274902344, "logps/chosen": -2.767977476119995, "logps/rejected": -3.3238353729248047, "loss": 4.4323, "rewards/accuracies": 0.5, "rewards/chosen": -27.67977523803711, "rewards/margins": 5.558579444885254, "rewards/rejected": -33.23835754394531, "step": 1063 }, { "epoch": 0.144880174291939, "grad_norm": 69.4932230800157, "learning_rate": 7.951183719021318e-07, "logits/chosen": 7.5629987716674805, "logits/rejected": 7.8187713623046875, "logps/chosen": -2.2698850631713867, "logps/rejected": -2.3446497917175293, "loss": 4.8487, "rewards/accuracies": 0.25, "rewards/chosen": -22.6988525390625, "rewards/margins": 0.747645378112793, "rewards/rejected": -23.446496963500977, "step": 1064 }, { "epoch": 0.14501633986928106, "grad_norm": 47.98355470110366, "learning_rate": 7.950887122206311e-07, "logits/chosen": 7.486894607543945, "logits/rejected": 6.630537033081055, "logps/chosen": -2.2019453048706055, "logps/rejected": -1.963758945465088, "loss": 4.4259, "rewards/accuracies": 0.25, "rewards/chosen": -22.019453048706055, "rewards/margins": -2.3818647861480713, "rewards/rejected": -19.637588500976562, "step": 1065 }, { "epoch": 0.1451525054466231, "grad_norm": 48.73714211325658, "learning_rate": 7.950589632655699e-07, "logits/chosen": 6.118114471435547, "logits/rejected": 7.51170539855957, "logps/chosen": -2.021106243133545, "logps/rejected": -2.4220685958862305, "loss": 4.2543, "rewards/accuracies": 1.0, "rewards/chosen": -20.2110652923584, "rewards/margins": 4.00961971282959, "rewards/rejected": -24.220684051513672, "step": 1066 }, { "epoch": 0.14528867102396514, "grad_norm": 48.42633348712758, "learning_rate": 7.950291250436706e-07, "logits/chosen": 6.851724624633789, "logits/rejected": 9.034297943115234, "logps/chosen": -2.133194923400879, "logps/rejected": -3.2002954483032227, "loss": 4.1781, "rewards/accuracies": 1.0, "rewards/chosen": -21.33194923400879, "rewards/margins": 10.671005249023438, "rewards/rejected": -32.002952575683594, "step": 1067 }, { "epoch": 0.1454248366013072, "grad_norm": 47.55544538280061, "learning_rate": 7.94999197561675e-07, "logits/chosen": 8.125069618225098, "logits/rejected": 7.927143096923828, "logps/chosen": -2.3166117668151855, "logps/rejected": -2.1458349227905273, "loss": 4.7355, "rewards/accuracies": 0.5, "rewards/chosen": -23.16611671447754, "rewards/margins": -1.707768440246582, "rewards/rejected": -21.458349227905273, "step": 1068 }, { "epoch": 0.14556100217864923, "grad_norm": 47.00298671173161, "learning_rate": 7.949691808263457e-07, "logits/chosen": 5.525528430938721, "logits/rejected": 6.143496513366699, "logps/chosen": -2.0052690505981445, "logps/rejected": -1.9820630550384521, "loss": 3.9041, "rewards/accuracies": 0.5, "rewards/chosen": -20.052692413330078, "rewards/margins": -0.23206162452697754, "rewards/rejected": -19.820629119873047, "step": 1069 }, { "epoch": 0.14569716775599129, "grad_norm": 44.15105435013016, "learning_rate": 7.94939074844465e-07, "logits/chosen": 7.272299766540527, "logits/rejected": 8.182524681091309, "logps/chosen": -2.555860996246338, "logps/rejected": -2.7619271278381348, "loss": 4.0913, "rewards/accuracies": 0.75, "rewards/chosen": -25.558609008789062, "rewards/margins": 2.0606632232666016, "rewards/rejected": -27.619272232055664, "step": 1070 }, { "epoch": 0.14583333333333334, "grad_norm": 60.58486696301464, "learning_rate": 7.94908879622836e-07, "logits/chosen": 7.304986476898193, "logits/rejected": 7.154782772064209, "logps/chosen": -2.546842336654663, "logps/rejected": -2.615126609802246, "loss": 4.1734, "rewards/accuracies": 0.5, "rewards/chosen": -25.468421936035156, "rewards/margins": 0.6828446388244629, "rewards/rejected": -26.151268005371094, "step": 1071 }, { "epoch": 0.14596949891067537, "grad_norm": 49.50790789526729, "learning_rate": 7.94878595168281e-07, "logits/chosen": 6.91555118560791, "logits/rejected": 7.228821754455566, "logps/chosen": -2.2227492332458496, "logps/rejected": -2.4619526863098145, "loss": 3.921, "rewards/accuracies": 0.5, "rewards/chosen": -22.22749137878418, "rewards/margins": 2.392035722732544, "rewards/rejected": -24.619525909423828, "step": 1072 }, { "epoch": 0.14610566448801743, "grad_norm": 47.72069239858087, "learning_rate": 7.948482214876434e-07, "logits/chosen": 6.6384711265563965, "logits/rejected": 8.63119125366211, "logps/chosen": -2.180769920349121, "logps/rejected": -2.760047197341919, "loss": 4.3372, "rewards/accuracies": 1.0, "rewards/chosen": -21.807697296142578, "rewards/margins": 5.792771816253662, "rewards/rejected": -27.60047149658203, "step": 1073 }, { "epoch": 0.14624183006535948, "grad_norm": 43.11694469206641, "learning_rate": 7.948177585877865e-07, "logits/chosen": 5.600986003875732, "logits/rejected": 7.130364418029785, "logps/chosen": -2.11661434173584, "logps/rejected": -2.1427245140075684, "loss": 3.9163, "rewards/accuracies": 0.5, "rewards/chosen": -21.16614532470703, "rewards/margins": 0.26110053062438965, "rewards/rejected": -21.42724609375, "step": 1074 }, { "epoch": 0.1463779956427015, "grad_norm": 62.87181967205608, "learning_rate": 7.947872064755932e-07, "logits/chosen": 6.138467788696289, "logits/rejected": 7.0620317459106445, "logps/chosen": -2.337937355041504, "logps/rejected": -2.610365152359009, "loss": 4.6321, "rewards/accuracies": 0.75, "rewards/chosen": -23.379375457763672, "rewards/margins": 2.7242770195007324, "rewards/rejected": -26.103652954101562, "step": 1075 }, { "epoch": 0.14651416122004357, "grad_norm": 46.89560131706874, "learning_rate": 7.947565651579673e-07, "logits/chosen": 2.84307599067688, "logits/rejected": 6.887186527252197, "logps/chosen": -1.968072533607483, "logps/rejected": -2.373044013977051, "loss": 4.5042, "rewards/accuracies": 0.75, "rewards/chosen": -19.68072509765625, "rewards/margins": 4.049717426300049, "rewards/rejected": -23.73044204711914, "step": 1076 }, { "epoch": 0.14665032679738563, "grad_norm": 43.89421560723156, "learning_rate": 7.947258346418325e-07, "logits/chosen": 6.982088088989258, "logits/rejected": 8.127557754516602, "logps/chosen": -2.4785397052764893, "logps/rejected": -2.650782823562622, "loss": 4.1772, "rewards/accuracies": 0.75, "rewards/chosen": -24.785396575927734, "rewards/margins": 1.7224302291870117, "rewards/rejected": -26.507827758789062, "step": 1077 }, { "epoch": 0.14678649237472766, "grad_norm": 46.24907974370417, "learning_rate": 7.946950149341326e-07, "logits/chosen": 8.006019592285156, "logits/rejected": 6.920699119567871, "logps/chosen": -2.7544026374816895, "logps/rejected": -2.634354829788208, "loss": 4.3089, "rewards/accuracies": 0.25, "rewards/chosen": -27.544025421142578, "rewards/margins": -1.2004766464233398, "rewards/rejected": -26.343547821044922, "step": 1078 }, { "epoch": 0.1469226579520697, "grad_norm": 49.68986606482622, "learning_rate": 7.946641060418313e-07, "logits/chosen": 7.989392280578613, "logits/rejected": 9.237676620483398, "logps/chosen": -2.2085118293762207, "logps/rejected": -2.333956480026245, "loss": 4.461, "rewards/accuracies": 0.5, "rewards/chosen": -22.085119247436523, "rewards/margins": 1.2544443607330322, "rewards/rejected": -23.339563369750977, "step": 1079 }, { "epoch": 0.14705882352941177, "grad_norm": 45.58695804976546, "learning_rate": 7.94633107971913e-07, "logits/chosen": 7.211371421813965, "logits/rejected": 6.395600318908691, "logps/chosen": -2.1516284942626953, "logps/rejected": -2.2511146068573, "loss": 4.3979, "rewards/accuracies": 0.5, "rewards/chosen": -21.51628875732422, "rewards/margins": 0.994858980178833, "rewards/rejected": -22.511146545410156, "step": 1080 }, { "epoch": 0.14719498910675383, "grad_norm": 51.48958516394322, "learning_rate": 7.946020207313819e-07, "logits/chosen": 6.647869110107422, "logits/rejected": 8.231767654418945, "logps/chosen": -2.232465982437134, "logps/rejected": -2.523315906524658, "loss": 4.2285, "rewards/accuracies": 1.0, "rewards/chosen": -22.32465934753418, "rewards/margins": 2.908498764038086, "rewards/rejected": -25.233158111572266, "step": 1081 }, { "epoch": 0.14733115468409586, "grad_norm": 47.7297448667391, "learning_rate": 7.945708443272624e-07, "logits/chosen": 7.546945571899414, "logits/rejected": 8.793795585632324, "logps/chosen": -2.5588040351867676, "logps/rejected": -2.5544469356536865, "loss": 4.2393, "rewards/accuracies": 0.75, "rewards/chosen": -25.588037490844727, "rewards/margins": -0.04356861114501953, "rewards/rejected": -25.54446792602539, "step": 1082 }, { "epoch": 0.1474673202614379, "grad_norm": 57.18540694389503, "learning_rate": 7.94539578766599e-07, "logits/chosen": 7.925384521484375, "logits/rejected": 7.4953508377075195, "logps/chosen": -2.3737986087799072, "logps/rejected": -2.1732616424560547, "loss": 4.4543, "rewards/accuracies": 0.5, "rewards/chosen": -23.737987518310547, "rewards/margins": -2.00536847114563, "rewards/rejected": -21.732616424560547, "step": 1083 }, { "epoch": 0.14760348583877997, "grad_norm": 51.596546951862415, "learning_rate": 7.945082240564566e-07, "logits/chosen": 5.921405792236328, "logits/rejected": 6.3546648025512695, "logps/chosen": -1.859436273574829, "logps/rejected": -2.192460536956787, "loss": 3.6254, "rewards/accuracies": 1.0, "rewards/chosen": -18.594364166259766, "rewards/margins": 3.3302409648895264, "rewards/rejected": -21.924604415893555, "step": 1084 }, { "epoch": 0.147739651416122, "grad_norm": 44.564744587862215, "learning_rate": 7.944767802039199e-07, "logits/chosen": 7.289421081542969, "logits/rejected": 9.394676208496094, "logps/chosen": -2.4941658973693848, "logps/rejected": -3.019066095352173, "loss": 4.1965, "rewards/accuracies": 0.75, "rewards/chosen": -24.94165802001953, "rewards/margins": 5.249000072479248, "rewards/rejected": -30.190658569335938, "step": 1085 }, { "epoch": 0.14787581699346405, "grad_norm": 54.396816549897395, "learning_rate": 7.94445247216094e-07, "logits/chosen": 5.897315979003906, "logits/rejected": 6.629268646240234, "logps/chosen": -1.8931113481521606, "logps/rejected": -2.1480188369750977, "loss": 4.0741, "rewards/accuracies": 1.0, "rewards/chosen": -18.931114196777344, "rewards/margins": 2.5490753650665283, "rewards/rejected": -21.48019027709961, "step": 1086 }, { "epoch": 0.1480119825708061, "grad_norm": 46.22262185931903, "learning_rate": 7.944136251001038e-07, "logits/chosen": 6.426568984985352, "logits/rejected": 7.61230993270874, "logps/chosen": -2.1710290908813477, "logps/rejected": -2.5079541206359863, "loss": 3.9609, "rewards/accuracies": 0.75, "rewards/chosen": -21.71029281616211, "rewards/margins": 3.369248390197754, "rewards/rejected": -25.079540252685547, "step": 1087 }, { "epoch": 0.14814814814814814, "grad_norm": 52.54559849011946, "learning_rate": 7.943819138630948e-07, "logits/chosen": 7.440896987915039, "logits/rejected": 8.07868766784668, "logps/chosen": -2.457139492034912, "logps/rejected": -2.6337475776672363, "loss": 4.4807, "rewards/accuracies": 0.5, "rewards/chosen": -24.571393966674805, "rewards/margins": 1.7660799026489258, "rewards/rejected": -26.337474822998047, "step": 1088 }, { "epoch": 0.1482843137254902, "grad_norm": 49.14200761435321, "learning_rate": 7.943501135122324e-07, "logits/chosen": 6.420352935791016, "logits/rejected": 8.068144798278809, "logps/chosen": -2.4076004028320312, "logps/rejected": -2.7116506099700928, "loss": 3.938, "rewards/accuracies": 1.0, "rewards/chosen": -24.076004028320312, "rewards/margins": 3.0405020713806152, "rewards/rejected": -27.116506576538086, "step": 1089 }, { "epoch": 0.14842047930283225, "grad_norm": 41.21082102650387, "learning_rate": 7.943182240547021e-07, "logits/chosen": 8.239226341247559, "logits/rejected": 7.461877822875977, "logps/chosen": -2.513453960418701, "logps/rejected": -2.5914082527160645, "loss": 3.9364, "rewards/accuracies": 0.5, "rewards/chosen": -25.134536743164062, "rewards/margins": 0.7795429229736328, "rewards/rejected": -25.914081573486328, "step": 1090 }, { "epoch": 0.14855664488017428, "grad_norm": 48.72694512051272, "learning_rate": 7.942862454977097e-07, "logits/chosen": 7.680271148681641, "logits/rejected": 5.9943671226501465, "logps/chosen": -2.422794818878174, "logps/rejected": -2.232612133026123, "loss": 4.2487, "rewards/accuracies": 0.25, "rewards/chosen": -24.227947235107422, "rewards/margins": -1.9018266201019287, "rewards/rejected": -22.326122283935547, "step": 1091 }, { "epoch": 0.14869281045751634, "grad_norm": 48.29237559166202, "learning_rate": 7.942541778484809e-07, "logits/chosen": 8.392349243164062, "logits/rejected": 8.507543563842773, "logps/chosen": -2.470914602279663, "logps/rejected": -2.66229510307312, "loss": 3.9137, "rewards/accuracies": 0.5, "rewards/chosen": -24.709148406982422, "rewards/margins": 1.9138035774230957, "rewards/rejected": -26.62295150756836, "step": 1092 }, { "epoch": 0.1488289760348584, "grad_norm": 47.92642305614693, "learning_rate": 7.942220211142616e-07, "logits/chosen": 7.404452323913574, "logits/rejected": 7.650815486907959, "logps/chosen": -2.1436338424682617, "logps/rejected": -2.297689914703369, "loss": 3.927, "rewards/accuracies": 0.75, "rewards/chosen": -21.436336517333984, "rewards/margins": 1.5405607223510742, "rewards/rejected": -22.976898193359375, "step": 1093 }, { "epoch": 0.14896514161220042, "grad_norm": 49.3141854156365, "learning_rate": 7.94189775302318e-07, "logits/chosen": 7.401819229125977, "logits/rejected": 7.5048298835754395, "logps/chosen": -2.2158546447753906, "logps/rejected": -2.4725449085235596, "loss": 4.6646, "rewards/accuracies": 0.75, "rewards/chosen": -22.158546447753906, "rewards/margins": 2.5669021606445312, "rewards/rejected": -24.725448608398438, "step": 1094 }, { "epoch": 0.14910130718954248, "grad_norm": 55.386197169962344, "learning_rate": 7.941574404199362e-07, "logits/chosen": 8.20209789276123, "logits/rejected": 8.631232261657715, "logps/chosen": -2.826984405517578, "logps/rejected": -2.8230605125427246, "loss": 4.6212, "rewards/accuracies": 0.5, "rewards/chosen": -28.26984405517578, "rewards/margins": -0.039238929748535156, "rewards/rejected": -28.23060417175293, "step": 1095 }, { "epoch": 0.14923747276688454, "grad_norm": 46.24053535974121, "learning_rate": 7.941250164744227e-07, "logits/chosen": 6.746589660644531, "logits/rejected": 7.960080623626709, "logps/chosen": -2.361301898956299, "logps/rejected": -2.5469679832458496, "loss": 4.3275, "rewards/accuracies": 0.75, "rewards/chosen": -23.613019943237305, "rewards/margins": 1.8566608428955078, "rewards/rejected": -25.469680786132812, "step": 1096 }, { "epoch": 0.14937363834422657, "grad_norm": 45.645643734603794, "learning_rate": 7.940925034731039e-07, "logits/chosen": 6.367142200469971, "logits/rejected": 7.537808418273926, "logps/chosen": -2.386108160018921, "logps/rejected": -2.6728861331939697, "loss": 4.4645, "rewards/accuracies": 0.5, "rewards/chosen": -23.861082077026367, "rewards/margins": 2.8677799701690674, "rewards/rejected": -26.72886085510254, "step": 1097 }, { "epoch": 0.14950980392156862, "grad_norm": 69.27574034630445, "learning_rate": 7.940599014233262e-07, "logits/chosen": 7.709702491760254, "logits/rejected": 7.558544158935547, "logps/chosen": -2.620137929916382, "logps/rejected": -2.5606579780578613, "loss": 4.8875, "rewards/accuracies": 0.5, "rewards/chosen": -26.201379776000977, "rewards/margins": -0.5947999954223633, "rewards/rejected": -25.606578826904297, "step": 1098 }, { "epoch": 0.14964596949891068, "grad_norm": 51.46581033964663, "learning_rate": 7.940272103324565e-07, "logits/chosen": 5.490316867828369, "logits/rejected": 6.711139678955078, "logps/chosen": -1.8513031005859375, "logps/rejected": -2.3677210807800293, "loss": 4.2367, "rewards/accuracies": 0.75, "rewards/chosen": -18.513029098510742, "rewards/margins": 5.16417932510376, "rewards/rejected": -23.677207946777344, "step": 1099 }, { "epoch": 0.14978213507625274, "grad_norm": 46.74787677857698, "learning_rate": 7.939944302078815e-07, "logits/chosen": 5.2277021408081055, "logits/rejected": 7.466156482696533, "logps/chosen": -2.246340274810791, "logps/rejected": -2.588789463043213, "loss": 4.157, "rewards/accuracies": 0.75, "rewards/chosen": -22.463401794433594, "rewards/margins": 3.4244909286499023, "rewards/rejected": -25.887893676757812, "step": 1100 }, { "epoch": 0.14991830065359477, "grad_norm": 51.050633676965994, "learning_rate": 7.939615610570083e-07, "logits/chosen": 7.8200178146362305, "logits/rejected": 8.223779678344727, "logps/chosen": -2.3539276123046875, "logps/rejected": -2.4856224060058594, "loss": 4.421, "rewards/accuracies": 0.5, "rewards/chosen": -23.539276123046875, "rewards/margins": 1.3169479370117188, "rewards/rejected": -24.85622215270996, "step": 1101 }, { "epoch": 0.15005446623093682, "grad_norm": 51.45833649425082, "learning_rate": 7.939286028872639e-07, "logits/chosen": 8.198173522949219, "logits/rejected": 9.447385787963867, "logps/chosen": -2.415052890777588, "logps/rejected": -2.815140724182129, "loss": 3.6963, "rewards/accuracies": 1.0, "rewards/chosen": -24.150529861450195, "rewards/margins": 4.000875949859619, "rewards/rejected": -28.151405334472656, "step": 1102 }, { "epoch": 0.15019063180827888, "grad_norm": 75.1162598389309, "learning_rate": 7.938955557060952e-07, "logits/chosen": 6.986461639404297, "logits/rejected": 6.135679244995117, "logps/chosen": -2.1742606163024902, "logps/rejected": -2.3834068775177, "loss": 4.1527, "rewards/accuracies": 1.0, "rewards/chosen": -21.742605209350586, "rewards/margins": 2.091463088989258, "rewards/rejected": -23.834068298339844, "step": 1103 }, { "epoch": 0.1503267973856209, "grad_norm": 49.292428124293544, "learning_rate": 7.938624195209699e-07, "logits/chosen": 6.386392593383789, "logits/rejected": 7.530998706817627, "logps/chosen": -2.098471164703369, "logps/rejected": -2.5879642963409424, "loss": 3.9688, "rewards/accuracies": 1.0, "rewards/chosen": -20.984710693359375, "rewards/margins": 4.89493465423584, "rewards/rejected": -25.8796443939209, "step": 1104 }, { "epoch": 0.15046296296296297, "grad_norm": 56.33496481705276, "learning_rate": 7.938291943393751e-07, "logits/chosen": 7.160405158996582, "logits/rejected": 7.846912384033203, "logps/chosen": -2.2942092418670654, "logps/rejected": -2.671229362487793, "loss": 4.2535, "rewards/accuracies": 0.75, "rewards/chosen": -22.942092895507812, "rewards/margins": 3.770202159881592, "rewards/rejected": -26.712295532226562, "step": 1105 }, { "epoch": 0.15059912854030502, "grad_norm": 45.68840017670658, "learning_rate": 7.937958801688185e-07, "logits/chosen": 5.974714279174805, "logits/rejected": 6.648443698883057, "logps/chosen": -1.9691143035888672, "logps/rejected": -2.428623676300049, "loss": 4.4656, "rewards/accuracies": 1.0, "rewards/chosen": -19.691143035888672, "rewards/margins": 4.595095157623291, "rewards/rejected": -24.286237716674805, "step": 1106 }, { "epoch": 0.15073529411764705, "grad_norm": 45.99790638782137, "learning_rate": 7.937624770168277e-07, "logits/chosen": 7.1137237548828125, "logits/rejected": 8.863107681274414, "logps/chosen": -2.2291576862335205, "logps/rejected": -2.8464953899383545, "loss": 4.0028, "rewards/accuracies": 1.0, "rewards/chosen": -22.29157829284668, "rewards/margins": 6.17337703704834, "rewards/rejected": -28.464954376220703, "step": 1107 }, { "epoch": 0.1508714596949891, "grad_norm": 49.75086677367135, "learning_rate": 7.937289848909503e-07, "logits/chosen": 8.495540618896484, "logits/rejected": 9.75747299194336, "logps/chosen": -2.537351608276367, "logps/rejected": -2.8885278701782227, "loss": 3.9291, "rewards/accuracies": 1.0, "rewards/chosen": -25.373517990112305, "rewards/margins": 3.5117626190185547, "rewards/rejected": -28.88528060913086, "step": 1108 }, { "epoch": 0.15100762527233116, "grad_norm": 47.998034874584896, "learning_rate": 7.93695403798754e-07, "logits/chosen": 7.911487102508545, "logits/rejected": 8.116324424743652, "logps/chosen": -2.6034388542175293, "logps/rejected": -2.644371509552002, "loss": 4.5003, "rewards/accuracies": 0.5, "rewards/chosen": -26.03438949584961, "rewards/margins": 0.40932512283325195, "rewards/rejected": -26.443714141845703, "step": 1109 }, { "epoch": 0.1511437908496732, "grad_norm": 53.73508621120248, "learning_rate": 7.936617337478271e-07, "logits/chosen": 7.846975326538086, "logits/rejected": 8.07398796081543, "logps/chosen": -2.516127824783325, "logps/rejected": -2.513361930847168, "loss": 4.5816, "rewards/accuracies": 0.75, "rewards/chosen": -25.161277770996094, "rewards/margins": -0.02765941619873047, "rewards/rejected": -25.13361930847168, "step": 1110 }, { "epoch": 0.15127995642701525, "grad_norm": 43.6754490513479, "learning_rate": 7.936279747457773e-07, "logits/chosen": 6.595114231109619, "logits/rejected": 8.765579223632812, "logps/chosen": -2.5741915702819824, "logps/rejected": -2.8210973739624023, "loss": 4.1176, "rewards/accuracies": 1.0, "rewards/chosen": -25.741914749145508, "rewards/margins": 2.469059944152832, "rewards/rejected": -28.210975646972656, "step": 1111 }, { "epoch": 0.1514161220043573, "grad_norm": 48.53461497485841, "learning_rate": 7.935941268002329e-07, "logits/chosen": 7.800570964813232, "logits/rejected": 7.140870094299316, "logps/chosen": -2.521850824356079, "logps/rejected": -2.4819018840789795, "loss": 4.5298, "rewards/accuracies": 0.25, "rewards/chosen": -25.218507766723633, "rewards/margins": -0.3994889259338379, "rewards/rejected": -24.819019317626953, "step": 1112 }, { "epoch": 0.15155228758169934, "grad_norm": 45.74515848626242, "learning_rate": 7.935601899188421e-07, "logits/chosen": 7.933063507080078, "logits/rejected": 7.470095634460449, "logps/chosen": -2.617710590362549, "logps/rejected": -2.6650404930114746, "loss": 4.1799, "rewards/accuracies": 0.75, "rewards/chosen": -26.177108764648438, "rewards/margins": 0.4732961654663086, "rewards/rejected": -26.65040397644043, "step": 1113 }, { "epoch": 0.1516884531590414, "grad_norm": 51.410909844869806, "learning_rate": 7.935261641092731e-07, "logits/chosen": 7.308617115020752, "logits/rejected": 8.124489784240723, "logps/chosen": -2.8078982830047607, "logps/rejected": -2.642043113708496, "loss": 4.531, "rewards/accuracies": 0.5, "rewards/chosen": -28.078983306884766, "rewards/margins": -1.6585540771484375, "rewards/rejected": -26.420429229736328, "step": 1114 }, { "epoch": 0.15182461873638345, "grad_norm": 56.69295531765357, "learning_rate": 7.934920493792145e-07, "logits/chosen": 7.073423385620117, "logits/rejected": 8.822671890258789, "logps/chosen": -2.4147751331329346, "logps/rejected": -2.85988187789917, "loss": 4.6191, "rewards/accuracies": 0.75, "rewards/chosen": -24.14775276184082, "rewards/margins": 4.451067924499512, "rewards/rejected": -28.598819732666016, "step": 1115 }, { "epoch": 0.15196078431372548, "grad_norm": 45.38378203520716, "learning_rate": 7.934578457363746e-07, "logits/chosen": 6.154585838317871, "logits/rejected": 7.758505821228027, "logps/chosen": -2.4761385917663574, "logps/rejected": -2.873276710510254, "loss": 4.4567, "rewards/accuracies": 1.0, "rewards/chosen": -24.761384963989258, "rewards/margins": 3.9713845252990723, "rewards/rejected": -28.732769012451172, "step": 1116 }, { "epoch": 0.15209694989106753, "grad_norm": 46.706353315106476, "learning_rate": 7.934235531884821e-07, "logits/chosen": 8.36489200592041, "logits/rejected": 7.776047706604004, "logps/chosen": -2.703859329223633, "logps/rejected": -2.3734636306762695, "loss": 3.8474, "rewards/accuracies": 0.5, "rewards/chosen": -27.038593292236328, "rewards/margins": -3.3039560317993164, "rewards/rejected": -23.734638214111328, "step": 1117 }, { "epoch": 0.1522331154684096, "grad_norm": 55.779490827205635, "learning_rate": 7.933891717432858e-07, "logits/chosen": 6.872611045837402, "logits/rejected": 7.142293930053711, "logps/chosen": -2.7568254470825195, "logps/rejected": -2.4000492095947266, "loss": 4.4114, "rewards/accuracies": 0.25, "rewards/chosen": -27.568256378173828, "rewards/margins": -3.5677647590637207, "rewards/rejected": -24.000492095947266, "step": 1118 }, { "epoch": 0.15236928104575165, "grad_norm": 53.54215252513372, "learning_rate": 7.933547014085542e-07, "logits/chosen": 6.310622692108154, "logits/rejected": 6.389246940612793, "logps/chosen": -2.188443422317505, "logps/rejected": -2.286533832550049, "loss": 4.2291, "rewards/accuracies": 0.5, "rewards/chosen": -21.884435653686523, "rewards/margins": 0.9809060096740723, "rewards/rejected": -22.865341186523438, "step": 1119 }, { "epoch": 0.15250544662309368, "grad_norm": 51.742682034495544, "learning_rate": 7.933201421920765e-07, "logits/chosen": 6.000193119049072, "logits/rejected": 8.124146461486816, "logps/chosen": -2.19301700592041, "logps/rejected": -2.6686196327209473, "loss": 3.7534, "rewards/accuracies": 1.0, "rewards/chosen": -21.93016815185547, "rewards/margins": 4.756026744842529, "rewards/rejected": -26.686195373535156, "step": 1120 }, { "epoch": 0.15264161220043573, "grad_norm": 57.74569651474768, "learning_rate": 7.932854941016613e-07, "logits/chosen": 7.827439785003662, "logits/rejected": 9.166203498840332, "logps/chosen": -2.650038242340088, "logps/rejected": -2.8002889156341553, "loss": 4.7503, "rewards/accuracies": 0.5, "rewards/chosen": -26.500385284423828, "rewards/margins": 1.5025053024291992, "rewards/rejected": -28.002887725830078, "step": 1121 }, { "epoch": 0.1527777777777778, "grad_norm": 58.832784902853724, "learning_rate": 7.932507571451378e-07, "logits/chosen": 7.999378204345703, "logits/rejected": 8.696267127990723, "logps/chosen": -2.754786729812622, "logps/rejected": -2.7638330459594727, "loss": 4.7245, "rewards/accuracies": 0.5, "rewards/chosen": -27.547870635986328, "rewards/margins": 0.09046077728271484, "rewards/rejected": -27.638328552246094, "step": 1122 }, { "epoch": 0.15291394335511982, "grad_norm": 50.79732542165751, "learning_rate": 7.932159313303551e-07, "logits/chosen": 8.095524787902832, "logits/rejected": 8.626598358154297, "logps/chosen": -2.3888678550720215, "logps/rejected": -2.537919521331787, "loss": 4.237, "rewards/accuracies": 0.75, "rewards/chosen": -23.8886775970459, "rewards/margins": 1.490518569946289, "rewards/rejected": -25.379196166992188, "step": 1123 }, { "epoch": 0.15305010893246188, "grad_norm": 47.06489426939921, "learning_rate": 7.931810166651824e-07, "logits/chosen": 8.589064598083496, "logits/rejected": 6.487595558166504, "logps/chosen": -3.0220887660980225, "logps/rejected": -2.7360782623291016, "loss": 4.0647, "rewards/accuracies": 0.25, "rewards/chosen": -30.22088623046875, "rewards/margins": -2.8601036071777344, "rewards/rejected": -27.360782623291016, "step": 1124 }, { "epoch": 0.15318627450980393, "grad_norm": 52.186525903535916, "learning_rate": 7.931460131575089e-07, "logits/chosen": 7.107768535614014, "logits/rejected": 8.813922882080078, "logps/chosen": -2.5710554122924805, "logps/rejected": -2.666779041290283, "loss": 3.9358, "rewards/accuracies": 0.75, "rewards/chosen": -25.710556030273438, "rewards/margins": 0.9572362899780273, "rewards/rejected": -26.66779327392578, "step": 1125 }, { "epoch": 0.15332244008714596, "grad_norm": 65.16349322898691, "learning_rate": 7.931109208152439e-07, "logits/chosen": 8.381075859069824, "logits/rejected": 8.853099822998047, "logps/chosen": -2.672886610031128, "logps/rejected": -2.767268180847168, "loss": 4.4074, "rewards/accuracies": 0.75, "rewards/chosen": -26.728866577148438, "rewards/margins": 0.943817138671875, "rewards/rejected": -27.672683715820312, "step": 1126 }, { "epoch": 0.15345860566448802, "grad_norm": 52.11353478995046, "learning_rate": 7.930757396463169e-07, "logits/chosen": 7.996394157409668, "logits/rejected": 7.880429267883301, "logps/chosen": -2.8095614910125732, "logps/rejected": -2.7835144996643066, "loss": 4.4715, "rewards/accuracies": 0.5, "rewards/chosen": -28.09561538696289, "rewards/margins": -0.2604713439941406, "rewards/rejected": -27.83514404296875, "step": 1127 }, { "epoch": 0.15359477124183007, "grad_norm": 48.82053117520981, "learning_rate": 7.930404696586773e-07, "logits/chosen": 7.054836273193359, "logits/rejected": 7.081324577331543, "logps/chosen": -2.5010874271392822, "logps/rejected": -2.708683967590332, "loss": 4.3409, "rewards/accuracies": 0.75, "rewards/chosen": -25.010875701904297, "rewards/margins": 2.0759644508361816, "rewards/rejected": -27.08683967590332, "step": 1128 }, { "epoch": 0.1537309368191721, "grad_norm": 89.87524304024936, "learning_rate": 7.930051108602947e-07, "logits/chosen": 7.968194007873535, "logits/rejected": 7.559394836425781, "logps/chosen": -2.558565139770508, "logps/rejected": -2.6166303157806396, "loss": 4.0062, "rewards/accuracies": 0.5, "rewards/chosen": -25.585651397705078, "rewards/margins": 0.5806512832641602, "rewards/rejected": -26.166303634643555, "step": 1129 }, { "epoch": 0.15386710239651416, "grad_norm": 55.015201650742334, "learning_rate": 7.929696632591588e-07, "logits/chosen": 8.967996597290039, "logits/rejected": 8.77226448059082, "logps/chosen": -2.66178560256958, "logps/rejected": -2.7870917320251465, "loss": 4.6798, "rewards/accuracies": 0.75, "rewards/chosen": -26.617855072021484, "rewards/margins": 1.2530617713928223, "rewards/rejected": -27.87091636657715, "step": 1130 }, { "epoch": 0.15400326797385622, "grad_norm": 51.20538348333452, "learning_rate": 7.929341268632789e-07, "logits/chosen": 8.909905433654785, "logits/rejected": 8.83310604095459, "logps/chosen": -2.438997507095337, "logps/rejected": -2.645615577697754, "loss": 4.1202, "rewards/accuracies": 0.75, "rewards/chosen": -24.38997459411621, "rewards/margins": 2.0661792755126953, "rewards/rejected": -26.456153869628906, "step": 1131 }, { "epoch": 0.15413943355119825, "grad_norm": 51.88503632670336, "learning_rate": 7.928985016806851e-07, "logits/chosen": 8.903955459594727, "logits/rejected": 8.978463172912598, "logps/chosen": -3.0605521202087402, "logps/rejected": -2.9746899604797363, "loss": 4.2738, "rewards/accuracies": 0.25, "rewards/chosen": -30.605520248413086, "rewards/margins": -0.8586196899414062, "rewards/rejected": -29.746898651123047, "step": 1132 }, { "epoch": 0.1542755991285403, "grad_norm": 50.79296579314532, "learning_rate": 7.928627877194273e-07, "logits/chosen": 6.140006065368652, "logits/rejected": 7.95203161239624, "logps/chosen": -2.23089599609375, "logps/rejected": -2.3642444610595703, "loss": 4.0223, "rewards/accuracies": 0.75, "rewards/chosen": -22.3089599609375, "rewards/margins": 1.333484172821045, "rewards/rejected": -23.64244270324707, "step": 1133 }, { "epoch": 0.15441176470588236, "grad_norm": 44.560198993859956, "learning_rate": 7.92826984987575e-07, "logits/chosen": 7.832927227020264, "logits/rejected": 7.705656051635742, "logps/chosen": -2.758449077606201, "logps/rejected": -2.8656797409057617, "loss": 4.3714, "rewards/accuracies": 0.75, "rewards/chosen": -27.584491729736328, "rewards/margins": 1.0723042488098145, "rewards/rejected": -28.656795501708984, "step": 1134 }, { "epoch": 0.1545479302832244, "grad_norm": 84.8687861154726, "learning_rate": 7.927910934932183e-07, "logits/chosen": 8.605606079101562, "logits/rejected": 8.820699691772461, "logps/chosen": -2.3170971870422363, "logps/rejected": -3.242525339126587, "loss": 3.7956, "rewards/accuracies": 1.0, "rewards/chosen": -23.170970916748047, "rewards/margins": 9.254283905029297, "rewards/rejected": -32.425254821777344, "step": 1135 }, { "epoch": 0.15468409586056645, "grad_norm": 55.60804787846568, "learning_rate": 7.927551132444673e-07, "logits/chosen": 8.109528541564941, "logits/rejected": 8.08955192565918, "logps/chosen": -2.7956957817077637, "logps/rejected": -2.873126983642578, "loss": 4.0968, "rewards/accuracies": 0.75, "rewards/chosen": -27.956958770751953, "rewards/margins": 0.7743101119995117, "rewards/rejected": -28.73126983642578, "step": 1136 }, { "epoch": 0.1548202614379085, "grad_norm": 66.38094526321271, "learning_rate": 7.927190442494518e-07, "logits/chosen": 7.611963272094727, "logits/rejected": 8.75413703918457, "logps/chosen": -2.578045606613159, "logps/rejected": -3.227126121520996, "loss": 4.0626, "rewards/accuracies": 1.0, "rewards/chosen": -25.78045654296875, "rewards/margins": 6.490805625915527, "rewards/rejected": -32.27125930786133, "step": 1137 }, { "epoch": 0.15495642701525056, "grad_norm": 53.59079994724869, "learning_rate": 7.926828865163221e-07, "logits/chosen": 9.136785507202148, "logits/rejected": 9.34780216217041, "logps/chosen": -3.5222764015197754, "logps/rejected": -3.237955331802368, "loss": 4.162, "rewards/accuracies": 0.5, "rewards/chosen": -35.2227668762207, "rewards/margins": -2.8432135581970215, "rewards/rejected": -32.379554748535156, "step": 1138 }, { "epoch": 0.1550925925925926, "grad_norm": 48.514048825530836, "learning_rate": 7.926466400532481e-07, "logits/chosen": 7.240055084228516, "logits/rejected": 7.6533098220825195, "logps/chosen": -2.5932111740112305, "logps/rejected": -2.9825239181518555, "loss": 4.041, "rewards/accuracies": 0.75, "rewards/chosen": -25.932113647460938, "rewards/margins": 3.893127918243408, "rewards/rejected": -29.825241088867188, "step": 1139 }, { "epoch": 0.15522875816993464, "grad_norm": 60.343495888776175, "learning_rate": 7.926103048684203e-07, "logits/chosen": 9.062688827514648, "logits/rejected": 9.05910873413086, "logps/chosen": -2.7550997734069824, "logps/rejected": -3.0322134494781494, "loss": 4.3211, "rewards/accuracies": 0.75, "rewards/chosen": -27.55099868774414, "rewards/margins": 2.7711377143859863, "rewards/rejected": -30.32213592529297, "step": 1140 }, { "epoch": 0.1553649237472767, "grad_norm": 52.34998496454558, "learning_rate": 7.925738809700487e-07, "logits/chosen": 9.663583755493164, "logits/rejected": 9.857624053955078, "logps/chosen": -2.888364791870117, "logps/rejected": -3.0095415115356445, "loss": 4.5095, "rewards/accuracies": 0.75, "rewards/chosen": -28.883647918701172, "rewards/margins": 1.211766242980957, "rewards/rejected": -30.095413208007812, "step": 1141 }, { "epoch": 0.15550108932461873, "grad_norm": 48.33693550261309, "learning_rate": 7.925373683663636e-07, "logits/chosen": 8.16923999786377, "logits/rejected": 8.625839233398438, "logps/chosen": -2.9065358638763428, "logps/rejected": -2.9110288619995117, "loss": 4.3266, "rewards/accuracies": 0.25, "rewards/chosen": -29.065359115600586, "rewards/margins": 0.04493093490600586, "rewards/rejected": -29.11029052734375, "step": 1142 }, { "epoch": 0.1556372549019608, "grad_norm": 53.06829495199242, "learning_rate": 7.925007670656154e-07, "logits/chosen": 9.047494888305664, "logits/rejected": 8.310314178466797, "logps/chosen": -2.582686185836792, "logps/rejected": -2.2463178634643555, "loss": 4.6406, "rewards/accuracies": 0.25, "rewards/chosen": -25.826860427856445, "rewards/margins": -3.3636832237243652, "rewards/rejected": -22.463176727294922, "step": 1143 }, { "epoch": 0.15577342047930284, "grad_norm": 51.05536120649815, "learning_rate": 7.924640770760744e-07, "logits/chosen": 9.71915054321289, "logits/rejected": 9.700967788696289, "logps/chosen": -3.1135482788085938, "logps/rejected": -2.9611129760742188, "loss": 4.41, "rewards/accuracies": 0.5, "rewards/chosen": -31.135482788085938, "rewards/margins": -1.5243525505065918, "rewards/rejected": -29.611129760742188, "step": 1144 }, { "epoch": 0.15590958605664487, "grad_norm": 48.98255232006541, "learning_rate": 7.924272984060311e-07, "logits/chosen": 8.014548301696777, "logits/rejected": 8.1425199508667, "logps/chosen": -2.387683629989624, "logps/rejected": -2.6606011390686035, "loss": 4.4824, "rewards/accuracies": 0.75, "rewards/chosen": -23.8768367767334, "rewards/margins": 2.7291741371154785, "rewards/rejected": -26.60601043701172, "step": 1145 }, { "epoch": 0.15604575163398693, "grad_norm": 48.43641808214925, "learning_rate": 7.923904310637959e-07, "logits/chosen": 9.019365310668945, "logits/rejected": 8.98886489868164, "logps/chosen": -2.805178165435791, "logps/rejected": -2.327613115310669, "loss": 4.2579, "rewards/accuracies": 0.25, "rewards/chosen": -28.051780700683594, "rewards/margins": -4.775650978088379, "rewards/rejected": -23.27613067626953, "step": 1146 }, { "epoch": 0.15618191721132899, "grad_norm": 52.15130137162261, "learning_rate": 7.923534750576993e-07, "logits/chosen": 8.36589241027832, "logits/rejected": 9.192293167114258, "logps/chosen": -2.6576874256134033, "logps/rejected": -2.763162612915039, "loss": 4.3784, "rewards/accuracies": 0.75, "rewards/chosen": -26.576873779296875, "rewards/margins": 1.0547528266906738, "rewards/rejected": -27.63162612915039, "step": 1147 }, { "epoch": 0.15631808278867101, "grad_norm": 45.03166852086459, "learning_rate": 7.923164303960917e-07, "logits/chosen": 6.9306440353393555, "logits/rejected": 6.70695686340332, "logps/chosen": -2.3674135208129883, "logps/rejected": -2.4322686195373535, "loss": 4.4935, "rewards/accuracies": 0.5, "rewards/chosen": -23.674137115478516, "rewards/margins": 0.6485509872436523, "rewards/rejected": -24.32268714904785, "step": 1148 }, { "epoch": 0.15645424836601307, "grad_norm": 51.45788059727349, "learning_rate": 7.922792970873438e-07, "logits/chosen": 8.838412284851074, "logits/rejected": 9.99671745300293, "logps/chosen": -2.7254369258880615, "logps/rejected": -2.9353854656219482, "loss": 3.8091, "rewards/accuracies": 0.75, "rewards/chosen": -27.254369735717773, "rewards/margins": 2.099486827850342, "rewards/rejected": -29.35385513305664, "step": 1149 }, { "epoch": 0.15659041394335513, "grad_norm": 46.61824107435791, "learning_rate": 7.922420751398461e-07, "logits/chosen": 7.514091491699219, "logits/rejected": 8.100318908691406, "logps/chosen": -2.519902229309082, "logps/rejected": -2.6904687881469727, "loss": 4.605, "rewards/accuracies": 0.5, "rewards/chosen": -25.19902229309082, "rewards/margins": 1.7056665420532227, "rewards/rejected": -26.90468978881836, "step": 1150 }, { "epoch": 0.15672657952069716, "grad_norm": 50.725320299261604, "learning_rate": 7.92204764562009e-07, "logits/chosen": 7.654720783233643, "logits/rejected": 8.33786678314209, "logps/chosen": -2.5245418548583984, "logps/rejected": -2.4948627948760986, "loss": 4.397, "rewards/accuracies": 0.5, "rewards/chosen": -25.245420455932617, "rewards/margins": -0.29679155349731445, "rewards/rejected": -24.948627471923828, "step": 1151 }, { "epoch": 0.1568627450980392, "grad_norm": 44.760036975263745, "learning_rate": 7.921673653622636e-07, "logits/chosen": 7.004299163818359, "logits/rejected": 7.077592372894287, "logps/chosen": -2.547724485397339, "logps/rejected": -2.6706345081329346, "loss": 4.474, "rewards/accuracies": 0.5, "rewards/chosen": -25.477245330810547, "rewards/margins": 1.2291021347045898, "rewards/rejected": -26.70634651184082, "step": 1152 }, { "epoch": 0.15699891067538127, "grad_norm": 44.604377637593835, "learning_rate": 7.921298775490603e-07, "logits/chosen": 7.684248924255371, "logits/rejected": 6.539210319519043, "logps/chosen": -2.1987125873565674, "logps/rejected": -2.25244402885437, "loss": 4.2733, "rewards/accuracies": 0.5, "rewards/chosen": -21.98712730407715, "rewards/margins": 0.5373134613037109, "rewards/rejected": -22.52444076538086, "step": 1153 }, { "epoch": 0.1571350762527233, "grad_norm": 42.96783643356912, "learning_rate": 7.920923011308696e-07, "logits/chosen": 9.011078834533691, "logits/rejected": 9.188773155212402, "logps/chosen": -2.7424569129943848, "logps/rejected": -2.7954225540161133, "loss": 4.3914, "rewards/accuracies": 0.5, "rewards/chosen": -27.42456817626953, "rewards/margins": 0.5296578407287598, "rewards/rejected": -27.954227447509766, "step": 1154 }, { "epoch": 0.15727124183006536, "grad_norm": 70.63035732957488, "learning_rate": 7.920546361161825e-07, "logits/chosen": 7.3046464920043945, "logits/rejected": 8.41751480102539, "logps/chosen": -2.424708843231201, "logps/rejected": -2.6276326179504395, "loss": 4.2232, "rewards/accuracies": 0.75, "rewards/chosen": -24.247089385986328, "rewards/margins": 2.0292348861694336, "rewards/rejected": -26.276325225830078, "step": 1155 }, { "epoch": 0.1574074074074074, "grad_norm": 49.80699744894694, "learning_rate": 7.920168825135097e-07, "logits/chosen": 7.609920501708984, "logits/rejected": 9.259225845336914, "logps/chosen": -2.2115843296051025, "logps/rejected": -2.771260976791382, "loss": 4.0236, "rewards/accuracies": 1.0, "rewards/chosen": -22.115842819213867, "rewards/margins": 5.596766471862793, "rewards/rejected": -27.712608337402344, "step": 1156 }, { "epoch": 0.15754357298474944, "grad_norm": 54.2128644693342, "learning_rate": 7.919790403313818e-07, "logits/chosen": 7.226951599121094, "logits/rejected": 8.616751670837402, "logps/chosen": -2.3671154975891113, "logps/rejected": -2.8022561073303223, "loss": 4.6121, "rewards/accuracies": 0.5, "rewards/chosen": -23.671154022216797, "rewards/margins": 4.351408958435059, "rewards/rejected": -28.02256202697754, "step": 1157 }, { "epoch": 0.1576797385620915, "grad_norm": 44.42682241721088, "learning_rate": 7.919411095783496e-07, "logits/chosen": 6.799042701721191, "logits/rejected": 5.2401933670043945, "logps/chosen": -2.4016971588134766, "logps/rejected": -2.170586109161377, "loss": 4.3226, "rewards/accuracies": 0.25, "rewards/chosen": -24.016971588134766, "rewards/margins": -2.3111090660095215, "rewards/rejected": -21.705862045288086, "step": 1158 }, { "epoch": 0.15781590413943355, "grad_norm": 47.43810520202025, "learning_rate": 7.919030902629838e-07, "logits/chosen": 8.118438720703125, "logits/rejected": 7.878485202789307, "logps/chosen": -2.4327619075775146, "logps/rejected": -2.5444278717041016, "loss": 3.8546, "rewards/accuracies": 0.5, "rewards/chosen": -24.327617645263672, "rewards/margins": 1.1166596412658691, "rewards/rejected": -25.444278717041016, "step": 1159 }, { "epoch": 0.1579520697167756, "grad_norm": 52.131680348712926, "learning_rate": 7.918649823938753e-07, "logits/chosen": 6.4860053062438965, "logits/rejected": 7.53181266784668, "logps/chosen": -2.2847299575805664, "logps/rejected": -2.5504636764526367, "loss": 4.303, "rewards/accuracies": 0.75, "rewards/chosen": -22.847301483154297, "rewards/margins": 2.6573362350463867, "rewards/rejected": -25.504638671875, "step": 1160 }, { "epoch": 0.15808823529411764, "grad_norm": 44.724993765113375, "learning_rate": 7.91826785979635e-07, "logits/chosen": 6.9204607009887695, "logits/rejected": 8.575255393981934, "logps/chosen": -2.2706289291381836, "logps/rejected": -2.498764991760254, "loss": 4.2858, "rewards/accuracies": 0.5, "rewards/chosen": -22.706289291381836, "rewards/margins": 2.281360149383545, "rewards/rejected": -24.987648010253906, "step": 1161 }, { "epoch": 0.1582244008714597, "grad_norm": 46.282360102421016, "learning_rate": 7.917885010288933e-07, "logits/chosen": 7.809560775756836, "logits/rejected": 8.784469604492188, "logps/chosen": -2.3839683532714844, "logps/rejected": -2.7231976985931396, "loss": 4.2314, "rewards/accuracies": 0.75, "rewards/chosen": -23.839683532714844, "rewards/margins": 3.3922953605651855, "rewards/rejected": -27.231977462768555, "step": 1162 }, { "epoch": 0.15836056644880175, "grad_norm": 46.04768906287647, "learning_rate": 7.917501275503013e-07, "logits/chosen": 8.538017272949219, "logits/rejected": 7.784733772277832, "logps/chosen": -2.501108169555664, "logps/rejected": -2.3846471309661865, "loss": 4.4935, "rewards/accuracies": 0.25, "rewards/chosen": -25.01108169555664, "rewards/margins": -1.164609432220459, "rewards/rejected": -23.846473693847656, "step": 1163 }, { "epoch": 0.15849673202614378, "grad_norm": 55.14409505682975, "learning_rate": 7.917116655525298e-07, "logits/chosen": 8.162550926208496, "logits/rejected": 7.579042434692383, "logps/chosen": -2.450856924057007, "logps/rejected": -2.319948434829712, "loss": 4.1721, "rewards/accuracies": 0.25, "rewards/chosen": -24.508569717407227, "rewards/margins": -1.3090848922729492, "rewards/rejected": -23.199485778808594, "step": 1164 }, { "epoch": 0.15863289760348584, "grad_norm": 48.57837222124435, "learning_rate": 7.916731150442695e-07, "logits/chosen": 6.586687088012695, "logits/rejected": 7.835637092590332, "logps/chosen": -1.9810789823532104, "logps/rejected": -2.5715081691741943, "loss": 3.5419, "rewards/accuracies": 1.0, "rewards/chosen": -19.810789108276367, "rewards/margins": 5.904292106628418, "rewards/rejected": -25.7150821685791, "step": 1165 }, { "epoch": 0.1587690631808279, "grad_norm": 45.47114679644667, "learning_rate": 7.916344760342312e-07, "logits/chosen": 6.994275093078613, "logits/rejected": 6.381542205810547, "logps/chosen": -1.993330478668213, "logps/rejected": -1.9740815162658691, "loss": 4.0959, "rewards/accuracies": 0.5, "rewards/chosen": -19.933305740356445, "rewards/margins": -0.1924903392791748, "rewards/rejected": -19.740816116333008, "step": 1166 }, { "epoch": 0.15890522875816993, "grad_norm": 47.885648092115645, "learning_rate": 7.915957485311459e-07, "logits/chosen": 9.064716339111328, "logits/rejected": 6.893901348114014, "logps/chosen": -2.4340057373046875, "logps/rejected": -2.2068421840667725, "loss": 4.2062, "rewards/accuracies": 0.0, "rewards/chosen": -24.340059280395508, "rewards/margins": -2.271636962890625, "rewards/rejected": -22.068422317504883, "step": 1167 }, { "epoch": 0.15904139433551198, "grad_norm": 39.83694543525536, "learning_rate": 7.915569325437641e-07, "logits/chosen": 6.62479305267334, "logits/rejected": 8.374641418457031, "logps/chosen": -2.468787908554077, "logps/rejected": -3.0539541244506836, "loss": 3.8829, "rewards/accuracies": 0.75, "rewards/chosen": -24.68787956237793, "rewards/margins": 5.8516621589660645, "rewards/rejected": -30.539541244506836, "step": 1168 }, { "epoch": 0.15917755991285404, "grad_norm": 44.17907737508109, "learning_rate": 7.915180280808568e-07, "logits/chosen": 7.8822784423828125, "logits/rejected": 8.205745697021484, "logps/chosen": -2.0268239974975586, "logps/rejected": -2.5029003620147705, "loss": 4.0426, "rewards/accuracies": 0.75, "rewards/chosen": -20.268238067626953, "rewards/margins": 4.760765552520752, "rewards/rejected": -25.029003143310547, "step": 1169 }, { "epoch": 0.15931372549019607, "grad_norm": 48.240478799291836, "learning_rate": 7.914790351512149e-07, "logits/chosen": 6.825207233428955, "logits/rejected": 6.709481239318848, "logps/chosen": -2.4006712436676025, "logps/rejected": -2.278965711593628, "loss": 4.3218, "rewards/accuracies": 0.5, "rewards/chosen": -24.0067138671875, "rewards/margins": -1.2170567512512207, "rewards/rejected": -22.789657592773438, "step": 1170 }, { "epoch": 0.15944989106753812, "grad_norm": 41.89667872976446, "learning_rate": 7.914399537636488e-07, "logits/chosen": 7.255977630615234, "logits/rejected": 8.588140487670898, "logps/chosen": -2.3489692211151123, "logps/rejected": -2.4386115074157715, "loss": 3.863, "rewards/accuracies": 0.25, "rewards/chosen": -23.48969268798828, "rewards/margins": 0.8964223861694336, "rewards/rejected": -24.38611602783203, "step": 1171 }, { "epoch": 0.15958605664488018, "grad_norm": 44.132972413189506, "learning_rate": 7.914007839269896e-07, "logits/chosen": 6.506877899169922, "logits/rejected": 7.227011203765869, "logps/chosen": -1.987058401107788, "logps/rejected": -2.3061537742614746, "loss": 4.2928, "rewards/accuracies": 1.0, "rewards/chosen": -19.870586395263672, "rewards/margins": 3.1909520626068115, "rewards/rejected": -23.061538696289062, "step": 1172 }, { "epoch": 0.1597222222222222, "grad_norm": 46.648439514965986, "learning_rate": 7.913615256500878e-07, "logits/chosen": 7.127978324890137, "logits/rejected": 7.009485721588135, "logps/chosen": -2.194605827331543, "logps/rejected": -2.2979073524475098, "loss": 4.0348, "rewards/accuracies": 0.75, "rewards/chosen": -21.946060180664062, "rewards/margins": 1.033012866973877, "rewards/rejected": -22.97907257080078, "step": 1173 }, { "epoch": 0.15985838779956427, "grad_norm": 46.703994671496496, "learning_rate": 7.913221789418143e-07, "logits/chosen": 6.202146053314209, "logits/rejected": 5.9574079513549805, "logps/chosen": -2.443464756011963, "logps/rejected": -2.383199691772461, "loss": 4.4231, "rewards/accuracies": 0.5, "rewards/chosen": -24.434646606445312, "rewards/margins": -0.6026501655578613, "rewards/rejected": -23.83199691772461, "step": 1174 }, { "epoch": 0.15999455337690632, "grad_norm": 45.83561935250806, "learning_rate": 7.912827438110598e-07, "logits/chosen": 6.29643440246582, "logits/rejected": 7.757932662963867, "logps/chosen": -1.9356484413146973, "logps/rejected": -2.280552387237549, "loss": 3.9719, "rewards/accuracies": 0.5, "rewards/chosen": -19.35648536682129, "rewards/margins": 3.4490394592285156, "rewards/rejected": -22.805524826049805, "step": 1175 }, { "epoch": 0.16013071895424835, "grad_norm": 49.26792414488684, "learning_rate": 7.91243220266735e-07, "logits/chosen": 7.866071701049805, "logits/rejected": 7.4854736328125, "logps/chosen": -2.341930627822876, "logps/rejected": -2.4589178562164307, "loss": 4.6449, "rewards/accuracies": 0.5, "rewards/chosen": -23.41930389404297, "rewards/margins": 1.1698741912841797, "rewards/rejected": -24.58917999267578, "step": 1176 }, { "epoch": 0.1602668845315904, "grad_norm": 45.23866729370216, "learning_rate": 7.912036083177704e-07, "logits/chosen": 5.3134965896606445, "logits/rejected": 7.031797409057617, "logps/chosen": -2.77933669090271, "logps/rejected": -2.4776968955993652, "loss": 4.5327, "rewards/accuracies": 0.5, "rewards/chosen": -27.793365478515625, "rewards/margins": -3.01639986038208, "rewards/rejected": -24.776966094970703, "step": 1177 }, { "epoch": 0.16040305010893247, "grad_norm": 46.878615211628805, "learning_rate": 7.911639079731169e-07, "logits/chosen": 5.287275314331055, "logits/rejected": 7.326613903045654, "logps/chosen": -2.184382438659668, "logps/rejected": -2.570538282394409, "loss": 4.0847, "rewards/accuracies": 0.75, "rewards/chosen": -21.843826293945312, "rewards/margins": 3.861556053161621, "rewards/rejected": -25.705381393432617, "step": 1178 }, { "epoch": 0.16053921568627452, "grad_norm": 45.87725495399492, "learning_rate": 7.911241192417449e-07, "logits/chosen": 7.23468017578125, "logits/rejected": 6.312538146972656, "logps/chosen": -2.3124256134033203, "logps/rejected": -2.1443490982055664, "loss": 4.1223, "rewards/accuracies": 0.25, "rewards/chosen": -23.12425422668457, "rewards/margins": -1.6807622909545898, "rewards/rejected": -21.443492889404297, "step": 1179 }, { "epoch": 0.16067538126361655, "grad_norm": 52.9320474925238, "learning_rate": 7.910842421326451e-07, "logits/chosen": 6.5196638107299805, "logits/rejected": 7.526887893676758, "logps/chosen": -2.2486274242401123, "logps/rejected": -2.47533917427063, "loss": 4.6589, "rewards/accuracies": 0.5, "rewards/chosen": -22.48627471923828, "rewards/margins": 2.267118215560913, "rewards/rejected": -24.75339126586914, "step": 1180 }, { "epoch": 0.1608115468409586, "grad_norm": 47.17854218548174, "learning_rate": 7.910442766548282e-07, "logits/chosen": 7.987937927246094, "logits/rejected": 7.304052352905273, "logps/chosen": -2.3568787574768066, "logps/rejected": -2.2469558715820312, "loss": 4.0473, "rewards/accuracies": 0.75, "rewards/chosen": -23.568788528442383, "rewards/margins": -1.0992283821105957, "rewards/rejected": -22.469560623168945, "step": 1181 }, { "epoch": 0.16094771241830066, "grad_norm": 49.94021768431721, "learning_rate": 7.910042228173244e-07, "logits/chosen": 6.724863052368164, "logits/rejected": 6.517125129699707, "logps/chosen": -2.665600061416626, "logps/rejected": -2.3832125663757324, "loss": 4.2933, "rewards/accuracies": 0.0, "rewards/chosen": -26.65599822998047, "rewards/margins": -2.8238730430603027, "rewards/rejected": -23.83212661743164, "step": 1182 }, { "epoch": 0.1610838779956427, "grad_norm": 52.262627188333774, "learning_rate": 7.909640806291845e-07, "logits/chosen": 6.616654396057129, "logits/rejected": 6.619044303894043, "logps/chosen": -2.4190545082092285, "logps/rejected": -2.516683340072632, "loss": 4.3374, "rewards/accuracies": 0.5, "rewards/chosen": -24.19054412841797, "rewards/margins": 0.9762880802154541, "rewards/rejected": -25.166831970214844, "step": 1183 }, { "epoch": 0.16122004357298475, "grad_norm": 50.327129444351236, "learning_rate": 7.909238500994789e-07, "logits/chosen": 8.616844177246094, "logits/rejected": 8.12669563293457, "logps/chosen": -2.726093292236328, "logps/rejected": -2.557955265045166, "loss": 3.9957, "rewards/accuracies": 0.5, "rewards/chosen": -27.26093292236328, "rewards/margins": -1.6813769340515137, "rewards/rejected": -25.57955551147461, "step": 1184 }, { "epoch": 0.1613562091503268, "grad_norm": 42.055369196256684, "learning_rate": 7.908835312372978e-07, "logits/chosen": 5.595049858093262, "logits/rejected": 8.008618354797363, "logps/chosen": -2.1636910438537598, "logps/rejected": -2.6391513347625732, "loss": 4.0066, "rewards/accuracies": 1.0, "rewards/chosen": -21.63690948486328, "rewards/margins": 4.754603862762451, "rewards/rejected": -26.39151382446289, "step": 1185 }, { "epoch": 0.16149237472766884, "grad_norm": 44.5216795625819, "learning_rate": 7.908431240517518e-07, "logits/chosen": 8.812153816223145, "logits/rejected": 8.520257949829102, "logps/chosen": -2.477008819580078, "logps/rejected": -2.3924665451049805, "loss": 3.7691, "rewards/accuracies": 0.25, "rewards/chosen": -24.77008819580078, "rewards/margins": -0.8454234600067139, "rewards/rejected": -23.924667358398438, "step": 1186 }, { "epoch": 0.1616285403050109, "grad_norm": 164.8165701785417, "learning_rate": 7.908026285519712e-07, "logits/chosen": 6.501376628875732, "logits/rejected": 8.50999641418457, "logps/chosen": -2.2639291286468506, "logps/rejected": -2.4536232948303223, "loss": 4.3337, "rewards/accuracies": 0.5, "rewards/chosen": -22.63928985595703, "rewards/margins": 1.896942138671875, "rewards/rejected": -24.536231994628906, "step": 1187 }, { "epoch": 0.16176470588235295, "grad_norm": 47.593428695847436, "learning_rate": 7.907620447471062e-07, "logits/chosen": 8.408988952636719, "logits/rejected": 8.329042434692383, "logps/chosen": -2.550058364868164, "logps/rejected": -2.6689651012420654, "loss": 4.0177, "rewards/accuracies": 0.75, "rewards/chosen": -25.50058364868164, "rewards/margins": 1.1890664100646973, "rewards/rejected": -26.689651489257812, "step": 1188 }, { "epoch": 0.16190087145969498, "grad_norm": 42.50915984053982, "learning_rate": 7.907213726463271e-07, "logits/chosen": 8.120033264160156, "logits/rejected": 6.9611334800720215, "logps/chosen": -2.440917730331421, "logps/rejected": -2.486027717590332, "loss": 4.2064, "rewards/accuracies": 0.5, "rewards/chosen": -24.409177780151367, "rewards/margins": 0.4510984420776367, "rewards/rejected": -24.860275268554688, "step": 1189 }, { "epoch": 0.16203703703703703, "grad_norm": 48.091923504014694, "learning_rate": 7.906806122588242e-07, "logits/chosen": 7.384913444519043, "logits/rejected": 8.961355209350586, "logps/chosen": -2.998725414276123, "logps/rejected": -3.208259344100952, "loss": 4.5061, "rewards/accuracies": 0.5, "rewards/chosen": -29.987255096435547, "rewards/margins": 2.095339298248291, "rewards/rejected": -32.08259582519531, "step": 1190 }, { "epoch": 0.1621732026143791, "grad_norm": 45.05655737455554, "learning_rate": 7.906397635938076e-07, "logits/chosen": 8.935664176940918, "logits/rejected": 8.443737030029297, "logps/chosen": -2.676774501800537, "logps/rejected": -2.603914737701416, "loss": 4.1709, "rewards/accuracies": 0.25, "rewards/chosen": -26.767742156982422, "rewards/margins": -0.7285952568054199, "rewards/rejected": -26.039146423339844, "step": 1191 }, { "epoch": 0.16230936819172112, "grad_norm": 48.365165196883645, "learning_rate": 7.905988266605073e-07, "logits/chosen": 8.614310264587402, "logits/rejected": 8.952247619628906, "logps/chosen": -2.806809425354004, "logps/rejected": -3.028243064880371, "loss": 4.428, "rewards/accuracies": 0.5, "rewards/chosen": -28.068092346191406, "rewards/margins": 2.21433687210083, "rewards/rejected": -30.28243064880371, "step": 1192 }, { "epoch": 0.16244553376906318, "grad_norm": 52.58963831093854, "learning_rate": 7.905578014681733e-07, "logits/chosen": 8.13994026184082, "logits/rejected": 9.573905944824219, "logps/chosen": -2.471031665802002, "logps/rejected": -2.713501453399658, "loss": 4.1018, "rewards/accuracies": 1.0, "rewards/chosen": -24.710315704345703, "rewards/margins": 2.424698829650879, "rewards/rejected": -27.135013580322266, "step": 1193 }, { "epoch": 0.16258169934640523, "grad_norm": 47.79379628333817, "learning_rate": 7.90516688026076e-07, "logits/chosen": 7.1538567543029785, "logits/rejected": 7.221102714538574, "logps/chosen": -2.686161994934082, "logps/rejected": -2.5723400115966797, "loss": 3.7762, "rewards/accuracies": 0.25, "rewards/chosen": -26.86161994934082, "rewards/margins": -1.138218879699707, "rewards/rejected": -25.723400115966797, "step": 1194 }, { "epoch": 0.16271786492374726, "grad_norm": 43.461261753558766, "learning_rate": 7.904754863435046e-07, "logits/chosen": 7.177461624145508, "logits/rejected": 6.691168785095215, "logps/chosen": -2.4437432289123535, "logps/rejected": -2.351114511489868, "loss": 3.8999, "rewards/accuracies": 0.25, "rewards/chosen": -24.43743133544922, "rewards/margins": -0.9262866973876953, "rewards/rejected": -23.511144638061523, "step": 1195 }, { "epoch": 0.16285403050108932, "grad_norm": 44.85907113575537, "learning_rate": 7.904341964297696e-07, "logits/chosen": 9.646526336669922, "logits/rejected": 10.156294822692871, "logps/chosen": -3.0514307022094727, "logps/rejected": -3.422116279602051, "loss": 3.7083, "rewards/accuracies": 0.75, "rewards/chosen": -30.514307022094727, "rewards/margins": 3.706857204437256, "rewards/rejected": -34.22116470336914, "step": 1196 }, { "epoch": 0.16299019607843138, "grad_norm": 50.873982104051876, "learning_rate": 7.903928182942005e-07, "logits/chosen": 8.998955726623535, "logits/rejected": 10.48076057434082, "logps/chosen": -2.7419614791870117, "logps/rejected": -3.1137654781341553, "loss": 4.2704, "rewards/accuracies": 1.0, "rewards/chosen": -27.419614791870117, "rewards/margins": 3.7180395126342773, "rewards/rejected": -31.137653350830078, "step": 1197 }, { "epoch": 0.16312636165577343, "grad_norm": 47.63328827847855, "learning_rate": 7.90351351946147e-07, "logits/chosen": 9.198299407958984, "logits/rejected": 9.848628044128418, "logps/chosen": -2.8352928161621094, "logps/rejected": -3.1784884929656982, "loss": 4.0531, "rewards/accuracies": 1.0, "rewards/chosen": -28.352928161621094, "rewards/margins": 3.431957244873047, "rewards/rejected": -31.78488540649414, "step": 1198 }, { "epoch": 0.16326252723311546, "grad_norm": 51.74179592760809, "learning_rate": 7.903097973949789e-07, "logits/chosen": 9.038963317871094, "logits/rejected": 9.342016220092773, "logps/chosen": -3.084888458251953, "logps/rejected": -2.933992385864258, "loss": 4.6554, "rewards/accuracies": 0.0, "rewards/chosen": -30.84888458251953, "rewards/margins": -1.5089592933654785, "rewards/rejected": -29.339923858642578, "step": 1199 }, { "epoch": 0.16339869281045752, "grad_norm": 48.450678903653554, "learning_rate": 7.902681546500858e-07, "logits/chosen": 9.912302017211914, "logits/rejected": 9.349477767944336, "logps/chosen": -3.2187867164611816, "logps/rejected": -3.2950618267059326, "loss": 4.2079, "rewards/accuracies": 0.25, "rewards/chosen": -32.1878662109375, "rewards/margins": 0.7627520561218262, "rewards/rejected": -32.950618743896484, "step": 1200 }, { "epoch": 0.16353485838779958, "grad_norm": 50.59065438960667, "learning_rate": 7.902264237208771e-07, "logits/chosen": 6.757364273071289, "logits/rejected": 7.784629821777344, "logps/chosen": -2.1237258911132812, "logps/rejected": -2.5668301582336426, "loss": 4.501, "rewards/accuracies": 0.75, "rewards/chosen": -21.237258911132812, "rewards/margins": 4.431042671203613, "rewards/rejected": -25.66830062866211, "step": 1201 }, { "epoch": 0.1636710239651416, "grad_norm": 47.237475683569976, "learning_rate": 7.901846046167824e-07, "logits/chosen": 9.065645217895508, "logits/rejected": 8.522613525390625, "logps/chosen": -2.8940999507904053, "logps/rejected": -2.6457526683807373, "loss": 4.2696, "rewards/accuracies": 0.25, "rewards/chosen": -28.940998077392578, "rewards/margins": -2.4834728240966797, "rewards/rejected": -26.45752716064453, "step": 1202 }, { "epoch": 0.16380718954248366, "grad_norm": 45.48774779418618, "learning_rate": 7.901426973472509e-07, "logits/chosen": 8.977079391479492, "logits/rejected": 9.71358871459961, "logps/chosen": -3.129847288131714, "logps/rejected": -3.214543342590332, "loss": 4.1934, "rewards/accuracies": 0.5, "rewards/chosen": -31.298473358154297, "rewards/margins": 0.8469619750976562, "rewards/rejected": -32.14543533325195, "step": 1203 }, { "epoch": 0.16394335511982572, "grad_norm": 47.33226516951672, "learning_rate": 7.901007019217519e-07, "logits/chosen": 8.295066833496094, "logits/rejected": 9.47509765625, "logps/chosen": -2.981811046600342, "logps/rejected": -2.824815273284912, "loss": 3.9087, "rewards/accuracies": 0.25, "rewards/chosen": -29.81810760498047, "rewards/margins": -1.569955825805664, "rewards/rejected": -28.248153686523438, "step": 1204 }, { "epoch": 0.16407952069716775, "grad_norm": 52.92424498331072, "learning_rate": 7.900586183497748e-07, "logits/chosen": 8.436609268188477, "logits/rejected": 8.116808891296387, "logps/chosen": -2.9958548545837402, "logps/rejected": -2.9573655128479004, "loss": 4.7634, "rewards/accuracies": 0.75, "rewards/chosen": -29.95854949951172, "rewards/margins": -0.38489484786987305, "rewards/rejected": -29.573654174804688, "step": 1205 }, { "epoch": 0.1642156862745098, "grad_norm": 61.55709051937049, "learning_rate": 7.900164466408288e-07, "logits/chosen": 8.14411449432373, "logits/rejected": 9.618223190307617, "logps/chosen": -2.695075035095215, "logps/rejected": -2.802316188812256, "loss": 4.395, "rewards/accuracies": 0.5, "rewards/chosen": -26.95075225830078, "rewards/margins": 1.072411060333252, "rewards/rejected": -28.023162841796875, "step": 1206 }, { "epoch": 0.16435185185185186, "grad_norm": 48.05144863779974, "learning_rate": 7.899741868044426e-07, "logits/chosen": 8.616004943847656, "logits/rejected": 10.044116973876953, "logps/chosen": -2.947784900665283, "logps/rejected": -3.081364154815674, "loss": 3.9779, "rewards/accuracies": 0.5, "rewards/chosen": -29.477846145629883, "rewards/margins": 1.3357958793640137, "rewards/rejected": -30.813644409179688, "step": 1207 }, { "epoch": 0.1644880174291939, "grad_norm": 47.044884141089355, "learning_rate": 7.899318388501653e-07, "logits/chosen": 9.107380867004395, "logits/rejected": 8.282937049865723, "logps/chosen": -2.4553885459899902, "logps/rejected": -2.8024797439575195, "loss": 4.2407, "rewards/accuracies": 0.75, "rewards/chosen": -24.55388641357422, "rewards/margins": 3.4709134101867676, "rewards/rejected": -28.024799346923828, "step": 1208 }, { "epoch": 0.16462418300653595, "grad_norm": 46.072102754132544, "learning_rate": 7.898894027875659e-07, "logits/chosen": 7.576505661010742, "logits/rejected": 7.568525314331055, "logps/chosen": -2.523094654083252, "logps/rejected": -2.4398725032806396, "loss": 4.4223, "rewards/accuracies": 0.5, "rewards/chosen": -25.230945587158203, "rewards/margins": -0.832221508026123, "rewards/rejected": -24.398725509643555, "step": 1209 }, { "epoch": 0.164760348583878, "grad_norm": 50.71347971480153, "learning_rate": 7.89846878626233e-07, "logits/chosen": 8.672134399414062, "logits/rejected": 9.576578140258789, "logps/chosen": -2.7513251304626465, "logps/rejected": -2.894209623336792, "loss": 4.5829, "rewards/accuracies": 0.75, "rewards/chosen": -27.51325035095215, "rewards/margins": 1.4288444519042969, "rewards/rejected": -28.942094802856445, "step": 1210 }, { "epoch": 0.16489651416122003, "grad_norm": 52.96235375282198, "learning_rate": 7.898042663757754e-07, "logits/chosen": 8.32214641571045, "logits/rejected": 8.665407180786133, "logps/chosen": -2.607490062713623, "logps/rejected": -2.923043727874756, "loss": 3.9174, "rewards/accuracies": 0.75, "rewards/chosen": -26.074901580810547, "rewards/margins": 3.155536651611328, "rewards/rejected": -29.230438232421875, "step": 1211 }, { "epoch": 0.1650326797385621, "grad_norm": 44.99857002827223, "learning_rate": 7.897615660458216e-07, "logits/chosen": 8.730501174926758, "logits/rejected": 9.176461219787598, "logps/chosen": -2.7381978034973145, "logps/rejected": -2.904910087585449, "loss": 4.2222, "rewards/accuracies": 0.75, "rewards/chosen": -27.381977081298828, "rewards/margins": 1.6671218872070312, "rewards/rejected": -29.04909896850586, "step": 1212 }, { "epoch": 0.16516884531590414, "grad_norm": 50.16882024389279, "learning_rate": 7.897187776460202e-07, "logits/chosen": 7.100518226623535, "logits/rejected": 8.368074417114258, "logps/chosen": -2.750340461730957, "logps/rejected": -2.8926339149475098, "loss": 4.5297, "rewards/accuracies": 0.75, "rewards/chosen": -27.50340461730957, "rewards/margins": 1.4229331016540527, "rewards/rejected": -28.92633819580078, "step": 1213 }, { "epoch": 0.16530501089324617, "grad_norm": 47.42311834921969, "learning_rate": 7.896759011860396e-07, "logits/chosen": 7.015141487121582, "logits/rejected": 7.024620056152344, "logps/chosen": -2.491641044616699, "logps/rejected": -2.5284621715545654, "loss": 4.2987, "rewards/accuracies": 0.5, "rewards/chosen": -24.916412353515625, "rewards/margins": 0.3682103157043457, "rewards/rejected": -25.284622192382812, "step": 1214 }, { "epoch": 0.16544117647058823, "grad_norm": 55.120186255542336, "learning_rate": 7.896329366755679e-07, "logits/chosen": 10.364157676696777, "logits/rejected": 8.95483684539795, "logps/chosen": -3.044438600540161, "logps/rejected": -2.719780206680298, "loss": 4.473, "rewards/accuracies": 0.25, "rewards/chosen": -30.444387435913086, "rewards/margins": -3.2465853691101074, "rewards/rejected": -27.197803497314453, "step": 1215 }, { "epoch": 0.1655773420479303, "grad_norm": 47.132460743597434, "learning_rate": 7.895898841243136e-07, "logits/chosen": 9.69680404663086, "logits/rejected": 8.920520782470703, "logps/chosen": -3.064289093017578, "logps/rejected": -2.6646595001220703, "loss": 4.8142, "rewards/accuracies": 0.25, "rewards/chosen": -30.64289093017578, "rewards/margins": -3.996295928955078, "rewards/rejected": -26.646595001220703, "step": 1216 }, { "epoch": 0.16571350762527234, "grad_norm": 56.44700196573162, "learning_rate": 7.895467435420045e-07, "logits/chosen": 8.222667694091797, "logits/rejected": 9.021637916564941, "logps/chosen": -2.917194128036499, "logps/rejected": -2.7419066429138184, "loss": 5.2029, "rewards/accuracies": 0.5, "rewards/chosen": -29.17194175720215, "rewards/margins": -1.752873420715332, "rewards/rejected": -27.4190673828125, "step": 1217 }, { "epoch": 0.16584967320261437, "grad_norm": 57.51867784268077, "learning_rate": 7.895035149383886e-07, "logits/chosen": 7.98297119140625, "logits/rejected": 8.765294075012207, "logps/chosen": -2.5603785514831543, "logps/rejected": -3.2133378982543945, "loss": 3.6688, "rewards/accuracies": 0.75, "rewards/chosen": -25.603788375854492, "rewards/margins": 6.529592037200928, "rewards/rejected": -32.13338088989258, "step": 1218 }, { "epoch": 0.16598583877995643, "grad_norm": 49.05907742457067, "learning_rate": 7.89460198323234e-07, "logits/chosen": 9.321573257446289, "logits/rejected": 9.106345176696777, "logps/chosen": -2.577190637588501, "logps/rejected": -2.641932964324951, "loss": 3.7618, "rewards/accuracies": 0.5, "rewards/chosen": -25.77190589904785, "rewards/margins": 0.647423267364502, "rewards/rejected": -26.419328689575195, "step": 1219 }, { "epoch": 0.1661220043572985, "grad_norm": 43.763326246667155, "learning_rate": 7.894167937063281e-07, "logits/chosen": 7.264993667602539, "logits/rejected": 9.574773788452148, "logps/chosen": -2.423917055130005, "logps/rejected": -2.950286865234375, "loss": 4.5209, "rewards/accuracies": 0.75, "rewards/chosen": -24.23917007446289, "rewards/margins": 5.263698101043701, "rewards/rejected": -29.50286865234375, "step": 1220 }, { "epoch": 0.16625816993464052, "grad_norm": 45.22058303784412, "learning_rate": 7.893733010974788e-07, "logits/chosen": 8.33610725402832, "logits/rejected": 7.9797163009643555, "logps/chosen": -2.4954819679260254, "logps/rejected": -2.383420705795288, "loss": 3.9642, "rewards/accuracies": 0.5, "rewards/chosen": -24.954818725585938, "rewards/margins": -1.1206116676330566, "rewards/rejected": -23.83420753479004, "step": 1221 }, { "epoch": 0.16639433551198257, "grad_norm": 46.6995782798333, "learning_rate": 7.893297205065135e-07, "logits/chosen": 8.298736572265625, "logits/rejected": 8.365031242370605, "logps/chosen": -2.5961623191833496, "logps/rejected": -2.7154641151428223, "loss": 4.7386, "rewards/accuracies": 0.75, "rewards/chosen": -25.961624145507812, "rewards/margins": 1.1930160522460938, "rewards/rejected": -27.154640197753906, "step": 1222 }, { "epoch": 0.16653050108932463, "grad_norm": 44.50793193270584, "learning_rate": 7.892860519432796e-07, "logits/chosen": 8.012735366821289, "logits/rejected": 6.551321983337402, "logps/chosen": -2.6915931701660156, "logps/rejected": -2.3424034118652344, "loss": 4.726, "rewards/accuracies": 0.5, "rewards/chosen": -26.915931701660156, "rewards/margins": -3.4918971061706543, "rewards/rejected": -23.424034118652344, "step": 1223 }, { "epoch": 0.16666666666666666, "grad_norm": 48.09313686521182, "learning_rate": 7.892422954176444e-07, "logits/chosen": 9.082839965820312, "logits/rejected": 7.845698833465576, "logps/chosen": -2.926738739013672, "logps/rejected": -2.6439051628112793, "loss": 4.5075, "rewards/accuracies": 0.25, "rewards/chosen": -29.26738739013672, "rewards/margins": -2.8283376693725586, "rewards/rejected": -26.439048767089844, "step": 1224 }, { "epoch": 0.16680283224400871, "grad_norm": 45.556887605757595, "learning_rate": 7.891984509394952e-07, "logits/chosen": 8.652215003967285, "logits/rejected": 10.328939437866211, "logps/chosen": -2.7461318969726562, "logps/rejected": -2.987497329711914, "loss": 4.3301, "rewards/accuracies": 0.75, "rewards/chosen": -27.461318969726562, "rewards/margins": 2.4136533737182617, "rewards/rejected": -29.87497329711914, "step": 1225 }, { "epoch": 0.16693899782135077, "grad_norm": 51.25384591649125, "learning_rate": 7.891545185187386e-07, "logits/chosen": 7.945009231567383, "logits/rejected": 7.760476112365723, "logps/chosen": -2.40531849861145, "logps/rejected": -2.394731044769287, "loss": 4.569, "rewards/accuracies": 0.5, "rewards/chosen": -24.053186416625977, "rewards/margins": -0.10587787628173828, "rewards/rejected": -23.947307586669922, "step": 1226 }, { "epoch": 0.1670751633986928, "grad_norm": 45.73622704607392, "learning_rate": 7.891104981653019e-07, "logits/chosen": 7.288423538208008, "logits/rejected": 8.800653457641602, "logps/chosen": -2.2231361865997314, "logps/rejected": -2.451376438140869, "loss": 4.3361, "rewards/accuracies": 0.75, "rewards/chosen": -22.231361389160156, "rewards/margins": 2.2824010848999023, "rewards/rejected": -24.513763427734375, "step": 1227 }, { "epoch": 0.16721132897603486, "grad_norm": 69.82181610986417, "learning_rate": 7.890663898891318e-07, "logits/chosen": 7.302967071533203, "logits/rejected": 8.469223022460938, "logps/chosen": -2.1146082878112793, "logps/rejected": -2.359224796295166, "loss": 4.0131, "rewards/accuracies": 1.0, "rewards/chosen": -21.14608383178711, "rewards/margins": 2.4461631774902344, "rewards/rejected": -23.592247009277344, "step": 1228 }, { "epoch": 0.1673474945533769, "grad_norm": 42.86676770209027, "learning_rate": 7.890221937001946e-07, "logits/chosen": 6.053194046020508, "logits/rejected": 8.245264053344727, "logps/chosen": -2.3127784729003906, "logps/rejected": -2.7532026767730713, "loss": 4.5709, "rewards/accuracies": 1.0, "rewards/chosen": -23.12778663635254, "rewards/margins": 4.40424108505249, "rewards/rejected": -27.532028198242188, "step": 1229 }, { "epoch": 0.16748366013071894, "grad_norm": 48.73221868459403, "learning_rate": 7.889779096084772e-07, "logits/chosen": 7.545917510986328, "logits/rejected": 8.799881935119629, "logps/chosen": -2.462153911590576, "logps/rejected": -2.8918118476867676, "loss": 4.0242, "rewards/accuracies": 0.75, "rewards/chosen": -24.621540069580078, "rewards/margins": 4.296579360961914, "rewards/rejected": -28.918119430541992, "step": 1230 }, { "epoch": 0.167619825708061, "grad_norm": 50.79113118955993, "learning_rate": 7.88933537623986e-07, "logits/chosen": 9.224340438842773, "logits/rejected": 8.548887252807617, "logps/chosen": -2.8033041954040527, "logps/rejected": -3.0774412155151367, "loss": 3.9439, "rewards/accuracies": 0.75, "rewards/chosen": -28.03304100036621, "rewards/margins": 2.7413711547851562, "rewards/rejected": -30.7744140625, "step": 1231 }, { "epoch": 0.16775599128540306, "grad_norm": 46.87434244744879, "learning_rate": 7.888890777567467e-07, "logits/chosen": 6.528298377990723, "logits/rejected": 6.856994152069092, "logps/chosen": -2.310452938079834, "logps/rejected": -2.3437576293945312, "loss": 4.4114, "rewards/accuracies": 0.5, "rewards/chosen": -23.104528427124023, "rewards/margins": 0.33304691314697266, "rewards/rejected": -23.437576293945312, "step": 1232 }, { "epoch": 0.16789215686274508, "grad_norm": 41.995188541266955, "learning_rate": 7.888445300168058e-07, "logits/chosen": 7.494782447814941, "logits/rejected": 6.441041946411133, "logps/chosen": -2.140611171722412, "logps/rejected": -2.2692694664001465, "loss": 4.045, "rewards/accuracies": 1.0, "rewards/chosen": -21.406112670898438, "rewards/margins": 1.2865827083587646, "rewards/rejected": -22.69269371032715, "step": 1233 }, { "epoch": 0.16802832244008714, "grad_norm": 47.55706244672616, "learning_rate": 7.887998944142291e-07, "logits/chosen": 8.521041870117188, "logits/rejected": 9.44980239868164, "logps/chosen": -2.4672560691833496, "logps/rejected": -2.8474068641662598, "loss": 4.5957, "rewards/accuracies": 1.0, "rewards/chosen": -24.67255973815918, "rewards/margins": 3.801506996154785, "rewards/rejected": -28.47406768798828, "step": 1234 }, { "epoch": 0.1681644880174292, "grad_norm": 52.2407889811253, "learning_rate": 7.887551709591024e-07, "logits/chosen": 8.950969696044922, "logits/rejected": 7.9474945068359375, "logps/chosen": -3.0538573265075684, "logps/rejected": -2.536398410797119, "loss": 4.3691, "rewards/accuracies": 0.25, "rewards/chosen": -30.53857421875, "rewards/margins": -5.174587726593018, "rewards/rejected": -25.36398696899414, "step": 1235 }, { "epoch": 0.16830065359477125, "grad_norm": 61.68148481476473, "learning_rate": 7.887103596615315e-07, "logits/chosen": 8.13881778717041, "logits/rejected": 9.216716766357422, "logps/chosen": -2.180954933166504, "logps/rejected": -2.455530881881714, "loss": 4.5458, "rewards/accuracies": 1.0, "rewards/chosen": -21.809551239013672, "rewards/margins": 2.7457590103149414, "rewards/rejected": -24.555309295654297, "step": 1236 }, { "epoch": 0.16843681917211328, "grad_norm": 49.37197980339942, "learning_rate": 7.886654605316415e-07, "logits/chosen": 8.03354263305664, "logits/rejected": 8.389589309692383, "logps/chosen": -2.8975677490234375, "logps/rejected": -2.63590407371521, "loss": 4.3708, "rewards/accuracies": 0.25, "rewards/chosen": -28.975677490234375, "rewards/margins": -2.6166367530822754, "rewards/rejected": -26.359041213989258, "step": 1237 }, { "epoch": 0.16857298474945534, "grad_norm": 70.44231648528854, "learning_rate": 7.886204735795781e-07, "logits/chosen": 7.631965160369873, "logits/rejected": 8.713203430175781, "logps/chosen": -2.1612417697906494, "logps/rejected": -2.416855812072754, "loss": 4.3186, "rewards/accuracies": 0.75, "rewards/chosen": -21.612415313720703, "rewards/margins": 2.556142568588257, "rewards/rejected": -24.168560028076172, "step": 1238 }, { "epoch": 0.1687091503267974, "grad_norm": 46.63972192962632, "learning_rate": 7.885753988155062e-07, "logits/chosen": 8.092073440551758, "logits/rejected": 8.21078109741211, "logps/chosen": -2.4853577613830566, "logps/rejected": -2.455524444580078, "loss": 4.083, "rewards/accuracies": 0.5, "rewards/chosen": -24.85357666015625, "rewards/margins": -0.29833173751831055, "rewards/rejected": -24.55524444580078, "step": 1239 }, { "epoch": 0.16884531590413943, "grad_norm": 42.1275395224524, "learning_rate": 7.88530236249611e-07, "logits/chosen": 7.924899101257324, "logits/rejected": 8.476277351379395, "logps/chosen": -2.144481658935547, "logps/rejected": -2.4092562198638916, "loss": 4.0293, "rewards/accuracies": 0.75, "rewards/chosen": -21.44481658935547, "rewards/margins": 2.6477458477020264, "rewards/rejected": -24.09256362915039, "step": 1240 }, { "epoch": 0.16898148148148148, "grad_norm": 46.964893916759266, "learning_rate": 7.884849858920973e-07, "logits/chosen": 8.710371017456055, "logits/rejected": 9.474968910217285, "logps/chosen": -2.742866277694702, "logps/rejected": -2.9396262168884277, "loss": 4.4045, "rewards/accuracies": 0.5, "rewards/chosen": -27.428661346435547, "rewards/margins": 1.9676012992858887, "rewards/rejected": -29.396263122558594, "step": 1241 }, { "epoch": 0.16911764705882354, "grad_norm": 45.427319306399134, "learning_rate": 7.884396477531898e-07, "logits/chosen": 8.899820327758789, "logits/rejected": 9.14918327331543, "logps/chosen": -2.424838066101074, "logps/rejected": -2.4364776611328125, "loss": 4.4622, "rewards/accuracies": 0.5, "rewards/chosen": -24.24837875366211, "rewards/margins": 0.11639881134033203, "rewards/rejected": -24.364778518676758, "step": 1242 }, { "epoch": 0.16925381263616557, "grad_norm": 55.61025159237514, "learning_rate": 7.88394221843133e-07, "logits/chosen": 6.721340179443359, "logits/rejected": 8.470243453979492, "logps/chosen": -2.8315892219543457, "logps/rejected": -2.84152889251709, "loss": 4.658, "rewards/accuracies": 0.5, "rewards/chosen": -28.315895080566406, "rewards/margins": 0.099395751953125, "rewards/rejected": -28.4152889251709, "step": 1243 }, { "epoch": 0.16938997821350762, "grad_norm": 45.12001986029032, "learning_rate": 7.883487081721913e-07, "logits/chosen": 9.118587493896484, "logits/rejected": 10.006275177001953, "logps/chosen": -2.481294870376587, "logps/rejected": -2.5993106365203857, "loss": 3.8093, "rewards/accuracies": 0.5, "rewards/chosen": -24.812946319580078, "rewards/margins": 1.1801581382751465, "rewards/rejected": -25.993106842041016, "step": 1244 }, { "epoch": 0.16952614379084968, "grad_norm": 46.48320969549027, "learning_rate": 7.883031067506488e-07, "logits/chosen": 9.013968467712402, "logits/rejected": 9.060892105102539, "logps/chosen": -3.0890309810638428, "logps/rejected": -2.8482346534729004, "loss": 4.3217, "rewards/accuracies": 0.5, "rewards/chosen": -30.890308380126953, "rewards/margins": -2.407961845397949, "rewards/rejected": -28.48234748840332, "step": 1245 }, { "epoch": 0.1696623093681917, "grad_norm": 134.3981973896688, "learning_rate": 7.882574175888097e-07, "logits/chosen": 9.512238502502441, "logits/rejected": 8.554282188415527, "logps/chosen": -2.7179436683654785, "logps/rejected": -2.6600894927978516, "loss": 4.588, "rewards/accuracies": 0.5, "rewards/chosen": -27.17943572998047, "rewards/margins": -0.5785412788391113, "rewards/rejected": -26.600894927978516, "step": 1246 }, { "epoch": 0.16979847494553377, "grad_norm": 46.354416426141185, "learning_rate": 7.882116406969976e-07, "logits/chosen": 9.060430526733398, "logits/rejected": 8.934269905090332, "logps/chosen": -2.6407439708709717, "logps/rejected": -2.7294085025787354, "loss": 4.3825, "rewards/accuracies": 0.5, "rewards/chosen": -26.407440185546875, "rewards/margins": 0.8866443634033203, "rewards/rejected": -27.294086456298828, "step": 1247 }, { "epoch": 0.16993464052287582, "grad_norm": 52.31009787131201, "learning_rate": 7.881657760855563e-07, "logits/chosen": 7.623306751251221, "logits/rejected": 7.9084062576293945, "logps/chosen": -2.689683437347412, "logps/rejected": -2.863309860229492, "loss": 3.9816, "rewards/accuracies": 0.75, "rewards/chosen": -26.896833419799805, "rewards/margins": 1.736264705657959, "rewards/rejected": -28.633098602294922, "step": 1248 }, { "epoch": 0.17007080610021785, "grad_norm": 45.30528352155746, "learning_rate": 7.881198237648494e-07, "logits/chosen": 9.906147956848145, "logits/rejected": 10.089578628540039, "logps/chosen": -2.9889984130859375, "logps/rejected": -2.8925135135650635, "loss": 4.4491, "rewards/accuracies": 0.25, "rewards/chosen": -29.889984130859375, "rewards/margins": -0.964848518371582, "rewards/rejected": -28.92513656616211, "step": 1249 }, { "epoch": 0.1702069716775599, "grad_norm": 45.84302960816579, "learning_rate": 7.880737837452601e-07, "logits/chosen": 7.127021789550781, "logits/rejected": 7.969600200653076, "logps/chosen": -2.161421060562134, "logps/rejected": -2.4203953742980957, "loss": 4.4281, "rewards/accuracies": 0.75, "rewards/chosen": -21.61421012878418, "rewards/margins": 2.589743137359619, "rewards/rejected": -24.20395278930664, "step": 1250 }, { "epoch": 0.17034313725490197, "grad_norm": 53.76412999497656, "learning_rate": 7.880276560371914e-07, "logits/chosen": 8.307655334472656, "logits/rejected": 8.853048324584961, "logps/chosen": -2.461514949798584, "logps/rejected": -2.601398468017578, "loss": 4.1832, "rewards/accuracies": 0.75, "rewards/chosen": -24.615150451660156, "rewards/margins": 1.398834228515625, "rewards/rejected": -26.01398468017578, "step": 1251 }, { "epoch": 0.170479302832244, "grad_norm": 42.238938243934044, "learning_rate": 7.879814406510664e-07, "logits/chosen": 9.921258926391602, "logits/rejected": 10.43041706085205, "logps/chosen": -2.8704047203063965, "logps/rejected": -3.0409862995147705, "loss": 4.3194, "rewards/accuracies": 0.75, "rewards/chosen": -28.70404815673828, "rewards/margins": 1.7058143615722656, "rewards/rejected": -30.409862518310547, "step": 1252 }, { "epoch": 0.17061546840958605, "grad_norm": 47.22327454187259, "learning_rate": 7.879351375973277e-07, "logits/chosen": 10.19507122039795, "logits/rejected": 10.54759407043457, "logps/chosen": -3.0958380699157715, "logps/rejected": -3.0329105854034424, "loss": 4.2287, "rewards/accuracies": 0.25, "rewards/chosen": -30.9583797454834, "rewards/margins": -0.6292743682861328, "rewards/rejected": -30.329105377197266, "step": 1253 }, { "epoch": 0.1707516339869281, "grad_norm": 43.49525918574068, "learning_rate": 7.87888746886438e-07, "logits/chosen": 9.587306022644043, "logits/rejected": 9.930798530578613, "logps/chosen": -2.81183123588562, "logps/rejected": -3.228473663330078, "loss": 3.9908, "rewards/accuracies": 0.75, "rewards/chosen": -28.11831283569336, "rewards/margins": 4.16642427444458, "rewards/rejected": -32.28473663330078, "step": 1254 }, { "epoch": 0.17088779956427017, "grad_norm": 44.7724523136541, "learning_rate": 7.878422685288799e-07, "logits/chosen": 7.8131022453308105, "logits/rejected": 8.611413955688477, "logps/chosen": -2.6941399574279785, "logps/rejected": -2.583127498626709, "loss": 4.6223, "rewards/accuracies": 0.5, "rewards/chosen": -26.9414005279541, "rewards/margins": -1.1101269721984863, "rewards/rejected": -25.831274032592773, "step": 1255 }, { "epoch": 0.1710239651416122, "grad_norm": 55.0524474965729, "learning_rate": 7.87795702535155e-07, "logits/chosen": 9.436850547790527, "logits/rejected": 9.902798652648926, "logps/chosen": -2.69787859916687, "logps/rejected": -3.027595043182373, "loss": 4.1131, "rewards/accuracies": 1.0, "rewards/chosen": -26.97878646850586, "rewards/margins": 3.297163963317871, "rewards/rejected": -30.275951385498047, "step": 1256 }, { "epoch": 0.17116013071895425, "grad_norm": 61.76445640217308, "learning_rate": 7.877490489157855e-07, "logits/chosen": 9.49083137512207, "logits/rejected": 9.599079132080078, "logps/chosen": -2.8950769901275635, "logps/rejected": -2.986938714981079, "loss": 4.1863, "rewards/accuracies": 0.5, "rewards/chosen": -28.950769424438477, "rewards/margins": 0.9186177253723145, "rewards/rejected": -29.869386672973633, "step": 1257 }, { "epoch": 0.1712962962962963, "grad_norm": 61.70795767729648, "learning_rate": 7.877023076813134e-07, "logits/chosen": 8.604788780212402, "logits/rejected": 9.297220230102539, "logps/chosen": -2.844646453857422, "logps/rejected": -2.916949987411499, "loss": 4.3025, "rewards/accuracies": 0.75, "rewards/chosen": -28.44646453857422, "rewards/margins": 0.7230362892150879, "rewards/rejected": -29.169498443603516, "step": 1258 }, { "epoch": 0.17143246187363834, "grad_norm": 43.96064233552516, "learning_rate": 7.876554788423e-07, "logits/chosen": 8.458938598632812, "logits/rejected": 8.176738739013672, "logps/chosen": -2.644636392593384, "logps/rejected": -2.5465476512908936, "loss": 3.9177, "rewards/accuracies": 0.25, "rewards/chosen": -26.44636344909668, "rewards/margins": -0.9808859825134277, "rewards/rejected": -25.465476989746094, "step": 1259 }, { "epoch": 0.1715686274509804, "grad_norm": 46.397223242405715, "learning_rate": 7.876085624093268e-07, "logits/chosen": 8.286495208740234, "logits/rejected": 8.556797981262207, "logps/chosen": -2.6331663131713867, "logps/rejected": -2.7809994220733643, "loss": 4.0685, "rewards/accuracies": 0.75, "rewards/chosen": -26.331661224365234, "rewards/margins": 1.4783329963684082, "rewards/rejected": -27.809995651245117, "step": 1260 }, { "epoch": 0.17170479302832245, "grad_norm": 44.854277948663444, "learning_rate": 7.875615583929949e-07, "logits/chosen": 9.116693496704102, "logits/rejected": 9.715124130249023, "logps/chosen": -2.7132060527801514, "logps/rejected": -2.520772933959961, "loss": 3.9528, "rewards/accuracies": 0.25, "rewards/chosen": -27.132061004638672, "rewards/margins": -1.9243321418762207, "rewards/rejected": -25.207727432250977, "step": 1261 }, { "epoch": 0.17184095860566448, "grad_norm": 72.8246474578965, "learning_rate": 7.875144668039254e-07, "logits/chosen": 8.046377182006836, "logits/rejected": 8.052467346191406, "logps/chosen": -2.508469581604004, "logps/rejected": -2.5930192470550537, "loss": 4.2105, "rewards/accuracies": 0.5, "rewards/chosen": -25.084693908691406, "rewards/margins": 0.8454995155334473, "rewards/rejected": -25.930192947387695, "step": 1262 }, { "epoch": 0.17197712418300654, "grad_norm": 45.75122928636438, "learning_rate": 7.874672876527586e-07, "logits/chosen": 7.4538254737854, "logits/rejected": 10.720962524414062, "logps/chosen": -2.7599449157714844, "logps/rejected": -3.161634683609009, "loss": 4.2915, "rewards/accuracies": 0.75, "rewards/chosen": -27.599449157714844, "rewards/margins": 4.016897201538086, "rewards/rejected": -31.61634635925293, "step": 1263 }, { "epoch": 0.1721132897603486, "grad_norm": 42.39333729828198, "learning_rate": 7.874200209501557e-07, "logits/chosen": 8.286649703979492, "logits/rejected": 9.470621109008789, "logps/chosen": -2.4471821784973145, "logps/rejected": -2.6487855911254883, "loss": 3.9063, "rewards/accuracies": 0.5, "rewards/chosen": -24.471820831298828, "rewards/margins": 2.0160365104675293, "rewards/rejected": -26.487857818603516, "step": 1264 }, { "epoch": 0.17224945533769062, "grad_norm": 48.322107540785865, "learning_rate": 7.873726667067964e-07, "logits/chosen": 9.15971565246582, "logits/rejected": 10.044114112854004, "logps/chosen": -2.654775619506836, "logps/rejected": -2.8483057022094727, "loss": 4.5763, "rewards/accuracies": 0.75, "rewards/chosen": -26.547754287719727, "rewards/margins": 1.9353013038635254, "rewards/rejected": -28.483055114746094, "step": 1265 }, { "epoch": 0.17238562091503268, "grad_norm": 49.37851432261237, "learning_rate": 7.87325224933381e-07, "logits/chosen": 7.472884654998779, "logits/rejected": 9.335637092590332, "logps/chosen": -2.313297986984253, "logps/rejected": -2.441592216491699, "loss": 4.0192, "rewards/accuracies": 0.75, "rewards/chosen": -23.132980346679688, "rewards/margins": 1.282942771911621, "rewards/rejected": -24.415924072265625, "step": 1266 }, { "epoch": 0.17252178649237473, "grad_norm": 44.62895815293876, "learning_rate": 7.872776956406294e-07, "logits/chosen": 7.114078044891357, "logits/rejected": 8.629109382629395, "logps/chosen": -2.5525102615356445, "logps/rejected": -2.910330057144165, "loss": 4.3955, "rewards/accuracies": 1.0, "rewards/chosen": -25.525100708007812, "rewards/margins": 3.5781989097595215, "rewards/rejected": -29.103300094604492, "step": 1267 }, { "epoch": 0.17265795206971676, "grad_norm": 44.6484746656619, "learning_rate": 7.872300788392811e-07, "logits/chosen": 9.02127742767334, "logits/rejected": 6.843494415283203, "logps/chosen": -2.876861572265625, "logps/rejected": -2.5896799564361572, "loss": 4.3073, "rewards/accuracies": 0.0, "rewards/chosen": -28.76861572265625, "rewards/margins": -2.8718161582946777, "rewards/rejected": -25.896800994873047, "step": 1268 }, { "epoch": 0.17279411764705882, "grad_norm": 42.35457513568847, "learning_rate": 7.871823745400957e-07, "logits/chosen": 7.287012100219727, "logits/rejected": 8.370485305786133, "logps/chosen": -2.462876796722412, "logps/rejected": -2.719925880432129, "loss": 4.1682, "rewards/accuracies": 0.75, "rewards/chosen": -24.628767013549805, "rewards/margins": 2.570492744445801, "rewards/rejected": -27.199260711669922, "step": 1269 }, { "epoch": 0.17293028322440088, "grad_norm": 50.67678780873444, "learning_rate": 7.871345827538524e-07, "logits/chosen": 10.217430114746094, "logits/rejected": 9.95366096496582, "logps/chosen": -2.913581609725952, "logps/rejected": -2.811647653579712, "loss": 4.2577, "rewards/accuracies": 0.25, "rewards/chosen": -29.135814666748047, "rewards/margins": -1.0193381309509277, "rewards/rejected": -28.116477966308594, "step": 1270 }, { "epoch": 0.1730664488017429, "grad_norm": 46.874028425450035, "learning_rate": 7.870867034913498e-07, "logits/chosen": 9.678644180297852, "logits/rejected": 9.645111083984375, "logps/chosen": -2.814304828643799, "logps/rejected": -3.1124703884124756, "loss": 4.1933, "rewards/accuracies": 0.75, "rewards/chosen": -28.143047332763672, "rewards/margins": 2.981654644012451, "rewards/rejected": -31.12470245361328, "step": 1271 }, { "epoch": 0.17320261437908496, "grad_norm": 44.64437963451122, "learning_rate": 7.87038736763407e-07, "logits/chosen": 7.631937503814697, "logits/rejected": 9.491973876953125, "logps/chosen": -2.3721938133239746, "logps/rejected": -2.7566583156585693, "loss": 4.2707, "rewards/accuracies": 0.75, "rewards/chosen": -23.721939086914062, "rewards/margins": 3.8446431159973145, "rewards/rejected": -27.56658172607422, "step": 1272 }, { "epoch": 0.17333877995642702, "grad_norm": 48.434580283060676, "learning_rate": 7.869906825808623e-07, "logits/chosen": 7.7485032081604, "logits/rejected": 8.113645553588867, "logps/chosen": -2.121635913848877, "logps/rejected": -2.272251605987549, "loss": 4.1667, "rewards/accuracies": 0.75, "rewards/chosen": -21.216358184814453, "rewards/margins": 1.5061593055725098, "rewards/rejected": -22.722518920898438, "step": 1273 }, { "epoch": 0.17347494553376908, "grad_norm": 43.755160007467744, "learning_rate": 7.86942540954574e-07, "logits/chosen": 7.300650596618652, "logits/rejected": 7.606579780578613, "logps/chosen": -2.3821849822998047, "logps/rejected": -2.651379108428955, "loss": 4.1213, "rewards/accuracies": 0.5, "rewards/chosen": -23.821849822998047, "rewards/margins": 2.6919403076171875, "rewards/rejected": -26.513790130615234, "step": 1274 }, { "epoch": 0.1736111111111111, "grad_norm": 55.41102254235875, "learning_rate": 7.868943118954202e-07, "logits/chosen": 8.90568733215332, "logits/rejected": 9.100958824157715, "logps/chosen": -3.0260095596313477, "logps/rejected": -2.783247947692871, "loss": 4.4922, "rewards/accuracies": 0.25, "rewards/chosen": -30.260095596313477, "rewards/margins": -2.427617073059082, "rewards/rejected": -27.832477569580078, "step": 1275 }, { "epoch": 0.17374727668845316, "grad_norm": 45.91745077745122, "learning_rate": 7.868459954142982e-07, "logits/chosen": 9.083772659301758, "logits/rejected": 9.378050804138184, "logps/chosen": -2.91959810256958, "logps/rejected": -2.866305112838745, "loss": 4.4824, "rewards/accuracies": 0.25, "rewards/chosen": -29.19598388671875, "rewards/margins": -0.532930850982666, "rewards/rejected": -28.66305160522461, "step": 1276 }, { "epoch": 0.17388344226579522, "grad_norm": 47.61980435499553, "learning_rate": 7.867975915221261e-07, "logits/chosen": 8.350170135498047, "logits/rejected": 8.522912979125977, "logps/chosen": -2.2855613231658936, "logps/rejected": -2.4437379837036133, "loss": 4.0334, "rewards/accuracies": 0.75, "rewards/chosen": -22.855613708496094, "rewards/margins": 1.5817651748657227, "rewards/rejected": -24.4373779296875, "step": 1277 }, { "epoch": 0.17401960784313725, "grad_norm": 40.67002177346131, "learning_rate": 7.867491002298408e-07, "logits/chosen": 9.75747013092041, "logits/rejected": 9.976947784423828, "logps/chosen": -3.1420912742614746, "logps/rejected": -3.036515235900879, "loss": 3.8125, "rewards/accuracies": 0.25, "rewards/chosen": -31.42091178894043, "rewards/margins": -1.0557608604431152, "rewards/rejected": -30.365150451660156, "step": 1278 }, { "epoch": 0.1741557734204793, "grad_norm": 49.77937593046394, "learning_rate": 7.867005215483995e-07, "logits/chosen": 8.422972679138184, "logits/rejected": 6.8179216384887695, "logps/chosen": -2.5796024799346924, "logps/rejected": -2.65240478515625, "loss": 4.4733, "rewards/accuracies": 0.5, "rewards/chosen": -25.796024322509766, "rewards/margins": 0.7280220985412598, "rewards/rejected": -26.5240478515625, "step": 1279 }, { "epoch": 0.17429193899782136, "grad_norm": 60.962048208870314, "learning_rate": 7.866518554887787e-07, "logits/chosen": 9.643272399902344, "logits/rejected": 9.948626518249512, "logps/chosen": -2.9586710929870605, "logps/rejected": -3.1463751792907715, "loss": 3.9109, "rewards/accuracies": 0.75, "rewards/chosen": -29.586711883544922, "rewards/margins": 1.877039909362793, "rewards/rejected": -31.4637508392334, "step": 1280 }, { "epoch": 0.1744281045751634, "grad_norm": 49.48674704285428, "learning_rate": 7.866031020619752e-07, "logits/chosen": 8.084953308105469, "logits/rejected": 9.32613754272461, "logps/chosen": -2.4592480659484863, "logps/rejected": -3.079942464828491, "loss": 4.0971, "rewards/accuracies": 1.0, "rewards/chosen": -24.592479705810547, "rewards/margins": 6.206945896148682, "rewards/rejected": -30.799427032470703, "step": 1281 }, { "epoch": 0.17456427015250545, "grad_norm": 52.371098858031246, "learning_rate": 7.86554261279005e-07, "logits/chosen": 7.8071088790893555, "logits/rejected": 8.861183166503906, "logps/chosen": -2.5917716026306152, "logps/rejected": -2.7848148345947266, "loss": 4.2032, "rewards/accuracies": 0.75, "rewards/chosen": -25.91771697998047, "rewards/margins": 1.9304327964782715, "rewards/rejected": -27.8481502532959, "step": 1282 }, { "epoch": 0.1747004357298475, "grad_norm": 49.21574593572897, "learning_rate": 7.865053331509042e-07, "logits/chosen": 8.048540115356445, "logits/rejected": 9.318962097167969, "logps/chosen": -2.1066060066223145, "logps/rejected": -2.598313808441162, "loss": 4.0517, "rewards/accuracies": 0.75, "rewards/chosen": -21.066059112548828, "rewards/margins": 4.917079448699951, "rewards/rejected": -25.983139038085938, "step": 1283 }, { "epoch": 0.17483660130718953, "grad_norm": 47.390934828439036, "learning_rate": 7.864563176887286e-07, "logits/chosen": 8.761039733886719, "logits/rejected": 10.564815521240234, "logps/chosen": -2.703883171081543, "logps/rejected": -3.025277853012085, "loss": 4.2846, "rewards/accuracies": 0.75, "rewards/chosen": -27.038833618164062, "rewards/margins": 3.2139453887939453, "rewards/rejected": -30.252779006958008, "step": 1284 }, { "epoch": 0.1749727668845316, "grad_norm": 55.989424491928624, "learning_rate": 7.864072149035534e-07, "logits/chosen": 8.71324348449707, "logits/rejected": 9.74647331237793, "logps/chosen": -2.930933952331543, "logps/rejected": -3.1913814544677734, "loss": 3.5017, "rewards/accuracies": 0.75, "rewards/chosen": -29.309341430664062, "rewards/margins": 2.604473114013672, "rewards/rejected": -31.9138126373291, "step": 1285 }, { "epoch": 0.17510893246187365, "grad_norm": 53.556859567738265, "learning_rate": 7.863580248064739e-07, "logits/chosen": 7.669706344604492, "logits/rejected": 8.26579475402832, "logps/chosen": -2.6120524406433105, "logps/rejected": -2.628973960876465, "loss": 3.8805, "rewards/accuracies": 0.75, "rewards/chosen": -26.120525360107422, "rewards/margins": 0.16921377182006836, "rewards/rejected": -26.28973960876465, "step": 1286 }, { "epoch": 0.17524509803921567, "grad_norm": 70.2591364912337, "learning_rate": 7.863087474086051e-07, "logits/chosen": 9.341255187988281, "logits/rejected": 8.56302547454834, "logps/chosen": -2.7550930976867676, "logps/rejected": -2.5680689811706543, "loss": 4.7273, "rewards/accuracies": 0.25, "rewards/chosen": -27.55093002319336, "rewards/margins": -1.8702406883239746, "rewards/rejected": -25.68069076538086, "step": 1287 }, { "epoch": 0.17538126361655773, "grad_norm": 47.86207784147279, "learning_rate": 7.862593827210815e-07, "logits/chosen": 8.549424171447754, "logits/rejected": 8.933584213256836, "logps/chosen": -2.308319091796875, "logps/rejected": -2.3824784755706787, "loss": 4.2653, "rewards/accuracies": 0.5, "rewards/chosen": -23.083192825317383, "rewards/margins": 0.7415924072265625, "rewards/rejected": -23.824783325195312, "step": 1288 }, { "epoch": 0.1755174291938998, "grad_norm": 68.87167140127774, "learning_rate": 7.862099307550576e-07, "logits/chosen": 9.562422752380371, "logits/rejected": 8.621639251708984, "logps/chosen": -2.763106346130371, "logps/rejected": -2.56426739692688, "loss": 4.6481, "rewards/accuracies": 0.0, "rewards/chosen": -27.631065368652344, "rewards/margins": -1.9883911609649658, "rewards/rejected": -25.64267349243164, "step": 1289 }, { "epoch": 0.17565359477124182, "grad_norm": 48.290739479310574, "learning_rate": 7.861603915217074e-07, "logits/chosen": 9.496155738830566, "logits/rejected": 10.342447280883789, "logps/chosen": -2.9406609535217285, "logps/rejected": -2.952253580093384, "loss": 4.445, "rewards/accuracies": 0.5, "rewards/chosen": -29.40660858154297, "rewards/margins": 0.11592769622802734, "rewards/rejected": -29.522537231445312, "step": 1290 }, { "epoch": 0.17578976034858387, "grad_norm": 54.20260171255681, "learning_rate": 7.861107650322246e-07, "logits/chosen": 9.571749687194824, "logits/rejected": 9.3209867477417, "logps/chosen": -2.740058422088623, "logps/rejected": -2.7292604446411133, "loss": 4.0893, "rewards/accuracies": 0.25, "rewards/chosen": -27.400585174560547, "rewards/margins": -0.10797834396362305, "rewards/rejected": -27.292606353759766, "step": 1291 }, { "epoch": 0.17592592592592593, "grad_norm": 47.046131433066044, "learning_rate": 7.860610512978229e-07, "logits/chosen": 8.470409393310547, "logits/rejected": 8.349843978881836, "logps/chosen": -2.4442391395568848, "logps/rejected": -2.6758623123168945, "loss": 3.8899, "rewards/accuracies": 0.75, "rewards/chosen": -24.442392349243164, "rewards/margins": 2.316230297088623, "rewards/rejected": -26.758623123168945, "step": 1292 }, { "epoch": 0.176062091503268, "grad_norm": 51.219099297590425, "learning_rate": 7.860112503297354e-07, "logits/chosen": 9.811771392822266, "logits/rejected": 8.87702751159668, "logps/chosen": -2.7664031982421875, "logps/rejected": -2.502713680267334, "loss": 3.8681, "rewards/accuracies": 0.25, "rewards/chosen": -27.664031982421875, "rewards/margins": -2.6368966102600098, "rewards/rejected": -25.027135848999023, "step": 1293 }, { "epoch": 0.17619825708061002, "grad_norm": 43.193131197470244, "learning_rate": 7.859613621392152e-07, "logits/chosen": 7.284927845001221, "logits/rejected": 9.299821853637695, "logps/chosen": -2.4468889236450195, "logps/rejected": -2.9238696098327637, "loss": 3.4096, "rewards/accuracies": 1.0, "rewards/chosen": -24.468887329101562, "rewards/margins": 4.769806385040283, "rewards/rejected": -29.23869514465332, "step": 1294 }, { "epoch": 0.17633442265795207, "grad_norm": 45.14381773059098, "learning_rate": 7.859113867375347e-07, "logits/chosen": 9.530488967895508, "logits/rejected": 9.607772827148438, "logps/chosen": -2.7126622200012207, "logps/rejected": -2.831169605255127, "loss": 4.4726, "rewards/accuracies": 0.5, "rewards/chosen": -27.12662124633789, "rewards/margins": 1.1850757598876953, "rewards/rejected": -28.31169891357422, "step": 1295 }, { "epoch": 0.17647058823529413, "grad_norm": 44.6807033188283, "learning_rate": 7.858613241359864e-07, "logits/chosen": 9.705795288085938, "logits/rejected": 10.672904968261719, "logps/chosen": -2.676936149597168, "logps/rejected": -2.8448379039764404, "loss": 3.6842, "rewards/accuracies": 0.75, "rewards/chosen": -26.769359588623047, "rewards/margins": 1.6790175437927246, "rewards/rejected": -28.448379516601562, "step": 1296 }, { "epoch": 0.17660675381263616, "grad_norm": 58.63967887889971, "learning_rate": 7.858111743458823e-07, "logits/chosen": 8.762245178222656, "logits/rejected": 8.276427268981934, "logps/chosen": -2.5112218856811523, "logps/rejected": -2.6382083892822266, "loss": 4.6367, "rewards/accuracies": 0.5, "rewards/chosen": -25.11221694946289, "rewards/margins": 1.2698640823364258, "rewards/rejected": -26.382083892822266, "step": 1297 }, { "epoch": 0.17674291938997821, "grad_norm": 42.55933727613376, "learning_rate": 7.857609373785544e-07, "logits/chosen": 8.346094131469727, "logits/rejected": 8.943838119506836, "logps/chosen": -2.138026475906372, "logps/rejected": -2.4908461570739746, "loss": 3.859, "rewards/accuracies": 0.75, "rewards/chosen": -21.380264282226562, "rewards/margins": 3.5281975269317627, "rewards/rejected": -24.908462524414062, "step": 1298 }, { "epoch": 0.17687908496732027, "grad_norm": 45.88410795292552, "learning_rate": 7.857106132453539e-07, "logits/chosen": 8.931011199951172, "logits/rejected": 9.686620712280273, "logps/chosen": -2.6054534912109375, "logps/rejected": -2.844187021255493, "loss": 3.9991, "rewards/accuracies": 1.0, "rewards/chosen": -26.054534912109375, "rewards/margins": 2.3873367309570312, "rewards/rejected": -28.441871643066406, "step": 1299 }, { "epoch": 0.1770152505446623, "grad_norm": 94.27806636236996, "learning_rate": 7.856602019576521e-07, "logits/chosen": 7.9843902587890625, "logits/rejected": 10.108194351196289, "logps/chosen": -2.5476183891296387, "logps/rejected": -2.734616279602051, "loss": 4.0725, "rewards/accuracies": 1.0, "rewards/chosen": -25.476184844970703, "rewards/margins": 1.8699774742126465, "rewards/rejected": -27.346160888671875, "step": 1300 }, { "epoch": 0.17715141612200436, "grad_norm": 61.898413872472446, "learning_rate": 7.856097035268396e-07, "logits/chosen": 10.213859558105469, "logits/rejected": 8.85774040222168, "logps/chosen": -3.0398077964782715, "logps/rejected": -2.9256277084350586, "loss": 4.5285, "rewards/accuracies": 0.25, "rewards/chosen": -30.39807891845703, "rewards/margins": -1.1418004035949707, "rewards/rejected": -29.25627899169922, "step": 1301 }, { "epoch": 0.1772875816993464, "grad_norm": 55.09018821019504, "learning_rate": 7.855591179643271e-07, "logits/chosen": 9.47698974609375, "logits/rejected": 10.019433975219727, "logps/chosen": -2.0071706771850586, "logps/rejected": -2.6865386962890625, "loss": 4.5551, "rewards/accuracies": 1.0, "rewards/chosen": -20.071706771850586, "rewards/margins": 6.7936787605285645, "rewards/rejected": -26.865386962890625, "step": 1302 }, { "epoch": 0.17742374727668844, "grad_norm": 45.66157562124003, "learning_rate": 7.855084452815448e-07, "logits/chosen": 6.952203273773193, "logits/rejected": 7.7143964767456055, "logps/chosen": -2.304696559906006, "logps/rejected": -2.327536106109619, "loss": 3.7852, "rewards/accuracies": 0.25, "rewards/chosen": -23.046966552734375, "rewards/margins": 0.2283954620361328, "rewards/rejected": -23.275360107421875, "step": 1303 }, { "epoch": 0.1775599128540305, "grad_norm": 42.01804444710504, "learning_rate": 7.854576854899428e-07, "logits/chosen": 6.993441581726074, "logits/rejected": 7.2320404052734375, "logps/chosen": -2.316403865814209, "logps/rejected": -2.5724165439605713, "loss": 4.1358, "rewards/accuracies": 0.75, "rewards/chosen": -23.164039611816406, "rewards/margins": 2.5601258277893066, "rewards/rejected": -25.724164962768555, "step": 1304 }, { "epoch": 0.17769607843137256, "grad_norm": 46.21872220037969, "learning_rate": 7.854068386009905e-07, "logits/chosen": 8.655352592468262, "logits/rejected": 8.097249984741211, "logps/chosen": -2.74363374710083, "logps/rejected": -2.7309179306030273, "loss": 3.7431, "rewards/accuracies": 0.75, "rewards/chosen": -27.436338424682617, "rewards/margins": -0.12716007232666016, "rewards/rejected": -27.30917739868164, "step": 1305 }, { "epoch": 0.17783224400871459, "grad_norm": 56.14236019693723, "learning_rate": 7.853559046261771e-07, "logits/chosen": 8.90906047821045, "logits/rejected": 9.978336334228516, "logps/chosen": -2.6388630867004395, "logps/rejected": -2.8459224700927734, "loss": 4.2271, "rewards/accuracies": 0.75, "rewards/chosen": -26.388629913330078, "rewards/margins": 2.0705957412719727, "rewards/rejected": -28.459226608276367, "step": 1306 }, { "epoch": 0.17796840958605664, "grad_norm": 46.730967419680454, "learning_rate": 7.853048835770118e-07, "logits/chosen": 8.44369125366211, "logits/rejected": 9.661638259887695, "logps/chosen": -2.8775205612182617, "logps/rejected": -3.085461139678955, "loss": 4.0652, "rewards/accuracies": 0.75, "rewards/chosen": -28.775203704833984, "rewards/margins": 2.0794053077697754, "rewards/rejected": -30.854610443115234, "step": 1307 }, { "epoch": 0.1781045751633987, "grad_norm": 42.94087516602237, "learning_rate": 7.852537754650229e-07, "logits/chosen": 6.538280487060547, "logits/rejected": 9.628881454467773, "logps/chosen": -2.1345739364624023, "logps/rejected": -2.826770067214966, "loss": 4.2829, "rewards/accuracies": 1.0, "rewards/chosen": -21.345741271972656, "rewards/margins": 6.921961784362793, "rewards/rejected": -28.2677001953125, "step": 1308 }, { "epoch": 0.17824074074074073, "grad_norm": 50.577042911642444, "learning_rate": 7.852025803017591e-07, "logits/chosen": 7.428853988647461, "logits/rejected": 8.235700607299805, "logps/chosen": -2.5533194541931152, "logps/rejected": -2.6879501342773438, "loss": 4.1941, "rewards/accuracies": 0.75, "rewards/chosen": -25.533191680908203, "rewards/margins": 1.346311092376709, "rewards/rejected": -26.87950325012207, "step": 1309 }, { "epoch": 0.17837690631808278, "grad_norm": 47.6031781281734, "learning_rate": 7.851512980987882e-07, "logits/chosen": 8.616228103637695, "logits/rejected": 8.412328720092773, "logps/chosen": -2.8765182495117188, "logps/rejected": -3.240140914916992, "loss": 3.6925, "rewards/accuracies": 1.0, "rewards/chosen": -28.765182495117188, "rewards/margins": 3.636223793029785, "rewards/rejected": -32.401405334472656, "step": 1310 }, { "epoch": 0.17851307189542484, "grad_norm": 51.19954070598363, "learning_rate": 7.850999288676977e-07, "logits/chosen": 6.797005653381348, "logits/rejected": 8.350231170654297, "logps/chosen": -2.573770046234131, "logps/rejected": -2.840582847595215, "loss": 4.5708, "rewards/accuracies": 0.75, "rewards/chosen": -25.737701416015625, "rewards/margins": 2.6681265830993652, "rewards/rejected": -28.40582847595215, "step": 1311 }, { "epoch": 0.1786492374727669, "grad_norm": 50.14765553632467, "learning_rate": 7.850484726200949e-07, "logits/chosen": 7.904426574707031, "logits/rejected": 8.87701416015625, "logps/chosen": -2.7667856216430664, "logps/rejected": -3.1120166778564453, "loss": 4.412, "rewards/accuracies": 0.75, "rewards/chosen": -27.667858123779297, "rewards/margins": 3.452305793762207, "rewards/rejected": -31.120162963867188, "step": 1312 }, { "epoch": 0.17878540305010893, "grad_norm": 46.533251016227574, "learning_rate": 7.849969293676071e-07, "logits/chosen": 7.039062023162842, "logits/rejected": 9.476337432861328, "logps/chosen": -2.5982227325439453, "logps/rejected": -3.124006748199463, "loss": 3.795, "rewards/accuracies": 1.0, "rewards/chosen": -25.982227325439453, "rewards/margins": 5.257841110229492, "rewards/rejected": -31.240068435668945, "step": 1313 }, { "epoch": 0.17892156862745098, "grad_norm": 42.50166239048462, "learning_rate": 7.849452991218805e-07, "logits/chosen": 8.771438598632812, "logits/rejected": 8.446882247924805, "logps/chosen": -2.8905787467956543, "logps/rejected": -2.673626184463501, "loss": 4.0869, "rewards/accuracies": 0.25, "rewards/chosen": -28.905784606933594, "rewards/margins": -2.1695241928100586, "rewards/rejected": -26.73626136779785, "step": 1314 }, { "epoch": 0.17905773420479304, "grad_norm": 44.70096712399427, "learning_rate": 7.848935818945817e-07, "logits/chosen": 7.721320629119873, "logits/rejected": 7.368414878845215, "logps/chosen": -2.8021442890167236, "logps/rejected": -2.934647560119629, "loss": 4.3237, "rewards/accuracies": 0.75, "rewards/chosen": -28.02144432067871, "rewards/margins": 1.3250317573547363, "rewards/rejected": -29.346473693847656, "step": 1315 }, { "epoch": 0.17919389978213507, "grad_norm": 51.91123686916754, "learning_rate": 7.848417776973964e-07, "logits/chosen": 7.745542526245117, "logits/rejected": 8.90375804901123, "logps/chosen": -2.7900233268737793, "logps/rejected": -3.2429327964782715, "loss": 4.5371, "rewards/accuracies": 0.75, "rewards/chosen": -27.900232315063477, "rewards/margins": 4.529095649719238, "rewards/rejected": -32.42932891845703, "step": 1316 }, { "epoch": 0.17933006535947713, "grad_norm": 44.262603134984566, "learning_rate": 7.847898865420304e-07, "logits/chosen": 9.290715217590332, "logits/rejected": 9.361499786376953, "logps/chosen": -3.03603458404541, "logps/rejected": -2.9926958084106445, "loss": 3.9574, "rewards/accuracies": 0.5, "rewards/chosen": -30.36034393310547, "rewards/margins": -0.43338680267333984, "rewards/rejected": -29.926959991455078, "step": 1317 }, { "epoch": 0.17946623093681918, "grad_norm": 48.10694366026449, "learning_rate": 7.847379084402088e-07, "logits/chosen": 9.140036582946777, "logits/rejected": 9.116342544555664, "logps/chosen": -2.9807772636413574, "logps/rejected": -3.07143497467041, "loss": 4.0332, "rewards/accuracies": 0.75, "rewards/chosen": -29.807771682739258, "rewards/margins": 0.9065794944763184, "rewards/rejected": -30.714351654052734, "step": 1318 }, { "epoch": 0.1796023965141612, "grad_norm": 50.43135231196726, "learning_rate": 7.846858434036765e-07, "logits/chosen": 7.677341938018799, "logits/rejected": 8.842384338378906, "logps/chosen": -2.213636636734009, "logps/rejected": -2.6503708362579346, "loss": 4.1948, "rewards/accuracies": 1.0, "rewards/chosen": -22.136367797851562, "rewards/margins": 4.367339611053467, "rewards/rejected": -26.503707885742188, "step": 1319 }, { "epoch": 0.17973856209150327, "grad_norm": 175.253238228689, "learning_rate": 7.846336914441981e-07, "logits/chosen": 7.713603973388672, "logits/rejected": 9.1264066696167, "logps/chosen": -2.4919309616088867, "logps/rejected": -3.203920364379883, "loss": 4.1501, "rewards/accuracies": 1.0, "rewards/chosen": -24.919307708740234, "rewards/margins": 7.119894027709961, "rewards/rejected": -32.03919982910156, "step": 1320 }, { "epoch": 0.17987472766884532, "grad_norm": 93.82446098998302, "learning_rate": 7.845814525735575e-07, "logits/chosen": 8.992136001586914, "logits/rejected": 9.466462135314941, "logps/chosen": -3.0524096488952637, "logps/rejected": -4.253414154052734, "loss": 4.4831, "rewards/accuracies": 0.75, "rewards/chosen": -30.52409553527832, "rewards/margins": 12.010045051574707, "rewards/rejected": -42.534141540527344, "step": 1321 }, { "epoch": 0.18001089324618735, "grad_norm": 43.05112887632272, "learning_rate": 7.845291268035588e-07, "logits/chosen": 8.598143577575684, "logits/rejected": 8.14716625213623, "logps/chosen": -2.5587656497955322, "logps/rejected": -2.7650842666625977, "loss": 3.8379, "rewards/accuracies": 0.5, "rewards/chosen": -25.587656021118164, "rewards/margins": 2.063185691833496, "rewards/rejected": -27.650842666625977, "step": 1322 }, { "epoch": 0.1801470588235294, "grad_norm": 46.78271198612174, "learning_rate": 7.844767141460254e-07, "logits/chosen": 9.06753921508789, "logits/rejected": 8.453546524047852, "logps/chosen": -2.6988282203674316, "logps/rejected": -2.653449058532715, "loss": 4.5466, "rewards/accuracies": 0.5, "rewards/chosen": -26.988283157348633, "rewards/margins": -0.4537930488586426, "rewards/rejected": -26.534488677978516, "step": 1323 }, { "epoch": 0.18028322440087147, "grad_norm": 44.608002518215066, "learning_rate": 7.844242146128003e-07, "logits/chosen": 8.709989547729492, "logits/rejected": 7.797305107116699, "logps/chosen": -3.049839973449707, "logps/rejected": -2.9358420372009277, "loss": 4.3516, "rewards/accuracies": 0.5, "rewards/chosen": -30.49839973449707, "rewards/margins": -1.1399798393249512, "rewards/rejected": -29.358421325683594, "step": 1324 }, { "epoch": 0.1804193899782135, "grad_norm": 50.421643065335886, "learning_rate": 7.843716282157463e-07, "logits/chosen": 9.571029663085938, "logits/rejected": 9.923508644104004, "logps/chosen": -2.617427349090576, "logps/rejected": -2.677485466003418, "loss": 4.0514, "rewards/accuracies": 0.5, "rewards/chosen": -26.174272537231445, "rewards/margins": 0.6005802154541016, "rewards/rejected": -26.774852752685547, "step": 1325 }, { "epoch": 0.18055555555555555, "grad_norm": 49.845721309615826, "learning_rate": 7.843189549667456e-07, "logits/chosen": 9.82248306274414, "logits/rejected": 9.43262004852295, "logps/chosen": -2.9201292991638184, "logps/rejected": -2.9965171813964844, "loss": 4.528, "rewards/accuracies": 0.75, "rewards/chosen": -29.2012939453125, "rewards/margins": 0.763878345489502, "rewards/rejected": -29.965171813964844, "step": 1326 }, { "epoch": 0.1806917211328976, "grad_norm": 42.66126283225123, "learning_rate": 7.842661948777001e-07, "logits/chosen": 8.476232528686523, "logits/rejected": 9.138541221618652, "logps/chosen": -2.5608205795288086, "logps/rejected": -2.748173236846924, "loss": 3.7826, "rewards/accuracies": 0.5, "rewards/chosen": -25.608205795288086, "rewards/margins": 1.8735270500183105, "rewards/rejected": -27.481733322143555, "step": 1327 }, { "epoch": 0.18082788671023964, "grad_norm": 43.59011022695937, "learning_rate": 7.842133479605316e-07, "logits/chosen": 9.584754943847656, "logits/rejected": 9.042560577392578, "logps/chosen": -2.5137972831726074, "logps/rejected": -2.5414364337921143, "loss": 4.2153, "rewards/accuracies": 0.25, "rewards/chosen": -25.137969970703125, "rewards/margins": 0.27639293670654297, "rewards/rejected": -25.414363861083984, "step": 1328 }, { "epoch": 0.1809640522875817, "grad_norm": 47.18249981092637, "learning_rate": 7.841604142271812e-07, "logits/chosen": 7.133713722229004, "logits/rejected": 6.85468864440918, "logps/chosen": -2.813255786895752, "logps/rejected": -2.750551700592041, "loss": 4.5388, "rewards/accuracies": 0.75, "rewards/chosen": -28.132556915283203, "rewards/margins": -0.6270394325256348, "rewards/rejected": -27.505517959594727, "step": 1329 }, { "epoch": 0.18110021786492375, "grad_norm": 46.98300848865491, "learning_rate": 7.841073936896098e-07, "logits/chosen": 9.413037300109863, "logits/rejected": 10.38302993774414, "logps/chosen": -2.9080312252044678, "logps/rejected": -2.988375663757324, "loss": 3.6865, "rewards/accuracies": 0.75, "rewards/chosen": -29.080310821533203, "rewards/margins": 0.8034462928771973, "rewards/rejected": -29.883758544921875, "step": 1330 }, { "epoch": 0.1812363834422658, "grad_norm": 54.64648605905222, "learning_rate": 7.840542863597976e-07, "logits/chosen": 8.192914962768555, "logits/rejected": 8.84432601928711, "logps/chosen": -2.251232147216797, "logps/rejected": -2.778550148010254, "loss": 4.122, "rewards/accuracies": 1.0, "rewards/chosen": -22.51232147216797, "rewards/margins": 5.273181915283203, "rewards/rejected": -27.785503387451172, "step": 1331 }, { "epoch": 0.18137254901960784, "grad_norm": 45.88941395715989, "learning_rate": 7.840010922497448e-07, "logits/chosen": 9.339456558227539, "logits/rejected": 8.758707046508789, "logps/chosen": -2.641119956970215, "logps/rejected": -2.618368625640869, "loss": 4.0095, "rewards/accuracies": 0.5, "rewards/chosen": -26.41120147705078, "rewards/margins": -0.22751522064208984, "rewards/rejected": -26.183685302734375, "step": 1332 }, { "epoch": 0.1815087145969499, "grad_norm": 48.29524648393327, "learning_rate": 7.83947811371471e-07, "logits/chosen": 8.02261734008789, "logits/rejected": 9.036222457885742, "logps/chosen": -2.5807840824127197, "logps/rejected": -2.770522356033325, "loss": 4.4418, "rewards/accuracies": 1.0, "rewards/chosen": -25.80784034729004, "rewards/margins": 1.897383213043213, "rewards/rejected": -27.705223083496094, "step": 1333 }, { "epoch": 0.18164488017429195, "grad_norm": 46.95148461094512, "learning_rate": 7.838944437370154e-07, "logits/chosen": 9.078540802001953, "logits/rejected": 9.489168167114258, "logps/chosen": -3.1286346912384033, "logps/rejected": -3.176274299621582, "loss": 4.2138, "rewards/accuracies": 0.5, "rewards/chosen": -31.286348342895508, "rewards/margins": 0.4763932228088379, "rewards/rejected": -31.762741088867188, "step": 1334 }, { "epoch": 0.18178104575163398, "grad_norm": 76.25179645352937, "learning_rate": 7.838409893584371e-07, "logits/chosen": 10.5423583984375, "logits/rejected": 10.375706672668457, "logps/chosen": -2.898308277130127, "logps/rejected": -2.8002519607543945, "loss": 4.1953, "rewards/accuracies": 0.5, "rewards/chosen": -28.983081817626953, "rewards/margins": -0.9805622100830078, "rewards/rejected": -28.002517700195312, "step": 1335 }, { "epoch": 0.18191721132897604, "grad_norm": 44.6042684555009, "learning_rate": 7.837874482478142e-07, "logits/chosen": 9.272310256958008, "logits/rejected": 10.112323760986328, "logps/chosen": -2.8877406120300293, "logps/rejected": -3.148242473602295, "loss": 3.9092, "rewards/accuracies": 1.0, "rewards/chosen": -28.87740707397461, "rewards/margins": 2.6050148010253906, "rewards/rejected": -31.482421875, "step": 1336 }, { "epoch": 0.1820533769063181, "grad_norm": 47.657697858245044, "learning_rate": 7.837338204172452e-07, "logits/chosen": 8.608688354492188, "logits/rejected": 11.220664978027344, "logps/chosen": -2.444748640060425, "logps/rejected": -2.880571126937866, "loss": 3.7702, "rewards/accuracies": 1.0, "rewards/chosen": -24.447486877441406, "rewards/margins": 4.358224391937256, "rewards/rejected": -28.80571174621582, "step": 1337 }, { "epoch": 0.18218954248366012, "grad_norm": 47.87769796910776, "learning_rate": 7.836801058788472e-07, "logits/chosen": 9.735295295715332, "logits/rejected": 10.65838623046875, "logps/chosen": -3.2688984870910645, "logps/rejected": -3.5713891983032227, "loss": 4.6372, "rewards/accuracies": 1.0, "rewards/chosen": -32.68898391723633, "rewards/margins": 3.0249075889587402, "rewards/rejected": -35.71389389038086, "step": 1338 }, { "epoch": 0.18232570806100218, "grad_norm": 48.05096729933243, "learning_rate": 7.83626304644758e-07, "logits/chosen": 9.17547607421875, "logits/rejected": 10.311275482177734, "logps/chosen": -2.7897443771362305, "logps/rejected": -2.928412437438965, "loss": 4.1181, "rewards/accuracies": 0.5, "rewards/chosen": -27.897443771362305, "rewards/margins": 1.3866806030273438, "rewards/rejected": -29.28412437438965, "step": 1339 }, { "epoch": 0.18246187363834424, "grad_norm": 50.60868172492785, "learning_rate": 7.835724167271341e-07, "logits/chosen": 9.712453842163086, "logits/rejected": 9.568883895874023, "logps/chosen": -2.225795030593872, "logps/rejected": -2.3443236351013184, "loss": 3.5498, "rewards/accuracies": 0.75, "rewards/chosen": -22.257949829101562, "rewards/margins": 1.1852855682373047, "rewards/rejected": -23.4432373046875, "step": 1340 }, { "epoch": 0.18259803921568626, "grad_norm": 46.915072660973856, "learning_rate": 7.835184421381519e-07, "logits/chosen": 10.346368789672852, "logits/rejected": 11.430965423583984, "logps/chosen": -3.1288821697235107, "logps/rejected": -3.5207719802856445, "loss": 3.8534, "rewards/accuracies": 1.0, "rewards/chosen": -31.288820266723633, "rewards/margins": 3.9188990592956543, "rewards/rejected": -35.20771789550781, "step": 1341 }, { "epoch": 0.18273420479302832, "grad_norm": 51.05209675652282, "learning_rate": 7.834643808900078e-07, "logits/chosen": 8.313867568969727, "logits/rejected": 10.076004028320312, "logps/chosen": -2.6997060775756836, "logps/rejected": -3.0974020957946777, "loss": 4.0544, "rewards/accuracies": 1.0, "rewards/chosen": -26.99706268310547, "rewards/margins": 3.9769606590270996, "rewards/rejected": -30.974023818969727, "step": 1342 }, { "epoch": 0.18287037037037038, "grad_norm": 69.87692731213616, "learning_rate": 7.834102329949168e-07, "logits/chosen": 10.78073501586914, "logits/rejected": 9.342853546142578, "logps/chosen": -3.193713903427124, "logps/rejected": -3.4202675819396973, "loss": 4.0419, "rewards/accuracies": 0.5, "rewards/chosen": -31.937137603759766, "rewards/margins": 2.2655367851257324, "rewards/rejected": -34.202674865722656, "step": 1343 }, { "epoch": 0.1830065359477124, "grad_norm": 56.127573126555305, "learning_rate": 7.833559984651144e-07, "logits/chosen": 8.735811233520508, "logits/rejected": 10.232507705688477, "logps/chosen": -2.546027183532715, "logps/rejected": -2.901289463043213, "loss": 3.9404, "rewards/accuracies": 0.75, "rewards/chosen": -25.460269927978516, "rewards/margins": 3.5526249408721924, "rewards/rejected": -29.012893676757812, "step": 1344 }, { "epoch": 0.18314270152505446, "grad_norm": 46.109012721797356, "learning_rate": 7.833016773128554e-07, "logits/chosen": 6.679217338562012, "logits/rejected": 8.604286193847656, "logps/chosen": -2.1791796684265137, "logps/rejected": -2.6349806785583496, "loss": 4.033, "rewards/accuracies": 1.0, "rewards/chosen": -21.791797637939453, "rewards/margins": 4.558011054992676, "rewards/rejected": -26.349807739257812, "step": 1345 }, { "epoch": 0.18327886710239652, "grad_norm": 45.67484928354285, "learning_rate": 7.832472695504139e-07, "logits/chosen": 10.27675724029541, "logits/rejected": 10.119391441345215, "logps/chosen": -2.8967084884643555, "logps/rejected": -3.365297794342041, "loss": 4.1966, "rewards/accuracies": 0.75, "rewards/chosen": -28.967086791992188, "rewards/margins": 4.685891628265381, "rewards/rejected": -33.652976989746094, "step": 1346 }, { "epoch": 0.18341503267973855, "grad_norm": 46.29110084767961, "learning_rate": 7.831927751900838e-07, "logits/chosen": 9.513994216918945, "logits/rejected": 10.164950370788574, "logps/chosen": -3.053359031677246, "logps/rejected": -2.598226547241211, "loss": 3.8538, "rewards/accuracies": 0.5, "rewards/chosen": -30.533592224121094, "rewards/margins": -4.551327228546143, "rewards/rejected": -25.98226547241211, "step": 1347 }, { "epoch": 0.1835511982570806, "grad_norm": 55.94929957823874, "learning_rate": 7.831381942441789e-07, "logits/chosen": 9.601903915405273, "logits/rejected": 10.290390014648438, "logps/chosen": -2.9269843101501465, "logps/rejected": -2.8831827640533447, "loss": 4.7447, "rewards/accuracies": 0.5, "rewards/chosen": -29.26984214782715, "rewards/margins": -0.43801450729370117, "rewards/rejected": -28.83182716369629, "step": 1348 }, { "epoch": 0.18368736383442266, "grad_norm": 48.64705971836791, "learning_rate": 7.830835267250317e-07, "logits/chosen": 9.736154556274414, "logits/rejected": 10.048152923583984, "logps/chosen": -2.738100528717041, "logps/rejected": -3.190201759338379, "loss": 4.2128, "rewards/accuracies": 0.75, "rewards/chosen": -27.381004333496094, "rewards/margins": 4.521012306213379, "rewards/rejected": -31.90201759338379, "step": 1349 }, { "epoch": 0.18382352941176472, "grad_norm": 45.10347329979892, "learning_rate": 7.830287726449953e-07, "logits/chosen": 8.769933700561523, "logits/rejected": 9.88935375213623, "logps/chosen": -2.631272792816162, "logps/rejected": -3.2275819778442383, "loss": 3.9584, "rewards/accuracies": 1.0, "rewards/chosen": -26.312728881835938, "rewards/margins": 5.963088512420654, "rewards/rejected": -32.27581787109375, "step": 1350 }, { "epoch": 0.18395969498910675, "grad_norm": 49.610821997445306, "learning_rate": 7.829739320164414e-07, "logits/chosen": 10.253240585327148, "logits/rejected": 9.109930992126465, "logps/chosen": -3.1146554946899414, "logps/rejected": -2.770916223526001, "loss": 4.2889, "rewards/accuracies": 0.25, "rewards/chosen": -31.146554946899414, "rewards/margins": -3.4373927116394043, "rewards/rejected": -27.70916175842285, "step": 1351 }, { "epoch": 0.1840958605664488, "grad_norm": 47.19416490126298, "learning_rate": 7.829190048517619e-07, "logits/chosen": 9.61937141418457, "logits/rejected": 8.709619522094727, "logps/chosen": -2.6394970417022705, "logps/rejected": -2.5815463066101074, "loss": 4.2098, "rewards/accuracies": 0.5, "rewards/chosen": -26.394969940185547, "rewards/margins": -0.5795073509216309, "rewards/rejected": -25.81546401977539, "step": 1352 }, { "epoch": 0.18423202614379086, "grad_norm": 43.88450763521514, "learning_rate": 7.82863991163368e-07, "logits/chosen": 8.958877563476562, "logits/rejected": 9.06096076965332, "logps/chosen": -2.8477187156677246, "logps/rejected": -3.001716136932373, "loss": 4.1142, "rewards/accuracies": 0.5, "rewards/chosen": -28.477190017700195, "rewards/margins": 1.5399727821350098, "rewards/rejected": -30.017162322998047, "step": 1353 }, { "epoch": 0.1843681917211329, "grad_norm": 51.12524111291966, "learning_rate": 7.828088909636906e-07, "logits/chosen": 8.959188461303711, "logits/rejected": 9.827420234680176, "logps/chosen": -2.6927237510681152, "logps/rejected": -2.8028087615966797, "loss": 4.494, "rewards/accuracies": 0.75, "rewards/chosen": -26.927234649658203, "rewards/margins": 1.100853443145752, "rewards/rejected": -28.02808952331543, "step": 1354 }, { "epoch": 0.18450435729847495, "grad_norm": 53.98457815688342, "learning_rate": 7.827537042651798e-07, "logits/chosen": 8.893094062805176, "logits/rejected": 9.894906997680664, "logps/chosen": -2.6706202030181885, "logps/rejected": -2.898362636566162, "loss": 4.567, "rewards/accuracies": 0.75, "rewards/chosen": -26.706201553344727, "rewards/margins": 2.2774252891540527, "rewards/rejected": -28.983627319335938, "step": 1355 }, { "epoch": 0.184640522875817, "grad_norm": 47.10464727521514, "learning_rate": 7.826984310803057e-07, "logits/chosen": 10.192037582397461, "logits/rejected": 9.356834411621094, "logps/chosen": -3.073298454284668, "logps/rejected": -2.7759671211242676, "loss": 4.219, "rewards/accuracies": 0.0, "rewards/chosen": -30.732986450195312, "rewards/margins": -2.973313808441162, "rewards/rejected": -27.759672164916992, "step": 1356 }, { "epoch": 0.18477668845315903, "grad_norm": 47.51927419213509, "learning_rate": 7.826430714215576e-07, "logits/chosen": 8.327648162841797, "logits/rejected": 9.209705352783203, "logps/chosen": -2.2067511081695557, "logps/rejected": -2.4478671550750732, "loss": 4.3079, "rewards/accuracies": 0.75, "rewards/chosen": -22.067508697509766, "rewards/margins": 2.411160945892334, "rewards/rejected": -24.47867202758789, "step": 1357 }, { "epoch": 0.1849128540305011, "grad_norm": 54.57540186507603, "learning_rate": 7.825876253014448e-07, "logits/chosen": 8.669706344604492, "logits/rejected": 10.71351432800293, "logps/chosen": -2.5942201614379883, "logps/rejected": -2.838146209716797, "loss": 4.0075, "rewards/accuracies": 0.75, "rewards/chosen": -25.942201614379883, "rewards/margins": 2.439258098602295, "rewards/rejected": -28.381460189819336, "step": 1358 }, { "epoch": 0.18504901960784315, "grad_norm": 48.0516220398086, "learning_rate": 7.825320927324954e-07, "logits/chosen": 8.217096328735352, "logits/rejected": 8.894645690917969, "logps/chosen": -2.4700205326080322, "logps/rejected": -2.6509013175964355, "loss": 4.2424, "rewards/accuracies": 0.5, "rewards/chosen": -24.700204849243164, "rewards/margins": 1.8088068962097168, "rewards/rejected": -26.50901222229004, "step": 1359 }, { "epoch": 0.18518518518518517, "grad_norm": 45.99246648422586, "learning_rate": 7.824764737272575e-07, "logits/chosen": 10.122997283935547, "logits/rejected": 10.11087703704834, "logps/chosen": -3.0002260208129883, "logps/rejected": -2.7448291778564453, "loss": 4.5645, "rewards/accuracies": 0.25, "rewards/chosen": -30.00225830078125, "rewards/margins": -2.5539655685424805, "rewards/rejected": -27.448293685913086, "step": 1360 }, { "epoch": 0.18532135076252723, "grad_norm": 50.13112792833105, "learning_rate": 7.82420768298299e-07, "logits/chosen": 8.223730087280273, "logits/rejected": 8.522046089172363, "logps/chosen": -2.2443442344665527, "logps/rejected": -2.552863836288452, "loss": 4.177, "rewards/accuracies": 1.0, "rewards/chosen": -22.443443298339844, "rewards/margins": 3.0851950645446777, "rewards/rejected": -25.528636932373047, "step": 1361 }, { "epoch": 0.1854575163398693, "grad_norm": 48.03687439641714, "learning_rate": 7.823649764582066e-07, "logits/chosen": 10.458288192749023, "logits/rejected": 10.48809814453125, "logps/chosen": -3.2313075065612793, "logps/rejected": -3.0329091548919678, "loss": 4.7586, "rewards/accuracies": 0.25, "rewards/chosen": -32.31307601928711, "rewards/margins": -1.9839839935302734, "rewards/rejected": -30.329090118408203, "step": 1362 }, { "epoch": 0.18559368191721132, "grad_norm": 49.28112361424598, "learning_rate": 7.823090982195872e-07, "logits/chosen": 11.096752166748047, "logits/rejected": 9.865400314331055, "logps/chosen": -2.907754898071289, "logps/rejected": -2.8114190101623535, "loss": 4.9454, "rewards/accuracies": 0.5, "rewards/chosen": -29.07754898071289, "rewards/margins": -0.963355541229248, "rewards/rejected": -28.114192962646484, "step": 1363 }, { "epoch": 0.18572984749455337, "grad_norm": 44.023206126732205, "learning_rate": 7.822531335950669e-07, "logits/chosen": 8.764164924621582, "logits/rejected": 8.375840187072754, "logps/chosen": -2.672489881515503, "logps/rejected": -2.7815704345703125, "loss": 3.906, "rewards/accuracies": 0.5, "rewards/chosen": -26.724899291992188, "rewards/margins": 1.090804100036621, "rewards/rejected": -27.815704345703125, "step": 1364 }, { "epoch": 0.18586601307189543, "grad_norm": 46.64381687582574, "learning_rate": 7.821970825972913e-07, "logits/chosen": 9.155696868896484, "logits/rejected": 9.151718139648438, "logps/chosen": -2.4067904949188232, "logps/rejected": -2.7847700119018555, "loss": 3.5947, "rewards/accuracies": 0.75, "rewards/chosen": -24.06790542602539, "rewards/margins": 3.779796600341797, "rewards/rejected": -27.847702026367188, "step": 1365 }, { "epoch": 0.18600217864923746, "grad_norm": 46.14860234160133, "learning_rate": 7.821409452389255e-07, "logits/chosen": 8.973176002502441, "logits/rejected": 8.346728324890137, "logps/chosen": -2.7119343280792236, "logps/rejected": -2.6147143840789795, "loss": 3.8213, "rewards/accuracies": 0.25, "rewards/chosen": -27.119342803955078, "rewards/margins": -0.9721994400024414, "rewards/rejected": -26.147144317626953, "step": 1366 }, { "epoch": 0.18613834422657952, "grad_norm": 53.32726964045519, "learning_rate": 7.820847215326544e-07, "logits/chosen": 8.5297269821167, "logits/rejected": 8.799429893493652, "logps/chosen": -2.5448081493377686, "logps/rejected": -2.9150614738464355, "loss": 4.5751, "rewards/accuracies": 0.75, "rewards/chosen": -25.44808006286621, "rewards/margins": 3.70253324508667, "rewards/rejected": -29.150611877441406, "step": 1367 }, { "epoch": 0.18627450980392157, "grad_norm": 41.36410414831897, "learning_rate": 7.820284114911822e-07, "logits/chosen": 9.460119247436523, "logits/rejected": 8.721275329589844, "logps/chosen": -2.6340174674987793, "logps/rejected": -2.5464539527893066, "loss": 4.1829, "rewards/accuracies": 0.25, "rewards/chosen": -26.340173721313477, "rewards/margins": -0.8756341934204102, "rewards/rejected": -25.46453857421875, "step": 1368 }, { "epoch": 0.18641067538126363, "grad_norm": 43.05276593071401, "learning_rate": 7.819720151272324e-07, "logits/chosen": 9.058177947998047, "logits/rejected": 8.753425598144531, "logps/chosen": -2.422506809234619, "logps/rejected": -2.4652132987976074, "loss": 4.5077, "rewards/accuracies": 0.5, "rewards/chosen": -24.225067138671875, "rewards/margins": 0.4270668029785156, "rewards/rejected": -24.652135848999023, "step": 1369 }, { "epoch": 0.18654684095860566, "grad_norm": 44.149362025074915, "learning_rate": 7.819155324535484e-07, "logits/chosen": 9.09316635131836, "logits/rejected": 8.837065696716309, "logps/chosen": -2.6342506408691406, "logps/rejected": -2.7623281478881836, "loss": 4.2636, "rewards/accuracies": 0.75, "rewards/chosen": -26.342506408691406, "rewards/margins": 1.2807750701904297, "rewards/rejected": -27.623279571533203, "step": 1370 }, { "epoch": 0.18668300653594772, "grad_norm": 39.731471813398784, "learning_rate": 7.81858963482893e-07, "logits/chosen": 9.061895370483398, "logits/rejected": 9.631524085998535, "logps/chosen": -2.473060131072998, "logps/rejected": -2.6182374954223633, "loss": 4.0706, "rewards/accuracies": 0.75, "rewards/chosen": -24.730602264404297, "rewards/margins": 1.4517745971679688, "rewards/rejected": -26.182376861572266, "step": 1371 }, { "epoch": 0.18681917211328977, "grad_norm": 45.90112428101656, "learning_rate": 7.818023082280482e-07, "logits/chosen": 9.404279708862305, "logits/rejected": 10.38174819946289, "logps/chosen": -2.619701623916626, "logps/rejected": -2.571302890777588, "loss": 4.9627, "rewards/accuracies": 0.5, "rewards/chosen": -26.19701385498047, "rewards/margins": -0.48398494720458984, "rewards/rejected": -25.713029861450195, "step": 1372 }, { "epoch": 0.1869553376906318, "grad_norm": 43.008913117569556, "learning_rate": 7.81745566701816e-07, "logits/chosen": 9.25516128540039, "logits/rejected": 10.634424209594727, "logps/chosen": -2.5485777854919434, "logps/rejected": -2.917529582977295, "loss": 3.9394, "rewards/accuracies": 1.0, "rewards/chosen": -25.48577880859375, "rewards/margins": 3.689517021179199, "rewards/rejected": -29.17529296875, "step": 1373 }, { "epoch": 0.18709150326797386, "grad_norm": 74.46862883753722, "learning_rate": 7.816887389170174e-07, "logits/chosen": 7.483752250671387, "logits/rejected": 7.948940277099609, "logps/chosen": -2.71976900100708, "logps/rejected": -2.6460413932800293, "loss": 4.4599, "rewards/accuracies": 0.25, "rewards/chosen": -27.197689056396484, "rewards/margins": -0.7372751235961914, "rewards/rejected": -26.46041488647461, "step": 1374 }, { "epoch": 0.18722766884531591, "grad_norm": 42.580331380292485, "learning_rate": 7.816318248864931e-07, "logits/chosen": 7.785876750946045, "logits/rejected": 7.6894965171813965, "logps/chosen": -2.4817442893981934, "logps/rejected": -2.3975284099578857, "loss": 3.8174, "rewards/accuracies": 0.25, "rewards/chosen": -24.81744384765625, "rewards/margins": -0.8421587944030762, "rewards/rejected": -23.975284576416016, "step": 1375 }, { "epoch": 0.18736383442265794, "grad_norm": 48.02452831361647, "learning_rate": 7.815748246231035e-07, "logits/chosen": 9.510337829589844, "logits/rejected": 10.560956954956055, "logps/chosen": -2.7587943077087402, "logps/rejected": -2.740643262863159, "loss": 4.6119, "rewards/accuracies": 0.5, "rewards/chosen": -27.58794403076172, "rewards/margins": -0.18151235580444336, "rewards/rejected": -27.40643310546875, "step": 1376 }, { "epoch": 0.1875, "grad_norm": 46.27292161079147, "learning_rate": 7.81517738139728e-07, "logits/chosen": 9.9613037109375, "logits/rejected": 9.056510925292969, "logps/chosen": -2.8897833824157715, "logps/rejected": -2.6394028663635254, "loss": 4.6074, "rewards/accuracies": 0.0, "rewards/chosen": -28.8978328704834, "rewards/margins": -2.5038037300109863, "rewards/rejected": -26.394027709960938, "step": 1377 }, { "epoch": 0.18763616557734206, "grad_norm": 39.3039621382149, "learning_rate": 7.81460565449266e-07, "logits/chosen": 8.936925888061523, "logits/rejected": 9.399545669555664, "logps/chosen": -2.3258676528930664, "logps/rejected": -2.970195770263672, "loss": 3.6709, "rewards/accuracies": 0.75, "rewards/chosen": -23.258678436279297, "rewards/margins": 6.443281650543213, "rewards/rejected": -29.70195960998535, "step": 1378 }, { "epoch": 0.18777233115468409, "grad_norm": 43.69509498645254, "learning_rate": 7.81403306564636e-07, "logits/chosen": 9.970996856689453, "logits/rejected": 7.8032917976379395, "logps/chosen": -2.95949649810791, "logps/rejected": -2.699737071990967, "loss": 3.9438, "rewards/accuracies": 0.25, "rewards/chosen": -29.5949649810791, "rewards/margins": -2.597593307495117, "rewards/rejected": -26.997371673583984, "step": 1379 }, { "epoch": 0.18790849673202614, "grad_norm": 43.168975869485486, "learning_rate": 7.813459614987762e-07, "logits/chosen": 8.904621124267578, "logits/rejected": 9.532183647155762, "logps/chosen": -2.2073111534118652, "logps/rejected": -2.6213040351867676, "loss": 4.5788, "rewards/accuracies": 1.0, "rewards/chosen": -22.07311248779297, "rewards/margins": 4.139930248260498, "rewards/rejected": -26.213041305541992, "step": 1380 }, { "epoch": 0.1880446623093682, "grad_norm": 39.71028618451692, "learning_rate": 7.812885302646442e-07, "logits/chosen": 9.1212158203125, "logits/rejected": 8.401924133300781, "logps/chosen": -2.5268330574035645, "logps/rejected": -2.567314624786377, "loss": 3.8569, "rewards/accuracies": 0.5, "rewards/chosen": -25.268329620361328, "rewards/margins": 0.404815673828125, "rewards/rejected": -25.673145294189453, "step": 1381 }, { "epoch": 0.18818082788671023, "grad_norm": 40.75215320973116, "learning_rate": 7.81231012875217e-07, "logits/chosen": 9.962976455688477, "logits/rejected": 10.523368835449219, "logps/chosen": -2.9065277576446533, "logps/rejected": -3.0160346031188965, "loss": 3.5955, "rewards/accuracies": 0.75, "rewards/chosen": -29.065279006958008, "rewards/margins": 1.095067024230957, "rewards/rejected": -30.16034507751465, "step": 1382 }, { "epoch": 0.18831699346405228, "grad_norm": 40.92928761748121, "learning_rate": 7.811734093434911e-07, "logits/chosen": 7.715935707092285, "logits/rejected": 8.882156372070312, "logps/chosen": -2.6002559661865234, "logps/rejected": -2.9392693042755127, "loss": 4.5043, "rewards/accuracies": 0.75, "rewards/chosen": -26.002559661865234, "rewards/margins": 3.390134811401367, "rewards/rejected": -29.39269256591797, "step": 1383 }, { "epoch": 0.18845315904139434, "grad_norm": 44.51095942744878, "learning_rate": 7.811157196824825e-07, "logits/chosen": 9.545315742492676, "logits/rejected": 11.169431686401367, "logps/chosen": -3.517887592315674, "logps/rejected": -3.314135789871216, "loss": 4.0519, "rewards/accuracies": 0.75, "rewards/chosen": -35.17887878417969, "rewards/margins": -2.037519931793213, "rewards/rejected": -33.141357421875, "step": 1384 }, { "epoch": 0.18858932461873637, "grad_norm": 43.34975024394475, "learning_rate": 7.810579439052268e-07, "logits/chosen": 10.435356140136719, "logits/rejected": 9.135393142700195, "logps/chosen": -2.8680810928344727, "logps/rejected": -2.6841039657592773, "loss": 4.4229, "rewards/accuracies": 0.25, "rewards/chosen": -28.68081283569336, "rewards/margins": -1.8397727012634277, "rewards/rejected": -26.841039657592773, "step": 1385 }, { "epoch": 0.18872549019607843, "grad_norm": 42.04645202497812, "learning_rate": 7.810000820247788e-07, "logits/chosen": 9.716689109802246, "logits/rejected": 9.915887832641602, "logps/chosen": -2.6641035079956055, "logps/rejected": -2.920287609100342, "loss": 4.1272, "rewards/accuracies": 1.0, "rewards/chosen": -26.641036987304688, "rewards/margins": 2.561840057373047, "rewards/rejected": -29.202877044677734, "step": 1386 }, { "epoch": 0.18886165577342048, "grad_norm": 43.920324156758426, "learning_rate": 7.809421340542128e-07, "logits/chosen": 9.850574493408203, "logits/rejected": 9.056726455688477, "logps/chosen": -2.916274309158325, "logps/rejected": -2.8572802543640137, "loss": 4.491, "rewards/accuracies": 0.5, "rewards/chosen": -29.162744522094727, "rewards/margins": -0.5899415016174316, "rewards/rejected": -28.572803497314453, "step": 1387 }, { "epoch": 0.18899782135076254, "grad_norm": 53.08774746283805, "learning_rate": 7.808841000066229e-07, "logits/chosen": 9.716751098632812, "logits/rejected": 9.270944595336914, "logps/chosen": -2.8926360607147217, "logps/rejected": -3.088618516921997, "loss": 4.3038, "rewards/accuracies": 0.75, "rewards/chosen": -28.926361083984375, "rewards/margins": 1.9598240852355957, "rewards/rejected": -30.886184692382812, "step": 1388 }, { "epoch": 0.18913398692810457, "grad_norm": 51.332652115037924, "learning_rate": 7.808259798951221e-07, "logits/chosen": 10.0178804397583, "logits/rejected": 9.671952247619629, "logps/chosen": -2.713184118270874, "logps/rejected": -2.7003378868103027, "loss": 3.5361, "rewards/accuracies": 0.25, "rewards/chosen": -27.131839752197266, "rewards/margins": -0.1284618377685547, "rewards/rejected": -27.00337791442871, "step": 1389 }, { "epoch": 0.18927015250544663, "grad_norm": 47.02108051276142, "learning_rate": 7.80767773732843e-07, "logits/chosen": 8.937543869018555, "logits/rejected": 8.922920227050781, "logps/chosen": -2.1580727100372314, "logps/rejected": -2.4497528076171875, "loss": 3.9958, "rewards/accuracies": 1.0, "rewards/chosen": -21.580726623535156, "rewards/margins": 2.916799545288086, "rewards/rejected": -24.497528076171875, "step": 1390 }, { "epoch": 0.18940631808278868, "grad_norm": 42.95103329625517, "learning_rate": 7.807094815329383e-07, "logits/chosen": 9.14013671875, "logits/rejected": 9.441476821899414, "logps/chosen": -2.777886390686035, "logps/rejected": -2.6164135932922363, "loss": 4.2634, "rewards/accuracies": 0.25, "rewards/chosen": -27.77886199951172, "rewards/margins": -1.6147270202636719, "rewards/rejected": -26.164134979248047, "step": 1391 }, { "epoch": 0.1895424836601307, "grad_norm": 47.82880147382934, "learning_rate": 7.80651103308579e-07, "logits/chosen": 9.605568885803223, "logits/rejected": 11.603001594543457, "logps/chosen": -2.945199489593506, "logps/rejected": -3.2299389839172363, "loss": 4.178, "rewards/accuracies": 0.5, "rewards/chosen": -29.451995849609375, "rewards/margins": 2.8473944664001465, "rewards/rejected": -32.29939270019531, "step": 1392 }, { "epoch": 0.18967864923747277, "grad_norm": 44.7780534778726, "learning_rate": 7.805926390729566e-07, "logits/chosen": 10.094022750854492, "logits/rejected": 9.926078796386719, "logps/chosen": -3.152008533477783, "logps/rejected": -3.195600748062134, "loss": 3.8662, "rewards/accuracies": 0.5, "rewards/chosen": -31.52008628845215, "rewards/margins": 0.43592214584350586, "rewards/rejected": -31.956008911132812, "step": 1393 }, { "epoch": 0.18981481481481483, "grad_norm": 51.937004321516774, "learning_rate": 7.805340888392813e-07, "logits/chosen": 7.801556587219238, "logits/rejected": 8.559396743774414, "logps/chosen": -2.5792834758758545, "logps/rejected": -2.8476827144622803, "loss": 4.1625, "rewards/accuracies": 0.75, "rewards/chosen": -25.792835235595703, "rewards/margins": 2.6839919090270996, "rewards/rejected": -28.476825714111328, "step": 1394 }, { "epoch": 0.18995098039215685, "grad_norm": 44.4283036506797, "learning_rate": 7.804754526207831e-07, "logits/chosen": 7.488901138305664, "logits/rejected": 10.169963836669922, "logps/chosen": -2.6813857555389404, "logps/rejected": -2.8964152336120605, "loss": 4.0145, "rewards/accuracies": 0.5, "rewards/chosen": -26.813858032226562, "rewards/margins": 2.1502928733825684, "rewards/rejected": -28.96415138244629, "step": 1395 }, { "epoch": 0.1900871459694989, "grad_norm": 46.6271780508641, "learning_rate": 7.804167304307114e-07, "logits/chosen": 8.637527465820312, "logits/rejected": 8.093347549438477, "logps/chosen": -2.8317785263061523, "logps/rejected": -2.9506235122680664, "loss": 4.0249, "rewards/accuracies": 0.5, "rewards/chosen": -28.317785263061523, "rewards/margins": 1.1884493827819824, "rewards/rejected": -29.50623321533203, "step": 1396 }, { "epoch": 0.19022331154684097, "grad_norm": 50.22228529346494, "learning_rate": 7.803579222823348e-07, "logits/chosen": 9.446855545043945, "logits/rejected": 9.54970932006836, "logps/chosen": -2.6828980445861816, "logps/rejected": -2.7002053260803223, "loss": 4.4293, "rewards/accuracies": 0.5, "rewards/chosen": -26.8289794921875, "rewards/margins": 0.17307281494140625, "rewards/rejected": -27.002052307128906, "step": 1397 }, { "epoch": 0.190359477124183, "grad_norm": 46.93485577356288, "learning_rate": 7.802990281889418e-07, "logits/chosen": 9.528192520141602, "logits/rejected": 7.43110990524292, "logps/chosen": -2.59613299369812, "logps/rejected": -2.2613019943237305, "loss": 4.333, "rewards/accuracies": 0.25, "rewards/chosen": -25.96133041381836, "rewards/margins": -3.3483080863952637, "rewards/rejected": -22.613021850585938, "step": 1398 }, { "epoch": 0.19049564270152505, "grad_norm": 52.95806206522151, "learning_rate": 7.802400481638396e-07, "logits/chosen": 9.400938987731934, "logits/rejected": 9.721193313598633, "logps/chosen": -2.882861375808716, "logps/rejected": -2.7486629486083984, "loss": 4.0832, "rewards/accuracies": 0.5, "rewards/chosen": -28.828611373901367, "rewards/margins": -1.3419818878173828, "rewards/rejected": -27.486631393432617, "step": 1399 }, { "epoch": 0.1906318082788671, "grad_norm": 54.636043078157314, "learning_rate": 7.801809822203555e-07, "logits/chosen": 10.298938751220703, "logits/rejected": 8.496502876281738, "logps/chosen": -2.7992656230926514, "logps/rejected": -2.4843451976776123, "loss": 4.8178, "rewards/accuracies": 0.25, "rewards/chosen": -27.992656707763672, "rewards/margins": -3.1492042541503906, "rewards/rejected": -24.84345245361328, "step": 1400 }, { "epoch": 0.19076797385620914, "grad_norm": 49.57536305889623, "learning_rate": 7.801218303718358e-07, "logits/chosen": 8.281044960021973, "logits/rejected": 9.828479766845703, "logps/chosen": -2.7538108825683594, "logps/rejected": -2.960878849029541, "loss": 3.7644, "rewards/accuracies": 0.75, "rewards/chosen": -27.538108825683594, "rewards/margins": 2.0706787109375, "rewards/rejected": -29.608787536621094, "step": 1401 }, { "epoch": 0.1909041394335512, "grad_norm": 49.27178248289167, "learning_rate": 7.800625926316464e-07, "logits/chosen": 9.041743278503418, "logits/rejected": 9.399553298950195, "logps/chosen": -3.243682861328125, "logps/rejected": -3.4176831245422363, "loss": 4.1494, "rewards/accuracies": 0.75, "rewards/chosen": -32.43682861328125, "rewards/margins": 1.7400031089782715, "rewards/rejected": -34.17683029174805, "step": 1402 }, { "epoch": 0.19104030501089325, "grad_norm": 45.61244061791615, "learning_rate": 7.800032690131727e-07, "logits/chosen": 7.430322647094727, "logits/rejected": 8.254188537597656, "logps/chosen": -2.540493965148926, "logps/rejected": -2.934587001800537, "loss": 4.1428, "rewards/accuracies": 0.75, "rewards/chosen": -25.404937744140625, "rewards/margins": 3.9409327507019043, "rewards/rejected": -29.345870971679688, "step": 1403 }, { "epoch": 0.19117647058823528, "grad_norm": 47.92785825808564, "learning_rate": 7.799438595298191e-07, "logits/chosen": 9.711482048034668, "logits/rejected": 10.051280975341797, "logps/chosen": -3.188821315765381, "logps/rejected": -2.661098003387451, "loss": 4.3754, "rewards/accuracies": 0.25, "rewards/chosen": -31.888214111328125, "rewards/margins": -5.277233600616455, "rewards/rejected": -26.610980987548828, "step": 1404 }, { "epoch": 0.19131263616557734, "grad_norm": 83.18259478743013, "learning_rate": 7.798843641950098e-07, "logits/chosen": 9.871641159057617, "logits/rejected": 8.361701011657715, "logps/chosen": -2.7264485359191895, "logps/rejected": -2.625802755355835, "loss": 4.1306, "rewards/accuracies": 0.25, "rewards/chosen": -27.264484405517578, "rewards/margins": -1.006455898284912, "rewards/rejected": -26.258028030395508, "step": 1405 }, { "epoch": 0.1914488017429194, "grad_norm": 45.16196833501355, "learning_rate": 7.798247830221883e-07, "logits/chosen": 8.31587028503418, "logits/rejected": 8.44569206237793, "logps/chosen": -2.809232473373413, "logps/rejected": -2.870121717453003, "loss": 3.8346, "rewards/accuracies": 0.5, "rewards/chosen": -28.092323303222656, "rewards/margins": 0.6088943481445312, "rewards/rejected": -28.701217651367188, "step": 1406 }, { "epoch": 0.19158496732026145, "grad_norm": 43.15075994124876, "learning_rate": 7.797651160248173e-07, "logits/chosen": 5.86435079574585, "logits/rejected": 8.907793045043945, "logps/chosen": -2.2660200595855713, "logps/rejected": -2.74520206451416, "loss": 3.6064, "rewards/accuracies": 1.0, "rewards/chosen": -22.660200119018555, "rewards/margins": 4.791820049285889, "rewards/rejected": -27.45201873779297, "step": 1407 }, { "epoch": 0.19172113289760348, "grad_norm": 43.0960606862917, "learning_rate": 7.797053632163793e-07, "logits/chosen": 9.137687683105469, "logits/rejected": 8.994029998779297, "logps/chosen": -2.7767512798309326, "logps/rejected": -2.998880624771118, "loss": 3.859, "rewards/accuracies": 0.75, "rewards/chosen": -27.767513275146484, "rewards/margins": 2.2212934494018555, "rewards/rejected": -29.988807678222656, "step": 1408 }, { "epoch": 0.19185729847494554, "grad_norm": 45.79218704407086, "learning_rate": 7.796455246103757e-07, "logits/chosen": 9.635754585266113, "logits/rejected": 8.260087966918945, "logps/chosen": -2.839120626449585, "logps/rejected": -2.7998366355895996, "loss": 4.511, "rewards/accuracies": 0.5, "rewards/chosen": -28.391206741333008, "rewards/margins": -0.3928413391113281, "rewards/rejected": -27.99836540222168, "step": 1409 }, { "epoch": 0.1919934640522876, "grad_norm": 65.16657770686385, "learning_rate": 7.795856002203278e-07, "logits/chosen": 7.864844799041748, "logits/rejected": 10.069198608398438, "logps/chosen": -2.72247576713562, "logps/rejected": -3.0553243160247803, "loss": 3.8672, "rewards/accuracies": 0.75, "rewards/chosen": -27.22475814819336, "rewards/margins": 3.328484535217285, "rewards/rejected": -30.553241729736328, "step": 1410 }, { "epoch": 0.19212962962962962, "grad_norm": 50.131286843457254, "learning_rate": 7.795255900597757e-07, "logits/chosen": 8.420543670654297, "logits/rejected": 9.391084671020508, "logps/chosen": -2.9602675437927246, "logps/rejected": -3.2420969009399414, "loss": 4.2834, "rewards/accuracies": 0.75, "rewards/chosen": -29.602676391601562, "rewards/margins": 2.8182921409606934, "rewards/rejected": -32.42096710205078, "step": 1411 }, { "epoch": 0.19226579520697168, "grad_norm": 50.13532468259227, "learning_rate": 7.794654941422793e-07, "logits/chosen": 9.307212829589844, "logits/rejected": 7.6254682540893555, "logps/chosen": -3.1865413188934326, "logps/rejected": -2.7991578578948975, "loss": 3.9216, "rewards/accuracies": 0.25, "rewards/chosen": -31.86541175842285, "rewards/margins": -3.873833179473877, "rewards/rejected": -27.9915771484375, "step": 1412 }, { "epoch": 0.19240196078431374, "grad_norm": 44.42351684535834, "learning_rate": 7.79405312481418e-07, "logits/chosen": 8.255430221557617, "logits/rejected": 9.89922046661377, "logps/chosen": -2.766974449157715, "logps/rejected": -2.91304874420166, "loss": 3.9771, "rewards/accuracies": 0.5, "rewards/chosen": -27.669742584228516, "rewards/margins": 1.460742473602295, "rewards/rejected": -29.13048553466797, "step": 1413 }, { "epoch": 0.19253812636165576, "grad_norm": 46.21762956291915, "learning_rate": 7.793450450907899e-07, "logits/chosen": 6.977984428405762, "logits/rejected": 8.861536026000977, "logps/chosen": -3.0217325687408447, "logps/rejected": -3.20483136177063, "loss": 4.6078, "rewards/accuracies": 0.75, "rewards/chosen": -30.217327117919922, "rewards/margins": 1.8309879302978516, "rewards/rejected": -32.04831314086914, "step": 1414 }, { "epoch": 0.19267429193899782, "grad_norm": 49.562281053481136, "learning_rate": 7.792846919840134e-07, "logits/chosen": 7.557511329650879, "logits/rejected": 9.08896255493164, "logps/chosen": -2.7388916015625, "logps/rejected": -3.1293673515319824, "loss": 4.1377, "rewards/accuracies": 1.0, "rewards/chosen": -27.388916015625, "rewards/margins": 3.904758930206299, "rewards/rejected": -31.293676376342773, "step": 1415 }, { "epoch": 0.19281045751633988, "grad_norm": 46.0233828590438, "learning_rate": 7.792242531747254e-07, "logits/chosen": 9.016619682312012, "logits/rejected": 9.75579833984375, "logps/chosen": -2.5336692333221436, "logps/rejected": -3.109391212463379, "loss": 4.2864, "rewards/accuracies": 1.0, "rewards/chosen": -25.336692810058594, "rewards/margins": 5.7572197914123535, "rewards/rejected": -31.09391212463379, "step": 1416 }, { "epoch": 0.1929466230936819, "grad_norm": 45.47128106302691, "learning_rate": 7.791637286765827e-07, "logits/chosen": 9.358530044555664, "logits/rejected": 8.278900146484375, "logps/chosen": -2.916264772415161, "logps/rejected": -3.0507569313049316, "loss": 4.4684, "rewards/accuracies": 0.5, "rewards/chosen": -29.162647247314453, "rewards/margins": 1.3449196815490723, "rewards/rejected": -30.507568359375, "step": 1417 }, { "epoch": 0.19308278867102396, "grad_norm": 61.948822842082976, "learning_rate": 7.791031185032613e-07, "logits/chosen": 6.310737609863281, "logits/rejected": 9.500275611877441, "logps/chosen": -2.282313823699951, "logps/rejected": -3.0007870197296143, "loss": 4.565, "rewards/accuracies": 0.75, "rewards/chosen": -22.823137283325195, "rewards/margins": 7.184731960296631, "rewards/rejected": -30.007869720458984, "step": 1418 }, { "epoch": 0.19321895424836602, "grad_norm": 53.65010698499398, "learning_rate": 7.790424226684566e-07, "logits/chosen": 9.231705665588379, "logits/rejected": 9.46037769317627, "logps/chosen": -2.906078815460205, "logps/rejected": -3.2229690551757812, "loss": 4.8871, "rewards/accuracies": 0.75, "rewards/chosen": -29.060791015625, "rewards/margins": 3.1688995361328125, "rewards/rejected": -32.22969055175781, "step": 1419 }, { "epoch": 0.19335511982570805, "grad_norm": 51.38958727269336, "learning_rate": 7.789816411858834e-07, "logits/chosen": 8.392578125, "logits/rejected": 9.452919006347656, "logps/chosen": -2.885490894317627, "logps/rejected": -2.8896961212158203, "loss": 4.6546, "rewards/accuracies": 0.75, "rewards/chosen": -28.854907989501953, "rewards/margins": 0.04205179214477539, "rewards/rejected": -28.896961212158203, "step": 1420 }, { "epoch": 0.1934912854030501, "grad_norm": 50.53414327414775, "learning_rate": 7.789207740692756e-07, "logits/chosen": 10.502703666687012, "logits/rejected": 10.400654792785645, "logps/chosen": -3.0411629676818848, "logps/rejected": -3.2562413215637207, "loss": 3.9921, "rewards/accuracies": 0.75, "rewards/chosen": -30.411632537841797, "rewards/margins": 2.1507811546325684, "rewards/rejected": -32.56241226196289, "step": 1421 }, { "epoch": 0.19362745098039216, "grad_norm": 48.25318293198131, "learning_rate": 7.788598213323868e-07, "logits/chosen": 9.984909057617188, "logits/rejected": 9.608039855957031, "logps/chosen": -2.8906359672546387, "logps/rejected": -2.848775863647461, "loss": 4.515, "rewards/accuracies": 0.5, "rewards/chosen": -28.906360626220703, "rewards/margins": -0.41860198974609375, "rewards/rejected": -28.48775863647461, "step": 1422 }, { "epoch": 0.1937636165577342, "grad_norm": 47.51228131934887, "learning_rate": 7.787987829889894e-07, "logits/chosen": 9.186555862426758, "logits/rejected": 9.099655151367188, "logps/chosen": -3.0193774700164795, "logps/rejected": -3.1830239295959473, "loss": 4.2117, "rewards/accuracies": 0.75, "rewards/chosen": -30.193775177001953, "rewards/margins": 1.6364660263061523, "rewards/rejected": -31.830238342285156, "step": 1423 }, { "epoch": 0.19389978213507625, "grad_norm": 49.77398889989463, "learning_rate": 7.787376590528761e-07, "logits/chosen": 8.834890365600586, "logits/rejected": 8.366927146911621, "logps/chosen": -2.900989055633545, "logps/rejected": -2.9940567016601562, "loss": 3.6293, "rewards/accuracies": 0.75, "rewards/chosen": -29.0098876953125, "rewards/margins": 0.9306788444519043, "rewards/rejected": -29.940567016601562, "step": 1424 }, { "epoch": 0.1940359477124183, "grad_norm": 74.37397409158372, "learning_rate": 7.786764495378578e-07, "logits/chosen": 10.095561981201172, "logits/rejected": 9.197192192077637, "logps/chosen": -3.4224514961242676, "logps/rejected": -3.2503161430358887, "loss": 4.297, "rewards/accuracies": 0.25, "rewards/chosen": -34.22451400756836, "rewards/margins": -1.7213549613952637, "rewards/rejected": -32.50315856933594, "step": 1425 }, { "epoch": 0.19417211328976036, "grad_norm": 81.1477049339845, "learning_rate": 7.786151544577658e-07, "logits/chosen": 8.070650100708008, "logits/rejected": 8.864618301391602, "logps/chosen": -2.4333181381225586, "logps/rejected": -2.926071882247925, "loss": 3.9898, "rewards/accuracies": 1.0, "rewards/chosen": -24.333179473876953, "rewards/margins": 4.927540302276611, "rewards/rejected": -29.260719299316406, "step": 1426 }, { "epoch": 0.1943082788671024, "grad_norm": 44.27983651601205, "learning_rate": 7.785537738264499e-07, "logits/chosen": 9.152587890625, "logits/rejected": 10.022639274597168, "logps/chosen": -2.8415169715881348, "logps/rejected": -3.3858072757720947, "loss": 3.7457, "rewards/accuracies": 1.0, "rewards/chosen": -28.41516876220703, "rewards/margins": 5.442902088165283, "rewards/rejected": -33.858070373535156, "step": 1427 }, { "epoch": 0.19444444444444445, "grad_norm": 43.83841443751021, "learning_rate": 7.784923076577796e-07, "logits/chosen": 9.25407886505127, "logits/rejected": 10.336372375488281, "logps/chosen": -3.0345253944396973, "logps/rejected": -3.297396421432495, "loss": 3.7246, "rewards/accuracies": 0.75, "rewards/chosen": -30.345252990722656, "rewards/margins": 2.6287102699279785, "rewards/rejected": -32.973960876464844, "step": 1428 }, { "epoch": 0.1945806100217865, "grad_norm": 43.15858275617871, "learning_rate": 7.784307559656438e-07, "logits/chosen": 8.178897857666016, "logits/rejected": 10.169759750366211, "logps/chosen": -2.666630268096924, "logps/rejected": -3.4572598934173584, "loss": 3.9486, "rewards/accuracies": 1.0, "rewards/chosen": -26.666301727294922, "rewards/margins": 7.906297206878662, "rewards/rejected": -34.57259750366211, "step": 1429 }, { "epoch": 0.19471677559912853, "grad_norm": 52.041379948211954, "learning_rate": 7.783691187639505e-07, "logits/chosen": 9.662242889404297, "logits/rejected": 10.202850341796875, "logps/chosen": -3.1301920413970947, "logps/rejected": -3.081408977508545, "loss": 4.4851, "rewards/accuracies": 0.5, "rewards/chosen": -31.301918029785156, "rewards/margins": -0.48783111572265625, "rewards/rejected": -30.814088821411133, "step": 1430 }, { "epoch": 0.1948529411764706, "grad_norm": 50.784754081008245, "learning_rate": 7.783073960666273e-07, "logits/chosen": 8.701744079589844, "logits/rejected": 9.265340805053711, "logps/chosen": -2.723775863647461, "logps/rejected": -2.905930757522583, "loss": 4.6251, "rewards/accuracies": 0.75, "rewards/chosen": -27.23775863647461, "rewards/margins": 1.8215479850769043, "rewards/rejected": -29.059307098388672, "step": 1431 }, { "epoch": 0.19498910675381265, "grad_norm": 46.03292033638059, "learning_rate": 7.782455878876207e-07, "logits/chosen": 7.336472034454346, "logits/rejected": 8.687309265136719, "logps/chosen": -2.784029006958008, "logps/rejected": -2.9931464195251465, "loss": 4.0415, "rewards/accuracies": 0.75, "rewards/chosen": -27.840290069580078, "rewards/margins": 2.091172695159912, "rewards/rejected": -29.93146324157715, "step": 1432 }, { "epoch": 0.19512527233115468, "grad_norm": 39.96934414359964, "learning_rate": 7.78183694240897e-07, "logits/chosen": 9.87495231628418, "logits/rejected": 8.463025093078613, "logps/chosen": -3.1998767852783203, "logps/rejected": -3.0185585021972656, "loss": 4.0352, "rewards/accuracies": 0.5, "rewards/chosen": -31.998767852783203, "rewards/margins": -1.8131837844848633, "rewards/rejected": -30.185585021972656, "step": 1433 }, { "epoch": 0.19526143790849673, "grad_norm": 53.9433644944088, "learning_rate": 7.781217151404414e-07, "logits/chosen": 9.455964088439941, "logits/rejected": 9.242420196533203, "logps/chosen": -2.8436758518218994, "logps/rejected": -2.7293002605438232, "loss": 4.2704, "rewards/accuracies": 0.25, "rewards/chosen": -28.436758041381836, "rewards/margins": -1.143754482269287, "rewards/rejected": -27.29300308227539, "step": 1434 }, { "epoch": 0.1953976034858388, "grad_norm": 44.46309356735921, "learning_rate": 7.780596506002587e-07, "logits/chosen": 9.859846115112305, "logits/rejected": 8.988874435424805, "logps/chosen": -3.091970920562744, "logps/rejected": -2.9363622665405273, "loss": 4.0182, "rewards/accuracies": 0.25, "rewards/chosen": -30.919708251953125, "rewards/margins": -1.5560836791992188, "rewards/rejected": -29.363624572753906, "step": 1435 }, { "epoch": 0.19553376906318082, "grad_norm": 46.10543728528157, "learning_rate": 7.779975006343729e-07, "logits/chosen": 10.01549243927002, "logits/rejected": 8.170310020446777, "logps/chosen": -3.2857909202575684, "logps/rejected": -3.026228904724121, "loss": 4.2428, "rewards/accuracies": 0.25, "rewards/chosen": -32.85791015625, "rewards/margins": -2.595620632171631, "rewards/rejected": -30.26228904724121, "step": 1436 }, { "epoch": 0.19566993464052287, "grad_norm": 122.54486747433727, "learning_rate": 7.779352652568272e-07, "logits/chosen": 9.644449234008789, "logits/rejected": 9.1419095993042, "logps/chosen": -3.279587745666504, "logps/rejected": -3.337552547454834, "loss": 3.8855, "rewards/accuracies": 0.5, "rewards/chosen": -32.795875549316406, "rewards/margins": 0.5796475410461426, "rewards/rejected": -33.375526428222656, "step": 1437 }, { "epoch": 0.19580610021786493, "grad_norm": 47.7501920815396, "learning_rate": 7.778729444816843e-07, "logits/chosen": 10.044188499450684, "logits/rejected": 9.692174911499023, "logps/chosen": -2.8433215618133545, "logps/rejected": -2.9163808822631836, "loss": 4.4873, "rewards/accuracies": 0.75, "rewards/chosen": -28.433216094970703, "rewards/margins": 0.7305946350097656, "rewards/rejected": -29.16381072998047, "step": 1438 }, { "epoch": 0.19594226579520696, "grad_norm": 45.35753949519482, "learning_rate": 7.778105383230262e-07, "logits/chosen": 8.213859558105469, "logits/rejected": 8.535127639770508, "logps/chosen": -2.369323253631592, "logps/rejected": -2.9668540954589844, "loss": 3.6531, "rewards/accuracies": 1.0, "rewards/chosen": -23.693233489990234, "rewards/margins": 5.975306510925293, "rewards/rejected": -29.668540954589844, "step": 1439 }, { "epoch": 0.19607843137254902, "grad_norm": 48.904317808420906, "learning_rate": 7.777480467949538e-07, "logits/chosen": 8.840709686279297, "logits/rejected": 8.565544128417969, "logps/chosen": -3.1883575916290283, "logps/rejected": -3.175034999847412, "loss": 4.262, "rewards/accuracies": 0.5, "rewards/chosen": -31.883575439453125, "rewards/margins": -0.1332249641418457, "rewards/rejected": -31.750349044799805, "step": 1440 }, { "epoch": 0.19621459694989107, "grad_norm": 50.61877086963339, "learning_rate": 7.776854699115878e-07, "logits/chosen": 9.287450790405273, "logits/rejected": 9.395622253417969, "logps/chosen": -3.30017352104187, "logps/rejected": -3.321747303009033, "loss": 4.1848, "rewards/accuracies": 0.25, "rewards/chosen": -33.00173568725586, "rewards/margins": 0.21573543548583984, "rewards/rejected": -33.217472076416016, "step": 1441 }, { "epoch": 0.1963507625272331, "grad_norm": 45.40762914698662, "learning_rate": 7.776228076870678e-07, "logits/chosen": 8.590129852294922, "logits/rejected": 8.867591857910156, "logps/chosen": -2.7648048400878906, "logps/rejected": -2.9648914337158203, "loss": 4.4733, "rewards/accuracies": 0.75, "rewards/chosen": -27.648048400878906, "rewards/margins": 2.0008678436279297, "rewards/rejected": -29.648914337158203, "step": 1442 }, { "epoch": 0.19648692810457516, "grad_norm": 46.32662673014026, "learning_rate": 7.775600601355532e-07, "logits/chosen": 9.013203620910645, "logits/rejected": 8.640427589416504, "logps/chosen": -3.1024222373962402, "logps/rejected": -2.859396457672119, "loss": 3.8561, "rewards/accuracies": 0.5, "rewards/chosen": -31.02422332763672, "rewards/margins": -2.4302563667297363, "rewards/rejected": -28.593965530395508, "step": 1443 }, { "epoch": 0.19662309368191722, "grad_norm": 58.49735809578905, "learning_rate": 7.774972272712217e-07, "logits/chosen": 8.170985221862793, "logits/rejected": 10.22265338897705, "logps/chosen": -2.444774627685547, "logps/rejected": -3.117020606994629, "loss": 4.5202, "rewards/accuracies": 1.0, "rewards/chosen": -24.44774627685547, "rewards/margins": 6.722458362579346, "rewards/rejected": -31.17020606994629, "step": 1444 }, { "epoch": 0.19675925925925927, "grad_norm": 59.27145002667411, "learning_rate": 7.774343091082716e-07, "logits/chosen": 8.62274169921875, "logits/rejected": 8.017119407653809, "logps/chosen": -2.6086833477020264, "logps/rejected": -2.4068355560302734, "loss": 4.6306, "rewards/accuracies": 0.25, "rewards/chosen": -26.086833953857422, "rewards/margins": -2.0184788703918457, "rewards/rejected": -24.068355560302734, "step": 1445 }, { "epoch": 0.1968954248366013, "grad_norm": 47.639461245590724, "learning_rate": 7.773713056609192e-07, "logits/chosen": 8.046990394592285, "logits/rejected": 7.338481426239014, "logps/chosen": -2.512003183364868, "logps/rejected": -2.4388787746429443, "loss": 4.4503, "rewards/accuracies": 0.5, "rewards/chosen": -25.120033264160156, "rewards/margins": -0.7312450408935547, "rewards/rejected": -24.38878631591797, "step": 1446 }, { "epoch": 0.19703159041394336, "grad_norm": 53.16812937108844, "learning_rate": 7.773082169434011e-07, "logits/chosen": 9.094379425048828, "logits/rejected": 10.085216522216797, "logps/chosen": -3.360833168029785, "logps/rejected": -3.6685972213745117, "loss": 3.8611, "rewards/accuracies": 0.75, "rewards/chosen": -33.60832977294922, "rewards/margins": 3.0776424407958984, "rewards/rejected": -36.68597412109375, "step": 1447 }, { "epoch": 0.19716775599128541, "grad_norm": 48.395168425876314, "learning_rate": 7.772450429699723e-07, "logits/chosen": 6.32399845123291, "logits/rejected": 8.318540573120117, "logps/chosen": -2.402905225753784, "logps/rejected": -2.7487869262695312, "loss": 3.9628, "rewards/accuracies": 0.75, "rewards/chosen": -24.029052734375, "rewards/margins": 3.4588184356689453, "rewards/rejected": -27.487871170043945, "step": 1448 }, { "epoch": 0.19730392156862744, "grad_norm": 51.2520283030313, "learning_rate": 7.771817837549079e-07, "logits/chosen": 6.982491493225098, "logits/rejected": 7.587825298309326, "logps/chosen": -2.890021324157715, "logps/rejected": -2.8637847900390625, "loss": 4.1881, "rewards/accuracies": 0.25, "rewards/chosen": -28.90021514892578, "rewards/margins": -0.26236486434936523, "rewards/rejected": -28.637847900390625, "step": 1449 }, { "epoch": 0.1974400871459695, "grad_norm": 48.92838516229369, "learning_rate": 7.771184393125016e-07, "logits/chosen": 8.488727569580078, "logits/rejected": 8.836777687072754, "logps/chosen": -2.55854868888855, "logps/rejected": -2.7019786834716797, "loss": 4.0283, "rewards/accuracies": 0.75, "rewards/chosen": -25.585487365722656, "rewards/margins": 1.4342999458312988, "rewards/rejected": -27.019786834716797, "step": 1450 }, { "epoch": 0.19757625272331156, "grad_norm": 46.8013789086363, "learning_rate": 7.770550096570665e-07, "logits/chosen": 8.507009506225586, "logits/rejected": 10.305585861206055, "logps/chosen": -2.658696174621582, "logps/rejected": -3.1344337463378906, "loss": 4.0863, "rewards/accuracies": 1.0, "rewards/chosen": -26.586963653564453, "rewards/margins": 4.757372856140137, "rewards/rejected": -31.344337463378906, "step": 1451 }, { "epoch": 0.1977124183006536, "grad_norm": 45.57097213497954, "learning_rate": 7.769914948029355e-07, "logits/chosen": 10.300090789794922, "logits/rejected": 10.669528007507324, "logps/chosen": -3.042023181915283, "logps/rejected": -3.5486438274383545, "loss": 4.0889, "rewards/accuracies": 0.75, "rewards/chosen": -30.42023277282715, "rewards/margins": 5.0662055015563965, "rewards/rejected": -35.4864387512207, "step": 1452 }, { "epoch": 0.19784858387799564, "grad_norm": 58.041124813702325, "learning_rate": 7.769278947644598e-07, "logits/chosen": 9.019883155822754, "logits/rejected": 9.806371688842773, "logps/chosen": -2.8721859455108643, "logps/rejected": -3.085418701171875, "loss": 4.1228, "rewards/accuracies": 0.75, "rewards/chosen": -28.721858978271484, "rewards/margins": 2.1323275566101074, "rewards/rejected": -30.85418701171875, "step": 1453 }, { "epoch": 0.1979847494553377, "grad_norm": 47.12615998539214, "learning_rate": 7.768642095560105e-07, "logits/chosen": 9.247065544128418, "logits/rejected": 10.774068832397461, "logps/chosen": -2.9091572761535645, "logps/rejected": -3.4947197437286377, "loss": 3.9437, "rewards/accuracies": 0.75, "rewards/chosen": -29.091571807861328, "rewards/margins": 5.855624198913574, "rewards/rejected": -34.94719696044922, "step": 1454 }, { "epoch": 0.19812091503267973, "grad_norm": 43.711784032588895, "learning_rate": 7.76800439191978e-07, "logits/chosen": 8.436380386352539, "logits/rejected": 7.033947944641113, "logps/chosen": -2.8736414909362793, "logps/rejected": -2.743246555328369, "loss": 3.7526, "rewards/accuracies": 0.5, "rewards/chosen": -28.73641586303711, "rewards/margins": -1.303950309753418, "rewards/rejected": -27.432464599609375, "step": 1455 }, { "epoch": 0.19825708061002179, "grad_norm": 50.61613020747094, "learning_rate": 7.767365836867716e-07, "logits/chosen": 10.080606460571289, "logits/rejected": 9.564727783203125, "logps/chosen": -3.3689374923706055, "logps/rejected": -3.1593525409698486, "loss": 4.0669, "rewards/accuracies": 0.5, "rewards/chosen": -33.68937683105469, "rewards/margins": -2.0958504676818848, "rewards/rejected": -31.593524932861328, "step": 1456 }, { "epoch": 0.19839324618736384, "grad_norm": 83.15324726382521, "learning_rate": 7.7667264305482e-07, "logits/chosen": 8.951769828796387, "logits/rejected": 10.650959968566895, "logps/chosen": -2.94549560546875, "logps/rejected": -3.5228450298309326, "loss": 4.1459, "rewards/accuracies": 1.0, "rewards/chosen": -29.454954147338867, "rewards/margins": 5.773494720458984, "rewards/rejected": -35.22844696044922, "step": 1457 }, { "epoch": 0.19852941176470587, "grad_norm": 49.439143392199185, "learning_rate": 7.766086173105709e-07, "logits/chosen": 9.908032417297363, "logits/rejected": 9.02869987487793, "logps/chosen": -3.208796501159668, "logps/rejected": -2.9409165382385254, "loss": 4.0294, "rewards/accuracies": 0.5, "rewards/chosen": -32.08796691894531, "rewards/margins": -2.678800106048584, "rewards/rejected": -29.40916633605957, "step": 1458 }, { "epoch": 0.19866557734204793, "grad_norm": 51.75229039182556, "learning_rate": 7.765445064684918e-07, "logits/chosen": 9.003669738769531, "logits/rejected": 9.268579483032227, "logps/chosen": -2.9879302978515625, "logps/rejected": -3.321084499359131, "loss": 4.3264, "rewards/accuracies": 0.75, "rewards/chosen": -29.879302978515625, "rewards/margins": 3.3315420150756836, "rewards/rejected": -33.210845947265625, "step": 1459 }, { "epoch": 0.19880174291938998, "grad_norm": 45.66531413795341, "learning_rate": 7.764803105430689e-07, "logits/chosen": 9.23869514465332, "logits/rejected": 10.389788627624512, "logps/chosen": -3.099674940109253, "logps/rejected": -3.394679069519043, "loss": 4.0322, "rewards/accuracies": 0.75, "rewards/chosen": -30.996749877929688, "rewards/margins": 2.950040340423584, "rewards/rejected": -33.9467887878418, "step": 1460 }, { "epoch": 0.198937908496732, "grad_norm": 182.4523425304293, "learning_rate": 7.764160295488078e-07, "logits/chosen": 8.99901008605957, "logits/rejected": 9.679401397705078, "logps/chosen": -2.4807703495025635, "logps/rejected": -2.4212398529052734, "loss": 3.825, "rewards/accuracies": 0.5, "rewards/chosen": -24.80770492553711, "rewards/margins": -0.5953049659729004, "rewards/rejected": -24.212398529052734, "step": 1461 }, { "epoch": 0.19907407407407407, "grad_norm": 48.34852358111159, "learning_rate": 7.763516635002333e-07, "logits/chosen": 10.055469512939453, "logits/rejected": 10.915822982788086, "logps/chosen": -3.1860463619232178, "logps/rejected": -3.2262237071990967, "loss": 4.4917, "rewards/accuracies": 0.5, "rewards/chosen": -31.860462188720703, "rewards/margins": 0.40177488327026367, "rewards/rejected": -32.262237548828125, "step": 1462 }, { "epoch": 0.19921023965141613, "grad_norm": 51.33696984701474, "learning_rate": 7.762872124118895e-07, "logits/chosen": 10.621923446655273, "logits/rejected": 11.125564575195312, "logps/chosen": -3.4355556964874268, "logps/rejected": -3.286452293395996, "loss": 3.611, "rewards/accuracies": 0.5, "rewards/chosen": -34.35555648803711, "rewards/margins": -1.4910330772399902, "rewards/rejected": -32.864524841308594, "step": 1463 }, { "epoch": 0.19934640522875818, "grad_norm": 86.7492933798764, "learning_rate": 7.762226762983397e-07, "logits/chosen": 9.117464065551758, "logits/rejected": 9.93181037902832, "logps/chosen": -2.8384909629821777, "logps/rejected": -3.224900484085083, "loss": 4.1239, "rewards/accuracies": 0.75, "rewards/chosen": -28.384910583496094, "rewards/margins": 3.864095687866211, "rewards/rejected": -32.24900436401367, "step": 1464 }, { "epoch": 0.1994825708061002, "grad_norm": 42.529480447690666, "learning_rate": 7.761580551741662e-07, "logits/chosen": 10.538615226745605, "logits/rejected": 10.228837966918945, "logps/chosen": -3.100710391998291, "logps/rejected": -3.1946821212768555, "loss": 4.2811, "rewards/accuracies": 0.75, "rewards/chosen": -31.007102966308594, "rewards/margins": 0.9397187232971191, "rewards/rejected": -31.946821212768555, "step": 1465 }, { "epoch": 0.19961873638344227, "grad_norm": 50.65444138640298, "learning_rate": 7.760933490539708e-07, "logits/chosen": 7.4539384841918945, "logits/rejected": 7.5749993324279785, "logps/chosen": -2.906237840652466, "logps/rejected": -2.7938594818115234, "loss": 4.6134, "rewards/accuracies": 0.25, "rewards/chosen": -29.0623779296875, "rewards/margins": -1.1237850189208984, "rewards/rejected": -27.9385929107666, "step": 1466 }, { "epoch": 0.19975490196078433, "grad_norm": 45.19686440919136, "learning_rate": 7.760285579523744e-07, "logits/chosen": 10.230865478515625, "logits/rejected": 10.893077850341797, "logps/chosen": -2.981006383895874, "logps/rejected": -3.230674982070923, "loss": 3.9547, "rewards/accuracies": 0.75, "rewards/chosen": -29.810062408447266, "rewards/margins": 2.496685028076172, "rewards/rejected": -32.30674743652344, "step": 1467 }, { "epoch": 0.19989106753812635, "grad_norm": 48.872173603128594, "learning_rate": 7.759636818840171e-07, "logits/chosen": 9.562965393066406, "logits/rejected": 7.65647554397583, "logps/chosen": -2.555471658706665, "logps/rejected": -2.2843799591064453, "loss": 4.5941, "rewards/accuracies": 0.25, "rewards/chosen": -25.554716110229492, "rewards/margins": -2.7109169960021973, "rewards/rejected": -22.843799591064453, "step": 1468 }, { "epoch": 0.2000272331154684, "grad_norm": 56.27504239591899, "learning_rate": 7.758987208635581e-07, "logits/chosen": 10.426261901855469, "logits/rejected": 10.272636413574219, "logps/chosen": -3.0200438499450684, "logps/rejected": -3.202613353729248, "loss": 4.1766, "rewards/accuracies": 0.75, "rewards/chosen": -30.200439453125, "rewards/margins": 1.8256926536560059, "rewards/rejected": -32.02613067626953, "step": 1469 }, { "epoch": 0.20016339869281047, "grad_norm": 42.84258113397826, "learning_rate": 7.758336749056757e-07, "logits/chosen": 8.742557525634766, "logits/rejected": 9.607854843139648, "logps/chosen": -2.6509780883789062, "logps/rejected": -3.178340196609497, "loss": 3.8934, "rewards/accuracies": 1.0, "rewards/chosen": -26.509780883789062, "rewards/margins": 5.273619174957275, "rewards/rejected": -31.783401489257812, "step": 1470 }, { "epoch": 0.2002995642701525, "grad_norm": 48.674614311813386, "learning_rate": 7.75768544025068e-07, "logits/chosen": 10.079938888549805, "logits/rejected": 10.263986587524414, "logps/chosen": -3.277217149734497, "logps/rejected": -3.5961575508117676, "loss": 3.9421, "rewards/accuracies": 0.75, "rewards/chosen": -32.77217102050781, "rewards/margins": 3.1894021034240723, "rewards/rejected": -35.96157455444336, "step": 1471 }, { "epoch": 0.20043572984749455, "grad_norm": 174.9050570280535, "learning_rate": 7.757033282364517e-07, "logits/chosen": 9.159131050109863, "logits/rejected": 11.045656204223633, "logps/chosen": -2.984546422958374, "logps/rejected": -3.2914273738861084, "loss": 4.0108, "rewards/accuracies": 0.75, "rewards/chosen": -29.8454647064209, "rewards/margins": 3.0688085556030273, "rewards/rejected": -32.91427230834961, "step": 1472 }, { "epoch": 0.2005718954248366, "grad_norm": 46.43110786417672, "learning_rate": 7.756380275545627e-07, "logits/chosen": 9.383152961730957, "logits/rejected": 10.943801879882812, "logps/chosen": -2.712529182434082, "logps/rejected": -2.9223742485046387, "loss": 4.2742, "rewards/accuracies": 0.75, "rewards/chosen": -27.125289916992188, "rewards/margins": 2.098454475402832, "rewards/rejected": -29.223743438720703, "step": 1473 }, { "epoch": 0.20070806100217864, "grad_norm": 58.30734585654349, "learning_rate": 7.755726419941563e-07, "logits/chosen": 10.630681991577148, "logits/rejected": 11.079618453979492, "logps/chosen": -3.1282739639282227, "logps/rejected": -3.611553907394409, "loss": 4.0407, "rewards/accuracies": 0.75, "rewards/chosen": -31.282737731933594, "rewards/margins": 4.832801818847656, "rewards/rejected": -36.11553955078125, "step": 1474 }, { "epoch": 0.2008442265795207, "grad_norm": 53.78994717912727, "learning_rate": 7.755071715700069e-07, "logits/chosen": 9.69851303100586, "logits/rejected": 9.820222854614258, "logps/chosen": -3.0454812049865723, "logps/rejected": -2.8603315353393555, "loss": 4.2308, "rewards/accuracies": 0.25, "rewards/chosen": -30.454811096191406, "rewards/margins": -1.8514952659606934, "rewards/rejected": -28.603315353393555, "step": 1475 }, { "epoch": 0.20098039215686275, "grad_norm": 48.93467351170447, "learning_rate": 7.754416162969081e-07, "logits/chosen": 9.128679275512695, "logits/rejected": 9.921143531799316, "logps/chosen": -2.908176898956299, "logps/rejected": -2.8394153118133545, "loss": 4.6186, "rewards/accuracies": 0.25, "rewards/chosen": -29.081769943237305, "rewards/margins": -0.6876163482666016, "rewards/rejected": -28.394153594970703, "step": 1476 }, { "epoch": 0.20111655773420478, "grad_norm": 50.789137847519854, "learning_rate": 7.753759761896727e-07, "logits/chosen": 9.980033874511719, "logits/rejected": 11.266616821289062, "logps/chosen": -3.284510374069214, "logps/rejected": -3.027937412261963, "loss": 3.8431, "rewards/accuracies": 0.25, "rewards/chosen": -32.8451042175293, "rewards/margins": -2.5657291412353516, "rewards/rejected": -30.279375076293945, "step": 1477 }, { "epoch": 0.20125272331154684, "grad_norm": 51.91303568986363, "learning_rate": 7.753102512631326e-07, "logits/chosen": 10.933070182800293, "logits/rejected": 10.207855224609375, "logps/chosen": -3.2922000885009766, "logps/rejected": -3.2824082374572754, "loss": 3.8797, "rewards/accuracies": 0.5, "rewards/chosen": -32.92200469970703, "rewards/margins": -0.09791898727416992, "rewards/rejected": -32.82408142089844, "step": 1478 }, { "epoch": 0.2013888888888889, "grad_norm": 47.297920809074114, "learning_rate": 7.75244441532139e-07, "logits/chosen": 9.656197547912598, "logits/rejected": 10.266362190246582, "logps/chosen": -2.71645450592041, "logps/rejected": -2.770266056060791, "loss": 4.5183, "rewards/accuracies": 0.5, "rewards/chosen": -27.164546966552734, "rewards/margins": 0.538114070892334, "rewards/rejected": -27.702659606933594, "step": 1479 }, { "epoch": 0.20152505446623092, "grad_norm": 42.13926539069677, "learning_rate": 7.751785470115619e-07, "logits/chosen": 10.726734161376953, "logits/rejected": 10.96585464477539, "logps/chosen": -3.600510358810425, "logps/rejected": -3.2929511070251465, "loss": 3.9826, "rewards/accuracies": 0.0, "rewards/chosen": -36.005104064941406, "rewards/margins": -3.075594902038574, "rewards/rejected": -32.92951202392578, "step": 1480 }, { "epoch": 0.20166122004357298, "grad_norm": 51.03582502040539, "learning_rate": 7.751125677162908e-07, "logits/chosen": 9.688996315002441, "logits/rejected": 10.686437606811523, "logps/chosen": -2.980539321899414, "logps/rejected": -3.1871676445007324, "loss": 4.243, "rewards/accuracies": 0.75, "rewards/chosen": -29.80539321899414, "rewards/margins": 2.0662841796875, "rewards/rejected": -31.87167739868164, "step": 1481 }, { "epoch": 0.20179738562091504, "grad_norm": 56.35500826717604, "learning_rate": 7.750465036612343e-07, "logits/chosen": 10.339637756347656, "logits/rejected": 8.941152572631836, "logps/chosen": -3.3281850814819336, "logps/rejected": -3.1954238414764404, "loss": 4.7111, "rewards/accuracies": 0.5, "rewards/chosen": -33.28185272216797, "rewards/margins": -1.3276138305664062, "rewards/rejected": -31.954238891601562, "step": 1482 }, { "epoch": 0.2019335511982571, "grad_norm": 45.57785453329041, "learning_rate": 7.749803548613203e-07, "logits/chosen": 10.225269317626953, "logits/rejected": 11.33920669555664, "logps/chosen": -3.137554168701172, "logps/rejected": -3.3560261726379395, "loss": 4.0214, "rewards/accuracies": 1.0, "rewards/chosen": -31.37554168701172, "rewards/margins": 2.184718132019043, "rewards/rejected": -33.56026077270508, "step": 1483 }, { "epoch": 0.20206971677559912, "grad_norm": 44.23717968861959, "learning_rate": 7.749141213314954e-07, "logits/chosen": 8.734895706176758, "logits/rejected": 8.732568740844727, "logps/chosen": -2.5332014560699463, "logps/rejected": -2.7105536460876465, "loss": 4.4105, "rewards/accuracies": 0.75, "rewards/chosen": -25.332015991210938, "rewards/margins": 1.7735204696655273, "rewards/rejected": -27.105533599853516, "step": 1484 }, { "epoch": 0.20220588235294118, "grad_norm": 45.07212351222551, "learning_rate": 7.748478030867257e-07, "logits/chosen": 9.576896667480469, "logits/rejected": 9.32303524017334, "logps/chosen": -3.3210983276367188, "logps/rejected": -3.04284930229187, "loss": 3.9501, "rewards/accuracies": 0.5, "rewards/chosen": -33.21098327636719, "rewards/margins": -2.7824912071228027, "rewards/rejected": -30.42849349975586, "step": 1485 }, { "epoch": 0.20234204793028324, "grad_norm": 43.81782886044254, "learning_rate": 7.747814001419964e-07, "logits/chosen": 10.945674896240234, "logits/rejected": 9.819061279296875, "logps/chosen": -3.546253204345703, "logps/rejected": -3.5631370544433594, "loss": 4.0462, "rewards/accuracies": 0.5, "rewards/chosen": -35.46253204345703, "rewards/margins": 0.1688375473022461, "rewards/rejected": -35.631370544433594, "step": 1486 }, { "epoch": 0.20247821350762527, "grad_norm": 46.999013676157396, "learning_rate": 7.747149125123117e-07, "logits/chosen": 9.98316764831543, "logits/rejected": 9.848362922668457, "logps/chosen": -3.2880005836486816, "logps/rejected": -3.3920023441314697, "loss": 3.8439, "rewards/accuracies": 0.75, "rewards/chosen": -32.8800048828125, "rewards/margins": 1.0400166511535645, "rewards/rejected": -33.920021057128906, "step": 1487 }, { "epoch": 0.20261437908496732, "grad_norm": 48.85319106207643, "learning_rate": 7.746483402126952e-07, "logits/chosen": 10.858562469482422, "logits/rejected": 11.099361419677734, "logps/chosen": -3.166644334793091, "logps/rejected": -2.9488277435302734, "loss": 4.6695, "rewards/accuracies": 0.25, "rewards/chosen": -31.666444778442383, "rewards/margins": -2.178168296813965, "rewards/rejected": -29.4882755279541, "step": 1488 }, { "epoch": 0.20275054466230938, "grad_norm": 47.30487418472292, "learning_rate": 7.745816832581893e-07, "logits/chosen": 8.595192909240723, "logits/rejected": 10.371667861938477, "logps/chosen": -3.1025185585021973, "logps/rejected": -3.4112515449523926, "loss": 4.3299, "rewards/accuracies": 0.5, "rewards/chosen": -31.025184631347656, "rewards/margins": 3.0873327255249023, "rewards/rejected": -34.112518310546875, "step": 1489 }, { "epoch": 0.2028867102396514, "grad_norm": 44.71913990388612, "learning_rate": 7.745149416638558e-07, "logits/chosen": 9.884319305419922, "logits/rejected": 11.443364143371582, "logps/chosen": -2.692561149597168, "logps/rejected": -2.781334638595581, "loss": 4.3881, "rewards/accuracies": 0.75, "rewards/chosen": -26.925609588623047, "rewards/margins": 0.8877358436584473, "rewards/rejected": -27.813344955444336, "step": 1490 }, { "epoch": 0.20302287581699346, "grad_norm": 45.52882622427578, "learning_rate": 7.744481154447754e-07, "logits/chosen": 10.396110534667969, "logits/rejected": 10.812126159667969, "logps/chosen": -3.4027302265167236, "logps/rejected": -3.546025276184082, "loss": 3.0179, "rewards/accuracies": 0.75, "rewards/chosen": -34.02730178833008, "rewards/margins": 1.4329490661621094, "rewards/rejected": -35.46025085449219, "step": 1491 }, { "epoch": 0.20315904139433552, "grad_norm": 41.68196873228656, "learning_rate": 7.743812046160481e-07, "logits/chosen": 10.270793914794922, "logits/rejected": 10.228995323181152, "logps/chosen": -3.4184303283691406, "logps/rejected": -3.207890748977661, "loss": 4.217, "rewards/accuracies": 0.0, "rewards/chosen": -34.184303283691406, "rewards/margins": -2.1053977012634277, "rewards/rejected": -32.07890701293945, "step": 1492 }, { "epoch": 0.20329520697167755, "grad_norm": 41.25265935067465, "learning_rate": 7.743142091927929e-07, "logits/chosen": 9.492667198181152, "logits/rejected": 11.740612983703613, "logps/chosen": -2.9292495250701904, "logps/rejected": -3.5370779037475586, "loss": 3.9608, "rewards/accuracies": 0.75, "rewards/chosen": -29.292495727539062, "rewards/margins": 6.078283786773682, "rewards/rejected": -35.37078094482422, "step": 1493 }, { "epoch": 0.2034313725490196, "grad_norm": 45.726413897539864, "learning_rate": 7.742471291901481e-07, "logits/chosen": 10.416006088256836, "logits/rejected": 10.682535171508789, "logps/chosen": -2.970582962036133, "logps/rejected": -2.8831615447998047, "loss": 4.2832, "rewards/accuracies": 0.25, "rewards/chosen": -29.705829620361328, "rewards/margins": -0.8742127418518066, "rewards/rejected": -28.83161735534668, "step": 1494 }, { "epoch": 0.20356753812636166, "grad_norm": 45.545789227183384, "learning_rate": 7.741799646232709e-07, "logits/chosen": 8.472463607788086, "logits/rejected": 10.310511589050293, "logps/chosen": -2.4923839569091797, "logps/rejected": -3.023167848587036, "loss": 4.0318, "rewards/accuracies": 1.0, "rewards/chosen": -24.923839569091797, "rewards/margins": 5.307840824127197, "rewards/rejected": -30.231678009033203, "step": 1495 }, { "epoch": 0.2037037037037037, "grad_norm": 48.06516706217952, "learning_rate": 7.741127155073377e-07, "logits/chosen": 7.933136940002441, "logits/rejected": 8.750874519348145, "logps/chosen": -2.7490882873535156, "logps/rejected": -2.871457815170288, "loss": 4.2572, "rewards/accuracies": 0.25, "rewards/chosen": -27.490882873535156, "rewards/margins": 1.2236948013305664, "rewards/rejected": -28.71457862854004, "step": 1496 }, { "epoch": 0.20383986928104575, "grad_norm": 46.92876813360535, "learning_rate": 7.740453818575439e-07, "logits/chosen": 10.107250213623047, "logits/rejected": 10.743568420410156, "logps/chosen": -2.5579569339752197, "logps/rejected": -2.882481098175049, "loss": 3.9587, "rewards/accuracies": 0.75, "rewards/chosen": -25.57956886291504, "rewards/margins": 3.2452430725097656, "rewards/rejected": -28.824811935424805, "step": 1497 }, { "epoch": 0.2039760348583878, "grad_norm": 53.69328397687956, "learning_rate": 7.739779636891041e-07, "logits/chosen": 9.468549728393555, "logits/rejected": 10.51112174987793, "logps/chosen": -2.724238395690918, "logps/rejected": -3.1025028228759766, "loss": 4.3422, "rewards/accuracies": 0.75, "rewards/chosen": -27.242385864257812, "rewards/margins": 3.7826437950134277, "rewards/rejected": -31.025028228759766, "step": 1498 }, { "epoch": 0.20411220043572983, "grad_norm": 44.45824437841151, "learning_rate": 7.739104610172523e-07, "logits/chosen": 9.0387601852417, "logits/rejected": 8.702025413513184, "logps/chosen": -2.8542935848236084, "logps/rejected": -2.7229208946228027, "loss": 4.0616, "rewards/accuracies": 0.5, "rewards/chosen": -28.542936325073242, "rewards/margins": -1.3137245178222656, "rewards/rejected": -27.229209899902344, "step": 1499 }, { "epoch": 0.2042483660130719, "grad_norm": 41.57371850310257, "learning_rate": 7.738428738572409e-07, "logits/chosen": 9.692127227783203, "logits/rejected": 9.367395401000977, "logps/chosen": -2.780317783355713, "logps/rejected": -3.0381460189819336, "loss": 3.6742, "rewards/accuracies": 0.75, "rewards/chosen": -27.803176879882812, "rewards/margins": 2.578282356262207, "rewards/rejected": -30.381458282470703, "step": 1500 }, { "epoch": 0.20438453159041395, "grad_norm": 44.67578663814388, "learning_rate": 7.73775202224342e-07, "logits/chosen": 8.556045532226562, "logits/rejected": 8.737282752990723, "logps/chosen": -2.6863980293273926, "logps/rejected": -2.826643943786621, "loss": 4.1833, "rewards/accuracies": 0.75, "rewards/chosen": -26.86397933959961, "rewards/margins": 1.4024605751037598, "rewards/rejected": -28.266437530517578, "step": 1501 }, { "epoch": 0.204520697167756, "grad_norm": 47.72383594280144, "learning_rate": 7.737074461338466e-07, "logits/chosen": 7.316582202911377, "logits/rejected": 10.040130615234375, "logps/chosen": -2.5013904571533203, "logps/rejected": -3.076038360595703, "loss": 4.0163, "rewards/accuracies": 0.75, "rewards/chosen": -25.013904571533203, "rewards/margins": 5.746479511260986, "rewards/rejected": -30.76038360595703, "step": 1502 }, { "epoch": 0.20465686274509803, "grad_norm": 41.1621298423703, "learning_rate": 7.736396056010645e-07, "logits/chosen": 9.57284927368164, "logits/rejected": 9.748025894165039, "logps/chosen": -3.256669521331787, "logps/rejected": -3.1597237586975098, "loss": 4.1291, "rewards/accuracies": 0.5, "rewards/chosen": -32.56669235229492, "rewards/margins": -0.9694547653198242, "rewards/rejected": -31.597238540649414, "step": 1503 }, { "epoch": 0.2047930283224401, "grad_norm": 46.68958692297606, "learning_rate": 7.735716806413249e-07, "logits/chosen": 9.047962188720703, "logits/rejected": 9.456048965454102, "logps/chosen": -3.0808606147766113, "logps/rejected": -3.4306929111480713, "loss": 4.0911, "rewards/accuracies": 0.75, "rewards/chosen": -30.80860710144043, "rewards/margins": 3.498321056365967, "rewards/rejected": -34.30692672729492, "step": 1504 }, { "epoch": 0.20492919389978215, "grad_norm": 45.13107138408783, "learning_rate": 7.735036712699763e-07, "logits/chosen": 7.4150848388671875, "logits/rejected": 8.246355056762695, "logps/chosen": -2.43450665473938, "logps/rejected": -2.612436532974243, "loss": 4.7239, "rewards/accuracies": 0.75, "rewards/chosen": -24.34506607055664, "rewards/margins": 1.7792987823486328, "rewards/rejected": -26.124364852905273, "step": 1505 }, { "epoch": 0.20506535947712418, "grad_norm": 40.83513124766465, "learning_rate": 7.734355775023856e-07, "logits/chosen": 7.402055740356445, "logits/rejected": 8.505611419677734, "logps/chosen": -2.711662769317627, "logps/rejected": -2.9735450744628906, "loss": 4.2562, "rewards/accuracies": 0.75, "rewards/chosen": -27.11663055419922, "rewards/margins": 2.6188206672668457, "rewards/rejected": -29.735450744628906, "step": 1506 }, { "epoch": 0.20520152505446623, "grad_norm": 43.50546023977554, "learning_rate": 7.733673993539394e-07, "logits/chosen": 10.452285766601562, "logits/rejected": 10.141242980957031, "logps/chosen": -2.8688111305236816, "logps/rejected": -2.7563276290893555, "loss": 4.0439, "rewards/accuracies": 0.5, "rewards/chosen": -28.688114166259766, "rewards/margins": -1.1248393058776855, "rewards/rejected": -27.563274383544922, "step": 1507 }, { "epoch": 0.2053376906318083, "grad_norm": 41.637120233040406, "learning_rate": 7.73299136840043e-07, "logits/chosen": 8.91121768951416, "logits/rejected": 9.56515121459961, "logps/chosen": -2.8861818313598633, "logps/rejected": -3.1739864349365234, "loss": 3.8455, "rewards/accuracies": 0.75, "rewards/chosen": -28.86181640625, "rewards/margins": 2.878046989440918, "rewards/rejected": -31.739866256713867, "step": 1508 }, { "epoch": 0.20547385620915032, "grad_norm": 47.723454898955616, "learning_rate": 7.732307899761209e-07, "logits/chosen": 9.039308547973633, "logits/rejected": 9.877425193786621, "logps/chosen": -2.7111754417419434, "logps/rejected": -2.9312753677368164, "loss": 3.854, "rewards/accuracies": 0.25, "rewards/chosen": -27.11175537109375, "rewards/margins": 2.201000213623047, "rewards/rejected": -29.312755584716797, "step": 1509 }, { "epoch": 0.20561002178649238, "grad_norm": 42.33557325079274, "learning_rate": 7.731623587776167e-07, "logits/chosen": 9.279367446899414, "logits/rejected": 9.887613296508789, "logps/chosen": -3.0156073570251465, "logps/rejected": -3.1096506118774414, "loss": 4.2136, "rewards/accuracies": 0.75, "rewards/chosen": -30.15607452392578, "rewards/margins": 0.9404306411743164, "rewards/rejected": -31.09650421142578, "step": 1510 }, { "epoch": 0.20574618736383443, "grad_norm": 43.89468677715297, "learning_rate": 7.730938432599929e-07, "logits/chosen": 9.243326187133789, "logits/rejected": 7.901089668273926, "logps/chosen": -2.534489154815674, "logps/rejected": -2.426858901977539, "loss": 4.2481, "rewards/accuracies": 0.5, "rewards/chosen": -25.344892501831055, "rewards/margins": -1.0763030052185059, "rewards/rejected": -24.26858901977539, "step": 1511 }, { "epoch": 0.20588235294117646, "grad_norm": 44.12730644877694, "learning_rate": 7.730252434387311e-07, "logits/chosen": 9.79200553894043, "logits/rejected": 9.489849090576172, "logps/chosen": -2.6076555252075195, "logps/rejected": -2.6327810287475586, "loss": 4.1938, "rewards/accuracies": 0.25, "rewards/chosen": -26.076553344726562, "rewards/margins": 0.25125551223754883, "rewards/rejected": -26.327810287475586, "step": 1512 }, { "epoch": 0.20601851851851852, "grad_norm": 44.540745395611566, "learning_rate": 7.729565593293323e-07, "logits/chosen": 9.803686141967773, "logits/rejected": 9.584250450134277, "logps/chosen": -3.0356976985931396, "logps/rejected": -3.1972439289093018, "loss": 4.1424, "rewards/accuracies": 0.75, "rewards/chosen": -30.356975555419922, "rewards/margins": 1.6154627799987793, "rewards/rejected": -31.972440719604492, "step": 1513 }, { "epoch": 0.20615468409586057, "grad_norm": 44.02453022525998, "learning_rate": 7.728877909473159e-07, "logits/chosen": 9.056403160095215, "logits/rejected": 10.060811996459961, "logps/chosen": -2.561628818511963, "logps/rejected": -2.782029628753662, "loss": 4.162, "rewards/accuracies": 0.75, "rewards/chosen": -25.616287231445312, "rewards/margins": 2.204008102416992, "rewards/rejected": -27.820295333862305, "step": 1514 }, { "epoch": 0.2062908496732026, "grad_norm": 52.31531093464424, "learning_rate": 7.728189383082208e-07, "logits/chosen": 10.386983871459961, "logits/rejected": 10.045450210571289, "logps/chosen": -3.138911485671997, "logps/rejected": -3.078035831451416, "loss": 4.2746, "rewards/accuracies": 0.5, "rewards/chosen": -31.389114379882812, "rewards/margins": -0.6087589263916016, "rewards/rejected": -30.780357360839844, "step": 1515 }, { "epoch": 0.20642701525054466, "grad_norm": 46.67854446264733, "learning_rate": 7.727500014276049e-07, "logits/chosen": 9.892107963562012, "logits/rejected": 9.672849655151367, "logps/chosen": -2.374948740005493, "logps/rejected": -2.5813827514648438, "loss": 4.2343, "rewards/accuracies": 0.75, "rewards/chosen": -23.749486923217773, "rewards/margins": 2.0643386840820312, "rewards/rejected": -25.813827514648438, "step": 1516 }, { "epoch": 0.20656318082788672, "grad_norm": 42.3034557676663, "learning_rate": 7.72680980321045e-07, "logits/chosen": 8.714057922363281, "logits/rejected": 10.216775894165039, "logps/chosen": -3.111717462539673, "logps/rejected": -3.270214557647705, "loss": 4.1135, "rewards/accuracies": 0.75, "rewards/chosen": -31.11717414855957, "rewards/margins": 1.5849723815917969, "rewards/rejected": -32.702144622802734, "step": 1517 }, { "epoch": 0.20669934640522875, "grad_norm": 47.94583952116449, "learning_rate": 7.726118750041369e-07, "logits/chosen": 9.36197280883789, "logits/rejected": 10.435904502868652, "logps/chosen": -2.649604320526123, "logps/rejected": -3.0394420623779297, "loss": 3.741, "rewards/accuracies": 1.0, "rewards/chosen": -26.496044158935547, "rewards/margins": 3.89837646484375, "rewards/rejected": -30.394420623779297, "step": 1518 }, { "epoch": 0.2068355119825708, "grad_norm": 48.48918898791553, "learning_rate": 7.725426854924956e-07, "logits/chosen": 9.807392120361328, "logits/rejected": 8.913664817810059, "logps/chosen": -3.2237296104431152, "logps/rejected": -3.101740837097168, "loss": 3.8786, "rewards/accuracies": 0.25, "rewards/chosen": -32.23729705810547, "rewards/margins": -1.219886302947998, "rewards/rejected": -31.01740837097168, "step": 1519 }, { "epoch": 0.20697167755991286, "grad_norm": 45.68863589781404, "learning_rate": 7.72473411801755e-07, "logits/chosen": 9.182605743408203, "logits/rejected": 10.268871307373047, "logps/chosen": -2.824150800704956, "logps/rejected": -3.095468282699585, "loss": 3.8759, "rewards/accuracies": 0.5, "rewards/chosen": -28.241506576538086, "rewards/margins": 2.7131757736206055, "rewards/rejected": -30.954681396484375, "step": 1520 }, { "epoch": 0.20710784313725492, "grad_norm": 39.587220398847464, "learning_rate": 7.724040539475683e-07, "logits/chosen": 9.704468727111816, "logits/rejected": 9.808188438415527, "logps/chosen": -3.0432193279266357, "logps/rejected": -3.230257034301758, "loss": 3.5322, "rewards/accuracies": 0.75, "rewards/chosen": -30.432193756103516, "rewards/margins": 1.8703765869140625, "rewards/rejected": -32.30256652832031, "step": 1521 }, { "epoch": 0.20724400871459694, "grad_norm": 47.05782262711256, "learning_rate": 7.723346119456072e-07, "logits/chosen": 8.721800804138184, "logits/rejected": 9.232776641845703, "logps/chosen": -2.9015002250671387, "logps/rejected": -2.9910287857055664, "loss": 4.3443, "rewards/accuracies": 0.75, "rewards/chosen": -29.015003204345703, "rewards/margins": 0.8952856063842773, "rewards/rejected": -29.910287857055664, "step": 1522 }, { "epoch": 0.207380174291939, "grad_norm": 44.0390269766141, "learning_rate": 7.722650858115628e-07, "logits/chosen": 10.714138984680176, "logits/rejected": 10.900224685668945, "logps/chosen": -3.542597532272339, "logps/rejected": -3.960082530975342, "loss": 3.627, "rewards/accuracies": 0.75, "rewards/chosen": -35.42597579956055, "rewards/margins": 4.174849510192871, "rewards/rejected": -39.600826263427734, "step": 1523 }, { "epoch": 0.20751633986928106, "grad_norm": 49.755396114465306, "learning_rate": 7.72195475561145e-07, "logits/chosen": 10.039295196533203, "logits/rejected": 10.056184768676758, "logps/chosen": -3.3620765209198, "logps/rejected": -3.3816421031951904, "loss": 4.4728, "rewards/accuracies": 0.5, "rewards/chosen": -33.620765686035156, "rewards/margins": 0.19565677642822266, "rewards/rejected": -33.81642150878906, "step": 1524 }, { "epoch": 0.2076525054466231, "grad_norm": 48.11164138834254, "learning_rate": 7.72125781210083e-07, "logits/chosen": 9.14140510559082, "logits/rejected": 9.753700256347656, "logps/chosen": -2.966533660888672, "logps/rejected": -3.0037319660186768, "loss": 4.1676, "rewards/accuracies": 0.5, "rewards/chosen": -29.66533660888672, "rewards/margins": 0.3719825744628906, "rewards/rejected": -30.03731918334961, "step": 1525 }, { "epoch": 0.20778867102396514, "grad_norm": 50.804369277153306, "learning_rate": 7.720560027741246e-07, "logits/chosen": 8.674100875854492, "logits/rejected": 8.793551445007324, "logps/chosen": -3.0142390727996826, "logps/rejected": -3.0013420581817627, "loss": 3.8415, "rewards/accuracies": 0.25, "rewards/chosen": -30.142391204833984, "rewards/margins": -0.12897062301635742, "rewards/rejected": -30.01342010498047, "step": 1526 }, { "epoch": 0.2079248366013072, "grad_norm": 47.5694574391649, "learning_rate": 7.71986140269037e-07, "logits/chosen": 9.32482624053955, "logits/rejected": 10.755514144897461, "logps/chosen": -2.864231586456299, "logps/rejected": -3.227268695831299, "loss": 4.2507, "rewards/accuracies": 0.75, "rewards/chosen": -28.642314910888672, "rewards/margins": 3.6303720474243164, "rewards/rejected": -32.27268981933594, "step": 1527 }, { "epoch": 0.20806100217864923, "grad_norm": 58.24957138257525, "learning_rate": 7.719161937106062e-07, "logits/chosen": 10.7779541015625, "logits/rejected": 11.93337631225586, "logps/chosen": -3.03326416015625, "logps/rejected": -3.2769522666931152, "loss": 4.1688, "rewards/accuracies": 0.75, "rewards/chosen": -30.3326416015625, "rewards/margins": 2.436880111694336, "rewards/rejected": -32.76952362060547, "step": 1528 }, { "epoch": 0.20819716775599129, "grad_norm": 57.17290049076445, "learning_rate": 7.71846163114637e-07, "logits/chosen": 10.117679595947266, "logits/rejected": 10.296953201293945, "logps/chosen": -3.0367612838745117, "logps/rejected": -2.9579248428344727, "loss": 3.507, "rewards/accuracies": 0.5, "rewards/chosen": -30.367612838745117, "rewards/margins": -0.7883648872375488, "rewards/rejected": -29.579248428344727, "step": 1529 }, { "epoch": 0.20833333333333334, "grad_norm": 45.73969599852763, "learning_rate": 7.717760484969536e-07, "logits/chosen": 10.189569473266602, "logits/rejected": 10.15321159362793, "logps/chosen": -3.6180505752563477, "logps/rejected": -3.5061123371124268, "loss": 4.1038, "rewards/accuracies": 0.75, "rewards/chosen": -36.180503845214844, "rewards/margins": -1.1193819046020508, "rewards/rejected": -35.061126708984375, "step": 1530 }, { "epoch": 0.20846949891067537, "grad_norm": 51.48288096211662, "learning_rate": 7.71705849873399e-07, "logits/chosen": 9.41354751586914, "logits/rejected": 9.000357627868652, "logps/chosen": -2.8980579376220703, "logps/rejected": -2.810690402984619, "loss": 3.6829, "rewards/accuracies": 0.5, "rewards/chosen": -28.980581283569336, "rewards/margins": -0.8736767768859863, "rewards/rejected": -28.106904983520508, "step": 1531 }, { "epoch": 0.20860566448801743, "grad_norm": 45.143839675084074, "learning_rate": 7.716355672598349e-07, "logits/chosen": 8.725870132446289, "logits/rejected": 10.026203155517578, "logps/chosen": -3.1705563068389893, "logps/rejected": -3.0087692737579346, "loss": 4.2093, "rewards/accuracies": 0.5, "rewards/chosen": -31.705564498901367, "rewards/margins": -1.6178722381591797, "rewards/rejected": -30.087692260742188, "step": 1532 }, { "epoch": 0.20874183006535948, "grad_norm": 44.82019166236812, "learning_rate": 7.715652006721425e-07, "logits/chosen": 7.388460636138916, "logits/rejected": 7.609871864318848, "logps/chosen": -2.877157211303711, "logps/rejected": -2.9936611652374268, "loss": 3.3407, "rewards/accuracies": 0.75, "rewards/chosen": -28.77157211303711, "rewards/margins": 1.1650400161743164, "rewards/rejected": -29.936613082885742, "step": 1533 }, { "epoch": 0.2088779956427015, "grad_norm": 45.84741489944219, "learning_rate": 7.714947501262216e-07, "logits/chosen": 10.253591537475586, "logits/rejected": 10.838438034057617, "logps/chosen": -3.1272034645080566, "logps/rejected": -3.230996608734131, "loss": 4.2128, "rewards/accuracies": 0.5, "rewards/chosen": -31.27203369140625, "rewards/margins": 1.037933349609375, "rewards/rejected": -32.309967041015625, "step": 1534 }, { "epoch": 0.20901416122004357, "grad_norm": 44.296403622159424, "learning_rate": 7.714242156379911e-07, "logits/chosen": 9.87187671661377, "logits/rejected": 10.889223098754883, "logps/chosen": -2.9971256256103516, "logps/rejected": -3.129911184310913, "loss": 4.1267, "rewards/accuracies": 0.5, "rewards/chosen": -29.97125816345215, "rewards/margins": 1.3278536796569824, "rewards/rejected": -31.299110412597656, "step": 1535 }, { "epoch": 0.20915032679738563, "grad_norm": 48.15176436775785, "learning_rate": 7.713535972233889e-07, "logits/chosen": 8.995546340942383, "logits/rejected": 9.977904319763184, "logps/chosen": -2.722032070159912, "logps/rejected": -3.0239291191101074, "loss": 3.5803, "rewards/accuracies": 0.75, "rewards/chosen": -27.220319747924805, "rewards/margins": 3.0189719200134277, "rewards/rejected": -30.23929214477539, "step": 1536 }, { "epoch": 0.20928649237472766, "grad_norm": 65.84049497131903, "learning_rate": 7.712828948983717e-07, "logits/chosen": 10.194751739501953, "logits/rejected": 9.867254257202148, "logps/chosen": -2.8534154891967773, "logps/rejected": -2.7652907371520996, "loss": 4.5972, "rewards/accuracies": 0.75, "rewards/chosen": -28.534156799316406, "rewards/margins": -0.8812494277954102, "rewards/rejected": -27.65290641784668, "step": 1537 }, { "epoch": 0.2094226579520697, "grad_norm": 42.08194651970758, "learning_rate": 7.712121086789154e-07, "logits/chosen": 10.224977493286133, "logits/rejected": 10.070632934570312, "logps/chosen": -2.8487751483917236, "logps/rejected": -3.0619938373565674, "loss": 3.4351, "rewards/accuracies": 0.75, "rewards/chosen": -28.48775291442871, "rewards/margins": 2.1321849822998047, "rewards/rejected": -30.619937896728516, "step": 1538 }, { "epoch": 0.20955882352941177, "grad_norm": 50.33544815035252, "learning_rate": 7.711412385810146e-07, "logits/chosen": 7.218438625335693, "logits/rejected": 7.250033378601074, "logps/chosen": -2.668710231781006, "logps/rejected": -2.5074710845947266, "loss": 3.6264, "rewards/accuracies": 0.5, "rewards/chosen": -26.687101364135742, "rewards/margins": -1.6123909950256348, "rewards/rejected": -25.074710845947266, "step": 1539 }, { "epoch": 0.20969498910675383, "grad_norm": 53.378091557937296, "learning_rate": 7.710702846206832e-07, "logits/chosen": 10.01108169555664, "logits/rejected": 9.943954467773438, "logps/chosen": -3.160529136657715, "logps/rejected": -3.355512857437134, "loss": 4.1318, "rewards/accuracies": 1.0, "rewards/chosen": -31.60529327392578, "rewards/margins": 1.9498376846313477, "rewards/rejected": -33.55513000488281, "step": 1540 }, { "epoch": 0.20983115468409586, "grad_norm": 51.67635532464326, "learning_rate": 7.709992468139536e-07, "logits/chosen": 9.43004035949707, "logits/rejected": 9.067878723144531, "logps/chosen": -2.6687469482421875, "logps/rejected": -3.1250882148742676, "loss": 3.9412, "rewards/accuracies": 1.0, "rewards/chosen": -26.687469482421875, "rewards/margins": 4.563412189483643, "rewards/rejected": -31.25088119506836, "step": 1541 }, { "epoch": 0.2099673202614379, "grad_norm": 51.288748657909125, "learning_rate": 7.709281251768774e-07, "logits/chosen": 9.742925643920898, "logits/rejected": 10.8892822265625, "logps/chosen": -2.816255807876587, "logps/rejected": -3.0753555297851562, "loss": 4.0123, "rewards/accuracies": 0.75, "rewards/chosen": -28.162559509277344, "rewards/margins": 2.5909957885742188, "rewards/rejected": -30.753555297851562, "step": 1542 }, { "epoch": 0.21010348583877997, "grad_norm": 53.55314318595173, "learning_rate": 7.708569197255252e-07, "logits/chosen": 9.371946334838867, "logits/rejected": 10.526754379272461, "logps/chosen": -2.9112939834594727, "logps/rejected": -2.9439144134521484, "loss": 4.8486, "rewards/accuracies": 0.5, "rewards/chosen": -29.11294174194336, "rewards/margins": 0.3262052536010742, "rewards/rejected": -29.439146041870117, "step": 1543 }, { "epoch": 0.210239651416122, "grad_norm": 55.02814099140027, "learning_rate": 7.707856304759865e-07, "logits/chosen": 9.330326080322266, "logits/rejected": 10.115194320678711, "logps/chosen": -2.7048563957214355, "logps/rejected": -3.112196922302246, "loss": 4.3643, "rewards/accuracies": 0.75, "rewards/chosen": -27.048564910888672, "rewards/margins": 4.07340669631958, "rewards/rejected": -31.121971130371094, "step": 1544 }, { "epoch": 0.21037581699346405, "grad_norm": 52.074266368203645, "learning_rate": 7.707142574443697e-07, "logits/chosen": 10.321742057800293, "logits/rejected": 10.251505851745605, "logps/chosen": -3.083007335662842, "logps/rejected": -3.080840587615967, "loss": 4.4437, "rewards/accuracies": 0.5, "rewards/chosen": -30.830076217651367, "rewards/margins": -0.021669864654541016, "rewards/rejected": -30.80840492248535, "step": 1545 }, { "epoch": 0.2105119825708061, "grad_norm": 52.08522987535709, "learning_rate": 7.706428006468021e-07, "logits/chosen": 8.61694049835205, "logits/rejected": 8.688776969909668, "logps/chosen": -2.6617093086242676, "logps/rejected": -2.773793935775757, "loss": 4.3269, "rewards/accuracies": 1.0, "rewards/chosen": -26.61709213256836, "rewards/margins": 1.1208477020263672, "rewards/rejected": -27.737937927246094, "step": 1546 }, { "epoch": 0.21064814814814814, "grad_norm": 49.510974681111755, "learning_rate": 7.705712600994297e-07, "logits/chosen": 9.531960487365723, "logits/rejected": 8.290363311767578, "logps/chosen": -2.254295587539673, "logps/rejected": -2.3517189025878906, "loss": 4.1566, "rewards/accuracies": 0.75, "rewards/chosen": -22.542957305908203, "rewards/margins": 0.9742321968078613, "rewards/rejected": -23.517189025878906, "step": 1547 }, { "epoch": 0.2107843137254902, "grad_norm": 60.02654535781318, "learning_rate": 7.704996358184182e-07, "logits/chosen": 9.242998123168945, "logits/rejected": 9.746833801269531, "logps/chosen": -2.9103760719299316, "logps/rejected": -3.0895237922668457, "loss": 5.2086, "rewards/accuracies": 0.5, "rewards/chosen": -29.103761672973633, "rewards/margins": 1.7914738655090332, "rewards/rejected": -30.89523696899414, "step": 1548 }, { "epoch": 0.21092047930283225, "grad_norm": 48.68923384971629, "learning_rate": 7.704279278199512e-07, "logits/chosen": 8.13200569152832, "logits/rejected": 9.943814277648926, "logps/chosen": -2.613844394683838, "logps/rejected": -2.797645092010498, "loss": 4.2075, "rewards/accuracies": 0.75, "rewards/chosen": -26.138444900512695, "rewards/margins": 1.8380050659179688, "rewards/rejected": -27.97644805908203, "step": 1549 }, { "epoch": 0.21105664488017428, "grad_norm": 45.77729946380763, "learning_rate": 7.703561361202321e-07, "logits/chosen": 9.962621688842773, "logits/rejected": 11.278772354125977, "logps/chosen": -3.4043421745300293, "logps/rejected": -3.6741909980773926, "loss": 4.4333, "rewards/accuracies": 0.75, "rewards/chosen": -34.043426513671875, "rewards/margins": 2.6984872817993164, "rewards/rejected": -36.741912841796875, "step": 1550 }, { "epoch": 0.21119281045751634, "grad_norm": 60.27474368492548, "learning_rate": 7.702842607354826e-07, "logits/chosen": 11.144126892089844, "logits/rejected": 10.219108581542969, "logps/chosen": -3.2045021057128906, "logps/rejected": -3.0659143924713135, "loss": 4.7176, "rewards/accuracies": 0.5, "rewards/chosen": -32.045021057128906, "rewards/margins": -1.3858757019042969, "rewards/rejected": -30.659143447875977, "step": 1551 }, { "epoch": 0.2113289760348584, "grad_norm": 47.716091284797464, "learning_rate": 7.702123016819435e-07, "logits/chosen": 10.375082969665527, "logits/rejected": 9.876230239868164, "logps/chosen": -2.921783208847046, "logps/rejected": -2.9222750663757324, "loss": 4.5467, "rewards/accuracies": 0.75, "rewards/chosen": -29.217832565307617, "rewards/margins": 0.004917621612548828, "rewards/rejected": -29.22275161743164, "step": 1552 }, { "epoch": 0.21146514161220042, "grad_norm": 43.30117820565079, "learning_rate": 7.701402589758747e-07, "logits/chosen": 10.633543968200684, "logits/rejected": 11.400642395019531, "logps/chosen": -3.1097025871276855, "logps/rejected": -3.2519354820251465, "loss": 4.1276, "rewards/accuracies": 0.75, "rewards/chosen": -31.097026824951172, "rewards/margins": 1.4223265647888184, "rewards/rejected": -32.519351959228516, "step": 1553 }, { "epoch": 0.21160130718954248, "grad_norm": 54.07825628166811, "learning_rate": 7.700681326335547e-07, "logits/chosen": 9.646444320678711, "logits/rejected": 10.180931091308594, "logps/chosen": -2.8027234077453613, "logps/rejected": -3.0033175945281982, "loss": 3.678, "rewards/accuracies": 0.5, "rewards/chosen": -28.027233123779297, "rewards/margins": 2.005941390991211, "rewards/rejected": -30.03317642211914, "step": 1554 }, { "epoch": 0.21173747276688454, "grad_norm": 45.668663362042125, "learning_rate": 7.699959226712812e-07, "logits/chosen": 9.379344940185547, "logits/rejected": 9.927227020263672, "logps/chosen": -2.605499744415283, "logps/rejected": -2.7804951667785645, "loss": 4.5379, "rewards/accuracies": 0.5, "rewards/chosen": -26.054996490478516, "rewards/margins": 1.7499537467956543, "rewards/rejected": -27.804950714111328, "step": 1555 }, { "epoch": 0.21187363834422657, "grad_norm": 43.475658572147346, "learning_rate": 7.699236291053705e-07, "logits/chosen": 8.906113624572754, "logits/rejected": 9.77227783203125, "logps/chosen": -2.6137561798095703, "logps/rejected": -3.001037120819092, "loss": 4.3563, "rewards/accuracies": 0.75, "rewards/chosen": -26.137561798095703, "rewards/margins": 3.8728113174438477, "rewards/rejected": -30.010372161865234, "step": 1556 }, { "epoch": 0.21200980392156862, "grad_norm": 45.88572133182712, "learning_rate": 7.698512519521579e-07, "logits/chosen": 10.785995483398438, "logits/rejected": 11.45409107208252, "logps/chosen": -3.4210398197174072, "logps/rejected": -3.3872270584106445, "loss": 4.6721, "rewards/accuracies": 0.5, "rewards/chosen": -34.21039581298828, "rewards/margins": -0.33812856674194336, "rewards/rejected": -33.87226867675781, "step": 1557 }, { "epoch": 0.21214596949891068, "grad_norm": 38.97913671425377, "learning_rate": 7.697787912279977e-07, "logits/chosen": 9.455667495727539, "logits/rejected": 10.793655395507812, "logps/chosen": -2.8360443115234375, "logps/rejected": -3.1333441734313965, "loss": 3.9259, "rewards/accuracies": 0.5, "rewards/chosen": -28.360443115234375, "rewards/margins": 2.972996711730957, "rewards/rejected": -31.33344078063965, "step": 1558 }, { "epoch": 0.21228213507625274, "grad_norm": 42.37434122278926, "learning_rate": 7.697062469492632e-07, "logits/chosen": 10.489158630371094, "logits/rejected": 11.437152862548828, "logps/chosen": -2.873065233230591, "logps/rejected": -3.0749759674072266, "loss": 3.8641, "rewards/accuracies": 0.75, "rewards/chosen": -28.730653762817383, "rewards/margins": 2.019106388092041, "rewards/rejected": -30.749757766723633, "step": 1559 }, { "epoch": 0.21241830065359477, "grad_norm": 48.436183560398106, "learning_rate": 7.69633619132346e-07, "logits/chosen": 10.19582748413086, "logits/rejected": 10.546493530273438, "logps/chosen": -3.0797841548919678, "logps/rejected": -3.4544029235839844, "loss": 4.2234, "rewards/accuracies": 1.0, "rewards/chosen": -30.797840118408203, "rewards/margins": 3.7461862564086914, "rewards/rejected": -34.544029235839844, "step": 1560 }, { "epoch": 0.21255446623093682, "grad_norm": 44.8678530491253, "learning_rate": 7.695609077936572e-07, "logits/chosen": 9.697696685791016, "logits/rejected": 10.367888450622559, "logps/chosen": -2.97361421585083, "logps/rejected": -3.3600125312805176, "loss": 3.9073, "rewards/accuracies": 0.75, "rewards/chosen": -29.736141204833984, "rewards/margins": 3.8639841079711914, "rewards/rejected": -33.60012435913086, "step": 1561 }, { "epoch": 0.21269063180827888, "grad_norm": 44.13068125119902, "learning_rate": 7.694881129496265e-07, "logits/chosen": 11.561951637268066, "logits/rejected": 10.896025657653809, "logps/chosen": -2.873722553253174, "logps/rejected": -2.9358670711517334, "loss": 4.0654, "rewards/accuracies": 0.5, "rewards/chosen": -28.737224578857422, "rewards/margins": 0.6214456558227539, "rewards/rejected": -29.35866928100586, "step": 1562 }, { "epoch": 0.2128267973856209, "grad_norm": 52.37383493086172, "learning_rate": 7.694152346167024e-07, "logits/chosen": 11.03155517578125, "logits/rejected": 9.596665382385254, "logps/chosen": -3.0076661109924316, "logps/rejected": -2.8220934867858887, "loss": 4.2342, "rewards/accuracies": 0.25, "rewards/chosen": -30.07666015625, "rewards/margins": -1.855727195739746, "rewards/rejected": -28.220932006835938, "step": 1563 }, { "epoch": 0.21296296296296297, "grad_norm": 43.433080471591, "learning_rate": 7.693422728113524e-07, "logits/chosen": 9.269872665405273, "logits/rejected": 9.84201431274414, "logps/chosen": -2.477376699447632, "logps/rejected": -2.861974000930786, "loss": 3.885, "rewards/accuracies": 1.0, "rewards/chosen": -24.773765563964844, "rewards/margins": 3.8459715843200684, "rewards/rejected": -28.619739532470703, "step": 1564 }, { "epoch": 0.21309912854030502, "grad_norm": 64.22388362165802, "learning_rate": 7.69269227550063e-07, "logits/chosen": 10.586523056030273, "logits/rejected": 9.999717712402344, "logps/chosen": -3.4214975833892822, "logps/rejected": -3.3819141387939453, "loss": 4.8781, "rewards/accuracies": 0.25, "rewards/chosen": -34.21497344970703, "rewards/margins": -0.39583253860473633, "rewards/rejected": -33.81914520263672, "step": 1565 }, { "epoch": 0.21323529411764705, "grad_norm": 39.51665088292218, "learning_rate": 7.691960988493391e-07, "logits/chosen": 9.82270622253418, "logits/rejected": 11.60020637512207, "logps/chosen": -3.079766035079956, "logps/rejected": -3.5527899265289307, "loss": 4.4508, "rewards/accuracies": 1.0, "rewards/chosen": -30.79766082763672, "rewards/margins": 4.730239391326904, "rewards/rejected": -35.52790069580078, "step": 1566 }, { "epoch": 0.2133714596949891, "grad_norm": 45.6717191618595, "learning_rate": 7.691228867257049e-07, "logits/chosen": 11.09040355682373, "logits/rejected": 10.990453720092773, "logps/chosen": -2.6527934074401855, "logps/rejected": -2.7908847332000732, "loss": 4.1048, "rewards/accuracies": 0.75, "rewards/chosen": -26.527931213378906, "rewards/margins": 1.3809146881103516, "rewards/rejected": -27.90884780883789, "step": 1567 }, { "epoch": 0.21350762527233116, "grad_norm": 52.95723069172379, "learning_rate": 7.690495911957032e-07, "logits/chosen": 10.723564147949219, "logits/rejected": 10.962862014770508, "logps/chosen": -2.9556798934936523, "logps/rejected": -2.9450912475585938, "loss": 4.4307, "rewards/accuracies": 0.25, "rewards/chosen": -29.55679702758789, "rewards/margins": -0.10588455200195312, "rewards/rejected": -29.45091438293457, "step": 1568 }, { "epoch": 0.2136437908496732, "grad_norm": 40.28997968755801, "learning_rate": 7.689762122758959e-07, "logits/chosen": 9.807342529296875, "logits/rejected": 9.859928131103516, "logps/chosen": -3.0566468238830566, "logps/rejected": -3.10284686088562, "loss": 4.0158, "rewards/accuracies": 0.5, "rewards/chosen": -30.56646728515625, "rewards/margins": 0.4620018005371094, "rewards/rejected": -31.028470993041992, "step": 1569 }, { "epoch": 0.21377995642701525, "grad_norm": 48.77677746945588, "learning_rate": 7.689027499828632e-07, "logits/chosen": 12.077262878417969, "logits/rejected": 11.707015037536621, "logps/chosen": -3.4423892498016357, "logps/rejected": -3.2451069355010986, "loss": 4.238, "rewards/accuracies": 0.5, "rewards/chosen": -34.42388916015625, "rewards/margins": -1.9728240966796875, "rewards/rejected": -32.45106887817383, "step": 1570 }, { "epoch": 0.2139161220043573, "grad_norm": 47.870789202762886, "learning_rate": 7.68829204333205e-07, "logits/chosen": 10.53920841217041, "logits/rejected": 11.307806015014648, "logps/chosen": -3.221538782119751, "logps/rejected": -3.15159273147583, "loss": 3.6363, "rewards/accuracies": 0.5, "rewards/chosen": -32.21538543701172, "rewards/margins": -0.6994595527648926, "rewards/rejected": -31.515928268432617, "step": 1571 }, { "epoch": 0.21405228758169934, "grad_norm": 51.82163645805322, "learning_rate": 7.687555753435391e-07, "logits/chosen": 12.079962730407715, "logits/rejected": 12.149277687072754, "logps/chosen": -3.1449806690216064, "logps/rejected": -3.3113508224487305, "loss": 4.4786, "rewards/accuracies": 0.75, "rewards/chosen": -31.449806213378906, "rewards/margins": 1.663701057434082, "rewards/rejected": -33.11351013183594, "step": 1572 }, { "epoch": 0.2141884531590414, "grad_norm": 45.04829859850359, "learning_rate": 7.686818630305029e-07, "logits/chosen": 10.925690650939941, "logits/rejected": 11.579972267150879, "logps/chosen": -2.9774346351623535, "logps/rejected": -3.033027172088623, "loss": 4.3888, "rewards/accuracies": 0.75, "rewards/chosen": -29.77434539794922, "rewards/margins": 0.5559267997741699, "rewards/rejected": -30.33027458190918, "step": 1573 }, { "epoch": 0.21432461873638345, "grad_norm": 76.84230009112066, "learning_rate": 7.686080674107522e-07, "logits/chosen": 11.662179946899414, "logits/rejected": 11.946144104003906, "logps/chosen": -3.227997064590454, "logps/rejected": -3.6523854732513428, "loss": 4.323, "rewards/accuracies": 0.75, "rewards/chosen": -32.27996826171875, "rewards/margins": 4.2438836097717285, "rewards/rejected": -36.52385711669922, "step": 1574 }, { "epoch": 0.21446078431372548, "grad_norm": 45.35839926626292, "learning_rate": 7.685341885009617e-07, "logits/chosen": 10.08857250213623, "logits/rejected": 11.014826774597168, "logps/chosen": -2.7975378036499023, "logps/rejected": -3.1170408725738525, "loss": 4.5713, "rewards/accuracies": 0.75, "rewards/chosen": -27.975378036499023, "rewards/margins": 3.1950302124023438, "rewards/rejected": -31.17041015625, "step": 1575 }, { "epoch": 0.21459694989106753, "grad_norm": 45.474241792761234, "learning_rate": 7.68460226317825e-07, "logits/chosen": 10.086790084838867, "logits/rejected": 10.060630798339844, "logps/chosen": -2.8086307048797607, "logps/rejected": -2.7529537677764893, "loss": 4.2224, "rewards/accuracies": 0.5, "rewards/chosen": -28.086307525634766, "rewards/margins": -0.5567693710327148, "rewards/rejected": -27.529537200927734, "step": 1576 }, { "epoch": 0.2147331154684096, "grad_norm": 44.73419416853653, "learning_rate": 7.683861808780544e-07, "logits/chosen": 10.079610824584961, "logits/rejected": 11.767459869384766, "logps/chosen": -3.130331039428711, "logps/rejected": -3.3101999759674072, "loss": 4.4504, "rewards/accuracies": 0.75, "rewards/chosen": -31.30331039428711, "rewards/margins": 1.7986879348754883, "rewards/rejected": -33.10199737548828, "step": 1577 }, { "epoch": 0.21486928104575165, "grad_norm": 112.46061016349007, "learning_rate": 7.683120521983813e-07, "logits/chosen": 10.654108047485352, "logits/rejected": 10.959383010864258, "logps/chosen": -2.884528160095215, "logps/rejected": -2.724503993988037, "loss": 4.6938, "rewards/accuracies": 0.25, "rewards/chosen": -28.84528160095215, "rewards/margins": -1.6002397537231445, "rewards/rejected": -27.245040893554688, "step": 1578 }, { "epoch": 0.21500544662309368, "grad_norm": 50.575978120291666, "learning_rate": 7.682378402955553e-07, "logits/chosen": 11.97059440612793, "logits/rejected": 12.24110221862793, "logps/chosen": -3.0862483978271484, "logps/rejected": -3.0775792598724365, "loss": 4.3238, "rewards/accuracies": 0.5, "rewards/chosen": -30.86248207092285, "rewards/margins": -0.08668947219848633, "rewards/rejected": -30.775793075561523, "step": 1579 }, { "epoch": 0.21514161220043573, "grad_norm": 79.21635652603608, "learning_rate": 7.681635451863455e-07, "logits/chosen": 11.106842041015625, "logits/rejected": 10.92310619354248, "logps/chosen": -2.8555612564086914, "logps/rejected": -2.969874143600464, "loss": 4.1445, "rewards/accuracies": 0.75, "rewards/chosen": -28.555612564086914, "rewards/margins": 1.1431283950805664, "rewards/rejected": -29.698741912841797, "step": 1580 }, { "epoch": 0.2152777777777778, "grad_norm": 45.16067877030821, "learning_rate": 7.680891668875393e-07, "logits/chosen": 10.230786323547363, "logits/rejected": 10.919756889343262, "logps/chosen": -3.2294979095458984, "logps/rejected": -3.1828160285949707, "loss": 4.6577, "rewards/accuracies": 0.25, "rewards/chosen": -32.294979095458984, "rewards/margins": -0.46682071685791016, "rewards/rejected": -31.82815933227539, "step": 1581 }, { "epoch": 0.21541394335511982, "grad_norm": 41.485358457022464, "learning_rate": 7.680147054159432e-07, "logits/chosen": 10.935575485229492, "logits/rejected": 11.133050918579102, "logps/chosen": -2.813828706741333, "logps/rejected": -3.216252565383911, "loss": 3.9354, "rewards/accuracies": 0.75, "rewards/chosen": -28.138288497924805, "rewards/margins": 4.024238586425781, "rewards/rejected": -32.16252517700195, "step": 1582 }, { "epoch": 0.21555010893246188, "grad_norm": 44.375395738203395, "learning_rate": 7.679401607883825e-07, "logits/chosen": 10.791585922241211, "logits/rejected": 11.027165412902832, "logps/chosen": -2.763566017150879, "logps/rejected": -2.9322116374969482, "loss": 4.4154, "rewards/accuracies": 0.75, "rewards/chosen": -27.635662078857422, "rewards/margins": 1.6864547729492188, "rewards/rejected": -29.32211685180664, "step": 1583 }, { "epoch": 0.21568627450980393, "grad_norm": 42.86530217836519, "learning_rate": 7.678655330217008e-07, "logits/chosen": 9.958681106567383, "logits/rejected": 9.775585174560547, "logps/chosen": -2.837013006210327, "logps/rejected": -2.953031539916992, "loss": 4.3272, "rewards/accuracies": 0.75, "rewards/chosen": -28.37013053894043, "rewards/margins": 1.1601839065551758, "rewards/rejected": -29.530315399169922, "step": 1584 }, { "epoch": 0.21582244008714596, "grad_norm": 55.976384251212664, "learning_rate": 7.677908221327614e-07, "logits/chosen": 11.209444046020508, "logits/rejected": 10.584822654724121, "logps/chosen": -3.0021064281463623, "logps/rejected": -3.149989128112793, "loss": 4.4094, "rewards/accuracies": 0.75, "rewards/chosen": -30.02106285095215, "rewards/margins": 1.4788284301757812, "rewards/rejected": -31.499893188476562, "step": 1585 }, { "epoch": 0.21595860566448802, "grad_norm": 41.32097353081319, "learning_rate": 7.677160281384454e-07, "logits/chosen": 9.941484451293945, "logits/rejected": 10.140827178955078, "logps/chosen": -2.7566676139831543, "logps/rejected": -2.905017375946045, "loss": 3.7954, "rewards/accuracies": 0.5, "rewards/chosen": -27.56667709350586, "rewards/margins": 1.4834961891174316, "rewards/rejected": -29.0501708984375, "step": 1586 }, { "epoch": 0.21609477124183007, "grad_norm": 47.46400015614144, "learning_rate": 7.676411510556532e-07, "logits/chosen": 9.876925468444824, "logits/rejected": 10.383536338806152, "logps/chosen": -3.0716381072998047, "logps/rejected": -3.2048983573913574, "loss": 4.1595, "rewards/accuracies": 0.75, "rewards/chosen": -30.716379165649414, "rewards/margins": 1.3326044082641602, "rewards/rejected": -32.04898452758789, "step": 1587 }, { "epoch": 0.2162309368191721, "grad_norm": 44.385089729442434, "learning_rate": 7.675661909013041e-07, "logits/chosen": 9.471830368041992, "logits/rejected": 9.919109344482422, "logps/chosen": -2.881720542907715, "logps/rejected": -3.1558403968811035, "loss": 4.5198, "rewards/accuracies": 1.0, "rewards/chosen": -28.81720542907715, "rewards/margins": 2.7411980628967285, "rewards/rejected": -31.55840301513672, "step": 1588 }, { "epoch": 0.21636710239651416, "grad_norm": 47.68462536209936, "learning_rate": 7.674911476923358e-07, "logits/chosen": 9.956418991088867, "logits/rejected": 10.419933319091797, "logps/chosen": -2.768460273742676, "logps/rejected": -3.2109789848327637, "loss": 3.9858, "rewards/accuracies": 0.75, "rewards/chosen": -27.68460464477539, "rewards/margins": 4.4251861572265625, "rewards/rejected": -32.10979080200195, "step": 1589 }, { "epoch": 0.21650326797385622, "grad_norm": 45.10661056385973, "learning_rate": 7.674160214457049e-07, "logits/chosen": 10.881233215332031, "logits/rejected": 11.441486358642578, "logps/chosen": -2.843858480453491, "logps/rejected": -2.947603225708008, "loss": 4.5539, "rewards/accuracies": 0.25, "rewards/chosen": -28.438583374023438, "rewards/margins": 1.0374479293823242, "rewards/rejected": -29.476032257080078, "step": 1590 }, { "epoch": 0.21663943355119825, "grad_norm": 44.57508291321513, "learning_rate": 7.673408121783869e-07, "logits/chosen": 10.103057861328125, "logits/rejected": 9.245742797851562, "logps/chosen": -2.650386333465576, "logps/rejected": -2.656229019165039, "loss": 4.413, "rewards/accuracies": 0.25, "rewards/chosen": -26.503862380981445, "rewards/margins": 0.05842781066894531, "rewards/rejected": -26.56229019165039, "step": 1591 }, { "epoch": 0.2167755991285403, "grad_norm": 48.29223126550441, "learning_rate": 7.672655199073759e-07, "logits/chosen": 11.923867225646973, "logits/rejected": 11.448737144470215, "logps/chosen": -3.1085386276245117, "logps/rejected": -3.4016637802124023, "loss": 3.9885, "rewards/accuracies": 0.75, "rewards/chosen": -31.085386276245117, "rewards/margins": 2.9312515258789062, "rewards/rejected": -34.01663589477539, "step": 1592 }, { "epoch": 0.21691176470588236, "grad_norm": 43.453631535036486, "learning_rate": 7.671901446496848e-07, "logits/chosen": 9.680303573608398, "logits/rejected": 9.909717559814453, "logps/chosen": -2.8555209636688232, "logps/rejected": -2.9058711528778076, "loss": 4.4334, "rewards/accuracies": 0.5, "rewards/chosen": -28.55521011352539, "rewards/margins": 0.5035009384155273, "rewards/rejected": -29.058712005615234, "step": 1593 }, { "epoch": 0.2170479302832244, "grad_norm": 68.68829838056878, "learning_rate": 7.671146864223454e-07, "logits/chosen": 11.36831283569336, "logits/rejected": 11.891420364379883, "logps/chosen": -3.2359843254089355, "logps/rejected": -3.3531274795532227, "loss": 4.2271, "rewards/accuracies": 0.75, "rewards/chosen": -32.35984420776367, "rewards/margins": 1.1714301109313965, "rewards/rejected": -33.531272888183594, "step": 1594 }, { "epoch": 0.21718409586056645, "grad_norm": 54.235292400900214, "learning_rate": 7.67039145242408e-07, "logits/chosen": 11.288214683532715, "logits/rejected": 11.568746566772461, "logps/chosen": -3.1438746452331543, "logps/rejected": -3.412966012954712, "loss": 3.6228, "rewards/accuracies": 1.0, "rewards/chosen": -31.438745498657227, "rewards/margins": 2.6909146308898926, "rewards/rejected": -34.129661560058594, "step": 1595 }, { "epoch": 0.2173202614379085, "grad_norm": 57.285109867045016, "learning_rate": 7.669635211269417e-07, "logits/chosen": 10.570929527282715, "logits/rejected": 10.578927993774414, "logps/chosen": -2.7240843772888184, "logps/rejected": -2.8992738723754883, "loss": 3.5766, "rewards/accuracies": 0.75, "rewards/chosen": -27.2408447265625, "rewards/margins": 1.7518939971923828, "rewards/rejected": -28.992738723754883, "step": 1596 }, { "epoch": 0.21745642701525056, "grad_norm": 42.81211819786957, "learning_rate": 7.668878140930344e-07, "logits/chosen": 10.101282119750977, "logits/rejected": 11.65392017364502, "logps/chosen": -2.9671711921691895, "logps/rejected": -3.1771738529205322, "loss": 4.6262, "rewards/accuracies": 0.75, "rewards/chosen": -29.671709060668945, "rewards/margins": 2.1000289916992188, "rewards/rejected": -31.771739959716797, "step": 1597 }, { "epoch": 0.2175925925925926, "grad_norm": 58.64362246096512, "learning_rate": 7.668120241577929e-07, "logits/chosen": 10.054708480834961, "logits/rejected": 10.254323959350586, "logps/chosen": -3.102809429168701, "logps/rejected": -3.0334486961364746, "loss": 4.1872, "rewards/accuracies": 0.25, "rewards/chosen": -31.028095245361328, "rewards/margins": -0.6936087608337402, "rewards/rejected": -30.33448600769043, "step": 1598 }, { "epoch": 0.21772875816993464, "grad_norm": 45.94315216413913, "learning_rate": 7.667361513383423e-07, "logits/chosen": 10.08065128326416, "logits/rejected": 12.198878288269043, "logps/chosen": -2.7471909523010254, "logps/rejected": -3.3772168159484863, "loss": 3.7745, "rewards/accuracies": 1.0, "rewards/chosen": -27.47191047668457, "rewards/margins": 6.300259113311768, "rewards/rejected": -33.77217102050781, "step": 1599 }, { "epoch": 0.2178649237472767, "grad_norm": 40.1889819803394, "learning_rate": 7.666601956518269e-07, "logits/chosen": 9.0031156539917, "logits/rejected": 9.472840309143066, "logps/chosen": -2.4908697605133057, "logps/rejected": -2.5027883052825928, "loss": 3.4659, "rewards/accuracies": 0.5, "rewards/chosen": -24.9086971282959, "rewards/margins": 0.1191868782043457, "rewards/rejected": -25.027883529663086, "step": 1600 }, { "epoch": 0.21800108932461873, "grad_norm": 58.15480601559658, "learning_rate": 7.665841571154094e-07, "logits/chosen": 9.691415786743164, "logits/rejected": 9.926609992980957, "logps/chosen": -3.0230321884155273, "logps/rejected": -3.28220272064209, "loss": 3.8082, "rewards/accuracies": 0.75, "rewards/chosen": -30.230323791503906, "rewards/margins": 2.5917019844055176, "rewards/rejected": -32.822025299072266, "step": 1601 }, { "epoch": 0.2181372549019608, "grad_norm": 48.33040623976027, "learning_rate": 7.665080357462715e-07, "logits/chosen": 9.460947036743164, "logits/rejected": 10.675315856933594, "logps/chosen": -2.927478551864624, "logps/rejected": -3.0162510871887207, "loss": 4.2297, "rewards/accuracies": 0.5, "rewards/chosen": -29.274784088134766, "rewards/margins": 0.8877263069152832, "rewards/rejected": -30.162511825561523, "step": 1602 }, { "epoch": 0.21827342047930284, "grad_norm": 43.79391985585045, "learning_rate": 7.664318315616134e-07, "logits/chosen": 10.023183822631836, "logits/rejected": 11.127506256103516, "logps/chosen": -2.8341572284698486, "logps/rejected": -3.183180570602417, "loss": 4.3321, "rewards/accuracies": 0.75, "rewards/chosen": -28.341571807861328, "rewards/margins": 3.490231990814209, "rewards/rejected": -31.831806182861328, "step": 1603 }, { "epoch": 0.21840958605664487, "grad_norm": 48.54701775687578, "learning_rate": 7.663555445786538e-07, "logits/chosen": 10.22732162475586, "logits/rejected": 10.517293930053711, "logps/chosen": -2.8963303565979004, "logps/rejected": -3.3477492332458496, "loss": 4.1749, "rewards/accuracies": 0.75, "rewards/chosen": -28.963302612304688, "rewards/margins": 4.514192581176758, "rewards/rejected": -33.47749328613281, "step": 1604 }, { "epoch": 0.21854575163398693, "grad_norm": 56.291104589110866, "learning_rate": 7.662791748146307e-07, "logits/chosen": 9.53569507598877, "logits/rejected": 9.811695098876953, "logps/chosen": -3.005734920501709, "logps/rejected": -3.3002121448516846, "loss": 3.6991, "rewards/accuracies": 0.75, "rewards/chosen": -30.057348251342773, "rewards/margins": 2.9447731971740723, "rewards/rejected": -33.00212097167969, "step": 1605 }, { "epoch": 0.21868191721132899, "grad_norm": 43.403523019131555, "learning_rate": 7.662027222868003e-07, "logits/chosen": 8.650094032287598, "logits/rejected": 9.909659385681152, "logps/chosen": -3.0831363201141357, "logps/rejected": -3.140498161315918, "loss": 3.8258, "rewards/accuracies": 0.5, "rewards/chosen": -30.831363677978516, "rewards/margins": 0.5736179351806641, "rewards/rejected": -31.40498161315918, "step": 1606 }, { "epoch": 0.21881808278867101, "grad_norm": 43.796255968478704, "learning_rate": 7.661261870124377e-07, "logits/chosen": 9.963878631591797, "logits/rejected": 10.498764991760254, "logps/chosen": -3.1831986904144287, "logps/rejected": -3.2819576263427734, "loss": 3.9785, "rewards/accuracies": 0.75, "rewards/chosen": -31.831985473632812, "rewards/margins": 0.9875879287719727, "rewards/rejected": -32.819576263427734, "step": 1607 }, { "epoch": 0.21895424836601307, "grad_norm": 46.55052453720085, "learning_rate": 7.660495690088368e-07, "logits/chosen": 12.045798301696777, "logits/rejected": 11.697813034057617, "logps/chosen": -3.234245538711548, "logps/rejected": -3.4901175498962402, "loss": 4.0269, "rewards/accuracies": 0.5, "rewards/chosen": -32.34245681762695, "rewards/margins": 2.5587196350097656, "rewards/rejected": -34.90117645263672, "step": 1608 }, { "epoch": 0.21909041394335513, "grad_norm": 47.74965146299758, "learning_rate": 7.659728682933099e-07, "logits/chosen": 10.133358001708984, "logits/rejected": 10.749563217163086, "logps/chosen": -3.1170921325683594, "logps/rejected": -3.3770227432250977, "loss": 4.2026, "rewards/accuracies": 0.5, "rewards/chosen": -31.170921325683594, "rewards/margins": 2.599304676055908, "rewards/rejected": -33.770225524902344, "step": 1609 }, { "epoch": 0.21922657952069716, "grad_norm": 45.062117321997, "learning_rate": 7.658960848831883e-07, "logits/chosen": 9.927925109863281, "logits/rejected": 11.365047454833984, "logps/chosen": -2.581301212310791, "logps/rejected": -3.114560604095459, "loss": 3.9545, "rewards/accuracies": 1.0, "rewards/chosen": -25.813011169433594, "rewards/margins": 5.332596778869629, "rewards/rejected": -31.145606994628906, "step": 1610 }, { "epoch": 0.2193627450980392, "grad_norm": 50.044783883589034, "learning_rate": 7.658192187958218e-07, "logits/chosen": 11.231378555297852, "logits/rejected": 11.92220687866211, "logps/chosen": -3.457463502883911, "logps/rejected": -3.547950506210327, "loss": 4.3719, "rewards/accuracies": 0.75, "rewards/chosen": -34.57463455200195, "rewards/margins": 0.9048700332641602, "rewards/rejected": -35.4795036315918, "step": 1611 }, { "epoch": 0.21949891067538127, "grad_norm": 90.22013059324078, "learning_rate": 7.65742270048579e-07, "logits/chosen": 9.525300979614258, "logits/rejected": 10.711931228637695, "logps/chosen": -3.01318359375, "logps/rejected": -3.0637383460998535, "loss": 3.8383, "rewards/accuracies": 0.5, "rewards/chosen": -30.131834030151367, "rewards/margins": 0.5055465698242188, "rewards/rejected": -30.637380599975586, "step": 1612 }, { "epoch": 0.2196350762527233, "grad_norm": 58.30161520763311, "learning_rate": 7.656652386588468e-07, "logits/chosen": 12.146688461303711, "logits/rejected": 10.542142868041992, "logps/chosen": -3.4991047382354736, "logps/rejected": -3.106689453125, "loss": 4.9019, "rewards/accuracies": 0.0, "rewards/chosen": -34.99104690551758, "rewards/margins": -3.9241538047790527, "rewards/rejected": -31.06689453125, "step": 1613 }, { "epoch": 0.21977124183006536, "grad_norm": 44.574329266555594, "learning_rate": 7.655881246440316e-07, "logits/chosen": 10.14571762084961, "logits/rejected": 12.40176773071289, "logps/chosen": -3.30540132522583, "logps/rejected": -3.8851706981658936, "loss": 4.2123, "rewards/accuracies": 1.0, "rewards/chosen": -33.054012298583984, "rewards/margins": 5.797694206237793, "rewards/rejected": -38.851707458496094, "step": 1614 }, { "epoch": 0.2199074074074074, "grad_norm": 55.285919701899395, "learning_rate": 7.655109280215575e-07, "logits/chosen": 10.86552906036377, "logits/rejected": 10.702056884765625, "logps/chosen": -3.3341221809387207, "logps/rejected": -3.53643536567688, "loss": 4.1507, "rewards/accuracies": 0.75, "rewards/chosen": -33.341217041015625, "rewards/margins": 2.0231332778930664, "rewards/rejected": -35.36435317993164, "step": 1615 }, { "epoch": 0.22004357298474944, "grad_norm": 52.02812033643638, "learning_rate": 7.654336488088678e-07, "logits/chosen": 11.167387008666992, "logits/rejected": 10.94675064086914, "logps/chosen": -3.0311429500579834, "logps/rejected": -3.369400978088379, "loss": 4.4152, "rewards/accuracies": 0.75, "rewards/chosen": -30.311429977416992, "rewards/margins": 3.3825793266296387, "rewards/rejected": -33.694007873535156, "step": 1616 }, { "epoch": 0.2201797385620915, "grad_norm": 47.76678772959992, "learning_rate": 7.653562870234245e-07, "logits/chosen": 10.218888282775879, "logits/rejected": 10.541285514831543, "logps/chosen": -3.156928777694702, "logps/rejected": -3.2673230171203613, "loss": 4.5193, "rewards/accuracies": 0.75, "rewards/chosen": -31.569290161132812, "rewards/margins": 1.1039423942565918, "rewards/rejected": -32.67323303222656, "step": 1617 }, { "epoch": 0.22031590413943355, "grad_norm": 42.37499352930705, "learning_rate": 7.652788426827081e-07, "logits/chosen": 10.53187084197998, "logits/rejected": 11.150362014770508, "logps/chosen": -3.150851249694824, "logps/rejected": -3.471803903579712, "loss": 4.1177, "rewards/accuracies": 0.75, "rewards/chosen": -31.508514404296875, "rewards/margins": 3.209526538848877, "rewards/rejected": -34.718040466308594, "step": 1618 }, { "epoch": 0.2204520697167756, "grad_norm": 44.36100179509427, "learning_rate": 7.652013158042179e-07, "logits/chosen": 11.141995429992676, "logits/rejected": 11.265195846557617, "logps/chosen": -3.5081746578216553, "logps/rejected": -3.3211655616760254, "loss": 4.4555, "rewards/accuracies": 0.5, "rewards/chosen": -35.08174514770508, "rewards/margins": -1.8700909614562988, "rewards/rejected": -33.21165466308594, "step": 1619 }, { "epoch": 0.22058823529411764, "grad_norm": 47.87351910788295, "learning_rate": 7.651237064054713e-07, "logits/chosen": 10.400090217590332, "logits/rejected": 10.833552360534668, "logps/chosen": -3.0836243629455566, "logps/rejected": -3.056903839111328, "loss": 3.4304, "rewards/accuracies": 0.5, "rewards/chosen": -30.83624267578125, "rewards/margins": -0.26720380783081055, "rewards/rejected": -30.56903839111328, "step": 1620 }, { "epoch": 0.2207244008714597, "grad_norm": 48.32615135888765, "learning_rate": 7.650460145040053e-07, "logits/chosen": 10.890365600585938, "logits/rejected": 8.807487487792969, "logps/chosen": -3.153139114379883, "logps/rejected": -3.129150867462158, "loss": 3.8811, "rewards/accuracies": 0.25, "rewards/chosen": -31.531391143798828, "rewards/margins": -0.23987913131713867, "rewards/rejected": -31.29151153564453, "step": 1621 }, { "epoch": 0.22086056644880175, "grad_norm": 44.62767896640824, "learning_rate": 7.649682401173748e-07, "logits/chosen": 10.495162963867188, "logits/rejected": 10.21986198425293, "logps/chosen": -2.9378085136413574, "logps/rejected": -3.103151798248291, "loss": 4.2938, "rewards/accuracies": 0.75, "rewards/chosen": -29.37808609008789, "rewards/margins": 1.6534333229064941, "rewards/rejected": -31.031518936157227, "step": 1622 }, { "epoch": 0.22099673202614378, "grad_norm": 47.19338092898566, "learning_rate": 7.648903832631536e-07, "logits/chosen": 10.86962890625, "logits/rejected": 10.178182601928711, "logps/chosen": -3.0357351303100586, "logps/rejected": -3.3470284938812256, "loss": 3.9251, "rewards/accuracies": 1.0, "rewards/chosen": -30.357351303100586, "rewards/margins": 3.112934112548828, "rewards/rejected": -33.47028350830078, "step": 1623 }, { "epoch": 0.22113289760348584, "grad_norm": 43.076542859341814, "learning_rate": 7.64812443958934e-07, "logits/chosen": 10.921299934387207, "logits/rejected": 10.243478775024414, "logps/chosen": -3.0114426612854004, "logps/rejected": -3.150120735168457, "loss": 4.48, "rewards/accuracies": 0.75, "rewards/chosen": -30.11442756652832, "rewards/margins": 1.3867802619934082, "rewards/rejected": -31.50120735168457, "step": 1624 }, { "epoch": 0.2212690631808279, "grad_norm": 44.817902625228456, "learning_rate": 7.647344222223273e-07, "logits/chosen": 8.838811874389648, "logits/rejected": 10.527068138122559, "logps/chosen": -2.786623954772949, "logps/rejected": -3.4333367347717285, "loss": 4.2098, "rewards/accuracies": 1.0, "rewards/chosen": -27.866239547729492, "rewards/margins": 6.467127799987793, "rewards/rejected": -34.33336639404297, "step": 1625 }, { "epoch": 0.22140522875816993, "grad_norm": 42.74440972747129, "learning_rate": 7.646563180709627e-07, "logits/chosen": 9.236627578735352, "logits/rejected": 10.181097984313965, "logps/chosen": -3.0554065704345703, "logps/rejected": -3.281161308288574, "loss": 4.9702, "rewards/accuracies": 0.5, "rewards/chosen": -30.554065704345703, "rewards/margins": 2.257546901702881, "rewards/rejected": -32.811614990234375, "step": 1626 }, { "epoch": 0.22154139433551198, "grad_norm": 49.370537338779144, "learning_rate": 7.64578131522489e-07, "logits/chosen": 11.158700942993164, "logits/rejected": 10.925641059875488, "logps/chosen": -3.155543088912964, "logps/rejected": -3.456148386001587, "loss": 3.9859, "rewards/accuracies": 1.0, "rewards/chosen": -31.555431365966797, "rewards/margins": 3.0060524940490723, "rewards/rejected": -34.561485290527344, "step": 1627 }, { "epoch": 0.22167755991285404, "grad_norm": 57.76079642131959, "learning_rate": 7.644998625945728e-07, "logits/chosen": 10.222707748413086, "logits/rejected": 9.823457717895508, "logps/chosen": -3.3209638595581055, "logps/rejected": -3.273080348968506, "loss": 4.5544, "rewards/accuracies": 0.5, "rewards/chosen": -33.20963668823242, "rewards/margins": -0.4788365364074707, "rewards/rejected": -32.730804443359375, "step": 1628 }, { "epoch": 0.22181372549019607, "grad_norm": 39.679195644121535, "learning_rate": 7.644215113048996e-07, "logits/chosen": 11.722719192504883, "logits/rejected": 11.467218399047852, "logps/chosen": -3.2142465114593506, "logps/rejected": -3.2409796714782715, "loss": 4.7863, "rewards/accuracies": 0.5, "rewards/chosen": -32.14246368408203, "rewards/margins": 0.267333984375, "rewards/rejected": -32.40979766845703, "step": 1629 }, { "epoch": 0.22194989106753812, "grad_norm": 41.69964069718356, "learning_rate": 7.643430776711736e-07, "logits/chosen": 9.853963851928711, "logits/rejected": 9.965550422668457, "logps/chosen": -3.0643107891082764, "logps/rejected": -3.1914727687835693, "loss": 4.6983, "rewards/accuracies": 0.75, "rewards/chosen": -30.643108367919922, "rewards/margins": 1.271620273590088, "rewards/rejected": -31.91472625732422, "step": 1630 }, { "epoch": 0.22208605664488018, "grad_norm": 41.0351946795863, "learning_rate": 7.642645617111175e-07, "logits/chosen": 10.484396934509277, "logits/rejected": 10.878316879272461, "logps/chosen": -2.9669060707092285, "logps/rejected": -3.1096465587615967, "loss": 3.9886, "rewards/accuracies": 0.5, "rewards/chosen": -29.6690616607666, "rewards/margins": 1.4274044036865234, "rewards/rejected": -31.096466064453125, "step": 1631 }, { "epoch": 0.2222222222222222, "grad_norm": 42.884064109016194, "learning_rate": 7.641859634424726e-07, "logits/chosen": 10.641426086425781, "logits/rejected": 11.673528671264648, "logps/chosen": -3.0408544540405273, "logps/rejected": -3.243760108947754, "loss": 3.8809, "rewards/accuracies": 0.5, "rewards/chosen": -30.408544540405273, "rewards/margins": 2.029057502746582, "rewards/rejected": -32.437599182128906, "step": 1632 }, { "epoch": 0.22235838779956427, "grad_norm": 48.108675262136124, "learning_rate": 7.64107282882999e-07, "logits/chosen": 11.005939483642578, "logits/rejected": 10.856314659118652, "logps/chosen": -2.992366313934326, "logps/rejected": -3.2189090251922607, "loss": 4.4925, "rewards/accuracies": 0.5, "rewards/chosen": -29.923664093017578, "rewards/margins": 2.2654247283935547, "rewards/rejected": -32.1890869140625, "step": 1633 }, { "epoch": 0.22249455337690632, "grad_norm": 41.89676390945042, "learning_rate": 7.640285200504749e-07, "logits/chosen": 9.655696868896484, "logits/rejected": 9.787144660949707, "logps/chosen": -3.105323314666748, "logps/rejected": -3.122670888900757, "loss": 4.3191, "rewards/accuracies": 0.5, "rewards/chosen": -31.05323028564453, "rewards/margins": 0.1734757423400879, "rewards/rejected": -31.226707458496094, "step": 1634 }, { "epoch": 0.22263071895424835, "grad_norm": 44.69812747522324, "learning_rate": 7.639496749626978e-07, "logits/chosen": 9.475046157836914, "logits/rejected": 10.160125732421875, "logps/chosen": -3.148862838745117, "logps/rejected": -3.1197850704193115, "loss": 4.1289, "rewards/accuracies": 0.25, "rewards/chosen": -31.488628387451172, "rewards/margins": -0.29077768325805664, "rewards/rejected": -31.197851181030273, "step": 1635 }, { "epoch": 0.2227668845315904, "grad_norm": 39.32230384861713, "learning_rate": 7.638707476374831e-07, "logits/chosen": 9.22298812866211, "logits/rejected": 10.020679473876953, "logps/chosen": -2.7317698001861572, "logps/rejected": -2.733680486679077, "loss": 3.2427, "rewards/accuracies": 0.75, "rewards/chosen": -27.317699432373047, "rewards/margins": 0.019106388092041016, "rewards/rejected": -27.336803436279297, "step": 1636 }, { "epoch": 0.22290305010893247, "grad_norm": 46.98817894753056, "learning_rate": 7.637917380926652e-07, "logits/chosen": 10.082611083984375, "logits/rejected": 9.88493537902832, "logps/chosen": -2.9201602935791016, "logps/rejected": -3.4075684547424316, "loss": 4.1292, "rewards/accuracies": 1.0, "rewards/chosen": -29.201601028442383, "rewards/margins": 4.874084949493408, "rewards/rejected": -34.07568359375, "step": 1637 }, { "epoch": 0.22303921568627452, "grad_norm": 42.47094318513402, "learning_rate": 7.637126463460969e-07, "logits/chosen": 9.121017456054688, "logits/rejected": 11.28831958770752, "logps/chosen": -2.9755210876464844, "logps/rejected": -3.2681901454925537, "loss": 4.7548, "rewards/accuracies": 0.5, "rewards/chosen": -29.755210876464844, "rewards/margins": 2.9266915321350098, "rewards/rejected": -32.68190002441406, "step": 1638 }, { "epoch": 0.22317538126361655, "grad_norm": 94.87899641409135, "learning_rate": 7.636334724156497e-07, "logits/chosen": 10.432904243469238, "logits/rejected": 10.655848503112793, "logps/chosen": -2.798644542694092, "logps/rejected": -2.9323270320892334, "loss": 4.1596, "rewards/accuracies": 0.75, "rewards/chosen": -27.986446380615234, "rewards/margins": 1.3368234634399414, "rewards/rejected": -29.32326889038086, "step": 1639 }, { "epoch": 0.2233115468409586, "grad_norm": 45.66337847147884, "learning_rate": 7.635542163192137e-07, "logits/chosen": 9.512348175048828, "logits/rejected": 9.655261039733887, "logps/chosen": -2.8016324043273926, "logps/rejected": -3.0753872394561768, "loss": 3.9037, "rewards/accuracies": 0.75, "rewards/chosen": -28.01632308959961, "rewards/margins": 2.737549304962158, "rewards/rejected": -30.753873825073242, "step": 1640 }, { "epoch": 0.22344771241830066, "grad_norm": 48.40545033520928, "learning_rate": 7.634748780746973e-07, "logits/chosen": 9.759700775146484, "logits/rejected": 11.106935501098633, "logps/chosen": -2.9703030586242676, "logps/rejected": -2.9769692420959473, "loss": 3.6966, "rewards/accuracies": 0.75, "rewards/chosen": -29.703033447265625, "rewards/margins": 0.06665802001953125, "rewards/rejected": -29.769691467285156, "step": 1641 }, { "epoch": 0.2235838779956427, "grad_norm": 40.491996757210444, "learning_rate": 7.633954577000276e-07, "logits/chosen": 9.789640426635742, "logits/rejected": 11.9633207321167, "logps/chosen": -2.707335948944092, "logps/rejected": -3.429046869277954, "loss": 4.1318, "rewards/accuracies": 1.0, "rewards/chosen": -27.0733585357666, "rewards/margins": 7.2171101570129395, "rewards/rejected": -34.290470123291016, "step": 1642 }, { "epoch": 0.22372004357298475, "grad_norm": 42.432178794336345, "learning_rate": 7.633159552131504e-07, "logits/chosen": 10.468042373657227, "logits/rejected": 9.427547454833984, "logps/chosen": -2.6221652030944824, "logps/rejected": -2.668233633041382, "loss": 4.2908, "rewards/accuracies": 0.75, "rewards/chosen": -26.221651077270508, "rewards/margins": 0.46068525314331055, "rewards/rejected": -26.682334899902344, "step": 1643 }, { "epoch": 0.2238562091503268, "grad_norm": 37.02651694226247, "learning_rate": 7.632363706320299e-07, "logits/chosen": 8.580713272094727, "logits/rejected": 10.523149490356445, "logps/chosen": -2.6034507751464844, "logps/rejected": -2.962418794631958, "loss": 3.6632, "rewards/accuracies": 1.0, "rewards/chosen": -26.034509658813477, "rewards/margins": 3.5896782875061035, "rewards/rejected": -29.624187469482422, "step": 1644 }, { "epoch": 0.22399237472766884, "grad_norm": 40.68468779846955, "learning_rate": 7.631567039746491e-07, "logits/chosen": 10.345193862915039, "logits/rejected": 11.530272483825684, "logps/chosen": -2.8221065998077393, "logps/rejected": -3.025268316268921, "loss": 4.142, "rewards/accuracies": 0.75, "rewards/chosen": -28.221065521240234, "rewards/margins": 2.0316162109375, "rewards/rejected": -30.252681732177734, "step": 1645 }, { "epoch": 0.2241285403050109, "grad_norm": 60.52111372121374, "learning_rate": 7.63076955259009e-07, "logits/chosen": 10.507709503173828, "logits/rejected": 11.117980003356934, "logps/chosen": -2.8617963790893555, "logps/rejected": -2.7450904846191406, "loss": 4.6094, "rewards/accuracies": 0.5, "rewards/chosen": -28.617963790893555, "rewards/margins": -1.1670575141906738, "rewards/rejected": -27.450904846191406, "step": 1646 }, { "epoch": 0.22426470588235295, "grad_norm": 44.99073708017938, "learning_rate": 7.629971245031296e-07, "logits/chosen": 9.545750617980957, "logits/rejected": 10.376251220703125, "logps/chosen": -2.7579259872436523, "logps/rejected": -2.9062414169311523, "loss": 4.4815, "rewards/accuracies": 0.75, "rewards/chosen": -27.579259872436523, "rewards/margins": 1.4831538200378418, "rewards/rejected": -29.06241226196289, "step": 1647 }, { "epoch": 0.22440087145969498, "grad_norm": 44.66680410599971, "learning_rate": 7.629172117250494e-07, "logits/chosen": 11.221088409423828, "logits/rejected": 10.60708999633789, "logps/chosen": -2.926281690597534, "logps/rejected": -2.9472579956054688, "loss": 4.0029, "rewards/accuracies": 0.75, "rewards/chosen": -29.2628173828125, "rewards/margins": 0.2097606658935547, "rewards/rejected": -29.472579956054688, "step": 1648 }, { "epoch": 0.22453703703703703, "grad_norm": 44.8660790392851, "learning_rate": 7.628372169428253e-07, "logits/chosen": 10.635000228881836, "logits/rejected": 11.191045761108398, "logps/chosen": -3.029935836791992, "logps/rejected": -3.0673210620880127, "loss": 4.3778, "rewards/accuracies": 0.5, "rewards/chosen": -30.299358367919922, "rewards/margins": 0.37385129928588867, "rewards/rejected": -30.67321014404297, "step": 1649 }, { "epoch": 0.2246732026143791, "grad_norm": 47.868052389844514, "learning_rate": 7.627571401745328e-07, "logits/chosen": 10.65023422241211, "logits/rejected": 11.024513244628906, "logps/chosen": -2.9272825717926025, "logps/rejected": -2.9272866249084473, "loss": 4.1116, "rewards/accuracies": 0.25, "rewards/chosen": -29.272825241088867, "rewards/margins": 4.100799560546875e-05, "rewards/rejected": -29.272865295410156, "step": 1650 }, { "epoch": 0.22480936819172112, "grad_norm": 144.3620645061734, "learning_rate": 7.626769814382658e-07, "logits/chosen": 8.590291976928711, "logits/rejected": 10.40049934387207, "logps/chosen": -2.5205979347229004, "logps/rejected": -2.8967955112457275, "loss": 4.0624, "rewards/accuracies": 0.75, "rewards/chosen": -25.20598030090332, "rewards/margins": 3.7619752883911133, "rewards/rejected": -28.967954635620117, "step": 1651 }, { "epoch": 0.22494553376906318, "grad_norm": 65.62221500622795, "learning_rate": 7.62596740752137e-07, "logits/chosen": 11.592370986938477, "logits/rejected": 11.92125129699707, "logps/chosen": -3.132261276245117, "logps/rejected": -3.4889302253723145, "loss": 4.3564, "rewards/accuracies": 0.75, "rewards/chosen": -31.322612762451172, "rewards/margins": 3.5666909217834473, "rewards/rejected": -34.889305114746094, "step": 1652 }, { "epoch": 0.22508169934640523, "grad_norm": 44.2889048659221, "learning_rate": 7.625164181342775e-07, "logits/chosen": 11.040139198303223, "logits/rejected": 11.370241165161133, "logps/chosen": -3.2095017433166504, "logps/rejected": -3.2397518157958984, "loss": 4.5447, "rewards/accuracies": 0.5, "rewards/chosen": -32.09501647949219, "rewards/margins": 0.30249977111816406, "rewards/rejected": -32.39751434326172, "step": 1653 }, { "epoch": 0.22521786492374726, "grad_norm": 38.32298466282237, "learning_rate": 7.624360136028366e-07, "logits/chosen": 9.205256462097168, "logits/rejected": 9.670255661010742, "logps/chosen": -2.8652544021606445, "logps/rejected": -2.8477704524993896, "loss": 3.7414, "rewards/accuracies": 0.5, "rewards/chosen": -28.652542114257812, "rewards/margins": -0.17483758926391602, "rewards/rejected": -28.477705001831055, "step": 1654 }, { "epoch": 0.22535403050108932, "grad_norm": 50.115387468409025, "learning_rate": 7.623555271759825e-07, "logits/chosen": 10.035829544067383, "logits/rejected": 10.903833389282227, "logps/chosen": -2.542863368988037, "logps/rejected": -2.6339685916900635, "loss": 4.2989, "rewards/accuracies": 0.5, "rewards/chosen": -25.428630828857422, "rewards/margins": 0.9110536575317383, "rewards/rejected": -26.33968734741211, "step": 1655 }, { "epoch": 0.22549019607843138, "grad_norm": 50.80792000943568, "learning_rate": 7.622749588719018e-07, "logits/chosen": 10.254295349121094, "logits/rejected": 10.88498306274414, "logps/chosen": -2.654196262359619, "logps/rejected": -2.9013900756835938, "loss": 4.0355, "rewards/accuracies": 1.0, "rewards/chosen": -26.541963577270508, "rewards/margins": 2.471937656402588, "rewards/rejected": -29.013900756835938, "step": 1656 }, { "epoch": 0.22562636165577343, "grad_norm": 43.779069556842245, "learning_rate": 7.621943087087995e-07, "logits/chosen": 11.672687530517578, "logits/rejected": 10.6768159866333, "logps/chosen": -3.182405471801758, "logps/rejected": -2.8698630332946777, "loss": 3.9664, "rewards/accuracies": 0.0, "rewards/chosen": -31.824054718017578, "rewards/margins": -3.1254234313964844, "rewards/rejected": -28.698631286621094, "step": 1657 }, { "epoch": 0.22576252723311546, "grad_norm": 43.66435017747805, "learning_rate": 7.621135767048993e-07, "logits/chosen": 9.868463516235352, "logits/rejected": 10.67496395111084, "logps/chosen": -2.771808385848999, "logps/rejected": -2.9041833877563477, "loss": 4.4201, "rewards/accuracies": 0.5, "rewards/chosen": -27.718082427978516, "rewards/margins": 1.3237495422363281, "rewards/rejected": -29.041831970214844, "step": 1658 }, { "epoch": 0.22589869281045752, "grad_norm": 59.461615746817614, "learning_rate": 7.620327628784432e-07, "logits/chosen": 11.239673614501953, "logits/rejected": 11.101846694946289, "logps/chosen": -3.412686347961426, "logps/rejected": -3.178316593170166, "loss": 3.8598, "rewards/accuracies": 0.5, "rewards/chosen": -34.126861572265625, "rewards/margins": -2.3436970710754395, "rewards/rejected": -31.783166885375977, "step": 1659 }, { "epoch": 0.22603485838779958, "grad_norm": 48.80739026701476, "learning_rate": 7.619518672476916e-07, "logits/chosen": 10.967519760131836, "logits/rejected": 8.997150421142578, "logps/chosen": -2.960390329360962, "logps/rejected": -2.5804860591888428, "loss": 4.5276, "rewards/accuracies": 0.25, "rewards/chosen": -29.60390281677246, "rewards/margins": -3.7990427017211914, "rewards/rejected": -25.804859161376953, "step": 1660 }, { "epoch": 0.2261710239651416, "grad_norm": 42.08848078784872, "learning_rate": 7.618708898309238e-07, "logits/chosen": 10.854156494140625, "logits/rejected": 10.393638610839844, "logps/chosen": -2.6066813468933105, "logps/rejected": -2.709216594696045, "loss": 3.9443, "rewards/accuracies": 0.5, "rewards/chosen": -26.06681251525879, "rewards/margins": 1.0253548622131348, "rewards/rejected": -27.092166900634766, "step": 1661 }, { "epoch": 0.22630718954248366, "grad_norm": 52.227004581999466, "learning_rate": 7.617898306464371e-07, "logits/chosen": 9.274967193603516, "logits/rejected": 11.961173057556152, "logps/chosen": -2.739023208618164, "logps/rejected": -3.1160542964935303, "loss": 4.3589, "rewards/accuracies": 0.75, "rewards/chosen": -27.39023208618164, "rewards/margins": 3.7703094482421875, "rewards/rejected": -31.16054344177246, "step": 1662 }, { "epoch": 0.22644335511982572, "grad_norm": 44.151859965421195, "learning_rate": 7.617086897125476e-07, "logits/chosen": 10.986348152160645, "logits/rejected": 11.574036598205566, "logps/chosen": -3.143928289413452, "logps/rejected": -3.5926992893218994, "loss": 4.3654, "rewards/accuracies": 0.75, "rewards/chosen": -31.43928337097168, "rewards/margins": 4.487709045410156, "rewards/rejected": -35.92699432373047, "step": 1663 }, { "epoch": 0.22657952069716775, "grad_norm": 47.481676695419544, "learning_rate": 7.616274670475897e-07, "logits/chosen": 11.38037109375, "logits/rejected": 11.318157196044922, "logps/chosen": -3.2215957641601562, "logps/rejected": -3.2526352405548096, "loss": 4.8033, "rewards/accuracies": 0.75, "rewards/chosen": -32.21595764160156, "rewards/margins": 0.3103952407836914, "rewards/rejected": -32.52635192871094, "step": 1664 }, { "epoch": 0.2267156862745098, "grad_norm": 70.96860893190484, "learning_rate": 7.615461626699164e-07, "logits/chosen": 10.11532211303711, "logits/rejected": 10.987262725830078, "logps/chosen": -2.656858444213867, "logps/rejected": -2.985711097717285, "loss": 4.2299, "rewards/accuracies": 1.0, "rewards/chosen": -26.568584442138672, "rewards/margins": 3.2885255813598633, "rewards/rejected": -29.85711097717285, "step": 1665 }, { "epoch": 0.22685185185185186, "grad_norm": 39.10626357093841, "learning_rate": 7.614647765978991e-07, "logits/chosen": 9.83400821685791, "logits/rejected": 10.40849494934082, "logps/chosen": -3.045243978500366, "logps/rejected": -3.120997905731201, "loss": 4.1705, "rewards/accuracies": 0.75, "rewards/chosen": -30.452438354492188, "rewards/margins": 0.757540225982666, "rewards/rejected": -31.209980010986328, "step": 1666 }, { "epoch": 0.2269880174291939, "grad_norm": 58.50444974432627, "learning_rate": 7.613833088499278e-07, "logits/chosen": 10.881624221801758, "logits/rejected": 11.930301666259766, "logps/chosen": -3.0353362560272217, "logps/rejected": -3.4500489234924316, "loss": 3.9891, "rewards/accuracies": 1.0, "rewards/chosen": -30.353363037109375, "rewards/margins": 4.14712381362915, "rewards/rejected": -34.50048828125, "step": 1667 }, { "epoch": 0.22712418300653595, "grad_norm": 42.0716216128628, "learning_rate": 7.613017594444104e-07, "logits/chosen": 10.405208587646484, "logits/rejected": 11.461286544799805, "logps/chosen": -3.1600279808044434, "logps/rejected": -3.3665926456451416, "loss": 4.1544, "rewards/accuracies": 1.0, "rewards/chosen": -31.600278854370117, "rewards/margins": 2.0656485557556152, "rewards/rejected": -33.66592788696289, "step": 1668 }, { "epoch": 0.227260348583878, "grad_norm": 41.909138371324914, "learning_rate": 7.61220128399774e-07, "logits/chosen": 10.660594940185547, "logits/rejected": 10.506797790527344, "logps/chosen": -3.124776840209961, "logps/rejected": -3.193486452102661, "loss": 4.0597, "rewards/accuracies": 0.5, "rewards/chosen": -31.24776840209961, "rewards/margins": 0.6870965957641602, "rewards/rejected": -31.93486785888672, "step": 1669 }, { "epoch": 0.22739651416122003, "grad_norm": 48.9479365722069, "learning_rate": 7.611384157344638e-07, "logits/chosen": 10.58033561706543, "logits/rejected": 10.673517227172852, "logps/chosen": -3.0070247650146484, "logps/rejected": -2.903243064880371, "loss": 3.9675, "rewards/accuracies": 0.5, "rewards/chosen": -30.070249557495117, "rewards/margins": -1.0378179550170898, "rewards/rejected": -29.03243064880371, "step": 1670 }, { "epoch": 0.2275326797385621, "grad_norm": 38.68728168411803, "learning_rate": 7.610566214669432e-07, "logits/chosen": 9.25743293762207, "logits/rejected": 10.884855270385742, "logps/chosen": -2.935692548751831, "logps/rejected": -3.2585082054138184, "loss": 3.985, "rewards/accuracies": 0.5, "rewards/chosen": -29.35692596435547, "rewards/margins": 3.228158473968506, "rewards/rejected": -32.5850830078125, "step": 1671 }, { "epoch": 0.22766884531590414, "grad_norm": 40.094767876380885, "learning_rate": 7.609747456156946e-07, "logits/chosen": 11.521053314208984, "logits/rejected": 10.90502643585205, "logps/chosen": -3.084685802459717, "logps/rejected": -3.1696126461029053, "loss": 4.1093, "rewards/accuracies": 0.5, "rewards/chosen": -30.84685707092285, "rewards/margins": 0.849268913269043, "rewards/rejected": -31.69612693786621, "step": 1672 }, { "epoch": 0.22780501089324617, "grad_norm": 43.40780072455534, "learning_rate": 7.608927881992182e-07, "logits/chosen": 10.747735977172852, "logits/rejected": 10.09867000579834, "logps/chosen": -3.0309534072875977, "logps/rejected": -2.9585437774658203, "loss": 4.0226, "rewards/accuracies": 0.25, "rewards/chosen": -30.309534072875977, "rewards/margins": -0.7240948677062988, "rewards/rejected": -29.585437774658203, "step": 1673 }, { "epoch": 0.22794117647058823, "grad_norm": 43.96793787509702, "learning_rate": 7.608107492360333e-07, "logits/chosen": 10.68275260925293, "logits/rejected": 10.298789978027344, "logps/chosen": -3.011507034301758, "logps/rejected": -3.1843819618225098, "loss": 4.2983, "rewards/accuracies": 0.5, "rewards/chosen": -30.115070343017578, "rewards/margins": 1.728750228881836, "rewards/rejected": -31.843820571899414, "step": 1674 }, { "epoch": 0.2280773420479303, "grad_norm": 47.36965355819193, "learning_rate": 7.60728628744677e-07, "logits/chosen": 9.889408111572266, "logits/rejected": 12.19996452331543, "logps/chosen": -2.9972376823425293, "logps/rejected": -3.7529687881469727, "loss": 4.5618, "rewards/accuracies": 1.0, "rewards/chosen": -29.972373962402344, "rewards/margins": 7.557314872741699, "rewards/rejected": -37.52968978881836, "step": 1675 }, { "epoch": 0.22821350762527234, "grad_norm": 44.04922167021342, "learning_rate": 7.606464267437052e-07, "logits/chosen": 8.858728408813477, "logits/rejected": 10.443780899047852, "logps/chosen": -2.754366159439087, "logps/rejected": -2.988521099090576, "loss": 3.7076, "rewards/accuracies": 0.75, "rewards/chosen": -27.54366111755371, "rewards/margins": 2.341550827026367, "rewards/rejected": -29.885211944580078, "step": 1676 }, { "epoch": 0.22834967320261437, "grad_norm": 75.59974233749766, "learning_rate": 7.605641432516923e-07, "logits/chosen": 11.304961204528809, "logits/rejected": 11.385576248168945, "logps/chosen": -3.134490489959717, "logps/rejected": -3.357576847076416, "loss": 4.2245, "rewards/accuracies": 0.75, "rewards/chosen": -31.34490394592285, "rewards/margins": 2.230863571166992, "rewards/rejected": -33.575767517089844, "step": 1677 }, { "epoch": 0.22848583877995643, "grad_norm": 50.044944553701974, "learning_rate": 7.604817782872307e-07, "logits/chosen": 11.340263366699219, "logits/rejected": 11.920843124389648, "logps/chosen": -3.247110366821289, "logps/rejected": -3.3812923431396484, "loss": 4.4639, "rewards/accuracies": 0.75, "rewards/chosen": -32.471099853515625, "rewards/margins": 1.3418207168579102, "rewards/rejected": -33.812923431396484, "step": 1678 }, { "epoch": 0.2286220043572985, "grad_norm": 46.15706365410287, "learning_rate": 7.603993318689315e-07, "logits/chosen": 10.502483367919922, "logits/rejected": 11.721492767333984, "logps/chosen": -3.2260451316833496, "logps/rejected": -3.3007261753082275, "loss": 4.2351, "rewards/accuracies": 0.25, "rewards/chosen": -32.26045227050781, "rewards/margins": 0.7468109130859375, "rewards/rejected": -33.00726318359375, "step": 1679 }, { "epoch": 0.22875816993464052, "grad_norm": 44.193389444024426, "learning_rate": 7.603168040154242e-07, "logits/chosen": 12.201294898986816, "logits/rejected": 10.532487869262695, "logps/chosen": -3.3698575496673584, "logps/rejected": -3.1163487434387207, "loss": 4.1803, "rewards/accuracies": 0.25, "rewards/chosen": -33.698577880859375, "rewards/margins": -2.535090446472168, "rewards/rejected": -31.16348648071289, "step": 1680 }, { "epoch": 0.22889433551198257, "grad_norm": 47.33762570127503, "learning_rate": 7.602341947453566e-07, "logits/chosen": 10.687660217285156, "logits/rejected": 10.467544555664062, "logps/chosen": -2.808258295059204, "logps/rejected": -2.973501682281494, "loss": 4.565, "rewards/accuracies": 0.5, "rewards/chosen": -28.082582473754883, "rewards/margins": 1.652435302734375, "rewards/rejected": -29.735017776489258, "step": 1681 }, { "epoch": 0.22903050108932463, "grad_norm": 74.26027923805212, "learning_rate": 7.60151504077395e-07, "logits/chosen": 12.52549934387207, "logits/rejected": 10.14528751373291, "logps/chosen": -3.2926840782165527, "logps/rejected": -2.9057517051696777, "loss": 3.5148, "rewards/accuracies": 0.0, "rewards/chosen": -32.926841735839844, "rewards/margins": -3.8693227767944336, "rewards/rejected": -29.057518005371094, "step": 1682 }, { "epoch": 0.22916666666666666, "grad_norm": 42.53329303468523, "learning_rate": 7.60068732030224e-07, "logits/chosen": 10.475566864013672, "logits/rejected": 11.631034851074219, "logps/chosen": -2.9867842197418213, "logps/rejected": -3.169492244720459, "loss": 4.465, "rewards/accuracies": 0.5, "rewards/chosen": -29.867843627929688, "rewards/margins": 1.8270812034606934, "rewards/rejected": -31.694923400878906, "step": 1683 }, { "epoch": 0.22930283224400871, "grad_norm": 44.94894112530197, "learning_rate": 7.599858786225466e-07, "logits/chosen": 10.61454963684082, "logits/rejected": 11.678840637207031, "logps/chosen": -3.0693840980529785, "logps/rejected": -3.507786750793457, "loss": 3.5551, "rewards/accuracies": 1.0, "rewards/chosen": -30.69384002685547, "rewards/margins": 4.38402795791626, "rewards/rejected": -35.0778694152832, "step": 1684 }, { "epoch": 0.22943899782135077, "grad_norm": 44.402528589734956, "learning_rate": 7.599029438730843e-07, "logits/chosen": 9.820629119873047, "logits/rejected": 10.841809272766113, "logps/chosen": -2.8232338428497314, "logps/rejected": -3.0735230445861816, "loss": 3.9478, "rewards/accuracies": 0.5, "rewards/chosen": -28.232337951660156, "rewards/margins": 2.5028934478759766, "rewards/rejected": -30.735231399536133, "step": 1685 }, { "epoch": 0.2295751633986928, "grad_norm": 58.5942986054744, "learning_rate": 7.598199278005769e-07, "logits/chosen": 10.499229431152344, "logits/rejected": 10.480260848999023, "logps/chosen": -2.9755754470825195, "logps/rejected": -2.904984474182129, "loss": 4.1819, "rewards/accuracies": 0.5, "rewards/chosen": -29.755752563476562, "rewards/margins": -0.7059087753295898, "rewards/rejected": -29.04984474182129, "step": 1686 }, { "epoch": 0.22971132897603486, "grad_norm": 43.38920723187016, "learning_rate": 7.597368304237823e-07, "logits/chosen": 10.938580513000488, "logits/rejected": 11.614532470703125, "logps/chosen": -3.2517988681793213, "logps/rejected": -3.401811122894287, "loss": 3.8237, "rewards/accuracies": 0.75, "rewards/chosen": -32.51799011230469, "rewards/margins": 1.5001225471496582, "rewards/rejected": -34.01811218261719, "step": 1687 }, { "epoch": 0.2298474945533769, "grad_norm": 43.351290846736944, "learning_rate": 7.596536517614774e-07, "logits/chosen": 10.756317138671875, "logits/rejected": 9.166336059570312, "logps/chosen": -3.093130350112915, "logps/rejected": -2.8258750438690186, "loss": 3.8829, "rewards/accuracies": 0.0, "rewards/chosen": -30.931304931640625, "rewards/margins": -2.6725549697875977, "rewards/rejected": -28.25874900817871, "step": 1688 }, { "epoch": 0.22998366013071894, "grad_norm": 42.06079013367801, "learning_rate": 7.59570391832457e-07, "logits/chosen": 9.80886459350586, "logits/rejected": 10.450651168823242, "logps/chosen": -2.7226743698120117, "logps/rejected": -3.0964643955230713, "loss": 4.0224, "rewards/accuracies": 1.0, "rewards/chosen": -27.22674560546875, "rewards/margins": 3.737898826599121, "rewards/rejected": -30.964641571044922, "step": 1689 }, { "epoch": 0.230119825708061, "grad_norm": 51.971466269007884, "learning_rate": 7.594870506555343e-07, "logits/chosen": 11.129470825195312, "logits/rejected": 11.51686954498291, "logps/chosen": -3.0853333473205566, "logps/rejected": -3.285191059112549, "loss": 3.919, "rewards/accuracies": 0.75, "rewards/chosen": -30.85333251953125, "rewards/margins": 1.9985785484313965, "rewards/rejected": -32.85190963745117, "step": 1690 }, { "epoch": 0.23025599128540306, "grad_norm": 48.39175588087228, "learning_rate": 7.594036282495409e-07, "logits/chosen": 10.315446853637695, "logits/rejected": 9.783103942871094, "logps/chosen": -2.9113237857818604, "logps/rejected": -2.868406295776367, "loss": 3.9152, "rewards/accuracies": 0.5, "rewards/chosen": -29.113239288330078, "rewards/margins": -0.42917680740356445, "rewards/rejected": -28.684062957763672, "step": 1691 }, { "epoch": 0.23039215686274508, "grad_norm": 57.312050088739426, "learning_rate": 7.593201246333269e-07, "logits/chosen": 10.781612396240234, "logits/rejected": 11.686368942260742, "logps/chosen": -3.038048267364502, "logps/rejected": -3.437258005142212, "loss": 3.8462, "rewards/accuracies": 1.0, "rewards/chosen": -30.380481719970703, "rewards/margins": 3.992098808288574, "rewards/rejected": -34.372581481933594, "step": 1692 }, { "epoch": 0.23052832244008714, "grad_norm": 45.785621846898366, "learning_rate": 7.592365398257605e-07, "logits/chosen": 9.834821701049805, "logits/rejected": 11.470680236816406, "logps/chosen": -2.6658835411071777, "logps/rejected": -2.844371795654297, "loss": 3.9424, "rewards/accuracies": 0.75, "rewards/chosen": -26.65883445739746, "rewards/margins": 1.7848825454711914, "rewards/rejected": -28.44371795654297, "step": 1693 }, { "epoch": 0.2306644880174292, "grad_norm": 55.65643455123506, "learning_rate": 7.591528738457284e-07, "logits/chosen": 11.602377891540527, "logits/rejected": 11.151602745056152, "logps/chosen": -3.2077159881591797, "logps/rejected": -3.285417318344116, "loss": 4.3043, "rewards/accuracies": 0.25, "rewards/chosen": -32.0771598815918, "rewards/margins": 0.7770147323608398, "rewards/rejected": -32.85417175292969, "step": 1694 }, { "epoch": 0.23080065359477125, "grad_norm": 43.72700656528529, "learning_rate": 7.59069126712136e-07, "logits/chosen": 10.701475143432617, "logits/rejected": 10.271453857421875, "logps/chosen": -3.056086540222168, "logps/rejected": -3.0533902645111084, "loss": 4.054, "rewards/accuracies": 0.5, "rewards/chosen": -30.56086540222168, "rewards/margins": -0.026963233947753906, "rewards/rejected": -30.533903121948242, "step": 1695 }, { "epoch": 0.23093681917211328, "grad_norm": 46.57241670096041, "learning_rate": 7.589852984439059e-07, "logits/chosen": 10.713554382324219, "logits/rejected": 11.111617088317871, "logps/chosen": -2.86262845993042, "logps/rejected": -3.237545967102051, "loss": 3.8983, "rewards/accuracies": 0.75, "rewards/chosen": -28.626283645629883, "rewards/margins": 3.749175548553467, "rewards/rejected": -32.37546157836914, "step": 1696 }, { "epoch": 0.23107298474945534, "grad_norm": 48.79737427934875, "learning_rate": 7.589013890599804e-07, "logits/chosen": 10.039203643798828, "logits/rejected": 11.09589672088623, "logps/chosen": -2.7789440155029297, "logps/rejected": -2.8823680877685547, "loss": 3.964, "rewards/accuracies": 0.5, "rewards/chosen": -27.789438247680664, "rewards/margins": 1.0342426300048828, "rewards/rejected": -28.823680877685547, "step": 1697 }, { "epoch": 0.2312091503267974, "grad_norm": 43.37140618309106, "learning_rate": 7.588173985793193e-07, "logits/chosen": 8.424032211303711, "logits/rejected": 9.414321899414062, "logps/chosen": -2.73077654838562, "logps/rejected": -2.799691677093506, "loss": 3.4783, "rewards/accuracies": 0.5, "rewards/chosen": -27.30776596069336, "rewards/margins": 0.6891512870788574, "rewards/rejected": -27.996917724609375, "step": 1698 }, { "epoch": 0.23134531590413943, "grad_norm": 43.553578555233535, "learning_rate": 7.587333270209011e-07, "logits/chosen": 10.510050773620605, "logits/rejected": 11.945631980895996, "logps/chosen": -2.8519833087921143, "logps/rejected": -3.308210611343384, "loss": 4.0765, "rewards/accuracies": 0.75, "rewards/chosen": -28.519832611083984, "rewards/margins": 4.56227445602417, "rewards/rejected": -33.08210754394531, "step": 1699 }, { "epoch": 0.23148148148148148, "grad_norm": 56.821570615282184, "learning_rate": 7.586491744037222e-07, "logits/chosen": 9.341010093688965, "logits/rejected": 10.37631607055664, "logps/chosen": -2.7129271030426025, "logps/rejected": -2.7016830444335938, "loss": 3.6666, "rewards/accuracies": 0.5, "rewards/chosen": -27.129268646240234, "rewards/margins": -0.1124410629272461, "rewards/rejected": -27.016830444335938, "step": 1700 }, { "epoch": 0.23161764705882354, "grad_norm": 66.9734494935451, "learning_rate": 7.585649407467977e-07, "logits/chosen": 11.675107955932617, "logits/rejected": 11.074348449707031, "logps/chosen": -3.362696886062622, "logps/rejected": -3.593071460723877, "loss": 4.5743, "rewards/accuracies": 0.5, "rewards/chosen": -33.62696838378906, "rewards/margins": 2.303743839263916, "rewards/rejected": -35.93071365356445, "step": 1701 }, { "epoch": 0.23175381263616557, "grad_norm": 62.12899050125139, "learning_rate": 7.58480626069161e-07, "logits/chosen": 10.237360000610352, "logits/rejected": 11.138479232788086, "logps/chosen": -2.9101829528808594, "logps/rejected": -2.989323139190674, "loss": 3.9979, "rewards/accuracies": 0.5, "rewards/chosen": -29.101829528808594, "rewards/margins": 0.7914037704467773, "rewards/rejected": -29.893232345581055, "step": 1702 }, { "epoch": 0.23188997821350762, "grad_norm": 45.20138204897656, "learning_rate": 7.583962303898636e-07, "logits/chosen": 10.201204299926758, "logits/rejected": 9.587929725646973, "logps/chosen": -3.0274393558502197, "logps/rejected": -2.8689019680023193, "loss": 3.8104, "rewards/accuracies": 0.25, "rewards/chosen": -30.274394989013672, "rewards/margins": -1.5853748321533203, "rewards/rejected": -28.68901824951172, "step": 1703 }, { "epoch": 0.23202614379084968, "grad_norm": 135.09149926981408, "learning_rate": 7.583117537279755e-07, "logits/chosen": 11.392133712768555, "logits/rejected": 9.545080184936523, "logps/chosen": -3.024667263031006, "logps/rejected": -2.8603451251983643, "loss": 4.31, "rewards/accuracies": 0.75, "rewards/chosen": -30.246673583984375, "rewards/margins": -1.6432228088378906, "rewards/rejected": -28.603450775146484, "step": 1704 }, { "epoch": 0.2321623093681917, "grad_norm": 40.21229650132523, "learning_rate": 7.582271961025846e-07, "logits/chosen": 10.05545711517334, "logits/rejected": 10.589179992675781, "logps/chosen": -2.91326904296875, "logps/rejected": -3.3495664596557617, "loss": 3.689, "rewards/accuracies": 0.75, "rewards/chosen": -29.1326904296875, "rewards/margins": 4.362975597381592, "rewards/rejected": -33.49566650390625, "step": 1705 }, { "epoch": 0.23229847494553377, "grad_norm": 50.11353578239022, "learning_rate": 7.581425575327976e-07, "logits/chosen": 10.698482513427734, "logits/rejected": 10.26488208770752, "logps/chosen": -3.250354766845703, "logps/rejected": -3.398556709289551, "loss": 3.955, "rewards/accuracies": 0.75, "rewards/chosen": -32.50354766845703, "rewards/margins": 1.4820189476013184, "rewards/rejected": -33.985565185546875, "step": 1706 }, { "epoch": 0.23243464052287582, "grad_norm": 51.55654078298247, "learning_rate": 7.580578380377394e-07, "logits/chosen": 11.284257888793945, "logits/rejected": 10.799561500549316, "logps/chosen": -3.1621646881103516, "logps/rejected": -2.961312770843506, "loss": 4.6277, "rewards/accuracies": 0.25, "rewards/chosen": -31.621646881103516, "rewards/margins": -2.0085182189941406, "rewards/rejected": -29.613128662109375, "step": 1707 }, { "epoch": 0.23257080610021785, "grad_norm": 52.81830565469967, "learning_rate": 7.57973037636553e-07, "logits/chosen": 10.40884780883789, "logits/rejected": 10.416467666625977, "logps/chosen": -2.9761977195739746, "logps/rejected": -3.1624996662139893, "loss": 4.0828, "rewards/accuracies": 0.5, "rewards/chosen": -29.761978149414062, "rewards/margins": 1.8630180358886719, "rewards/rejected": -31.624996185302734, "step": 1708 }, { "epoch": 0.2327069716775599, "grad_norm": 50.0713945226452, "learning_rate": 7.578881563483997e-07, "logits/chosen": 9.872007369995117, "logits/rejected": 10.049020767211914, "logps/chosen": -2.6994383335113525, "logps/rejected": -2.997076988220215, "loss": 4.1459, "rewards/accuracies": 0.75, "rewards/chosen": -26.994382858276367, "rewards/margins": 2.9763851165771484, "rewards/rejected": -29.970767974853516, "step": 1709 }, { "epoch": 0.23284313725490197, "grad_norm": 46.92494487713668, "learning_rate": 7.57803194192459e-07, "logits/chosen": 11.118364334106445, "logits/rejected": 11.574735641479492, "logps/chosen": -3.018855094909668, "logps/rejected": -3.290644407272339, "loss": 4.1831, "rewards/accuracies": 0.75, "rewards/chosen": -30.188552856445312, "rewards/margins": 2.7178921699523926, "rewards/rejected": -32.90644454956055, "step": 1710 }, { "epoch": 0.232979302832244, "grad_norm": 45.22098157478699, "learning_rate": 7.577181511879291e-07, "logits/chosen": 11.048133850097656, "logits/rejected": 9.393190383911133, "logps/chosen": -3.053152322769165, "logps/rejected": -3.1572113037109375, "loss": 3.786, "rewards/accuracies": 0.5, "rewards/chosen": -30.531522750854492, "rewards/margins": 1.0405893325805664, "rewards/rejected": -31.572113037109375, "step": 1711 }, { "epoch": 0.23311546840958605, "grad_norm": 46.69087463103457, "learning_rate": 7.57633027354026e-07, "logits/chosen": 10.138679504394531, "logits/rejected": 10.803705215454102, "logps/chosen": -2.6583399772644043, "logps/rejected": -3.04541015625, "loss": 4.3427, "rewards/accuracies": 0.5, "rewards/chosen": -26.583398818969727, "rewards/margins": 3.870704174041748, "rewards/rejected": -30.4541015625, "step": 1712 }, { "epoch": 0.2332516339869281, "grad_norm": 57.243734373917476, "learning_rate": 7.575478227099841e-07, "logits/chosen": 10.312750816345215, "logits/rejected": 11.339773178100586, "logps/chosen": -2.9802117347717285, "logps/rejected": -3.5277740955352783, "loss": 4.475, "rewards/accuracies": 1.0, "rewards/chosen": -29.8021183013916, "rewards/margins": 5.475622653961182, "rewards/rejected": -35.277740478515625, "step": 1713 }, { "epoch": 0.23338779956427017, "grad_norm": 44.442923640973916, "learning_rate": 7.574625372750562e-07, "logits/chosen": 10.11562728881836, "logits/rejected": 10.875985145568848, "logps/chosen": -3.178605079650879, "logps/rejected": -3.384347438812256, "loss": 4.1713, "rewards/accuracies": 0.75, "rewards/chosen": -31.786048889160156, "rewards/margins": 2.0574235916137695, "rewards/rejected": -33.843475341796875, "step": 1714 }, { "epoch": 0.2335239651416122, "grad_norm": 51.37511289084574, "learning_rate": 7.57377171068513e-07, "logits/chosen": 11.051248550415039, "logits/rejected": 10.920319557189941, "logps/chosen": -3.443848133087158, "logps/rejected": -3.1420083045959473, "loss": 4.1921, "rewards/accuracies": 0.75, "rewards/chosen": -34.43848419189453, "rewards/margins": -3.018401622772217, "rewards/rejected": -31.420082092285156, "step": 1715 }, { "epoch": 0.23366013071895425, "grad_norm": 61.63849484175621, "learning_rate": 7.572917241096441e-07, "logits/chosen": 10.04985523223877, "logits/rejected": 10.614331245422363, "logps/chosen": -2.9343152046203613, "logps/rejected": -3.0225090980529785, "loss": 4.143, "rewards/accuracies": 0.5, "rewards/chosen": -29.343151092529297, "rewards/margins": 0.8819384574890137, "rewards/rejected": -30.22509002685547, "step": 1716 }, { "epoch": 0.2337962962962963, "grad_norm": 44.210425900859235, "learning_rate": 7.572061964177566e-07, "logits/chosen": 9.624258041381836, "logits/rejected": 10.763957977294922, "logps/chosen": -3.0537023544311523, "logps/rejected": -3.392636775970459, "loss": 4.1424, "rewards/accuracies": 0.75, "rewards/chosen": -30.537025451660156, "rewards/margins": 3.3893423080444336, "rewards/rejected": -33.926368713378906, "step": 1717 }, { "epoch": 0.23393246187363834, "grad_norm": 45.756288437362, "learning_rate": 7.571205880121764e-07, "logits/chosen": 8.301310539245605, "logits/rejected": 9.93313217163086, "logps/chosen": -2.1267385482788086, "logps/rejected": -2.693434715270996, "loss": 3.2016, "rewards/accuracies": 1.0, "rewards/chosen": -21.26738739013672, "rewards/margins": 5.666962146759033, "rewards/rejected": -26.93434715270996, "step": 1718 }, { "epoch": 0.2340686274509804, "grad_norm": 48.38580325929136, "learning_rate": 7.570348989122473e-07, "logits/chosen": 10.858552932739258, "logits/rejected": 11.022293090820312, "logps/chosen": -2.77388858795166, "logps/rejected": -3.137110471725464, "loss": 4.4367, "rewards/accuracies": 0.75, "rewards/chosen": -27.738887786865234, "rewards/margins": 3.6322178840637207, "rewards/rejected": -31.371105194091797, "step": 1719 }, { "epoch": 0.23420479302832245, "grad_norm": 47.14451533171347, "learning_rate": 7.569491291373316e-07, "logits/chosen": 10.67770767211914, "logits/rejected": 10.579325675964355, "logps/chosen": -3.3780770301818848, "logps/rejected": -3.346339225769043, "loss": 4.4644, "rewards/accuracies": 0.5, "rewards/chosen": -33.78076934814453, "rewards/margins": -0.3173789978027344, "rewards/rejected": -33.46339416503906, "step": 1720 }, { "epoch": 0.23434095860566448, "grad_norm": 47.22780101641829, "learning_rate": 7.568632787068095e-07, "logits/chosen": 9.835417747497559, "logits/rejected": 9.384275436401367, "logps/chosen": -2.8760337829589844, "logps/rejected": -2.723377227783203, "loss": 4.0558, "rewards/accuracies": 0.5, "rewards/chosen": -28.760337829589844, "rewards/margins": -1.526564598083496, "rewards/rejected": -27.23377227783203, "step": 1721 }, { "epoch": 0.23447712418300654, "grad_norm": 49.06017705911048, "learning_rate": 7.567773476400797e-07, "logits/chosen": 10.216256141662598, "logits/rejected": 9.789691925048828, "logps/chosen": -3.017636775970459, "logps/rejected": -2.857638359069824, "loss": 4.4546, "rewards/accuracies": 0.25, "rewards/chosen": -30.176366806030273, "rewards/margins": -1.5999846458435059, "rewards/rejected": -28.57638168334961, "step": 1722 }, { "epoch": 0.2346132897603486, "grad_norm": 50.84684554055727, "learning_rate": 7.566913359565591e-07, "logits/chosen": 9.927295684814453, "logits/rejected": 10.995981216430664, "logps/chosen": -3.114959239959717, "logps/rejected": -3.6042051315307617, "loss": 4.112, "rewards/accuracies": 1.0, "rewards/chosen": -31.14959144592285, "rewards/margins": 4.89246129989624, "rewards/rejected": -36.04205322265625, "step": 1723 }, { "epoch": 0.23474945533769062, "grad_norm": 41.633052195290816, "learning_rate": 7.566052436756827e-07, "logits/chosen": 9.058893203735352, "logits/rejected": 10.883697509765625, "logps/chosen": -2.769658327102661, "logps/rejected": -3.2201602458953857, "loss": 3.4302, "rewards/accuracies": 1.0, "rewards/chosen": -27.696582794189453, "rewards/margins": 4.505019664764404, "rewards/rejected": -32.201602935791016, "step": 1724 }, { "epoch": 0.23488562091503268, "grad_norm": 42.92789053154131, "learning_rate": 7.565190708169037e-07, "logits/chosen": 10.686274528503418, "logits/rejected": 10.897031784057617, "logps/chosen": -3.2650341987609863, "logps/rejected": -3.4530763626098633, "loss": 3.8871, "rewards/accuracies": 0.75, "rewards/chosen": -32.65034484863281, "rewards/margins": 1.8804197311401367, "rewards/rejected": -34.53076171875, "step": 1725 }, { "epoch": 0.23502178649237473, "grad_norm": 46.65128417022572, "learning_rate": 7.564328173996937e-07, "logits/chosen": 9.197044372558594, "logits/rejected": 10.618419647216797, "logps/chosen": -2.8049755096435547, "logps/rejected": -3.1906137466430664, "loss": 3.8824, "rewards/accuracies": 0.75, "rewards/chosen": -28.049755096435547, "rewards/margins": 3.8563804626464844, "rewards/rejected": -31.906137466430664, "step": 1726 }, { "epoch": 0.23515795206971676, "grad_norm": 46.65903117995384, "learning_rate": 7.563464834435424e-07, "logits/chosen": 10.818344116210938, "logits/rejected": 11.893547058105469, "logps/chosen": -3.201720714569092, "logps/rejected": -3.9008612632751465, "loss": 3.447, "rewards/accuracies": 0.75, "rewards/chosen": -32.01720428466797, "rewards/margins": 6.991405010223389, "rewards/rejected": -39.00861358642578, "step": 1727 }, { "epoch": 0.23529411764705882, "grad_norm": 56.19036250492746, "learning_rate": 7.562600689679573e-07, "logits/chosen": 12.11201286315918, "logits/rejected": 11.234903335571289, "logps/chosen": -3.2876029014587402, "logps/rejected": -3.3493356704711914, "loss": 4.1145, "rewards/accuracies": 0.75, "rewards/chosen": -32.87602996826172, "rewards/margins": 0.6173253059387207, "rewards/rejected": -33.49335479736328, "step": 1728 }, { "epoch": 0.23543028322440088, "grad_norm": 58.88309378668805, "learning_rate": 7.561735739924649e-07, "logits/chosen": 11.158060073852539, "logits/rejected": 11.663581848144531, "logps/chosen": -3.198064088821411, "logps/rejected": -3.044861078262329, "loss": 3.7602, "rewards/accuracies": 0.0, "rewards/chosen": -31.980640411376953, "rewards/margins": -1.532029628753662, "rewards/rejected": -30.448610305786133, "step": 1729 }, { "epoch": 0.2355664488017429, "grad_norm": 42.43150928383943, "learning_rate": 7.560869985366094e-07, "logits/chosen": 10.046686172485352, "logits/rejected": 10.315864562988281, "logps/chosen": -2.8742775917053223, "logps/rejected": -3.1085569858551025, "loss": 3.8455, "rewards/accuracies": 0.75, "rewards/chosen": -28.74277687072754, "rewards/margins": 2.3427929878234863, "rewards/rejected": -31.085569381713867, "step": 1730 }, { "epoch": 0.23570261437908496, "grad_norm": 40.14678369900368, "learning_rate": 7.560003426199531e-07, "logits/chosen": 10.790721893310547, "logits/rejected": 11.902522087097168, "logps/chosen": -3.1091387271881104, "logps/rejected": -3.271315336227417, "loss": 3.7849, "rewards/accuracies": 0.75, "rewards/chosen": -31.091388702392578, "rewards/margins": 1.6217665672302246, "rewards/rejected": -32.71315383911133, "step": 1731 }, { "epoch": 0.23583877995642702, "grad_norm": 47.67690269275301, "learning_rate": 7.559136062620766e-07, "logits/chosen": 10.61816692352295, "logits/rejected": 11.130411148071289, "logps/chosen": -2.7322893142700195, "logps/rejected": -2.7921600341796875, "loss": 4.1837, "rewards/accuracies": 0.75, "rewards/chosen": -27.322891235351562, "rewards/margins": 0.5987081527709961, "rewards/rejected": -27.921600341796875, "step": 1732 }, { "epoch": 0.23597494553376908, "grad_norm": 57.349360701181666, "learning_rate": 7.558267894825787e-07, "logits/chosen": 11.544036865234375, "logits/rejected": 12.583381652832031, "logps/chosen": -3.1035523414611816, "logps/rejected": -3.366396903991699, "loss": 4.1857, "rewards/accuracies": 0.75, "rewards/chosen": -31.0355224609375, "rewards/margins": 2.6284475326538086, "rewards/rejected": -33.663970947265625, "step": 1733 }, { "epoch": 0.2361111111111111, "grad_norm": 49.643551305540086, "learning_rate": 7.557398923010764e-07, "logits/chosen": 9.643022537231445, "logits/rejected": 10.168386459350586, "logps/chosen": -2.8248867988586426, "logps/rejected": -2.7029106616973877, "loss": 4.1943, "rewards/accuracies": 0.5, "rewards/chosen": -28.248868942260742, "rewards/margins": -1.2197628021240234, "rewards/rejected": -27.02910614013672, "step": 1734 }, { "epoch": 0.23624727668845316, "grad_norm": 50.39432697888651, "learning_rate": 7.55652914737205e-07, "logits/chosen": 11.745807647705078, "logits/rejected": 11.109773635864258, "logps/chosen": -2.700625419616699, "logps/rejected": -2.9393534660339355, "loss": 4.054, "rewards/accuracies": 0.75, "rewards/chosen": -27.00625228881836, "rewards/margins": 2.3872814178466797, "rewards/rejected": -29.393535614013672, "step": 1735 }, { "epoch": 0.23638344226579522, "grad_norm": 60.768631283690404, "learning_rate": 7.555658568106176e-07, "logits/chosen": 11.052433013916016, "logits/rejected": 10.64535140991211, "logps/chosen": -3.259784460067749, "logps/rejected": -3.1593101024627686, "loss": 4.2756, "rewards/accuracies": 0.5, "rewards/chosen": -32.597843170166016, "rewards/margins": -1.0047426223754883, "rewards/rejected": -31.593101501464844, "step": 1736 }, { "epoch": 0.23651960784313725, "grad_norm": 46.96463224148518, "learning_rate": 7.554787185409857e-07, "logits/chosen": 10.776233673095703, "logits/rejected": 11.892093658447266, "logps/chosen": -3.175004482269287, "logps/rejected": -3.2442777156829834, "loss": 4.2395, "rewards/accuracies": 0.5, "rewards/chosen": -31.750045776367188, "rewards/margins": 0.6927337646484375, "rewards/rejected": -32.442779541015625, "step": 1737 }, { "epoch": 0.2366557734204793, "grad_norm": 48.60487608406577, "learning_rate": 7.553914999479989e-07, "logits/chosen": 10.711296081542969, "logits/rejected": 12.106624603271484, "logps/chosen": -3.1590776443481445, "logps/rejected": -3.3207361698150635, "loss": 4.1494, "rewards/accuracies": 0.75, "rewards/chosen": -31.590776443481445, "rewards/margins": 1.6165852546691895, "rewards/rejected": -33.207359313964844, "step": 1738 }, { "epoch": 0.23679193899782136, "grad_norm": 51.236479274946014, "learning_rate": 7.55304201051365e-07, "logits/chosen": 11.257755279541016, "logits/rejected": 11.416688919067383, "logps/chosen": -3.3222293853759766, "logps/rejected": -3.3286893367767334, "loss": 3.9683, "rewards/accuracies": 0.5, "rewards/chosen": -33.222293853759766, "rewards/margins": 0.06459999084472656, "rewards/rejected": -33.286895751953125, "step": 1739 }, { "epoch": 0.2369281045751634, "grad_norm": 49.68552111599995, "learning_rate": 7.552168218708099e-07, "logits/chosen": 10.630207061767578, "logits/rejected": 11.420242309570312, "logps/chosen": -3.402651071548462, "logps/rejected": -3.397890090942383, "loss": 4.5227, "rewards/accuracies": 0.5, "rewards/chosen": -34.026512145996094, "rewards/margins": -0.047609806060791016, "rewards/rejected": -33.97890090942383, "step": 1740 }, { "epoch": 0.23706427015250545, "grad_norm": 53.2613585822371, "learning_rate": 7.551293624260778e-07, "logits/chosen": 10.080947875976562, "logits/rejected": 11.464597702026367, "logps/chosen": -3.2430710792541504, "logps/rejected": -3.449343681335449, "loss": 3.9399, "rewards/accuracies": 1.0, "rewards/chosen": -32.43070983886719, "rewards/margins": 2.06272554397583, "rewards/rejected": -34.49343490600586, "step": 1741 }, { "epoch": 0.2372004357298475, "grad_norm": 51.590304248964216, "learning_rate": 7.550418227369305e-07, "logits/chosen": 10.811939239501953, "logits/rejected": 10.29231071472168, "logps/chosen": -3.317444086074829, "logps/rejected": -3.1842479705810547, "loss": 4.1514, "rewards/accuracies": 0.0, "rewards/chosen": -33.17444610595703, "rewards/margins": -1.3319621086120605, "rewards/rejected": -31.842479705810547, "step": 1742 }, { "epoch": 0.23733660130718953, "grad_norm": 50.63246936288941, "learning_rate": 7.549542028231487e-07, "logits/chosen": 9.435007095336914, "logits/rejected": 11.998369216918945, "logps/chosen": -3.0843472480773926, "logps/rejected": -3.5662004947662354, "loss": 4.0855, "rewards/accuracies": 1.0, "rewards/chosen": -30.843473434448242, "rewards/margins": 4.818531036376953, "rewards/rejected": -35.66200256347656, "step": 1743 }, { "epoch": 0.2374727668845316, "grad_norm": 50.84841018910479, "learning_rate": 7.548665027045306e-07, "logits/chosen": 10.467428207397461, "logits/rejected": 10.384330749511719, "logps/chosen": -3.0069527626037598, "logps/rejected": -2.873180389404297, "loss": 3.526, "rewards/accuracies": 0.5, "rewards/chosen": -30.069530487060547, "rewards/margins": -1.337724208831787, "rewards/rejected": -28.73180389404297, "step": 1744 }, { "epoch": 0.23760893246187365, "grad_norm": 45.387512042319216, "learning_rate": 7.547787224008929e-07, "logits/chosen": 10.619878768920898, "logits/rejected": 11.276144027709961, "logps/chosen": -3.036795139312744, "logps/rejected": -3.5251739025115967, "loss": 4.4746, "rewards/accuracies": 1.0, "rewards/chosen": -30.367950439453125, "rewards/margins": 4.883787631988525, "rewards/rejected": -35.251739501953125, "step": 1745 }, { "epoch": 0.23774509803921567, "grad_norm": 45.117911850368905, "learning_rate": 7.546908619320702e-07, "logits/chosen": 10.947162628173828, "logits/rejected": 9.366363525390625, "logps/chosen": -3.065286159515381, "logps/rejected": -2.8688254356384277, "loss": 3.9642, "rewards/accuracies": 0.5, "rewards/chosen": -30.65285873413086, "rewards/margins": -1.9646053314208984, "rewards/rejected": -28.688255310058594, "step": 1746 }, { "epoch": 0.23788126361655773, "grad_norm": 48.04547045280538, "learning_rate": 7.546029213179153e-07, "logits/chosen": 11.455737113952637, "logits/rejected": 10.829659461975098, "logps/chosen": -3.0237693786621094, "logps/rejected": -3.000469207763672, "loss": 4.3155, "rewards/accuracies": 0.5, "rewards/chosen": -30.237693786621094, "rewards/margins": -0.2330021858215332, "rewards/rejected": -30.00469207763672, "step": 1747 }, { "epoch": 0.2380174291938998, "grad_norm": 48.33379301554035, "learning_rate": 7.545149005782993e-07, "logits/chosen": 11.446779251098633, "logits/rejected": 11.615448951721191, "logps/chosen": -3.069218635559082, "logps/rejected": -3.5134308338165283, "loss": 4.2683, "rewards/accuracies": 1.0, "rewards/chosen": -30.692188262939453, "rewards/margins": 4.442120552062988, "rewards/rejected": -35.134307861328125, "step": 1748 }, { "epoch": 0.23815359477124182, "grad_norm": 42.74425495039551, "learning_rate": 7.54426799733111e-07, "logits/chosen": 9.726961135864258, "logits/rejected": 11.730304718017578, "logps/chosen": -3.0168700218200684, "logps/rejected": -3.5274271965026855, "loss": 3.8141, "rewards/accuracies": 0.75, "rewards/chosen": -30.168699264526367, "rewards/margins": 5.105571746826172, "rewards/rejected": -35.274269104003906, "step": 1749 }, { "epoch": 0.23828976034858387, "grad_norm": 43.59051029499634, "learning_rate": 7.543386188022575e-07, "logits/chosen": 11.003935813903809, "logits/rejected": 10.581446647644043, "logps/chosen": -3.1967573165893555, "logps/rejected": -3.2713308334350586, "loss": 4.2155, "rewards/accuracies": 0.75, "rewards/chosen": -31.967573165893555, "rewards/margins": 0.7457361221313477, "rewards/rejected": -32.71331024169922, "step": 1750 }, { "epoch": 0.23842592592592593, "grad_norm": 46.50232186189072, "learning_rate": 7.542503578056642e-07, "logits/chosen": 11.491132736206055, "logits/rejected": 10.935248374938965, "logps/chosen": -3.3769752979278564, "logps/rejected": -3.5180282592773438, "loss": 4.4565, "rewards/accuracies": 0.25, "rewards/chosen": -33.769752502441406, "rewards/margins": 1.4105286598205566, "rewards/rejected": -35.18028259277344, "step": 1751 }, { "epoch": 0.238562091503268, "grad_norm": 44.0789693236735, "learning_rate": 7.541620167632743e-07, "logits/chosen": 10.491111755371094, "logits/rejected": 11.538862228393555, "logps/chosen": -3.2023673057556152, "logps/rejected": -3.5927414894104004, "loss": 3.6181, "rewards/accuracies": 0.75, "rewards/chosen": -32.02367401123047, "rewards/margins": 3.9037394523620605, "rewards/rejected": -35.92741394042969, "step": 1752 }, { "epoch": 0.23869825708061002, "grad_norm": 44.22599964323989, "learning_rate": 7.540735956950491e-07, "logits/chosen": 10.198253631591797, "logits/rejected": 11.258382797241211, "logps/chosen": -3.2776923179626465, "logps/rejected": -3.482625722885132, "loss": 4.4143, "rewards/accuracies": 0.75, "rewards/chosen": -32.77692413330078, "rewards/margins": 2.0493359565734863, "rewards/rejected": -34.826255798339844, "step": 1753 }, { "epoch": 0.23883442265795207, "grad_norm": 50.90021078953224, "learning_rate": 7.539850946209683e-07, "logits/chosen": 11.131150245666504, "logits/rejected": 11.934369087219238, "logps/chosen": -3.3651037216186523, "logps/rejected": -3.287074327468872, "loss": 4.6035, "rewards/accuracies": 0.5, "rewards/chosen": -33.65103530883789, "rewards/margins": -0.7802915573120117, "rewards/rejected": -32.87074279785156, "step": 1754 }, { "epoch": 0.23897058823529413, "grad_norm": 42.39615335929274, "learning_rate": 7.538965135610291e-07, "logits/chosen": 10.699024200439453, "logits/rejected": 11.029953002929688, "logps/chosen": -3.023933172225952, "logps/rejected": -3.0716452598571777, "loss": 4.076, "rewards/accuracies": 0.75, "rewards/chosen": -30.239330291748047, "rewards/margins": 0.47712039947509766, "rewards/rejected": -30.716453552246094, "step": 1755 }, { "epoch": 0.23910675381263616, "grad_norm": 49.94612879515869, "learning_rate": 7.538078525352474e-07, "logits/chosen": 9.859086036682129, "logits/rejected": 9.936210632324219, "logps/chosen": -3.0673749446868896, "logps/rejected": -3.067779779434204, "loss": 4.6852, "rewards/accuracies": 0.75, "rewards/chosen": -30.673748016357422, "rewards/margins": 0.004048824310302734, "rewards/rejected": -30.677797317504883, "step": 1756 }, { "epoch": 0.23924291938997821, "grad_norm": 49.86368170572504, "learning_rate": 7.537191115636569e-07, "logits/chosen": 11.980847358703613, "logits/rejected": 12.232516288757324, "logps/chosen": -3.164642810821533, "logps/rejected": -3.2562224864959717, "loss": 4.2188, "rewards/accuracies": 0.5, "rewards/chosen": -31.646427154541016, "rewards/margins": 0.9157986640930176, "rewards/rejected": -32.562225341796875, "step": 1757 }, { "epoch": 0.23937908496732027, "grad_norm": 47.15105036658889, "learning_rate": 7.536302906663092e-07, "logits/chosen": 11.37532901763916, "logits/rejected": 11.09687614440918, "logps/chosen": -3.1848196983337402, "logps/rejected": -3.5673396587371826, "loss": 3.9685, "rewards/accuracies": 0.75, "rewards/chosen": -31.848196029663086, "rewards/margins": 3.825200080871582, "rewards/rejected": -35.67339324951172, "step": 1758 }, { "epoch": 0.2395152505446623, "grad_norm": 49.11664421112399, "learning_rate": 7.535413898632741e-07, "logits/chosen": 11.281052589416504, "logits/rejected": 11.083154678344727, "logps/chosen": -3.5604147911071777, "logps/rejected": -3.117587089538574, "loss": 4.6297, "rewards/accuracies": 0.25, "rewards/chosen": -35.604148864746094, "rewards/margins": -4.428278923034668, "rewards/rejected": -31.175870895385742, "step": 1759 }, { "epoch": 0.23965141612200436, "grad_norm": 44.62872152096061, "learning_rate": 7.534524091746396e-07, "logits/chosen": 10.066158294677734, "logits/rejected": 9.914485931396484, "logps/chosen": -3.20636248588562, "logps/rejected": -3.1937665939331055, "loss": 4.2649, "rewards/accuracies": 0.5, "rewards/chosen": -32.063621520996094, "rewards/margins": -0.12595701217651367, "rewards/rejected": -31.937667846679688, "step": 1760 }, { "epoch": 0.2397875816993464, "grad_norm": 42.28551160528368, "learning_rate": 7.533633486205117e-07, "logits/chosen": 11.799995422363281, "logits/rejected": 10.746095657348633, "logps/chosen": -2.949899196624756, "logps/rejected": -3.1046290397644043, "loss": 4.2005, "rewards/accuracies": 0.5, "rewards/chosen": -29.498992919921875, "rewards/margins": 1.5472984313964844, "rewards/rejected": -31.04629135131836, "step": 1761 }, { "epoch": 0.23992374727668844, "grad_norm": 48.24600685656371, "learning_rate": 7.532742082210142e-07, "logits/chosen": 11.900554656982422, "logits/rejected": 11.522590637207031, "logps/chosen": -3.039376735687256, "logps/rejected": -3.369316577911377, "loss": 4.4097, "rewards/accuracies": 0.75, "rewards/chosen": -30.393766403198242, "rewards/margins": 3.2993998527526855, "rewards/rejected": -33.69316864013672, "step": 1762 }, { "epoch": 0.2400599128540305, "grad_norm": 43.198010747221105, "learning_rate": 7.531849879962891e-07, "logits/chosen": 12.131477355957031, "logits/rejected": 10.634910583496094, "logps/chosen": -3.33516788482666, "logps/rejected": -3.2267637252807617, "loss": 4.4376, "rewards/accuracies": 0.5, "rewards/chosen": -33.35167694091797, "rewards/margins": -1.0840411186218262, "rewards/rejected": -32.26763916015625, "step": 1763 }, { "epoch": 0.24019607843137256, "grad_norm": 43.69725833801254, "learning_rate": 7.530956879664964e-07, "logits/chosen": 10.983776092529297, "logits/rejected": 10.7159423828125, "logps/chosen": -3.4170327186584473, "logps/rejected": -3.351040840148926, "loss": 3.7408, "rewards/accuracies": 0.5, "rewards/chosen": -34.17033004760742, "rewards/margins": -0.6599206924438477, "rewards/rejected": -33.510406494140625, "step": 1764 }, { "epoch": 0.24033224400871459, "grad_norm": 61.83402348796169, "learning_rate": 7.530063081518145e-07, "logits/chosen": 11.038545608520508, "logits/rejected": 11.737228393554688, "logps/chosen": -2.964362382888794, "logps/rejected": -3.3027281761169434, "loss": 3.9701, "rewards/accuracies": 0.75, "rewards/chosen": -29.64362335205078, "rewards/margins": 3.3836593627929688, "rewards/rejected": -33.02728271484375, "step": 1765 }, { "epoch": 0.24046840958605664, "grad_norm": 52.90820300444308, "learning_rate": 7.529168485724392e-07, "logits/chosen": 10.422392845153809, "logits/rejected": 10.805753707885742, "logps/chosen": -3.238002061843872, "logps/rejected": -3.449341058731079, "loss": 3.9388, "rewards/accuracies": 0.75, "rewards/chosen": -32.38002014160156, "rewards/margins": 2.1133875846862793, "rewards/rejected": -34.493408203125, "step": 1766 }, { "epoch": 0.2406045751633987, "grad_norm": 44.3293748332176, "learning_rate": 7.528273092485847e-07, "logits/chosen": 11.018263816833496, "logits/rejected": 11.063629150390625, "logps/chosen": -3.3724937438964844, "logps/rejected": -3.4325523376464844, "loss": 3.9682, "rewards/accuracies": 0.5, "rewards/chosen": -33.724937438964844, "rewards/margins": 0.6005878448486328, "rewards/rejected": -34.325523376464844, "step": 1767 }, { "epoch": 0.24074074074074073, "grad_norm": 44.205638556181526, "learning_rate": 7.527376902004832e-07, "logits/chosen": 11.332447052001953, "logits/rejected": 11.77577018737793, "logps/chosen": -3.3539438247680664, "logps/rejected": -3.607862949371338, "loss": 3.9027, "rewards/accuracies": 0.75, "rewards/chosen": -33.53943634033203, "rewards/margins": 2.5391931533813477, "rewards/rejected": -36.07863235473633, "step": 1768 }, { "epoch": 0.24087690631808278, "grad_norm": 54.30488224635707, "learning_rate": 7.526479914483849e-07, "logits/chosen": 10.372004508972168, "logits/rejected": 12.139503479003906, "logps/chosen": -3.0799458026885986, "logps/rejected": -3.69423246383667, "loss": 4.3793, "rewards/accuracies": 1.0, "rewards/chosen": -30.799457550048828, "rewards/margins": 6.142867088317871, "rewards/rejected": -36.942325592041016, "step": 1769 }, { "epoch": 0.24101307189542484, "grad_norm": 62.31506973710705, "learning_rate": 7.525582130125577e-07, "logits/chosen": 11.416259765625, "logits/rejected": 11.56649398803711, "logps/chosen": -3.344097137451172, "logps/rejected": -3.4633774757385254, "loss": 4.2662, "rewards/accuracies": 0.75, "rewards/chosen": -33.44097137451172, "rewards/margins": 1.1928014755249023, "rewards/rejected": -34.63377380371094, "step": 1770 }, { "epoch": 0.2411492374727669, "grad_norm": 48.10691062864638, "learning_rate": 7.524683549132883e-07, "logits/chosen": 10.2711763381958, "logits/rejected": 11.037205696105957, "logps/chosen": -3.027712345123291, "logps/rejected": -3.3332786560058594, "loss": 4.5973, "rewards/accuracies": 0.75, "rewards/chosen": -30.277122497558594, "rewards/margins": 3.055666446685791, "rewards/rejected": -33.33279037475586, "step": 1771 }, { "epoch": 0.24128540305010893, "grad_norm": 44.503260359563114, "learning_rate": 7.523784171708804e-07, "logits/chosen": 12.045357704162598, "logits/rejected": 12.575665473937988, "logps/chosen": -3.7153549194335938, "logps/rejected": -3.555917739868164, "loss": 4.221, "rewards/accuracies": 0.25, "rewards/chosen": -37.15354919433594, "rewards/margins": -1.5943713188171387, "rewards/rejected": -35.559181213378906, "step": 1772 }, { "epoch": 0.24142156862745098, "grad_norm": 44.949162791738665, "learning_rate": 7.522883998056564e-07, "logits/chosen": 10.66981315612793, "logits/rejected": 11.091869354248047, "logps/chosen": -3.4883432388305664, "logps/rejected": -3.4944326877593994, "loss": 4.0048, "rewards/accuracies": 0.5, "rewards/chosen": -34.88343048095703, "rewards/margins": 0.06089353561401367, "rewards/rejected": -34.9443244934082, "step": 1773 }, { "epoch": 0.24155773420479304, "grad_norm": 76.97045849898235, "learning_rate": 7.521983028379564e-07, "logits/chosen": 10.598240852355957, "logits/rejected": 10.972877502441406, "logps/chosen": -2.929586410522461, "logps/rejected": -3.3780674934387207, "loss": 3.6833, "rewards/accuracies": 1.0, "rewards/chosen": -29.29586410522461, "rewards/margins": 4.4848103523254395, "rewards/rejected": -33.780677795410156, "step": 1774 }, { "epoch": 0.24169389978213507, "grad_norm": 56.28420186996668, "learning_rate": 7.521081262881385e-07, "logits/chosen": 10.699043273925781, "logits/rejected": 10.528764724731445, "logps/chosen": -3.0636627674102783, "logps/rejected": -3.2356679439544678, "loss": 4.2494, "rewards/accuracies": 0.5, "rewards/chosen": -30.636629104614258, "rewards/margins": 1.7200498580932617, "rewards/rejected": -32.3566780090332, "step": 1775 }, { "epoch": 0.24183006535947713, "grad_norm": 56.41959381122118, "learning_rate": 7.520178701765789e-07, "logits/chosen": 9.896464347839355, "logits/rejected": 10.295783996582031, "logps/chosen": -3.0321178436279297, "logps/rejected": -3.322141170501709, "loss": 4.5705, "rewards/accuracies": 0.75, "rewards/chosen": -30.321178436279297, "rewards/margins": 2.90023136138916, "rewards/rejected": -33.22140884399414, "step": 1776 }, { "epoch": 0.24196623093681918, "grad_norm": 44.83827617916501, "learning_rate": 7.51927534523672e-07, "logits/chosen": 11.175981521606445, "logits/rejected": 11.84994888305664, "logps/chosen": -3.224102735519409, "logps/rejected": -3.519139528274536, "loss": 3.6418, "rewards/accuracies": 1.0, "rewards/chosen": -32.24102783203125, "rewards/margins": 2.950366973876953, "rewards/rejected": -35.19139862060547, "step": 1777 }, { "epoch": 0.2421023965141612, "grad_norm": 45.990769851309274, "learning_rate": 7.518371193498294e-07, "logits/chosen": 11.809762954711914, "logits/rejected": 11.438337326049805, "logps/chosen": -3.7173149585723877, "logps/rejected": -3.456752300262451, "loss": 4.2195, "rewards/accuracies": 0.25, "rewards/chosen": -37.17314910888672, "rewards/margins": -2.6056294441223145, "rewards/rejected": -34.56752014160156, "step": 1778 }, { "epoch": 0.24223856209150327, "grad_norm": 41.29558367221283, "learning_rate": 7.517466246754813e-07, "logits/chosen": 11.038836479187012, "logits/rejected": 12.751455307006836, "logps/chosen": -3.1053876876831055, "logps/rejected": -3.4735569953918457, "loss": 3.6414, "rewards/accuracies": 1.0, "rewards/chosen": -31.053878784179688, "rewards/margins": 3.681692123413086, "rewards/rejected": -34.735572814941406, "step": 1779 }, { "epoch": 0.24237472766884532, "grad_norm": 45.73635034051777, "learning_rate": 7.516560505210758e-07, "logits/chosen": 10.58272933959961, "logits/rejected": 11.961606979370117, "logps/chosen": -3.184469223022461, "logps/rejected": -3.3725509643554688, "loss": 3.707, "rewards/accuracies": 0.75, "rewards/chosen": -31.84469223022461, "rewards/margins": 1.8808178901672363, "rewards/rejected": -33.72550964355469, "step": 1780 }, { "epoch": 0.24251089324618735, "grad_norm": 43.95505876679999, "learning_rate": 7.51565396907079e-07, "logits/chosen": 11.039167404174805, "logits/rejected": 10.661821365356445, "logps/chosen": -3.220851421356201, "logps/rejected": -3.31467604637146, "loss": 4.0526, "rewards/accuracies": 0.5, "rewards/chosen": -32.20851135253906, "rewards/margins": 0.9382472038269043, "rewards/rejected": -33.146759033203125, "step": 1781 }, { "epoch": 0.2426470588235294, "grad_norm": 50.99198273552075, "learning_rate": 7.514746638539747e-07, "logits/chosen": 11.2246675491333, "logits/rejected": 11.499025344848633, "logps/chosen": -3.0342912673950195, "logps/rejected": -3.4613595008850098, "loss": 3.8429, "rewards/accuracies": 1.0, "rewards/chosen": -30.342912673950195, "rewards/margins": 4.270680904388428, "rewards/rejected": -34.61359405517578, "step": 1782 }, { "epoch": 0.24278322440087147, "grad_norm": 47.79828702098773, "learning_rate": 7.513838513822646e-07, "logits/chosen": 12.056610107421875, "logits/rejected": 11.635721206665039, "logps/chosen": -3.3415639400482178, "logps/rejected": -3.213505268096924, "loss": 4.2207, "rewards/accuracies": 0.75, "rewards/chosen": -33.41564178466797, "rewards/margins": -1.280585765838623, "rewards/rejected": -32.13505554199219, "step": 1783 }, { "epoch": 0.2429193899782135, "grad_norm": 44.73003861508756, "learning_rate": 7.512929595124689e-07, "logits/chosen": 11.201366424560547, "logits/rejected": 11.572200775146484, "logps/chosen": -3.015415906906128, "logps/rejected": -3.3734560012817383, "loss": 4.3625, "rewards/accuracies": 1.0, "rewards/chosen": -30.154159545898438, "rewards/margins": 3.580402374267578, "rewards/rejected": -33.734561920166016, "step": 1784 }, { "epoch": 0.24305555555555555, "grad_norm": 46.59515916062208, "learning_rate": 7.512019882651251e-07, "logits/chosen": 10.999489784240723, "logits/rejected": 10.7575101852417, "logps/chosen": -3.208272933959961, "logps/rejected": -3.2302818298339844, "loss": 4.1772, "rewards/accuracies": 0.5, "rewards/chosen": -32.082725524902344, "rewards/margins": 0.22008943557739258, "rewards/rejected": -32.302818298339844, "step": 1785 }, { "epoch": 0.2431917211328976, "grad_norm": 42.35086442249243, "learning_rate": 7.511109376607891e-07, "logits/chosen": 10.073320388793945, "logits/rejected": 11.633989334106445, "logps/chosen": -2.9611544609069824, "logps/rejected": -3.2587766647338867, "loss": 4.2016, "rewards/accuracies": 0.75, "rewards/chosen": -29.61154556274414, "rewards/margins": 2.976222038269043, "rewards/rejected": -32.5877685546875, "step": 1786 }, { "epoch": 0.24332788671023964, "grad_norm": 42.49300903349434, "learning_rate": 7.510198077200343e-07, "logits/chosen": 10.415030479431152, "logits/rejected": 12.027722358703613, "logps/chosen": -2.8662538528442383, "logps/rejected": -3.4506397247314453, "loss": 4.0771, "rewards/accuracies": 1.0, "rewards/chosen": -28.66253662109375, "rewards/margins": 5.84385871887207, "rewards/rejected": -34.50639343261719, "step": 1787 }, { "epoch": 0.2434640522875817, "grad_norm": 45.219597521893014, "learning_rate": 7.509285984634523e-07, "logits/chosen": 11.114099502563477, "logits/rejected": 11.386645317077637, "logps/chosen": -3.4985387325286865, "logps/rejected": -3.450622797012329, "loss": 4.3818, "rewards/accuracies": 0.25, "rewards/chosen": -34.985389709472656, "rewards/margins": -0.4791593551635742, "rewards/rejected": -34.5062255859375, "step": 1788 }, { "epoch": 0.24360021786492375, "grad_norm": 41.376219115786135, "learning_rate": 7.508373099116529e-07, "logits/chosen": 11.78886890411377, "logits/rejected": 11.632425308227539, "logps/chosen": -3.357778549194336, "logps/rejected": -3.0644068717956543, "loss": 4.1219, "rewards/accuracies": 0.5, "rewards/chosen": -33.57778549194336, "rewards/margins": -2.933718681335449, "rewards/rejected": -30.644067764282227, "step": 1789 }, { "epoch": 0.2437363834422658, "grad_norm": 41.96710888673771, "learning_rate": 7.507459420852631e-07, "logits/chosen": 11.28701400756836, "logits/rejected": 11.818609237670898, "logps/chosen": -3.4286036491394043, "logps/rejected": -3.604970932006836, "loss": 3.7419, "rewards/accuracies": 0.75, "rewards/chosen": -34.28603744506836, "rewards/margins": 1.763674259185791, "rewards/rejected": -36.049713134765625, "step": 1790 }, { "epoch": 0.24387254901960784, "grad_norm": 44.984961734000905, "learning_rate": 7.506544950049285e-07, "logits/chosen": 9.6209716796875, "logits/rejected": 11.13068962097168, "logps/chosen": -2.7896933555603027, "logps/rejected": -3.1795644760131836, "loss": 4.0835, "rewards/accuracies": 0.75, "rewards/chosen": -27.896934509277344, "rewards/margins": 3.898710250854492, "rewards/rejected": -31.795644760131836, "step": 1791 }, { "epoch": 0.2440087145969499, "grad_norm": 50.81358336676065, "learning_rate": 7.505629686913121e-07, "logits/chosen": 11.599807739257812, "logits/rejected": 11.025075912475586, "logps/chosen": -3.273519992828369, "logps/rejected": -3.0981321334838867, "loss": 4.4203, "rewards/accuracies": 0.0, "rewards/chosen": -32.735198974609375, "rewards/margins": -1.7538785934448242, "rewards/rejected": -30.981319427490234, "step": 1792 }, { "epoch": 0.24414488017429195, "grad_norm": 50.04822092111291, "learning_rate": 7.504713631650952e-07, "logits/chosen": 9.695564270019531, "logits/rejected": 10.708366394042969, "logps/chosen": -2.979320764541626, "logps/rejected": -3.152907371520996, "loss": 4.1559, "rewards/accuracies": 0.75, "rewards/chosen": -29.7932071685791, "rewards/margins": 1.735865592956543, "rewards/rejected": -31.529075622558594, "step": 1793 }, { "epoch": 0.24428104575163398, "grad_norm": 39.258476490525474, "learning_rate": 7.503796784469769e-07, "logits/chosen": 10.842737197875977, "logits/rejected": 11.966521263122559, "logps/chosen": -2.9248886108398438, "logps/rejected": -3.3227925300598145, "loss": 3.6011, "rewards/accuracies": 0.75, "rewards/chosen": -29.248886108398438, "rewards/margins": 3.9790406227111816, "rewards/rejected": -33.227928161621094, "step": 1794 }, { "epoch": 0.24441721132897604, "grad_norm": 46.98700040557917, "learning_rate": 7.502879145576737e-07, "logits/chosen": 9.254291534423828, "logits/rejected": 10.877067565917969, "logps/chosen": -2.6828956604003906, "logps/rejected": -3.580575466156006, "loss": 3.8736, "rewards/accuracies": 1.0, "rewards/chosen": -26.828956604003906, "rewards/margins": 8.976795196533203, "rewards/rejected": -35.80575180053711, "step": 1795 }, { "epoch": 0.2445533769063181, "grad_norm": 45.14285788807105, "learning_rate": 7.501960715179208e-07, "logits/chosen": 11.001566886901855, "logits/rejected": 11.762211799621582, "logps/chosen": -3.2243194580078125, "logps/rejected": -3.514406204223633, "loss": 4.0914, "rewards/accuracies": 0.75, "rewards/chosen": -32.243194580078125, "rewards/margins": 2.9008688926696777, "rewards/rejected": -35.14406204223633, "step": 1796 }, { "epoch": 0.24468954248366012, "grad_norm": 39.19022458604284, "learning_rate": 7.50104149348471e-07, "logits/chosen": 11.076416015625, "logits/rejected": 12.108524322509766, "logps/chosen": -3.1595396995544434, "logps/rejected": -3.3288121223449707, "loss": 3.6377, "rewards/accuracies": 0.75, "rewards/chosen": -31.59539794921875, "rewards/margins": 1.6927227973937988, "rewards/rejected": -33.28812026977539, "step": 1797 }, { "epoch": 0.24482570806100218, "grad_norm": 50.28241481818251, "learning_rate": 7.500121480700943e-07, "logits/chosen": 10.639142990112305, "logits/rejected": 11.572948455810547, "logps/chosen": -3.451314926147461, "logps/rejected": -3.5341286659240723, "loss": 4.0763, "rewards/accuracies": 0.75, "rewards/chosen": -34.51314926147461, "rewards/margins": 0.8281373977661133, "rewards/rejected": -35.341285705566406, "step": 1798 }, { "epoch": 0.24496187363834424, "grad_norm": 46.504333177280266, "learning_rate": 7.499200677035798e-07, "logits/chosen": 11.112197875976562, "logits/rejected": 11.377762794494629, "logps/chosen": -3.3683037757873535, "logps/rejected": -3.5492045879364014, "loss": 4.4763, "rewards/accuracies": 0.75, "rewards/chosen": -33.68303680419922, "rewards/margins": 1.8090085983276367, "rewards/rejected": -35.492042541503906, "step": 1799 }, { "epoch": 0.24509803921568626, "grad_norm": 42.10973596079227, "learning_rate": 7.498279082697335e-07, "logits/chosen": 11.055283546447754, "logits/rejected": 10.310532569885254, "logps/chosen": -3.4262101650238037, "logps/rejected": -3.61014986038208, "loss": 4.0409, "rewards/accuracies": 0.5, "rewards/chosen": -34.26210021972656, "rewards/margins": 1.8393993377685547, "rewards/rejected": -36.10150146484375, "step": 1800 }, { "epoch": 0.24523420479302832, "grad_norm": 66.17924551718491, "learning_rate": 7.497356697893795e-07, "logits/chosen": 11.584994316101074, "logits/rejected": 12.366267204284668, "logps/chosen": -3.2126591205596924, "logps/rejected": -3.083048105239868, "loss": 3.3448, "rewards/accuracies": 0.25, "rewards/chosen": -32.126590728759766, "rewards/margins": -1.2961101531982422, "rewards/rejected": -30.830480575561523, "step": 1801 }, { "epoch": 0.24537037037037038, "grad_norm": 56.1467111007096, "learning_rate": 7.496433522833602e-07, "logits/chosen": 11.392831802368164, "logits/rejected": 11.116937637329102, "logps/chosen": -2.954639196395874, "logps/rejected": -3.2873682975769043, "loss": 4.4078, "rewards/accuracies": 1.0, "rewards/chosen": -29.54639434814453, "rewards/margins": 3.327291965484619, "rewards/rejected": -32.873687744140625, "step": 1802 }, { "epoch": 0.2455065359477124, "grad_norm": 41.252976993514025, "learning_rate": 7.49550955772535e-07, "logits/chosen": 11.276347160339355, "logits/rejected": 10.573877334594727, "logps/chosen": -3.179567813873291, "logps/rejected": -3.2691545486450195, "loss": 4.2001, "rewards/accuracies": 0.5, "rewards/chosen": -31.795679092407227, "rewards/margins": 0.8958678245544434, "rewards/rejected": -32.69154739379883, "step": 1803 }, { "epoch": 0.24564270152505446, "grad_norm": 43.13714719895776, "learning_rate": 7.494584802777821e-07, "logits/chosen": 10.884084701538086, "logits/rejected": 11.249504089355469, "logps/chosen": -2.70609450340271, "logps/rejected": -2.829014778137207, "loss": 4.2412, "rewards/accuracies": 0.5, "rewards/chosen": -27.060945510864258, "rewards/margins": 1.2292022705078125, "rewards/rejected": -28.290149688720703, "step": 1804 }, { "epoch": 0.24577886710239652, "grad_norm": 42.0354925567953, "learning_rate": 7.493659258199969e-07, "logits/chosen": 10.89360237121582, "logits/rejected": 11.945589065551758, "logps/chosen": -3.1445789337158203, "logps/rejected": -3.4207448959350586, "loss": 4.0428, "rewards/accuracies": 0.75, "rewards/chosen": -31.44578742980957, "rewards/margins": 2.761659622192383, "rewards/rejected": -34.20745086669922, "step": 1805 }, { "epoch": 0.24591503267973855, "grad_norm": 42.13832937737018, "learning_rate": 7.492732924200927e-07, "logits/chosen": 11.896068572998047, "logits/rejected": 11.12497615814209, "logps/chosen": -2.995847225189209, "logps/rejected": -3.717725992202759, "loss": 3.6566, "rewards/accuracies": 1.0, "rewards/chosen": -29.958473205566406, "rewards/margins": 7.218786239624023, "rewards/rejected": -37.17726135253906, "step": 1806 }, { "epoch": 0.2460511982570806, "grad_norm": 46.30487790154749, "learning_rate": 7.491805800990011e-07, "logits/chosen": 11.18077278137207, "logits/rejected": 11.653064727783203, "logps/chosen": -2.8695178031921387, "logps/rejected": -3.2240500450134277, "loss": 4.3164, "rewards/accuracies": 0.75, "rewards/chosen": -28.69517707824707, "rewards/margins": 3.545322895050049, "rewards/rejected": -32.240501403808594, "step": 1807 }, { "epoch": 0.24618736383442266, "grad_norm": 98.80620797232197, "learning_rate": 7.490877888776712e-07, "logits/chosen": 11.121118545532227, "logits/rejected": 11.085458755493164, "logps/chosen": -3.0644469261169434, "logps/rejected": -3.5494203567504883, "loss": 4.4763, "rewards/accuracies": 0.75, "rewards/chosen": -30.64447021484375, "rewards/margins": 4.849734783172607, "rewards/rejected": -35.494205474853516, "step": 1808 }, { "epoch": 0.24632352941176472, "grad_norm": 43.543667410322556, "learning_rate": 7.489949187770695e-07, "logits/chosen": 11.23291015625, "logits/rejected": 10.906952857971191, "logps/chosen": -3.190316677093506, "logps/rejected": -3.0466456413269043, "loss": 3.5533, "rewards/accuracies": 0.25, "rewards/chosen": -31.90316390991211, "rewards/margins": -1.4367103576660156, "rewards/rejected": -30.466455459594727, "step": 1809 }, { "epoch": 0.24645969498910675, "grad_norm": 42.59380140086508, "learning_rate": 7.489019698181813e-07, "logits/chosen": 10.667098045349121, "logits/rejected": 10.568184852600098, "logps/chosen": -2.7193856239318848, "logps/rejected": -2.7652270793914795, "loss": 3.9481, "rewards/accuracies": 0.75, "rewards/chosen": -27.19385528564453, "rewards/margins": 0.45841550827026367, "rewards/rejected": -27.652271270751953, "step": 1810 }, { "epoch": 0.2465958605664488, "grad_norm": 47.140288907020704, "learning_rate": 7.48808942022009e-07, "logits/chosen": 11.640802383422852, "logits/rejected": 12.580406188964844, "logps/chosen": -3.273421287536621, "logps/rejected": -3.607705593109131, "loss": 3.9905, "rewards/accuracies": 0.75, "rewards/chosen": -32.734214782714844, "rewards/margins": 3.3428425788879395, "rewards/rejected": -36.077056884765625, "step": 1811 }, { "epoch": 0.24673202614379086, "grad_norm": 46.85258124529804, "learning_rate": 7.487158354095729e-07, "logits/chosen": 10.368759155273438, "logits/rejected": 10.66466999053955, "logps/chosen": -3.0763778686523438, "logps/rejected": -3.170710802078247, "loss": 4.4989, "rewards/accuracies": 0.75, "rewards/chosen": -30.763778686523438, "rewards/margins": 0.9433302879333496, "rewards/rejected": -31.707107543945312, "step": 1812 }, { "epoch": 0.2468681917211329, "grad_norm": 47.059173793914105, "learning_rate": 7.486226500019112e-07, "logits/chosen": 10.137550354003906, "logits/rejected": 11.315290451049805, "logps/chosen": -2.8961498737335205, "logps/rejected": -3.390824556350708, "loss": 4.3871, "rewards/accuracies": 0.75, "rewards/chosen": -28.961498260498047, "rewards/margins": 4.946748733520508, "rewards/rejected": -33.90824890136719, "step": 1813 }, { "epoch": 0.24700435729847495, "grad_norm": 47.338152143808806, "learning_rate": 7.485293858200801e-07, "logits/chosen": 11.441173553466797, "logits/rejected": 11.446409225463867, "logps/chosen": -3.1684563159942627, "logps/rejected": -3.1714835166931152, "loss": 4.3, "rewards/accuracies": 0.25, "rewards/chosen": -31.68456268310547, "rewards/margins": 0.03027057647705078, "rewards/rejected": -31.714834213256836, "step": 1814 }, { "epoch": 0.247140522875817, "grad_norm": 46.967756554812226, "learning_rate": 7.484360428851532e-07, "logits/chosen": 10.534808158874512, "logits/rejected": 10.706074714660645, "logps/chosen": -2.811016082763672, "logps/rejected": -3.077510356903076, "loss": 4.2044, "rewards/accuracies": 1.0, "rewards/chosen": -28.11016273498535, "rewards/margins": 2.664940357208252, "rewards/rejected": -30.775102615356445, "step": 1815 }, { "epoch": 0.24727668845315903, "grad_norm": 44.94104571627108, "learning_rate": 7.483426212182223e-07, "logits/chosen": 9.898669242858887, "logits/rejected": 12.11054515838623, "logps/chosen": -3.0378665924072266, "logps/rejected": -3.4783995151519775, "loss": 3.4607, "rewards/accuracies": 0.75, "rewards/chosen": -30.378665924072266, "rewards/margins": 4.405330181121826, "rewards/rejected": -34.78399658203125, "step": 1816 }, { "epoch": 0.2474128540305011, "grad_norm": 48.764776521343286, "learning_rate": 7.482491208403967e-07, "logits/chosen": 11.19146728515625, "logits/rejected": 10.897554397583008, "logps/chosen": -3.4940571784973145, "logps/rejected": -3.670335292816162, "loss": 4.4887, "rewards/accuracies": 1.0, "rewards/chosen": -34.94056701660156, "rewards/margins": 1.7627811431884766, "rewards/rejected": -36.70335006713867, "step": 1817 }, { "epoch": 0.24754901960784315, "grad_norm": 50.43018651972567, "learning_rate": 7.481555417728035e-07, "logits/chosen": 11.820890426635742, "logits/rejected": 11.677523612976074, "logps/chosen": -3.2526235580444336, "logps/rejected": -3.6348958015441895, "loss": 4.0487, "rewards/accuracies": 1.0, "rewards/chosen": -32.52623748779297, "rewards/margins": 3.8227224349975586, "rewards/rejected": -36.34895706176758, "step": 1818 }, { "epoch": 0.24768518518518517, "grad_norm": 48.50948460641658, "learning_rate": 7.480618840365879e-07, "logits/chosen": 10.943262100219727, "logits/rejected": 11.370609283447266, "logps/chosen": -3.4089293479919434, "logps/rejected": -3.3224377632141113, "loss": 4.5143, "rewards/accuracies": 0.25, "rewards/chosen": -34.08929443359375, "rewards/margins": -0.8649144172668457, "rewards/rejected": -33.22438049316406, "step": 1819 }, { "epoch": 0.24782135076252723, "grad_norm": 48.01482221547562, "learning_rate": 7.479681476529123e-07, "logits/chosen": 10.536479949951172, "logits/rejected": 11.24127197265625, "logps/chosen": -2.5444512367248535, "logps/rejected": -2.8985652923583984, "loss": 3.4178, "rewards/accuracies": 0.75, "rewards/chosen": -25.44451332092285, "rewards/margins": 3.541139602661133, "rewards/rejected": -28.985652923583984, "step": 1820 }, { "epoch": 0.2479575163398693, "grad_norm": 42.093822269891426, "learning_rate": 7.478743326429576e-07, "logits/chosen": 10.913297653198242, "logits/rejected": 9.995853424072266, "logps/chosen": -3.570664405822754, "logps/rejected": -3.2234082221984863, "loss": 3.8353, "rewards/accuracies": 0.0, "rewards/chosen": -35.706642150878906, "rewards/margins": -3.472562313079834, "rewards/rejected": -32.23408126831055, "step": 1821 }, { "epoch": 0.24809368191721132, "grad_norm": 49.63955559965905, "learning_rate": 7.477804390279217e-07, "logits/chosen": 10.886558532714844, "logits/rejected": 11.660863876342773, "logps/chosen": -3.5532479286193848, "logps/rejected": -3.6461589336395264, "loss": 3.8462, "rewards/accuracies": 0.25, "rewards/chosen": -35.53247833251953, "rewards/margins": 0.9291138648986816, "rewards/rejected": -36.46158981323242, "step": 1822 }, { "epoch": 0.24822984749455337, "grad_norm": 70.38614671628335, "learning_rate": 7.47686466829021e-07, "logits/chosen": 12.01966667175293, "logits/rejected": 11.324209213256836, "logps/chosen": -3.215996265411377, "logps/rejected": -3.3204047679901123, "loss": 4.9129, "rewards/accuracies": 0.25, "rewards/chosen": -32.15996170043945, "rewards/margins": 1.0440864562988281, "rewards/rejected": -33.20404815673828, "step": 1823 }, { "epoch": 0.24836601307189543, "grad_norm": 55.454499284293554, "learning_rate": 7.47592416067489e-07, "logits/chosen": 10.631431579589844, "logits/rejected": 10.743180274963379, "logps/chosen": -3.2043750286102295, "logps/rejected": -3.173656702041626, "loss": 3.9991, "rewards/accuracies": 0.5, "rewards/chosen": -32.04375076293945, "rewards/margins": -0.30718469619750977, "rewards/rejected": -31.73656463623047, "step": 1824 }, { "epoch": 0.24850217864923746, "grad_norm": 55.187828096206836, "learning_rate": 7.474982867645774e-07, "logits/chosen": 10.785867691040039, "logits/rejected": 10.773126602172852, "logps/chosen": -2.945234775543213, "logps/rejected": -3.256573438644409, "loss": 3.8171, "rewards/accuracies": 0.75, "rewards/chosen": -29.452348709106445, "rewards/margins": 3.1133880615234375, "rewards/rejected": -32.56573486328125, "step": 1825 }, { "epoch": 0.24863834422657952, "grad_norm": 51.7476498939853, "learning_rate": 7.474040789415554e-07, "logits/chosen": 11.988836288452148, "logits/rejected": 11.639792442321777, "logps/chosen": -3.185595989227295, "logps/rejected": -3.1957244873046875, "loss": 4.0881, "rewards/accuracies": 0.5, "rewards/chosen": -31.855960845947266, "rewards/margins": 0.10128498077392578, "rewards/rejected": -31.957244873046875, "step": 1826 }, { "epoch": 0.24877450980392157, "grad_norm": 48.30418323987548, "learning_rate": 7.473097926197102e-07, "logits/chosen": 9.067937850952148, "logits/rejected": 11.51120376586914, "logps/chosen": -3.034534454345703, "logps/rejected": -3.604229688644409, "loss": 3.9832, "rewards/accuracies": 1.0, "rewards/chosen": -30.34534454345703, "rewards/margins": 5.696952819824219, "rewards/rejected": -36.04229736328125, "step": 1827 }, { "epoch": 0.24891067538126363, "grad_norm": 45.851636048498285, "learning_rate": 7.472154278203463e-07, "logits/chosen": 9.97997760772705, "logits/rejected": 10.715929985046387, "logps/chosen": -2.768144369125366, "logps/rejected": -3.491387367248535, "loss": 3.8488, "rewards/accuracies": 1.0, "rewards/chosen": -27.68144416809082, "rewards/margins": 7.232429504394531, "rewards/rejected": -34.913875579833984, "step": 1828 }, { "epoch": 0.24904684095860566, "grad_norm": 56.84781909835449, "learning_rate": 7.471209845647865e-07, "logits/chosen": 9.930768966674805, "logits/rejected": 10.349720001220703, "logps/chosen": -2.5102508068084717, "logps/rejected": -2.7836506366729736, "loss": 3.2339, "rewards/accuracies": 0.75, "rewards/chosen": -25.102508544921875, "rewards/margins": 2.733999252319336, "rewards/rejected": -27.836505889892578, "step": 1829 }, { "epoch": 0.24918300653594772, "grad_norm": 55.06020720677077, "learning_rate": 7.470264628743709e-07, "logits/chosen": 10.863630294799805, "logits/rejected": 11.202025413513184, "logps/chosen": -2.8679745197296143, "logps/rejected": -3.052912712097168, "loss": 4.4655, "rewards/accuracies": 0.75, "rewards/chosen": -28.679744720458984, "rewards/margins": 1.849381923675537, "rewards/rejected": -30.529125213623047, "step": 1830 }, { "epoch": 0.24931917211328977, "grad_norm": 46.08948756911176, "learning_rate": 7.469318627704573e-07, "logits/chosen": 9.441263198852539, "logits/rejected": 11.709915161132812, "logps/chosen": -2.708922863006592, "logps/rejected": -3.2855865955352783, "loss": 3.6964, "rewards/accuracies": 0.75, "rewards/chosen": -27.089229583740234, "rewards/margins": 5.766636371612549, "rewards/rejected": -32.855865478515625, "step": 1831 }, { "epoch": 0.2494553376906318, "grad_norm": 42.329834275929485, "learning_rate": 7.468371842744218e-07, "logits/chosen": 10.890876770019531, "logits/rejected": 10.836701393127441, "logps/chosen": -2.8406307697296143, "logps/rejected": -3.157744884490967, "loss": 3.7079, "rewards/accuracies": 0.75, "rewards/chosen": -28.406307220458984, "rewards/margins": 3.1711416244506836, "rewards/rejected": -31.577449798583984, "step": 1832 }, { "epoch": 0.24959150326797386, "grad_norm": 46.01978421835885, "learning_rate": 7.467424274076574e-07, "logits/chosen": 9.171030044555664, "logits/rejected": 9.569849014282227, "logps/chosen": -2.854454517364502, "logps/rejected": -3.166867733001709, "loss": 3.7665, "rewards/accuracies": 0.75, "rewards/chosen": -28.544544219970703, "rewards/margins": 3.124131202697754, "rewards/rejected": -31.668676376342773, "step": 1833 }, { "epoch": 0.24972766884531591, "grad_norm": 44.48827258587138, "learning_rate": 7.466475921915753e-07, "logits/chosen": 9.158990859985352, "logits/rejected": 9.12511157989502, "logps/chosen": -2.698441982269287, "logps/rejected": -2.8421690464019775, "loss": 3.8104, "rewards/accuracies": 1.0, "rewards/chosen": -26.984416961669922, "rewards/margins": 1.4372735023498535, "rewards/rejected": -28.42169189453125, "step": 1834 }, { "epoch": 0.24986383442265794, "grad_norm": 48.28105687751508, "learning_rate": 7.465526786476044e-07, "logits/chosen": 9.159280776977539, "logits/rejected": 10.216215133666992, "logps/chosen": -2.9341580867767334, "logps/rejected": -3.3621463775634766, "loss": 4.3297, "rewards/accuracies": 1.0, "rewards/chosen": -29.341581344604492, "rewards/margins": 4.279884338378906, "rewards/rejected": -33.62146759033203, "step": 1835 }, { "epoch": 0.25, "grad_norm": 49.191169852181254, "learning_rate": 7.464576867971911e-07, "logits/chosen": 10.711397171020508, "logits/rejected": 11.27529239654541, "logps/chosen": -3.3415186405181885, "logps/rejected": -3.2346980571746826, "loss": 4.0027, "rewards/accuracies": 0.25, "rewards/chosen": -33.415184020996094, "rewards/margins": -1.0682058334350586, "rewards/rejected": -32.346981048583984, "step": 1836 }, { "epoch": 0.25013616557734203, "grad_norm": 45.14974194855605, "learning_rate": 7.463626166617996e-07, "logits/chosen": 10.37271499633789, "logits/rejected": 11.408161163330078, "logps/chosen": -2.721623420715332, "logps/rejected": -3.096604824066162, "loss": 3.5741, "rewards/accuracies": 0.5, "rewards/chosen": -27.21623420715332, "rewards/margins": 3.7498154640197754, "rewards/rejected": -30.966049194335938, "step": 1837 }, { "epoch": 0.2502723311546841, "grad_norm": 54.98700993934836, "learning_rate": 7.462674682629119e-07, "logits/chosen": 9.843291282653809, "logits/rejected": 11.418045043945312, "logps/chosen": -2.73388409614563, "logps/rejected": -3.155658006668091, "loss": 4.2408, "rewards/accuracies": 1.0, "rewards/chosen": -27.338842391967773, "rewards/margins": 4.217738628387451, "rewards/rejected": -31.556581497192383, "step": 1838 }, { "epoch": 0.25040849673202614, "grad_norm": 56.002698883416905, "learning_rate": 7.461722416220273e-07, "logits/chosen": 11.07461929321289, "logits/rejected": 10.663230895996094, "logps/chosen": -2.9302258491516113, "logps/rejected": -2.909287929534912, "loss": 4.3725, "rewards/accuracies": 0.5, "rewards/chosen": -29.302257537841797, "rewards/margins": -0.2093801498413086, "rewards/rejected": -29.092876434326172, "step": 1839 }, { "epoch": 0.25054466230936817, "grad_norm": 44.630563264291396, "learning_rate": 7.460769367606632e-07, "logits/chosen": 9.214248657226562, "logits/rejected": 11.200040817260742, "logps/chosen": -2.375450849533081, "logps/rejected": -2.893383502960205, "loss": 3.9065, "rewards/accuracies": 0.75, "rewards/chosen": -23.75450897216797, "rewards/margins": 5.17932653427124, "rewards/rejected": -28.933835983276367, "step": 1840 }, { "epoch": 0.25068082788671026, "grad_norm": 45.25791932278719, "learning_rate": 7.459815537003548e-07, "logits/chosen": 9.984336853027344, "logits/rejected": 10.78461742401123, "logps/chosen": -3.162787437438965, "logps/rejected": -3.6032118797302246, "loss": 4.4541, "rewards/accuracies": 1.0, "rewards/chosen": -31.62787437438965, "rewards/margins": 4.404246807098389, "rewards/rejected": -36.03211975097656, "step": 1841 }, { "epoch": 0.2508169934640523, "grad_norm": 55.574353455785065, "learning_rate": 7.458860924626541e-07, "logits/chosen": 10.240072250366211, "logits/rejected": 10.360841751098633, "logps/chosen": -3.1802191734313965, "logps/rejected": -3.3040924072265625, "loss": 4.617, "rewards/accuracies": 0.75, "rewards/chosen": -31.80219078063965, "rewards/margins": 1.2387323379516602, "rewards/rejected": -33.040924072265625, "step": 1842 }, { "epoch": 0.2509531590413943, "grad_norm": 45.49044031640759, "learning_rate": 7.457905530691319e-07, "logits/chosen": 9.935361862182617, "logits/rejected": 11.495223045349121, "logps/chosen": -3.294844150543213, "logps/rejected": -3.591263771057129, "loss": 3.9839, "rewards/accuracies": 0.75, "rewards/chosen": -32.94844055175781, "rewards/margins": 2.964198112487793, "rewards/rejected": -35.912635803222656, "step": 1843 }, { "epoch": 0.2510893246187364, "grad_norm": 45.40903018922701, "learning_rate": 7.456949355413759e-07, "logits/chosen": 11.690224647521973, "logits/rejected": 11.837270736694336, "logps/chosen": -3.2368593215942383, "logps/rejected": -3.414276599884033, "loss": 3.9993, "rewards/accuracies": 1.0, "rewards/chosen": -32.368595123291016, "rewards/margins": 1.7741708755493164, "rewards/rejected": -34.14276885986328, "step": 1844 }, { "epoch": 0.2512254901960784, "grad_norm": 45.169918186406655, "learning_rate": 7.455992399009917e-07, "logits/chosen": 10.856512069702148, "logits/rejected": 10.065845489501953, "logps/chosen": -3.3325881958007812, "logps/rejected": -3.2392497062683105, "loss": 4.3966, "rewards/accuracies": 0.5, "rewards/chosen": -33.32588195800781, "rewards/margins": -0.9333834648132324, "rewards/rejected": -32.392494201660156, "step": 1845 }, { "epoch": 0.25136165577342046, "grad_norm": 56.89420993855234, "learning_rate": 7.455034661696023e-07, "logits/chosen": 10.416910171508789, "logits/rejected": 11.188560485839844, "logps/chosen": -3.422791004180908, "logps/rejected": -3.306532382965088, "loss": 3.9867, "rewards/accuracies": 0.25, "rewards/chosen": -34.227909088134766, "rewards/margins": -1.1625871658325195, "rewards/rejected": -33.06532287597656, "step": 1846 }, { "epoch": 0.25149782135076254, "grad_norm": 51.81889900010299, "learning_rate": 7.454076143688489e-07, "logits/chosen": 10.517385482788086, "logits/rejected": 10.123753547668457, "logps/chosen": -3.426887035369873, "logps/rejected": -3.2836577892303467, "loss": 4.2033, "rewards/accuracies": 0.25, "rewards/chosen": -34.26887130737305, "rewards/margins": -1.43229341506958, "rewards/rejected": -32.836578369140625, "step": 1847 }, { "epoch": 0.25163398692810457, "grad_norm": 43.080576875332085, "learning_rate": 7.453116845203898e-07, "logits/chosen": 11.701702117919922, "logits/rejected": 11.27387809753418, "logps/chosen": -3.629629611968994, "logps/rejected": -3.611487865447998, "loss": 4.1619, "rewards/accuracies": 0.25, "rewards/chosen": -36.296295166015625, "rewards/margins": -0.18141460418701172, "rewards/rejected": -36.11488342285156, "step": 1848 }, { "epoch": 0.2517701525054466, "grad_norm": 47.522173002105305, "learning_rate": 7.452156766459013e-07, "logits/chosen": 9.282890319824219, "logits/rejected": 10.20258903503418, "logps/chosen": -3.2399327754974365, "logps/rejected": -3.6598286628723145, "loss": 4.3919, "rewards/accuracies": 1.0, "rewards/chosen": -32.39932632446289, "rewards/margins": 4.198959827423096, "rewards/rejected": -36.598289489746094, "step": 1849 }, { "epoch": 0.2519063180827887, "grad_norm": 46.336917476028226, "learning_rate": 7.451195907670769e-07, "logits/chosen": 11.196414947509766, "logits/rejected": 9.963388442993164, "logps/chosen": -3.5428600311279297, "logps/rejected": -3.311751365661621, "loss": 4.3736, "rewards/accuracies": 0.25, "rewards/chosen": -35.42859649658203, "rewards/margins": -2.3110852241516113, "rewards/rejected": -33.117515563964844, "step": 1850 }, { "epoch": 0.2520424836601307, "grad_norm": 50.59451236888898, "learning_rate": 7.450234269056284e-07, "logits/chosen": 10.682638168334961, "logits/rejected": 10.89657211303711, "logps/chosen": -3.352735996246338, "logps/rejected": -3.687079429626465, "loss": 3.9865, "rewards/accuracies": 1.0, "rewards/chosen": -33.52735900878906, "rewards/margins": 3.3434362411499023, "rewards/rejected": -36.87079620361328, "step": 1851 }, { "epoch": 0.25217864923747274, "grad_norm": 52.71639233113721, "learning_rate": 7.449271850832845e-07, "logits/chosen": 10.062429428100586, "logits/rejected": 10.579879760742188, "logps/chosen": -3.0567548274993896, "logps/rejected": -3.3524935245513916, "loss": 4.0746, "rewards/accuracies": 1.0, "rewards/chosen": -30.567550659179688, "rewards/margins": 2.957386016845703, "rewards/rejected": -33.524932861328125, "step": 1852 }, { "epoch": 0.2523148148148148, "grad_norm": 49.33482799315466, "learning_rate": 7.448308653217919e-07, "logits/chosen": 9.580796241760254, "logits/rejected": 10.773762702941895, "logps/chosen": -3.0813663005828857, "logps/rejected": -3.2721543312072754, "loss": 4.5225, "rewards/accuracies": 0.75, "rewards/chosen": -30.813661575317383, "rewards/margins": 1.907881259918213, "rewards/rejected": -32.72154235839844, "step": 1853 }, { "epoch": 0.25245098039215685, "grad_norm": 51.419159820761635, "learning_rate": 7.447344676429149e-07, "logits/chosen": 10.291664123535156, "logits/rejected": 11.14995002746582, "logps/chosen": -3.404958963394165, "logps/rejected": -3.2938783168792725, "loss": 4.556, "rewards/accuracies": 0.5, "rewards/chosen": -34.049591064453125, "rewards/margins": -1.1108055114746094, "rewards/rejected": -32.93878173828125, "step": 1854 }, { "epoch": 0.2525871459694989, "grad_norm": 50.42245320809165, "learning_rate": 7.446379920684354e-07, "logits/chosen": 9.806678771972656, "logits/rejected": 10.911354064941406, "logps/chosen": -2.980863094329834, "logps/rejected": -3.523303747177124, "loss": 3.8048, "rewards/accuracies": 1.0, "rewards/chosen": -29.808631896972656, "rewards/margins": 5.424403667449951, "rewards/rejected": -35.233036041259766, "step": 1855 }, { "epoch": 0.25272331154684097, "grad_norm": 50.683222226381744, "learning_rate": 7.445414386201527e-07, "logits/chosen": 9.32080364227295, "logits/rejected": 11.174911499023438, "logps/chosen": -3.108257532119751, "logps/rejected": -3.3391149044036865, "loss": 4.4519, "rewards/accuracies": 0.5, "rewards/chosen": -31.082576751708984, "rewards/margins": 2.3085732460021973, "rewards/rejected": -33.39114761352539, "step": 1856 }, { "epoch": 0.252859477124183, "grad_norm": 49.46308916478209, "learning_rate": 7.44444807319884e-07, "logits/chosen": 10.291067123413086, "logits/rejected": 10.459285736083984, "logps/chosen": -3.182506561279297, "logps/rejected": -3.237787961959839, "loss": 4.1446, "rewards/accuracies": 0.5, "rewards/chosen": -31.82506561279297, "rewards/margins": 0.5528149604797363, "rewards/rejected": -32.37788009643555, "step": 1857 }, { "epoch": 0.2529956427015251, "grad_norm": 49.809827525538196, "learning_rate": 7.443480981894637e-07, "logits/chosen": 11.350669860839844, "logits/rejected": 10.387088775634766, "logps/chosen": -3.361234664916992, "logps/rejected": -3.429083824157715, "loss": 4.2206, "rewards/accuracies": 0.75, "rewards/chosen": -33.61234664916992, "rewards/margins": 0.6784906387329102, "rewards/rejected": -34.29084014892578, "step": 1858 }, { "epoch": 0.2531318082788671, "grad_norm": 53.08962309609622, "learning_rate": 7.442513112507445e-07, "logits/chosen": 10.084037780761719, "logits/rejected": 11.794548034667969, "logps/chosen": -3.322629451751709, "logps/rejected": -3.291201591491699, "loss": 3.9754, "rewards/accuracies": 0.5, "rewards/chosen": -33.226295471191406, "rewards/margins": -0.31427764892578125, "rewards/rejected": -32.912017822265625, "step": 1859 }, { "epoch": 0.25326797385620914, "grad_norm": 53.05586725064102, "learning_rate": 7.441544465255956e-07, "logits/chosen": 11.152490615844727, "logits/rejected": 10.659205436706543, "logps/chosen": -3.0627574920654297, "logps/rejected": -3.1679134368896484, "loss": 3.9687, "rewards/accuracies": 0.5, "rewards/chosen": -30.627573013305664, "rewards/margins": 1.0515637397766113, "rewards/rejected": -31.679136276245117, "step": 1860 }, { "epoch": 0.2534041394335512, "grad_norm": 66.61998649756544, "learning_rate": 7.44057504035905e-07, "logits/chosen": 10.205988883972168, "logits/rejected": 10.022405624389648, "logps/chosen": -3.243082284927368, "logps/rejected": -3.30476713180542, "loss": 4.6673, "rewards/accuracies": 0.5, "rewards/chosen": -32.430824279785156, "rewards/margins": 0.6168479919433594, "rewards/rejected": -33.047672271728516, "step": 1861 }, { "epoch": 0.25354030501089325, "grad_norm": 51.87543475342798, "learning_rate": 7.439604838035771e-07, "logits/chosen": 8.829254150390625, "logits/rejected": 9.505147933959961, "logps/chosen": -2.774857997894287, "logps/rejected": -3.148880958557129, "loss": 4.4154, "rewards/accuracies": 0.75, "rewards/chosen": -27.748580932617188, "rewards/margins": 3.7402286529541016, "rewards/rejected": -31.48880958557129, "step": 1862 }, { "epoch": 0.2536764705882353, "grad_norm": 44.83229627370964, "learning_rate": 7.438633858505348e-07, "logits/chosen": 10.364736557006836, "logits/rejected": 11.303144454956055, "logps/chosen": -3.0327060222625732, "logps/rejected": -3.345813751220703, "loss": 3.6849, "rewards/accuracies": 0.75, "rewards/chosen": -30.327058792114258, "rewards/margins": 3.131080150604248, "rewards/rejected": -33.4581413269043, "step": 1863 }, { "epoch": 0.25381263616557737, "grad_norm": 52.35828668288705, "learning_rate": 7.437662101987181e-07, "logits/chosen": 8.488842010498047, "logits/rejected": 9.361648559570312, "logps/chosen": -3.0102686882019043, "logps/rejected": -3.4411191940307617, "loss": 4.6536, "rewards/accuracies": 1.0, "rewards/chosen": -30.102685928344727, "rewards/margins": 4.308504581451416, "rewards/rejected": -34.41119384765625, "step": 1864 }, { "epoch": 0.2539488017429194, "grad_norm": 50.221202765105375, "learning_rate": 7.436689568700845e-07, "logits/chosen": 10.375836372375488, "logits/rejected": 11.82626724243164, "logps/chosen": -3.126542091369629, "logps/rejected": -3.6598682403564453, "loss": 3.7614, "rewards/accuracies": 1.0, "rewards/chosen": -31.265422821044922, "rewards/margins": 5.333258628845215, "rewards/rejected": -36.59868240356445, "step": 1865 }, { "epoch": 0.2540849673202614, "grad_norm": 46.37186988793729, "learning_rate": 7.435716258866093e-07, "logits/chosen": 11.680570602416992, "logits/rejected": 10.805139541625977, "logps/chosen": -3.581463575363159, "logps/rejected": -3.5041427612304688, "loss": 4.2388, "rewards/accuracies": 0.25, "rewards/chosen": -35.81463623046875, "rewards/margins": -0.7732081413269043, "rewards/rejected": -35.04142761230469, "step": 1866 }, { "epoch": 0.2542211328976035, "grad_norm": 71.01846663446719, "learning_rate": 7.434742172702854e-07, "logits/chosen": 9.602170944213867, "logits/rejected": 11.191556930541992, "logps/chosen": -3.633450508117676, "logps/rejected": -3.6912074089050293, "loss": 4.1372, "rewards/accuracies": 0.5, "rewards/chosen": -36.33450698852539, "rewards/margins": 0.5775671005249023, "rewards/rejected": -36.91207504272461, "step": 1867 }, { "epoch": 0.25435729847494554, "grad_norm": 58.11924769515684, "learning_rate": 7.433767310431228e-07, "logits/chosen": 10.482129096984863, "logits/rejected": 10.905062675476074, "logps/chosen": -2.899278163909912, "logps/rejected": -3.2625820636749268, "loss": 3.902, "rewards/accuracies": 0.75, "rewards/chosen": -28.992778778076172, "rewards/margins": 3.633039951324463, "rewards/rejected": -32.625816345214844, "step": 1868 }, { "epoch": 0.25449346405228757, "grad_norm": 88.92457391814457, "learning_rate": 7.432791672271495e-07, "logits/chosen": 10.926553726196289, "logits/rejected": 10.936155319213867, "logps/chosen": -3.1975767612457275, "logps/rejected": -3.1121935844421387, "loss": 4.3891, "rewards/accuracies": 0.25, "rewards/chosen": -31.975767135620117, "rewards/margins": -0.8538317680358887, "rewards/rejected": -31.12193489074707, "step": 1869 }, { "epoch": 0.25462962962962965, "grad_norm": 45.194972214654555, "learning_rate": 7.431815258444107e-07, "logits/chosen": 9.292102813720703, "logits/rejected": 10.890199661254883, "logps/chosen": -2.5499939918518066, "logps/rejected": -3.0715439319610596, "loss": 3.6969, "rewards/accuracies": 1.0, "rewards/chosen": -25.49993896484375, "rewards/margins": 5.2154998779296875, "rewards/rejected": -30.715438842773438, "step": 1870 }, { "epoch": 0.2547657952069717, "grad_norm": 45.533940856254894, "learning_rate": 7.430838069169695e-07, "logits/chosen": 10.239259719848633, "logits/rejected": 11.344964981079102, "logps/chosen": -2.928346633911133, "logps/rejected": -3.268681287765503, "loss": 3.9316, "rewards/accuracies": 1.0, "rewards/chosen": -29.283466339111328, "rewards/margins": 3.403348922729492, "rewards/rejected": -32.68681335449219, "step": 1871 }, { "epoch": 0.2549019607843137, "grad_norm": 44.4000576223676, "learning_rate": 7.42986010466906e-07, "logits/chosen": 10.385176658630371, "logits/rejected": 10.884385108947754, "logps/chosen": -3.1280970573425293, "logps/rejected": -3.269073009490967, "loss": 3.2529, "rewards/accuracies": 0.75, "rewards/chosen": -31.280973434448242, "rewards/margins": 1.4097561836242676, "rewards/rejected": -32.69072723388672, "step": 1872 }, { "epoch": 0.2550381263616558, "grad_norm": 43.477689497988855, "learning_rate": 7.428881365163183e-07, "logits/chosen": 8.825302124023438, "logits/rejected": 10.319021224975586, "logps/chosen": -2.9438862800598145, "logps/rejected": -3.170793056488037, "loss": 3.5202, "rewards/accuracies": 0.75, "rewards/chosen": -29.438861846923828, "rewards/margins": 2.269068717956543, "rewards/rejected": -31.707931518554688, "step": 1873 }, { "epoch": 0.2551742919389978, "grad_norm": 43.34521035946541, "learning_rate": 7.427901850873219e-07, "logits/chosen": 9.371599197387695, "logits/rejected": 11.118099212646484, "logps/chosen": -2.8339359760284424, "logps/rejected": -3.235079526901245, "loss": 3.9087, "rewards/accuracies": 0.75, "rewards/chosen": -28.339359283447266, "rewards/margins": 4.011435508728027, "rewards/rejected": -32.35079574584961, "step": 1874 }, { "epoch": 0.25531045751633985, "grad_norm": 135.8722936295576, "learning_rate": 7.426921562020497e-07, "logits/chosen": 10.760873794555664, "logits/rejected": 10.791472434997559, "logps/chosen": -2.6491494178771973, "logps/rejected": -3.028286933898926, "loss": 6.012, "rewards/accuracies": 0.75, "rewards/chosen": -26.491493225097656, "rewards/margins": 3.7913737297058105, "rewards/rejected": -30.282867431640625, "step": 1875 }, { "epoch": 0.25544662309368193, "grad_norm": 44.45201463555951, "learning_rate": 7.42594049882652e-07, "logits/chosen": 11.557104110717773, "logits/rejected": 11.554311752319336, "logps/chosen": -3.187375545501709, "logps/rejected": -3.077054500579834, "loss": 4.0246, "rewards/accuracies": 0.5, "rewards/chosen": -31.873756408691406, "rewards/margins": -1.1032123565673828, "rewards/rejected": -30.770545959472656, "step": 1876 }, { "epoch": 0.25558278867102396, "grad_norm": 64.23527959226034, "learning_rate": 7.424958661512968e-07, "logits/chosen": 10.121944427490234, "logits/rejected": 11.712068557739258, "logps/chosen": -2.6709213256835938, "logps/rejected": -3.3225386142730713, "loss": 4.4469, "rewards/accuracies": 1.0, "rewards/chosen": -26.709211349487305, "rewards/margins": 6.51617431640625, "rewards/rejected": -33.22538757324219, "step": 1877 }, { "epoch": 0.255718954248366, "grad_norm": 48.01298474095796, "learning_rate": 7.423976050301696e-07, "logits/chosen": 11.418291091918945, "logits/rejected": 11.776487350463867, "logps/chosen": -3.5787065029144287, "logps/rejected": -3.7634167671203613, "loss": 3.7996, "rewards/accuracies": 0.5, "rewards/chosen": -35.78706359863281, "rewards/margins": 1.8471031188964844, "rewards/rejected": -37.63417053222656, "step": 1878 }, { "epoch": 0.2558551198257081, "grad_norm": 44.842733404894005, "learning_rate": 7.422992665414732e-07, "logits/chosen": 10.190155029296875, "logits/rejected": 11.687297821044922, "logps/chosen": -2.6689257621765137, "logps/rejected": -2.9317216873168945, "loss": 4.0104, "rewards/accuracies": 0.75, "rewards/chosen": -26.689258575439453, "rewards/margins": 2.627958297729492, "rewards/rejected": -29.317214965820312, "step": 1879 }, { "epoch": 0.2559912854030501, "grad_norm": 47.88346519901237, "learning_rate": 7.422008507074281e-07, "logits/chosen": 11.62537956237793, "logits/rejected": 11.5556058883667, "logps/chosen": -3.2268829345703125, "logps/rejected": -3.151745319366455, "loss": 4.3046, "rewards/accuracies": 0.25, "rewards/chosen": -32.268829345703125, "rewards/margins": -0.7513761520385742, "rewards/rejected": -31.517452239990234, "step": 1880 }, { "epoch": 0.25612745098039214, "grad_norm": 48.9144659036769, "learning_rate": 7.42102357550272e-07, "logits/chosen": 11.652915954589844, "logits/rejected": 10.456472396850586, "logps/chosen": -3.2460215091705322, "logps/rejected": -3.066943645477295, "loss": 4.3901, "rewards/accuracies": 0.25, "rewards/chosen": -32.4602165222168, "rewards/margins": -1.790781021118164, "rewards/rejected": -30.669435501098633, "step": 1881 }, { "epoch": 0.2562636165577342, "grad_norm": 46.127076991635874, "learning_rate": 7.420037870922605e-07, "logits/chosen": 10.467248916625977, "logits/rejected": 11.6276273727417, "logps/chosen": -2.9450149536132812, "logps/rejected": -3.111597776412964, "loss": 3.9859, "rewards/accuracies": 0.75, "rewards/chosen": -29.450149536132812, "rewards/margins": 1.6658291816711426, "rewards/rejected": -31.115978240966797, "step": 1882 }, { "epoch": 0.25639978213507625, "grad_norm": 47.7164648062295, "learning_rate": 7.419051393556663e-07, "logits/chosen": 10.382806777954102, "logits/rejected": 11.284890174865723, "logps/chosen": -3.121403694152832, "logps/rejected": -3.300563335418701, "loss": 4.1543, "rewards/accuracies": 0.5, "rewards/chosen": -31.21403694152832, "rewards/margins": 1.7915968894958496, "rewards/rejected": -33.00563430786133, "step": 1883 }, { "epoch": 0.2565359477124183, "grad_norm": 45.82483811325503, "learning_rate": 7.418064143627796e-07, "logits/chosen": 10.791792869567871, "logits/rejected": 11.332439422607422, "logps/chosen": -2.965831995010376, "logps/rejected": -3.263806104660034, "loss": 3.9177, "rewards/accuracies": 1.0, "rewards/chosen": -29.6583194732666, "rewards/margins": 2.9797420501708984, "rewards/rejected": -32.6380615234375, "step": 1884 }, { "epoch": 0.25667211328976036, "grad_norm": 52.99663193573438, "learning_rate": 7.417076121359081e-07, "logits/chosen": 11.414856910705566, "logits/rejected": 12.322945594787598, "logps/chosen": -3.462568521499634, "logps/rejected": -3.4250378608703613, "loss": 4.6037, "rewards/accuracies": 0.5, "rewards/chosen": -34.62568664550781, "rewards/margins": -0.3753061294555664, "rewards/rejected": -34.25038146972656, "step": 1885 }, { "epoch": 0.2568082788671024, "grad_norm": 50.23623808140653, "learning_rate": 7.416087326973771e-07, "logits/chosen": 11.847771644592285, "logits/rejected": 12.256332397460938, "logps/chosen": -3.4612390995025635, "logps/rejected": -3.876528739929199, "loss": 4.3877, "rewards/accuracies": 1.0, "rewards/chosen": -34.612388610839844, "rewards/margins": 4.152896404266357, "rewards/rejected": -38.765289306640625, "step": 1886 }, { "epoch": 0.2569444444444444, "grad_norm": 49.073686629812045, "learning_rate": 7.415097760695292e-07, "logits/chosen": 10.184806823730469, "logits/rejected": 10.932584762573242, "logps/chosen": -3.1653451919555664, "logps/rejected": -3.6028804779052734, "loss": 3.5337, "rewards/accuracies": 0.25, "rewards/chosen": -31.65345001220703, "rewards/margins": 4.375356197357178, "rewards/rejected": -36.028804779052734, "step": 1887 }, { "epoch": 0.2570806100217865, "grad_norm": 78.99360190180282, "learning_rate": 7.414107422747245e-07, "logits/chosen": 9.639822006225586, "logits/rejected": 11.503527641296387, "logps/chosen": -3.0837135314941406, "logps/rejected": -3.6840524673461914, "loss": 3.7901, "rewards/accuracies": 1.0, "rewards/chosen": -30.837135314941406, "rewards/margins": 6.003391265869141, "rewards/rejected": -36.84052658081055, "step": 1888 }, { "epoch": 0.25721677559912853, "grad_norm": 45.39290295091549, "learning_rate": 7.413116313353404e-07, "logits/chosen": 11.385076522827148, "logits/rejected": 11.932779312133789, "logps/chosen": -3.1792244911193848, "logps/rejected": -3.501873016357422, "loss": 4.5199, "rewards/accuracies": 0.75, "rewards/chosen": -31.792247772216797, "rewards/margins": 3.2264842987060547, "rewards/rejected": -35.01873016357422, "step": 1889 }, { "epoch": 0.25735294117647056, "grad_norm": 44.94252975298563, "learning_rate": 7.412124432737719e-07, "logits/chosen": 11.279555320739746, "logits/rejected": 12.315114974975586, "logps/chosen": -3.4983770847320557, "logps/rejected": -3.643634796142578, "loss": 4.2234, "rewards/accuracies": 1.0, "rewards/chosen": -34.98377227783203, "rewards/margins": 1.4525775909423828, "rewards/rejected": -36.43634796142578, "step": 1890 }, { "epoch": 0.25748910675381265, "grad_norm": 48.066813972503766, "learning_rate": 7.411131781124313e-07, "logits/chosen": 10.978673934936523, "logits/rejected": 11.7279052734375, "logps/chosen": -3.274451971054077, "logps/rejected": -3.607700824737549, "loss": 4.2603, "rewards/accuracies": 0.5, "rewards/chosen": -32.7445182800293, "rewards/margins": 3.332489490509033, "rewards/rejected": -36.07700729370117, "step": 1891 }, { "epoch": 0.2576252723311547, "grad_norm": 45.653301387274205, "learning_rate": 7.410138358737485e-07, "logits/chosen": 11.565442085266113, "logits/rejected": 10.56944465637207, "logps/chosen": -3.406538724899292, "logps/rejected": -3.1911075115203857, "loss": 4.6514, "rewards/accuracies": 0.0, "rewards/chosen": -34.06538772583008, "rewards/margins": -2.1543116569519043, "rewards/rejected": -31.911075592041016, "step": 1892 }, { "epoch": 0.2577614379084967, "grad_norm": 50.84731417567403, "learning_rate": 7.409144165801706e-07, "logits/chosen": 11.237967491149902, "logits/rejected": 12.366127014160156, "logps/chosen": -2.982276439666748, "logps/rejected": -3.353353500366211, "loss": 4.0146, "rewards/accuracies": 0.75, "rewards/chosen": -29.822765350341797, "rewards/margins": 3.7107725143432617, "rewards/rejected": -33.533538818359375, "step": 1893 }, { "epoch": 0.2578976034858388, "grad_norm": 44.094321415303696, "learning_rate": 7.408149202541622e-07, "logits/chosen": 11.197824478149414, "logits/rejected": 10.86886215209961, "logps/chosen": -3.526057243347168, "logps/rejected": -3.4982542991638184, "loss": 3.8788, "rewards/accuracies": 0.5, "rewards/chosen": -35.26057052612305, "rewards/margins": -0.27802562713623047, "rewards/rejected": -34.9825439453125, "step": 1894 }, { "epoch": 0.2580337690631808, "grad_norm": 44.90597726646376, "learning_rate": 7.407153469182054e-07, "logits/chosen": 11.72994613647461, "logits/rejected": 12.529342651367188, "logps/chosen": -2.9801464080810547, "logps/rejected": -3.1762032508850098, "loss": 3.7738, "rewards/accuracies": 0.75, "rewards/chosen": -29.80146598815918, "rewards/margins": 1.960568904876709, "rewards/rejected": -31.762033462524414, "step": 1895 }, { "epoch": 0.2581699346405229, "grad_norm": 51.09293493117617, "learning_rate": 7.406156965947996e-07, "logits/chosen": 10.492151260375977, "logits/rejected": 11.26413345336914, "logps/chosen": -3.400059223175049, "logps/rejected": -2.9787521362304688, "loss": 4.4694, "rewards/accuracies": 0.25, "rewards/chosen": -34.00059127807617, "rewards/margins": -4.213069915771484, "rewards/rejected": -29.787521362304688, "step": 1896 }, { "epoch": 0.25830610021786493, "grad_norm": 48.3600753656039, "learning_rate": 7.405159693064617e-07, "logits/chosen": 10.666298866271973, "logits/rejected": 10.789175033569336, "logps/chosen": -3.1801328659057617, "logps/rejected": -3.384809970855713, "loss": 3.9171, "rewards/accuracies": 0.5, "rewards/chosen": -31.801328659057617, "rewards/margins": 2.0467710494995117, "rewards/rejected": -33.84809875488281, "step": 1897 }, { "epoch": 0.25844226579520696, "grad_norm": 43.93397363326666, "learning_rate": 7.404161650757256e-07, "logits/chosen": 9.954275131225586, "logits/rejected": 10.787437438964844, "logps/chosen": -2.857029676437378, "logps/rejected": -3.1194405555725098, "loss": 4.086, "rewards/accuracies": 0.75, "rewards/chosen": -28.570297241210938, "rewards/margins": 2.624107837677002, "rewards/rejected": -31.19440460205078, "step": 1898 }, { "epoch": 0.25857843137254904, "grad_norm": 50.48557955983177, "learning_rate": 7.403162839251433e-07, "logits/chosen": 10.426414489746094, "logits/rejected": 9.467998504638672, "logps/chosen": -3.2622575759887695, "logps/rejected": -3.340651035308838, "loss": 4.0422, "rewards/accuracies": 0.75, "rewards/chosen": -32.62257385253906, "rewards/margins": 0.7839360237121582, "rewards/rejected": -33.40650939941406, "step": 1899 }, { "epoch": 0.2587145969498911, "grad_norm": 55.52732254776161, "learning_rate": 7.402163258772834e-07, "logits/chosen": 11.329968452453613, "logits/rejected": 11.695143699645996, "logps/chosen": -3.1323037147521973, "logps/rejected": -3.522390127182007, "loss": 4.4733, "rewards/accuracies": 1.0, "rewards/chosen": -31.323036193847656, "rewards/margins": 3.900864601135254, "rewards/rejected": -35.223899841308594, "step": 1900 }, { "epoch": 0.2588507625272331, "grad_norm": 47.861976160162634, "learning_rate": 7.401162909547324e-07, "logits/chosen": 9.85041618347168, "logits/rejected": 11.110230445861816, "logps/chosen": -3.0416817665100098, "logps/rejected": -3.959524631500244, "loss": 4.4526, "rewards/accuracies": 1.0, "rewards/chosen": -30.416820526123047, "rewards/margins": 9.178426742553711, "rewards/rejected": -39.595245361328125, "step": 1901 }, { "epoch": 0.2589869281045752, "grad_norm": 43.69375439807787, "learning_rate": 7.400161791800942e-07, "logits/chosen": 10.852502822875977, "logits/rejected": 10.195852279663086, "logps/chosen": -3.600687026977539, "logps/rejected": -3.412755012512207, "loss": 4.2022, "rewards/accuracies": 0.25, "rewards/chosen": -36.00687026977539, "rewards/margins": -1.879321575164795, "rewards/rejected": -34.12754821777344, "step": 1902 }, { "epoch": 0.2591230936819172, "grad_norm": 47.34779449942613, "learning_rate": 7.399159905759895e-07, "logits/chosen": 12.075517654418945, "logits/rejected": 11.9478759765625, "logps/chosen": -3.7133290767669678, "logps/rejected": -3.919111490249634, "loss": 4.4541, "rewards/accuracies": 0.5, "rewards/chosen": -37.1332893371582, "rewards/margins": 2.057823657989502, "rewards/rejected": -39.19111251831055, "step": 1903 }, { "epoch": 0.25925925925925924, "grad_norm": 50.84981783554486, "learning_rate": 7.398157251650571e-07, "logits/chosen": 11.097879409790039, "logits/rejected": 11.593050003051758, "logps/chosen": -2.9410972595214844, "logps/rejected": -3.1940841674804688, "loss": 4.222, "rewards/accuracies": 0.5, "rewards/chosen": -29.410972595214844, "rewards/margins": 2.5298690795898438, "rewards/rejected": -31.940841674804688, "step": 1904 }, { "epoch": 0.25939542483660133, "grad_norm": 42.19802452297107, "learning_rate": 7.397153829699526e-07, "logits/chosen": 11.618659019470215, "logits/rejected": 11.70358943939209, "logps/chosen": -3.1182432174682617, "logps/rejected": -3.365180015563965, "loss": 4.0167, "rewards/accuracies": 0.75, "rewards/chosen": -31.18243408203125, "rewards/margins": 2.4693660736083984, "rewards/rejected": -33.65180206298828, "step": 1905 }, { "epoch": 0.25953159041394336, "grad_norm": 43.337177914122265, "learning_rate": 7.396149640133492e-07, "logits/chosen": 11.183795928955078, "logits/rejected": 11.620979309082031, "logps/chosen": -2.9766812324523926, "logps/rejected": -3.4380576610565186, "loss": 4.0418, "rewards/accuracies": 1.0, "rewards/chosen": -29.766813278198242, "rewards/margins": 4.613765716552734, "rewards/rejected": -34.380577087402344, "step": 1906 }, { "epoch": 0.2596677559912854, "grad_norm": 41.49460391429603, "learning_rate": 7.395144683179375e-07, "logits/chosen": 10.603437423706055, "logits/rejected": 11.289667129516602, "logps/chosen": -3.286458969116211, "logps/rejected": -3.5727126598358154, "loss": 4.2756, "rewards/accuracies": 1.0, "rewards/chosen": -32.864585876464844, "rewards/margins": 2.8625388145446777, "rewards/rejected": -35.72712707519531, "step": 1907 }, { "epoch": 0.25980392156862747, "grad_norm": 41.42583583873089, "learning_rate": 7.394138959064251e-07, "logits/chosen": 12.213644981384277, "logits/rejected": 11.236995697021484, "logps/chosen": -3.1805684566497803, "logps/rejected": -3.540592670440674, "loss": 4.091, "rewards/accuracies": 0.75, "rewards/chosen": -31.805683135986328, "rewards/margins": 3.6002440452575684, "rewards/rejected": -35.40592956542969, "step": 1908 }, { "epoch": 0.2599400871459695, "grad_norm": 43.497874888207015, "learning_rate": 7.393132468015374e-07, "logits/chosen": 10.54327392578125, "logits/rejected": 12.086273193359375, "logps/chosen": -3.0323140621185303, "logps/rejected": -3.6830525398254395, "loss": 4.0488, "rewards/accuracies": 1.0, "rewards/chosen": -30.323143005371094, "rewards/margins": 6.507382869720459, "rewards/rejected": -36.83052444458008, "step": 1909 }, { "epoch": 0.26007625272331153, "grad_norm": 45.75069834285366, "learning_rate": 7.392125210260167e-07, "logits/chosen": 11.85232925415039, "logits/rejected": 11.803266525268555, "logps/chosen": -3.272171974182129, "logps/rejected": -3.3351643085479736, "loss": 4.3271, "rewards/accuracies": 0.5, "rewards/chosen": -32.72172164916992, "rewards/margins": 0.6299242973327637, "rewards/rejected": -33.35164260864258, "step": 1910 }, { "epoch": 0.2602124183006536, "grad_norm": 40.92618361547631, "learning_rate": 7.391117186026229e-07, "logits/chosen": 12.269210815429688, "logits/rejected": 11.823365211486816, "logps/chosen": -3.582414150238037, "logps/rejected": -3.476130962371826, "loss": 3.8133, "rewards/accuracies": 0.0, "rewards/chosen": -35.82414245605469, "rewards/margins": -1.0628294944763184, "rewards/rejected": -34.761314392089844, "step": 1911 }, { "epoch": 0.26034858387799564, "grad_norm": 45.68299834646856, "learning_rate": 7.390108395541333e-07, "logits/chosen": 11.171857833862305, "logits/rejected": 11.226956367492676, "logps/chosen": -3.3709092140197754, "logps/rejected": -3.4710006713867188, "loss": 4.393, "rewards/accuracies": 0.75, "rewards/chosen": -33.70909118652344, "rewards/margins": 1.0009160041809082, "rewards/rejected": -34.71000671386719, "step": 1912 }, { "epoch": 0.26048474945533767, "grad_norm": 48.39027441148834, "learning_rate": 7.38909883903342e-07, "logits/chosen": 11.084107398986816, "logits/rejected": 10.169095993041992, "logps/chosen": -3.2460598945617676, "logps/rejected": -3.195171356201172, "loss": 4.2806, "rewards/accuracies": 0.25, "rewards/chosen": -32.46059799194336, "rewards/margins": -0.5088834762573242, "rewards/rejected": -31.95171546936035, "step": 1913 }, { "epoch": 0.26062091503267976, "grad_norm": 45.06322634429928, "learning_rate": 7.388088516730611e-07, "logits/chosen": 10.637449264526367, "logits/rejected": 11.873868942260742, "logps/chosen": -3.2925586700439453, "logps/rejected": -3.6819746494293213, "loss": 3.8844, "rewards/accuracies": 0.75, "rewards/chosen": -32.92559051513672, "rewards/margins": 3.894158363342285, "rewards/rejected": -36.81974792480469, "step": 1914 }, { "epoch": 0.2607570806100218, "grad_norm": 42.87424629303356, "learning_rate": 7.387077428861194e-07, "logits/chosen": 10.548568725585938, "logits/rejected": 12.533720016479492, "logps/chosen": -3.3488879203796387, "logps/rejected": -3.6752471923828125, "loss": 3.7783, "rewards/accuracies": 1.0, "rewards/chosen": -33.48887634277344, "rewards/margins": 3.2635903358459473, "rewards/rejected": -36.752471923828125, "step": 1915 }, { "epoch": 0.2608932461873638, "grad_norm": 47.504425046355024, "learning_rate": 7.386065575653637e-07, "logits/chosen": 11.559324264526367, "logits/rejected": 10.745697021484375, "logps/chosen": -3.500314235687256, "logps/rejected": -3.4255971908569336, "loss": 3.7166, "rewards/accuracies": 0.5, "rewards/chosen": -35.003143310546875, "rewards/margins": -0.7471694946289062, "rewards/rejected": -34.25597381591797, "step": 1916 }, { "epoch": 0.2610294117647059, "grad_norm": 43.59588603998652, "learning_rate": 7.385052957336571e-07, "logits/chosen": 10.552613258361816, "logits/rejected": 11.256364822387695, "logps/chosen": -2.9104199409484863, "logps/rejected": -3.1505236625671387, "loss": 4.0366, "rewards/accuracies": 1.0, "rewards/chosen": -29.104198455810547, "rewards/margins": 2.4010372161865234, "rewards/rejected": -31.50523567199707, "step": 1917 }, { "epoch": 0.2611655773420479, "grad_norm": 58.11116544605917, "learning_rate": 7.38403957413881e-07, "logits/chosen": 10.885997772216797, "logits/rejected": 11.436838150024414, "logps/chosen": -3.359867572784424, "logps/rejected": -3.4859066009521484, "loss": 4.0961, "rewards/accuracies": 0.5, "rewards/chosen": -33.59867477416992, "rewards/margins": 1.2603917121887207, "rewards/rejected": -34.85906982421875, "step": 1918 }, { "epoch": 0.26130174291938996, "grad_norm": 51.987060084584094, "learning_rate": 7.383025426289333e-07, "logits/chosen": 10.304141998291016, "logits/rejected": 11.446281433105469, "logps/chosen": -3.161201000213623, "logps/rejected": -3.314120292663574, "loss": 3.7223, "rewards/accuracies": 0.75, "rewards/chosen": -31.61200714111328, "rewards/margins": 1.5291962623596191, "rewards/rejected": -33.141204833984375, "step": 1919 }, { "epoch": 0.26143790849673204, "grad_norm": 41.3886082437372, "learning_rate": 7.382010514017297e-07, "logits/chosen": 10.805068969726562, "logits/rejected": 11.322477340698242, "logps/chosen": -3.1822407245635986, "logps/rejected": -3.3005950450897217, "loss": 3.4836, "rewards/accuracies": 0.5, "rewards/chosen": -31.822406768798828, "rewards/margins": 1.1835432052612305, "rewards/rejected": -33.005950927734375, "step": 1920 }, { "epoch": 0.26157407407407407, "grad_norm": 44.25459101754852, "learning_rate": 7.38099483755203e-07, "logits/chosen": 9.70238208770752, "logits/rejected": 11.090692520141602, "logps/chosen": -3.013317108154297, "logps/rejected": -3.4221363067626953, "loss": 3.9179, "rewards/accuracies": 0.75, "rewards/chosen": -30.1331729888916, "rewards/margins": 4.088190078735352, "rewards/rejected": -34.22136306762695, "step": 1921 }, { "epoch": 0.2617102396514161, "grad_norm": 46.90880833533055, "learning_rate": 7.379978397123031e-07, "logits/chosen": 11.590590476989746, "logits/rejected": 10.958131790161133, "logps/chosen": -3.00045108795166, "logps/rejected": -3.302682399749756, "loss": 4.2724, "rewards/accuracies": 0.75, "rewards/chosen": -30.004512786865234, "rewards/margins": 3.0223140716552734, "rewards/rejected": -33.026824951171875, "step": 1922 }, { "epoch": 0.2618464052287582, "grad_norm": 51.32054853106813, "learning_rate": 7.378961192959975e-07, "logits/chosen": 9.900537490844727, "logits/rejected": 10.150163650512695, "logps/chosen": -3.1377556324005127, "logps/rejected": -3.071892738342285, "loss": 4.7465, "rewards/accuracies": 0.75, "rewards/chosen": -31.37755584716797, "rewards/margins": -0.65863037109375, "rewards/rejected": -30.71892547607422, "step": 1923 }, { "epoch": 0.2619825708061002, "grad_norm": 48.58937718865156, "learning_rate": 7.377943225292707e-07, "logits/chosen": 10.774504661560059, "logits/rejected": 10.36724853515625, "logps/chosen": -2.891171932220459, "logps/rejected": -2.720447063446045, "loss": 4.3563, "rewards/accuracies": 0.25, "rewards/chosen": -28.911720275878906, "rewards/margins": -1.7072501182556152, "rewards/rejected": -27.2044677734375, "step": 1924 }, { "epoch": 0.26211873638344224, "grad_norm": 42.16402024707081, "learning_rate": 7.376924494351243e-07, "logits/chosen": 11.178252220153809, "logits/rejected": 10.44607925415039, "logps/chosen": -3.6892080307006836, "logps/rejected": -3.2845003604888916, "loss": 3.9082, "rewards/accuracies": 0.0, "rewards/chosen": -36.89208221435547, "rewards/margins": -4.047076225280762, "rewards/rejected": -32.845001220703125, "step": 1925 }, { "epoch": 0.2622549019607843, "grad_norm": 42.14490969270481, "learning_rate": 7.375905000365777e-07, "logits/chosen": 10.32080078125, "logits/rejected": 11.71328353881836, "logps/chosen": -2.834524631500244, "logps/rejected": -3.253293514251709, "loss": 3.5611, "rewards/accuracies": 1.0, "rewards/chosen": -28.34524917602539, "rewards/margins": 4.187686443328857, "rewards/rejected": -32.532936096191406, "step": 1926 }, { "epoch": 0.26239106753812635, "grad_norm": 45.95149117442776, "learning_rate": 7.37488474356667e-07, "logits/chosen": 11.086248397827148, "logits/rejected": 10.790885925292969, "logps/chosen": -3.070807933807373, "logps/rejected": -2.9819116592407227, "loss": 4.2164, "rewards/accuracies": 0.5, "rewards/chosen": -30.708078384399414, "rewards/margins": -0.8889608383178711, "rewards/rejected": -29.81911849975586, "step": 1927 }, { "epoch": 0.2625272331154684, "grad_norm": 46.719979367964704, "learning_rate": 7.373863724184457e-07, "logits/chosen": 9.943809509277344, "logits/rejected": 10.716690063476562, "logps/chosen": -2.99562931060791, "logps/rejected": -3.2081243991851807, "loss": 4.3417, "rewards/accuracies": 0.5, "rewards/chosen": -29.956295013427734, "rewards/margins": 2.124950885772705, "rewards/rejected": -32.08124542236328, "step": 1928 }, { "epoch": 0.26266339869281047, "grad_norm": 51.828690113627424, "learning_rate": 7.37284194244985e-07, "logits/chosen": 12.322158813476562, "logits/rejected": 12.415163040161133, "logps/chosen": -3.6707069873809814, "logps/rejected": -3.844012975692749, "loss": 4.441, "rewards/accuracies": 0.75, "rewards/chosen": -36.707069396972656, "rewards/margins": 1.733057975769043, "rewards/rejected": -38.440128326416016, "step": 1929 }, { "epoch": 0.2627995642701525, "grad_norm": 42.48964999753321, "learning_rate": 7.371819398593723e-07, "logits/chosen": 11.515205383300781, "logits/rejected": 11.6943359375, "logps/chosen": -2.921006202697754, "logps/rejected": -3.316303253173828, "loss": 3.6245, "rewards/accuracies": 0.75, "rewards/chosen": -29.210063934326172, "rewards/margins": 3.9529685974121094, "rewards/rejected": -33.16303253173828, "step": 1930 }, { "epoch": 0.2629357298474945, "grad_norm": 43.09948084803391, "learning_rate": 7.370796092847132e-07, "logits/chosen": 10.679847717285156, "logits/rejected": 11.892776489257812, "logps/chosen": -3.2442328929901123, "logps/rejected": -3.757384777069092, "loss": 4.0744, "rewards/accuracies": 0.75, "rewards/chosen": -32.44232940673828, "rewards/margins": 5.1315178871154785, "rewards/rejected": -37.57384490966797, "step": 1931 }, { "epoch": 0.2630718954248366, "grad_norm": 50.45306137984989, "learning_rate": 7.369772025441301e-07, "logits/chosen": 11.023937225341797, "logits/rejected": 10.523144721984863, "logps/chosen": -2.6944074630737305, "logps/rejected": -3.0464460849761963, "loss": 4.2221, "rewards/accuracies": 1.0, "rewards/chosen": -26.944076538085938, "rewards/margins": 3.5203847885131836, "rewards/rejected": -30.464460372924805, "step": 1932 }, { "epoch": 0.26320806100217864, "grad_norm": 49.641526258851876, "learning_rate": 7.368747196607626e-07, "logits/chosen": 11.767099380493164, "logits/rejected": 12.172469139099121, "logps/chosen": -3.5189132690429688, "logps/rejected": -3.533212184906006, "loss": 4.0755, "rewards/accuracies": 0.75, "rewards/chosen": -35.18913269042969, "rewards/margins": 0.14299249649047852, "rewards/rejected": -35.332122802734375, "step": 1933 }, { "epoch": 0.2633442265795207, "grad_norm": 44.71599556119794, "learning_rate": 7.367721606577676e-07, "logits/chosen": 9.744300842285156, "logits/rejected": 10.889297485351562, "logps/chosen": -2.8966875076293945, "logps/rejected": -3.101691246032715, "loss": 4.0238, "rewards/accuracies": 0.5, "rewards/chosen": -28.966875076293945, "rewards/margins": 2.0500354766845703, "rewards/rejected": -31.016910552978516, "step": 1934 }, { "epoch": 0.26348039215686275, "grad_norm": 67.65000141346, "learning_rate": 7.36669525558319e-07, "logits/chosen": 10.748705863952637, "logits/rejected": 10.787220001220703, "logps/chosen": -3.267484664916992, "logps/rejected": -3.1253461837768555, "loss": 3.7969, "rewards/accuracies": 0.25, "rewards/chosen": -32.67484664916992, "rewards/margins": -1.4213838577270508, "rewards/rejected": -31.253463745117188, "step": 1935 }, { "epoch": 0.2636165577342048, "grad_norm": 41.37438017449015, "learning_rate": 7.365668143856082e-07, "logits/chosen": 10.456884384155273, "logits/rejected": 10.797630310058594, "logps/chosen": -2.985589027404785, "logps/rejected": -3.591304302215576, "loss": 3.9671, "rewards/accuracies": 0.75, "rewards/chosen": -29.855892181396484, "rewards/margins": 6.057150840759277, "rewards/rejected": -35.91304397583008, "step": 1936 }, { "epoch": 0.26375272331154687, "grad_norm": 41.90855041796471, "learning_rate": 7.364640271628437e-07, "logits/chosen": 10.158380508422852, "logits/rejected": 10.473814964294434, "logps/chosen": -2.9930427074432373, "logps/rejected": -3.1471810340881348, "loss": 4.2422, "rewards/accuracies": 0.75, "rewards/chosen": -29.93042755126953, "rewards/margins": 1.5413818359375, "rewards/rejected": -31.47180938720703, "step": 1937 }, { "epoch": 0.2638888888888889, "grad_norm": 48.792179154914734, "learning_rate": 7.363611639132509e-07, "logits/chosen": 11.242393493652344, "logits/rejected": 11.230393409729004, "logps/chosen": -3.0057361125946045, "logps/rejected": -3.2971653938293457, "loss": 3.8318, "rewards/accuracies": 0.75, "rewards/chosen": -30.057361602783203, "rewards/margins": 2.914290428161621, "rewards/rejected": -32.97165298461914, "step": 1938 }, { "epoch": 0.2640250544662309, "grad_norm": 48.54620775866392, "learning_rate": 7.362582246600728e-07, "logits/chosen": 11.542694091796875, "logits/rejected": 11.671527862548828, "logps/chosen": -3.2166907787323, "logps/rejected": -3.289823532104492, "loss": 3.779, "rewards/accuracies": 0.5, "rewards/chosen": -32.166908264160156, "rewards/margins": 0.7313261032104492, "rewards/rejected": -32.898231506347656, "step": 1939 }, { "epoch": 0.264161220043573, "grad_norm": 40.02402628052584, "learning_rate": 7.361552094265693e-07, "logits/chosen": 9.897130966186523, "logits/rejected": 10.393539428710938, "logps/chosen": -3.2364401817321777, "logps/rejected": -3.1131157875061035, "loss": 3.9958, "rewards/accuracies": 0.5, "rewards/chosen": -32.364402770996094, "rewards/margins": -1.233245849609375, "rewards/rejected": -31.13115692138672, "step": 1940 }, { "epoch": 0.26429738562091504, "grad_norm": 44.56839505572319, "learning_rate": 7.360521182360175e-07, "logits/chosen": 11.432769775390625, "logits/rejected": 11.426654815673828, "logps/chosen": -3.3029062747955322, "logps/rejected": -3.0811541080474854, "loss": 4.6256, "rewards/accuracies": 0.25, "rewards/chosen": -33.02906036376953, "rewards/margins": -2.217519760131836, "rewards/rejected": -30.811540603637695, "step": 1941 }, { "epoch": 0.26443355119825707, "grad_norm": 50.57690623529536, "learning_rate": 7.359489511117117e-07, "logits/chosen": 10.044836044311523, "logits/rejected": 11.057403564453125, "logps/chosen": -3.0521626472473145, "logps/rejected": -3.2504072189331055, "loss": 3.7778, "rewards/accuracies": 0.75, "rewards/chosen": -30.521625518798828, "rewards/margins": 1.9824447631835938, "rewards/rejected": -32.50407409667969, "step": 1942 }, { "epoch": 0.26456971677559915, "grad_norm": 60.896721481722096, "learning_rate": 7.358457080769634e-07, "logits/chosen": 10.838041305541992, "logits/rejected": 11.541776657104492, "logps/chosen": -3.4183030128479004, "logps/rejected": -3.554964303970337, "loss": 3.4163, "rewards/accuracies": 0.5, "rewards/chosen": -34.18302917480469, "rewards/margins": 1.366614818572998, "rewards/rejected": -35.549644470214844, "step": 1943 }, { "epoch": 0.2647058823529412, "grad_norm": 45.01549579834046, "learning_rate": 7.357423891551014e-07, "logits/chosen": 11.013090133666992, "logits/rejected": 12.014589309692383, "logps/chosen": -3.216879367828369, "logps/rejected": -3.6537232398986816, "loss": 4.3068, "rewards/accuracies": 0.5, "rewards/chosen": -32.168792724609375, "rewards/margins": 4.3684401512146, "rewards/rejected": -36.5372314453125, "step": 1944 }, { "epoch": 0.2648420479302832, "grad_norm": 41.19541267124541, "learning_rate": 7.356389943694711e-07, "logits/chosen": 11.885333061218262, "logits/rejected": 12.05679702758789, "logps/chosen": -3.070866107940674, "logps/rejected": -3.2641701698303223, "loss": 3.8189, "rewards/accuracies": 0.75, "rewards/chosen": -30.708663940429688, "rewards/margins": 1.9330387115478516, "rewards/rejected": -32.641700744628906, "step": 1945 }, { "epoch": 0.2649782135076253, "grad_norm": 41.67786804442986, "learning_rate": 7.355355237434357e-07, "logits/chosen": 10.496757507324219, "logits/rejected": 11.040445327758789, "logps/chosen": -3.081594467163086, "logps/rejected": -3.11314058303833, "loss": 4.0063, "rewards/accuracies": 0.25, "rewards/chosen": -30.815946578979492, "rewards/margins": 0.315460205078125, "rewards/rejected": -31.131404876708984, "step": 1946 }, { "epoch": 0.2651143790849673, "grad_norm": 52.67709476988354, "learning_rate": 7.354319773003752e-07, "logits/chosen": 11.208634376525879, "logits/rejected": 10.925348281860352, "logps/chosen": -3.313575267791748, "logps/rejected": -3.246877908706665, "loss": 4.3658, "rewards/accuracies": 0.5, "rewards/chosen": -33.1357536315918, "rewards/margins": -0.6669750213623047, "rewards/rejected": -32.468780517578125, "step": 1947 }, { "epoch": 0.26525054466230935, "grad_norm": 50.17301722710208, "learning_rate": 7.353283550636866e-07, "logits/chosen": 10.756460189819336, "logits/rejected": 11.380561828613281, "logps/chosen": -2.8155250549316406, "logps/rejected": -3.283595323562622, "loss": 4.8066, "rewards/accuracies": 0.75, "rewards/chosen": -28.155250549316406, "rewards/margins": 4.680703163146973, "rewards/rejected": -32.83595275878906, "step": 1948 }, { "epoch": 0.26538671023965144, "grad_norm": 37.70268944725403, "learning_rate": 7.352246570567844e-07, "logits/chosen": 11.207786560058594, "logits/rejected": 12.167495727539062, "logps/chosen": -3.041135549545288, "logps/rejected": -3.4419193267822266, "loss": 3.4051, "rewards/accuracies": 1.0, "rewards/chosen": -30.411354064941406, "rewards/margins": 4.007838249206543, "rewards/rejected": -34.419193267822266, "step": 1949 }, { "epoch": 0.26552287581699346, "grad_norm": 40.30385641062094, "learning_rate": 7.351208833031001e-07, "logits/chosen": 10.369333267211914, "logits/rejected": 10.213593482971191, "logps/chosen": -3.271489381790161, "logps/rejected": -3.3116469383239746, "loss": 3.9237, "rewards/accuracies": 0.75, "rewards/chosen": -32.71489334106445, "rewards/margins": 0.40157556533813477, "rewards/rejected": -33.11647033691406, "step": 1950 }, { "epoch": 0.2656590413943355, "grad_norm": 41.68850617821478, "learning_rate": 7.350170338260817e-07, "logits/chosen": 11.147974014282227, "logits/rejected": 10.905811309814453, "logps/chosen": -3.1556758880615234, "logps/rejected": -3.232023000717163, "loss": 3.8776, "rewards/accuracies": 0.25, "rewards/chosen": -31.556758880615234, "rewards/margins": 0.7634711265563965, "rewards/rejected": -32.320228576660156, "step": 1951 }, { "epoch": 0.2657952069716776, "grad_norm": 41.25879457550484, "learning_rate": 7.349131086491954e-07, "logits/chosen": 10.304649353027344, "logits/rejected": 11.139057159423828, "logps/chosen": -3.262758493423462, "logps/rejected": -3.7361466884613037, "loss": 3.8247, "rewards/accuracies": 0.75, "rewards/chosen": -32.627586364746094, "rewards/margins": 4.733881950378418, "rewards/rejected": -37.36146545410156, "step": 1952 }, { "epoch": 0.2659313725490196, "grad_norm": 45.24767622259315, "learning_rate": 7.348091077959239e-07, "logits/chosen": 11.328432083129883, "logits/rejected": 10.54903507232666, "logps/chosen": -3.5674164295196533, "logps/rejected": -3.430776596069336, "loss": 3.7225, "rewards/accuracies": 0.5, "rewards/chosen": -35.674163818359375, "rewards/margins": -1.366396427154541, "rewards/rejected": -34.307769775390625, "step": 1953 }, { "epoch": 0.26606753812636164, "grad_norm": 43.338570600123695, "learning_rate": 7.347050312897669e-07, "logits/chosen": 10.692558288574219, "logits/rejected": 12.192340850830078, "logps/chosen": -3.2469420433044434, "logps/rejected": -3.5228185653686523, "loss": 4.4917, "rewards/accuracies": 0.75, "rewards/chosen": -32.46942138671875, "rewards/margins": 2.7587661743164062, "rewards/rejected": -35.228187561035156, "step": 1954 }, { "epoch": 0.2662037037037037, "grad_norm": 47.502738127611764, "learning_rate": 7.346008791542412e-07, "logits/chosen": 10.210047721862793, "logits/rejected": 11.451488494873047, "logps/chosen": -3.1690192222595215, "logps/rejected": -3.586641311645508, "loss": 4.2376, "rewards/accuracies": 1.0, "rewards/chosen": -31.69019317626953, "rewards/margins": 4.17622184753418, "rewards/rejected": -35.866416931152344, "step": 1955 }, { "epoch": 0.26633986928104575, "grad_norm": 44.86929166247218, "learning_rate": 7.344966514128813e-07, "logits/chosen": 10.822517395019531, "logits/rejected": 10.050943374633789, "logps/chosen": -3.3206067085266113, "logps/rejected": -3.063610315322876, "loss": 4.0355, "rewards/accuracies": 0.25, "rewards/chosen": -33.20606994628906, "rewards/margins": -2.5699663162231445, "rewards/rejected": -30.63610076904297, "step": 1956 }, { "epoch": 0.2664760348583878, "grad_norm": 64.51191652750806, "learning_rate": 7.343923480892378e-07, "logits/chosen": 10.140865325927734, "logits/rejected": 10.37217903137207, "logps/chosen": -3.0116569995880127, "logps/rejected": -2.955965518951416, "loss": 4.5537, "rewards/accuracies": 0.25, "rewards/chosen": -30.11656951904297, "rewards/margins": -0.5569148063659668, "rewards/rejected": -29.559654235839844, "step": 1957 }, { "epoch": 0.26661220043572986, "grad_norm": 39.88397217312072, "learning_rate": 7.342879692068793e-07, "logits/chosen": 9.779886245727539, "logits/rejected": 11.545888900756836, "logps/chosen": -3.260040521621704, "logps/rejected": -3.6288132667541504, "loss": 3.6572, "rewards/accuracies": 0.75, "rewards/chosen": -32.60040283203125, "rewards/margins": 3.687730312347412, "rewards/rejected": -36.28813171386719, "step": 1958 }, { "epoch": 0.2667483660130719, "grad_norm": 52.69801954155437, "learning_rate": 7.341835147893908e-07, "logits/chosen": 10.585992813110352, "logits/rejected": 10.624970436096191, "logps/chosen": -3.27888822555542, "logps/rejected": -3.3866872787475586, "loss": 4.3279, "rewards/accuracies": 1.0, "rewards/chosen": -32.788883209228516, "rewards/margins": 1.0779905319213867, "rewards/rejected": -33.86687088012695, "step": 1959 }, { "epoch": 0.2668845315904139, "grad_norm": 41.62438174738945, "learning_rate": 7.340789848603748e-07, "logits/chosen": 10.593037605285645, "logits/rejected": 11.407651901245117, "logps/chosen": -3.0248894691467285, "logps/rejected": -3.2251715660095215, "loss": 3.9879, "rewards/accuracies": 0.75, "rewards/chosen": -30.24889373779297, "rewards/margins": 2.002821922302246, "rewards/rejected": -32.25171661376953, "step": 1960 }, { "epoch": 0.267020697167756, "grad_norm": 45.10019591202697, "learning_rate": 7.339743794434506e-07, "logits/chosen": 10.356738090515137, "logits/rejected": 11.190092086791992, "logps/chosen": -3.2377004623413086, "logps/rejected": -3.448542833328247, "loss": 3.9558, "rewards/accuracies": 0.75, "rewards/chosen": -32.37700271606445, "rewards/margins": 2.108424186706543, "rewards/rejected": -34.48542785644531, "step": 1961 }, { "epoch": 0.26715686274509803, "grad_norm": 49.32353268346963, "learning_rate": 7.338696985622547e-07, "logits/chosen": 10.648697853088379, "logits/rejected": 10.554753303527832, "logps/chosen": -3.7098495960235596, "logps/rejected": -3.599928379058838, "loss": 4.527, "rewards/accuracies": 0.25, "rewards/chosen": -37.09849548339844, "rewards/margins": -1.0992145538330078, "rewards/rejected": -35.99928283691406, "step": 1962 }, { "epoch": 0.26729302832244006, "grad_norm": 40.874745231353224, "learning_rate": 7.337649422404406e-07, "logits/chosen": 10.28596019744873, "logits/rejected": 9.714948654174805, "logps/chosen": -2.9750635623931885, "logps/rejected": -2.977287530899048, "loss": 4.0203, "rewards/accuracies": 0.5, "rewards/chosen": -29.750635147094727, "rewards/margins": 0.022240161895751953, "rewards/rejected": -29.77287483215332, "step": 1963 }, { "epoch": 0.26742919389978215, "grad_norm": 38.54429371232205, "learning_rate": 7.33660110501679e-07, "logits/chosen": 10.095830917358398, "logits/rejected": 10.990468978881836, "logps/chosen": -2.879361152648926, "logps/rejected": -3.3605360984802246, "loss": 3.9344, "rewards/accuracies": 0.75, "rewards/chosen": -28.793611526489258, "rewards/margins": 4.811748027801514, "rewards/rejected": -33.60536193847656, "step": 1964 }, { "epoch": 0.2675653594771242, "grad_norm": 40.02683159144431, "learning_rate": 7.335552033696572e-07, "logits/chosen": 10.053153991699219, "logits/rejected": 10.197141647338867, "logps/chosen": -2.8704230785369873, "logps/rejected": -3.0564498901367188, "loss": 4.0526, "rewards/accuracies": 0.5, "rewards/chosen": -28.70423126220703, "rewards/margins": 1.8602681159973145, "rewards/rejected": -30.564498901367188, "step": 1965 }, { "epoch": 0.2677015250544662, "grad_norm": 43.09395983460282, "learning_rate": 7.334502208680801e-07, "logits/chosen": 10.176595687866211, "logits/rejected": 10.43348503112793, "logps/chosen": -3.392711639404297, "logps/rejected": -3.5599589347839355, "loss": 4.5195, "rewards/accuracies": 0.5, "rewards/chosen": -33.92711639404297, "rewards/margins": 1.6724739074707031, "rewards/rejected": -35.59959030151367, "step": 1966 }, { "epoch": 0.2678376906318083, "grad_norm": 45.131534452200476, "learning_rate": 7.333451630206692e-07, "logits/chosen": 10.317317008972168, "logits/rejected": 10.534451484680176, "logps/chosen": -3.165498733520508, "logps/rejected": -3.1120457649230957, "loss": 3.4413, "rewards/accuracies": 0.5, "rewards/chosen": -31.654987335205078, "rewards/margins": -0.5345301628112793, "rewards/rejected": -31.120458602905273, "step": 1967 }, { "epoch": 0.2679738562091503, "grad_norm": 42.5126101524061, "learning_rate": 7.332400298511633e-07, "logits/chosen": 9.708734512329102, "logits/rejected": 10.70213508605957, "logps/chosen": -3.00101900100708, "logps/rejected": -3.1323418617248535, "loss": 3.8574, "rewards/accuracies": 0.75, "rewards/chosen": -30.01018714904785, "rewards/margins": 1.3132305145263672, "rewards/rejected": -31.32341766357422, "step": 1968 }, { "epoch": 0.26811002178649235, "grad_norm": 42.12485098708811, "learning_rate": 7.33134821383318e-07, "logits/chosen": 10.17334270477295, "logits/rejected": 10.439173698425293, "logps/chosen": -3.1371922492980957, "logps/rejected": -3.0636978149414062, "loss": 4.1608, "rewards/accuracies": 0.25, "rewards/chosen": -31.37192153930664, "rewards/margins": -0.7349433898925781, "rewards/rejected": -30.636978149414062, "step": 1969 }, { "epoch": 0.26824618736383443, "grad_norm": 43.691620612762485, "learning_rate": 7.330295376409061e-07, "logits/chosen": 10.067503929138184, "logits/rejected": 10.315563201904297, "logps/chosen": -2.694467067718506, "logps/rejected": -2.9869327545166016, "loss": 4.0636, "rewards/accuracies": 0.75, "rewards/chosen": -26.944673538208008, "rewards/margins": 2.9246554374694824, "rewards/rejected": -29.869327545166016, "step": 1970 }, { "epoch": 0.26838235294117646, "grad_norm": 41.91160310407085, "learning_rate": 7.329241786477175e-07, "logits/chosen": 9.906646728515625, "logits/rejected": 10.435741424560547, "logps/chosen": -3.1063895225524902, "logps/rejected": -3.231039524078369, "loss": 4.2754, "rewards/accuracies": 0.5, "rewards/chosen": -31.06389617919922, "rewards/margins": 1.2464966773986816, "rewards/rejected": -32.310394287109375, "step": 1971 }, { "epoch": 0.26851851851851855, "grad_norm": 42.80554314493543, "learning_rate": 7.328187444275586e-07, "logits/chosen": 10.847471237182617, "logits/rejected": 10.652091979980469, "logps/chosen": -2.8586983680725098, "logps/rejected": -2.9718685150146484, "loss": 3.9054, "rewards/accuracies": 0.75, "rewards/chosen": -28.58698272705078, "rewards/margins": 1.1317028999328613, "rewards/rejected": -29.718685150146484, "step": 1972 }, { "epoch": 0.2686546840958606, "grad_norm": 41.933319321246564, "learning_rate": 7.327132350042533e-07, "logits/chosen": 10.115180969238281, "logits/rejected": 10.207736015319824, "logps/chosen": -2.871335983276367, "logps/rejected": -3.17295503616333, "loss": 4.1409, "rewards/accuracies": 0.75, "rewards/chosen": -28.713361740112305, "rewards/margins": 3.016188621520996, "rewards/rejected": -31.729549407958984, "step": 1973 }, { "epoch": 0.2687908496732026, "grad_norm": 43.99959415406064, "learning_rate": 7.326076504016424e-07, "logits/chosen": 10.884143829345703, "logits/rejected": 10.303465843200684, "logps/chosen": -3.4953935146331787, "logps/rejected": -3.3167076110839844, "loss": 4.3669, "rewards/accuracies": 0.0, "rewards/chosen": -34.95393371582031, "rewards/margins": -1.7868585586547852, "rewards/rejected": -33.167076110839844, "step": 1974 }, { "epoch": 0.2689270152505447, "grad_norm": 39.913330377726695, "learning_rate": 7.325019906435834e-07, "logits/chosen": 10.30241584777832, "logits/rejected": 9.482256889343262, "logps/chosen": -2.7758383750915527, "logps/rejected": -3.137592077255249, "loss": 3.6889, "rewards/accuracies": 1.0, "rewards/chosen": -27.758386611938477, "rewards/margins": 3.6175360679626465, "rewards/rejected": -31.37592124938965, "step": 1975 }, { "epoch": 0.2690631808278867, "grad_norm": 45.71023316345587, "learning_rate": 7.323962557539512e-07, "logits/chosen": 8.964893341064453, "logits/rejected": 7.671968460083008, "logps/chosen": -2.8508262634277344, "logps/rejected": -2.644958019256592, "loss": 4.3723, "rewards/accuracies": 0.5, "rewards/chosen": -28.508262634277344, "rewards/margins": -2.0586838722229004, "rewards/rejected": -26.449581146240234, "step": 1976 }, { "epoch": 0.26919934640522875, "grad_norm": 44.835332904301914, "learning_rate": 7.322904457566373e-07, "logits/chosen": 10.230241775512695, "logits/rejected": 10.967496871948242, "logps/chosen": -3.1908345222473145, "logps/rejected": -3.357865333557129, "loss": 4.834, "rewards/accuracies": 0.75, "rewards/chosen": -31.90834617614746, "rewards/margins": 1.670309066772461, "rewards/rejected": -33.57865524291992, "step": 1977 }, { "epoch": 0.26933551198257083, "grad_norm": 43.655514344677826, "learning_rate": 7.321845606755506e-07, "logits/chosen": 10.805900573730469, "logits/rejected": 10.61413288116455, "logps/chosen": -3.0552010536193848, "logps/rejected": -2.9037551879882812, "loss": 3.6898, "rewards/accuracies": 0.25, "rewards/chosen": -30.552011489868164, "rewards/margins": -1.514460563659668, "rewards/rejected": -29.037551879882812, "step": 1978 }, { "epoch": 0.26947167755991286, "grad_norm": 44.222695593753656, "learning_rate": 7.320786005346164e-07, "logits/chosen": 10.124856948852539, "logits/rejected": 10.650636672973633, "logps/chosen": -2.711904525756836, "logps/rejected": -3.1121838092803955, "loss": 4.4177, "rewards/accuracies": 0.75, "rewards/chosen": -27.119047164916992, "rewards/margins": 4.002790927886963, "rewards/rejected": -31.121837615966797, "step": 1979 }, { "epoch": 0.2696078431372549, "grad_norm": 44.89558093795129, "learning_rate": 7.319725653577776e-07, "logits/chosen": 10.99325180053711, "logits/rejected": 10.797346115112305, "logps/chosen": -3.4399569034576416, "logps/rejected": -2.9325313568115234, "loss": 4.2331, "rewards/accuracies": 0.0, "rewards/chosen": -34.39957046508789, "rewards/margins": -5.074255466461182, "rewards/rejected": -29.325313568115234, "step": 1980 }, { "epoch": 0.269744008714597, "grad_norm": 47.42179540346833, "learning_rate": 7.318664551689935e-07, "logits/chosen": 9.508293151855469, "logits/rejected": 9.09864616394043, "logps/chosen": -3.0890679359436035, "logps/rejected": -3.0644819736480713, "loss": 3.7178, "rewards/accuracies": 0.5, "rewards/chosen": -30.89068031311035, "rewards/margins": -0.24586200714111328, "rewards/rejected": -30.644817352294922, "step": 1981 }, { "epoch": 0.269880174291939, "grad_norm": 42.77163615251112, "learning_rate": 7.317602699922404e-07, "logits/chosen": 10.833158493041992, "logits/rejected": 9.887293815612793, "logps/chosen": -3.7263896465301514, "logps/rejected": -3.28466796875, "loss": 4.1738, "rewards/accuracies": 0.0, "rewards/chosen": -37.26389694213867, "rewards/margins": -4.417214393615723, "rewards/rejected": -32.8466796875, "step": 1982 }, { "epoch": 0.27001633986928103, "grad_norm": 47.74780676292056, "learning_rate": 7.316540098515122e-07, "logits/chosen": 10.428991317749023, "logits/rejected": 11.180028915405273, "logps/chosen": -3.1120715141296387, "logps/rejected": -3.2746400833129883, "loss": 3.7643, "rewards/accuracies": 0.75, "rewards/chosen": -31.120716094970703, "rewards/margins": 1.6256847381591797, "rewards/rejected": -32.74639892578125, "step": 1983 }, { "epoch": 0.2701525054466231, "grad_norm": 40.84545827020503, "learning_rate": 7.315476747708189e-07, "logits/chosen": 10.619626998901367, "logits/rejected": 9.789139747619629, "logps/chosen": -3.3132967948913574, "logps/rejected": -2.9358460903167725, "loss": 4.525, "rewards/accuracies": 0.25, "rewards/chosen": -33.132965087890625, "rewards/margins": -3.774507999420166, "rewards/rejected": -29.35845947265625, "step": 1984 }, { "epoch": 0.27028867102396514, "grad_norm": 39.97442201782205, "learning_rate": 7.314412647741879e-07, "logits/chosen": 9.659608840942383, "logits/rejected": 10.606571197509766, "logps/chosen": -3.168344497680664, "logps/rejected": -3.29693865776062, "loss": 4.0777, "rewards/accuracies": 0.5, "rewards/chosen": -31.68344497680664, "rewards/margins": 1.2859420776367188, "rewards/rejected": -32.96938705444336, "step": 1985 }, { "epoch": 0.2704248366013072, "grad_norm": 43.8513289289414, "learning_rate": 7.313347798856632e-07, "logits/chosen": 9.986701965332031, "logits/rejected": 9.459875106811523, "logps/chosen": -3.0685548782348633, "logps/rejected": -3.3784451484680176, "loss": 4.2791, "rewards/accuracies": 0.75, "rewards/chosen": -30.685548782348633, "rewards/margins": 3.09890079498291, "rewards/rejected": -33.78445053100586, "step": 1986 }, { "epoch": 0.27056100217864926, "grad_norm": 37.04275785838416, "learning_rate": 7.312282201293063e-07, "logits/chosen": 10.090629577636719, "logits/rejected": 10.689542770385742, "logps/chosen": -2.819549798965454, "logps/rejected": -3.354344606399536, "loss": 3.6009, "rewards/accuracies": 1.0, "rewards/chosen": -28.195499420166016, "rewards/margins": 5.3479485511779785, "rewards/rejected": -33.5434455871582, "step": 1987 }, { "epoch": 0.2706971677559913, "grad_norm": 41.24358383414636, "learning_rate": 7.311215855291952e-07, "logits/chosen": 10.09316635131836, "logits/rejected": 9.884307861328125, "logps/chosen": -3.273221492767334, "logps/rejected": -3.4932985305786133, "loss": 3.3531, "rewards/accuracies": 0.75, "rewards/chosen": -32.732215881347656, "rewards/margins": 2.200770378112793, "rewards/rejected": -34.9329833984375, "step": 1988 }, { "epoch": 0.2708333333333333, "grad_norm": 42.566697057859464, "learning_rate": 7.310148761094246e-07, "logits/chosen": 10.306785583496094, "logits/rejected": 9.908737182617188, "logps/chosen": -3.1456522941589355, "logps/rejected": -3.227985382080078, "loss": 4.2198, "rewards/accuracies": 0.75, "rewards/chosen": -31.456520080566406, "rewards/margins": 0.8233318328857422, "rewards/rejected": -32.27985382080078, "step": 1989 }, { "epoch": 0.2709694989106754, "grad_norm": 40.46945153719614, "learning_rate": 7.309080918941068e-07, "logits/chosen": 9.485983848571777, "logits/rejected": 10.327064514160156, "logps/chosen": -3.4530348777770996, "logps/rejected": -3.4129421710968018, "loss": 4.2714, "rewards/accuracies": 0.5, "rewards/chosen": -34.53034973144531, "rewards/margins": -0.4009284973144531, "rewards/rejected": -34.12942123413086, "step": 1990 }, { "epoch": 0.27110566448801743, "grad_norm": 46.76377522431343, "learning_rate": 7.308012329073701e-07, "logits/chosen": 9.273896217346191, "logits/rejected": 10.113718032836914, "logps/chosen": -3.0194060802459717, "logps/rejected": -3.2451469898223877, "loss": 4.0877, "rewards/accuracies": 0.75, "rewards/chosen": -30.194061279296875, "rewards/margins": 2.2574081420898438, "rewards/rejected": -32.45146942138672, "step": 1991 }, { "epoch": 0.27124183006535946, "grad_norm": 48.82495029584682, "learning_rate": 7.306942991733605e-07, "logits/chosen": 10.49443531036377, "logits/rejected": 10.602264404296875, "logps/chosen": -3.3928542137145996, "logps/rejected": -3.5589213371276855, "loss": 4.4201, "rewards/accuracies": 0.5, "rewards/chosen": -33.92854309082031, "rewards/margins": 1.6606721878051758, "rewards/rejected": -35.58921432495117, "step": 1992 }, { "epoch": 0.27137799564270154, "grad_norm": 42.494417590434004, "learning_rate": 7.305872907162405e-07, "logits/chosen": 9.792900085449219, "logits/rejected": 11.067964553833008, "logps/chosen": -3.3646492958068848, "logps/rejected": -3.726170063018799, "loss": 4.2094, "rewards/accuracies": 1.0, "rewards/chosen": -33.64649200439453, "rewards/margins": 3.6152071952819824, "rewards/rejected": -37.26170349121094, "step": 1993 }, { "epoch": 0.27151416122004357, "grad_norm": 47.03967807477871, "learning_rate": 7.304802075601893e-07, "logits/chosen": 9.83702278137207, "logits/rejected": 10.449195861816406, "logps/chosen": -3.1987204551696777, "logps/rejected": -3.5032076835632324, "loss": 3.9694, "rewards/accuracies": 0.5, "rewards/chosen": -31.987201690673828, "rewards/margins": 3.0448737144470215, "rewards/rejected": -35.03207778930664, "step": 1994 }, { "epoch": 0.2716503267973856, "grad_norm": 43.23677336095055, "learning_rate": 7.303730497294035e-07, "logits/chosen": 10.973318099975586, "logits/rejected": 10.799219131469727, "logps/chosen": -3.153029680252075, "logps/rejected": -3.1357946395874023, "loss": 3.8822, "rewards/accuracies": 0.5, "rewards/chosen": -31.530296325683594, "rewards/margins": -0.17235040664672852, "rewards/rejected": -31.357946395874023, "step": 1995 }, { "epoch": 0.2717864923747277, "grad_norm": 52.42525702433401, "learning_rate": 7.302658172480963e-07, "logits/chosen": 10.78586196899414, "logits/rejected": 10.633350372314453, "logps/chosen": -3.108877182006836, "logps/rejected": -3.617509126663208, "loss": 4.5184, "rewards/accuracies": 1.0, "rewards/chosen": -31.08877182006836, "rewards/margins": 5.0863165855407715, "rewards/rejected": -36.175086975097656, "step": 1996 }, { "epoch": 0.2719226579520697, "grad_norm": 44.573575908482944, "learning_rate": 7.301585101404976e-07, "logits/chosen": 9.817386627197266, "logits/rejected": 11.483367919921875, "logps/chosen": -3.1455931663513184, "logps/rejected": -3.5418155193328857, "loss": 3.7266, "rewards/accuracies": 1.0, "rewards/chosen": -31.4559326171875, "rewards/margins": 3.962224006652832, "rewards/rejected": -35.418155670166016, "step": 1997 }, { "epoch": 0.27205882352941174, "grad_norm": 54.29143578851819, "learning_rate": 7.300511284308545e-07, "logits/chosen": 10.87348747253418, "logits/rejected": 10.243392944335938, "logps/chosen": -3.548027515411377, "logps/rejected": -3.3053839206695557, "loss": 3.6583, "rewards/accuracies": 0.25, "rewards/chosen": -35.48027420043945, "rewards/margins": -2.4264354705810547, "rewards/rejected": -33.05384063720703, "step": 1998 }, { "epoch": 0.2721949891067538, "grad_norm": 47.287857194535974, "learning_rate": 7.299436721434305e-07, "logits/chosen": 10.180253982543945, "logits/rejected": 10.116153717041016, "logps/chosen": -3.2518091201782227, "logps/rejected": -3.4091742038726807, "loss": 3.9299, "rewards/accuracies": 0.75, "rewards/chosen": -32.51809310913086, "rewards/margins": 1.5736498832702637, "rewards/rejected": -34.09174346923828, "step": 1999 }, { "epoch": 0.27233115468409586, "grad_norm": 45.55884499436885, "learning_rate": 7.298361413025068e-07, "logits/chosen": 10.132156372070312, "logits/rejected": 9.913487434387207, "logps/chosen": -3.138246536254883, "logps/rejected": -3.394822120666504, "loss": 3.8013, "rewards/accuracies": 0.75, "rewards/chosen": -31.382465362548828, "rewards/margins": 2.565756320953369, "rewards/rejected": -33.948219299316406, "step": 2000 }, { "epoch": 0.2724673202614379, "grad_norm": 40.157304576952406, "learning_rate": 7.297285359323802e-07, "logits/chosen": 9.693090438842773, "logits/rejected": 10.29661750793457, "logps/chosen": -2.972529888153076, "logps/rejected": -3.209840774536133, "loss": 3.6283, "rewards/accuracies": 0.5, "rewards/chosen": -29.725299835205078, "rewards/margins": 2.37310791015625, "rewards/rejected": -32.098411560058594, "step": 2001 }, { "epoch": 0.27260348583877997, "grad_norm": 40.71341098826601, "learning_rate": 7.296208560573654e-07, "logits/chosen": 10.649511337280273, "logits/rejected": 10.458353042602539, "logps/chosen": -3.568756580352783, "logps/rejected": -3.492060661315918, "loss": 3.7266, "rewards/accuracies": 0.5, "rewards/chosen": -35.687564849853516, "rewards/margins": -0.7669601440429688, "rewards/rejected": -34.92060470581055, "step": 2002 }, { "epoch": 0.272739651416122, "grad_norm": 46.451197832555906, "learning_rate": 7.295131017017936e-07, "logits/chosen": 9.461191177368164, "logits/rejected": 11.329160690307617, "logps/chosen": -2.8933024406433105, "logps/rejected": -3.2745752334594727, "loss": 3.4869, "rewards/accuracies": 0.75, "rewards/chosen": -28.933025360107422, "rewards/margins": 3.8127284049987793, "rewards/rejected": -32.74575424194336, "step": 2003 }, { "epoch": 0.272875816993464, "grad_norm": 43.69943439031731, "learning_rate": 7.294052728900126e-07, "logits/chosen": 9.973665237426758, "logits/rejected": 9.059999465942383, "logps/chosen": -3.3557519912719727, "logps/rejected": -2.99973464012146, "loss": 3.689, "rewards/accuracies": 0.25, "rewards/chosen": -33.557518005371094, "rewards/margins": -3.560173511505127, "rewards/rejected": -29.997346878051758, "step": 2004 }, { "epoch": 0.2730119825708061, "grad_norm": 47.43174643068363, "learning_rate": 7.292973696463875e-07, "logits/chosen": 9.6735200881958, "logits/rejected": 9.510699272155762, "logps/chosen": -2.7324771881103516, "logps/rejected": -2.746154308319092, "loss": 4.4607, "rewards/accuracies": 0.25, "rewards/chosen": -27.32477378845215, "rewards/margins": 0.13677024841308594, "rewards/rejected": -27.461544036865234, "step": 2005 }, { "epoch": 0.27314814814814814, "grad_norm": 62.00145748908563, "learning_rate": 7.291893919952995e-07, "logits/chosen": 9.249317169189453, "logits/rejected": 10.347532272338867, "logps/chosen": -2.7424559593200684, "logps/rejected": -3.2874679565429688, "loss": 4.6572, "rewards/accuracies": 1.0, "rewards/chosen": -27.424560546875, "rewards/margins": 5.450119972229004, "rewards/rejected": -32.87467956542969, "step": 2006 }, { "epoch": 0.27328431372549017, "grad_norm": 41.89472899817343, "learning_rate": 7.290813399611475e-07, "logits/chosen": 8.99233627319336, "logits/rejected": 9.709003448486328, "logps/chosen": -3.0976197719573975, "logps/rejected": -3.03520131111145, "loss": 4.0718, "rewards/accuracies": 0.5, "rewards/chosen": -30.976198196411133, "rewards/margins": -0.6241841316223145, "rewards/rejected": -30.352012634277344, "step": 2007 }, { "epoch": 0.27342047930283225, "grad_norm": 52.080190264395334, "learning_rate": 7.289732135683462e-07, "logits/chosen": 9.619110107421875, "logits/rejected": 10.332351684570312, "logps/chosen": -3.2030458450317383, "logps/rejected": -3.266364574432373, "loss": 4.6331, "rewards/accuracies": 0.5, "rewards/chosen": -32.03045654296875, "rewards/margins": 0.6331882476806641, "rewards/rejected": -32.66364669799805, "step": 2008 }, { "epoch": 0.2735566448801743, "grad_norm": 41.149814816490284, "learning_rate": 7.288650128413282e-07, "logits/chosen": 9.087930679321289, "logits/rejected": 9.485458374023438, "logps/chosen": -2.969280958175659, "logps/rejected": -3.079784393310547, "loss": 4.4237, "rewards/accuracies": 0.75, "rewards/chosen": -29.69281005859375, "rewards/margins": 1.1050338745117188, "rewards/rejected": -30.79784393310547, "step": 2009 }, { "epoch": 0.27369281045751637, "grad_norm": 42.84757515853534, "learning_rate": 7.287567378045421e-07, "logits/chosen": 9.351062774658203, "logits/rejected": 8.88620376586914, "logps/chosen": -3.159511089324951, "logps/rejected": -3.169571876525879, "loss": 3.8098, "rewards/accuracies": 0.5, "rewards/chosen": -31.595109939575195, "rewards/margins": 0.10060882568359375, "rewards/rejected": -31.695720672607422, "step": 2010 }, { "epoch": 0.2738289760348584, "grad_norm": 40.954328580219716, "learning_rate": 7.286483884824534e-07, "logits/chosen": 8.856693267822266, "logits/rejected": 8.800870895385742, "logps/chosen": -2.9414143562316895, "logps/rejected": -2.9524874687194824, "loss": 4.0293, "rewards/accuracies": 0.5, "rewards/chosen": -29.41414451599121, "rewards/margins": 0.11073064804077148, "rewards/rejected": -29.52487564086914, "step": 2011 }, { "epoch": 0.2739651416122004, "grad_norm": 119.68904825819683, "learning_rate": 7.285399648995449e-07, "logits/chosen": 10.026503562927246, "logits/rejected": 10.70901870727539, "logps/chosen": -3.173490285873413, "logps/rejected": -3.310499668121338, "loss": 5.9392, "rewards/accuracies": 0.5, "rewards/chosen": -31.73490333557129, "rewards/margins": 1.3700942993164062, "rewards/rejected": -33.10499954223633, "step": 2012 }, { "epoch": 0.2741013071895425, "grad_norm": 44.74714626269058, "learning_rate": 7.284314670803156e-07, "logits/chosen": 9.508905410766602, "logits/rejected": 9.06275749206543, "logps/chosen": -2.9808883666992188, "logps/rejected": -2.9762139320373535, "loss": 4.2447, "rewards/accuracies": 0.5, "rewards/chosen": -29.808883666992188, "rewards/margins": -0.04674577713012695, "rewards/rejected": -29.76213836669922, "step": 2013 }, { "epoch": 0.27423747276688454, "grad_norm": 44.547430515055524, "learning_rate": 7.283228950492812e-07, "logits/chosen": 10.47416877746582, "logits/rejected": 9.291886329650879, "logps/chosen": -3.2205581665039062, "logps/rejected": -2.999631404876709, "loss": 4.114, "rewards/accuracies": 0.25, "rewards/chosen": -32.20558166503906, "rewards/margins": -2.209270477294922, "rewards/rejected": -29.99631118774414, "step": 2014 }, { "epoch": 0.27437363834422657, "grad_norm": 43.94598533028935, "learning_rate": 7.28214248830975e-07, "logits/chosen": 9.001480102539062, "logits/rejected": 9.952157974243164, "logps/chosen": -3.0203804969787598, "logps/rejected": -3.1937851905822754, "loss": 4.5691, "rewards/accuracies": 0.75, "rewards/chosen": -30.203805923461914, "rewards/margins": 1.734046459197998, "rewards/rejected": -31.93785285949707, "step": 2015 }, { "epoch": 0.27450980392156865, "grad_norm": 42.79740801465069, "learning_rate": 7.28105528449946e-07, "logits/chosen": 10.33837890625, "logits/rejected": 10.662575721740723, "logps/chosen": -2.9630420207977295, "logps/rejected": -3.4840035438537598, "loss": 2.9987, "rewards/accuracies": 0.75, "rewards/chosen": -29.630420684814453, "rewards/margins": 5.209615230560303, "rewards/rejected": -34.84003448486328, "step": 2016 }, { "epoch": 0.2746459694989107, "grad_norm": 47.16676872738258, "learning_rate": 7.279967339307608e-07, "logits/chosen": 10.349387168884277, "logits/rejected": 10.91789436340332, "logps/chosen": -3.249378204345703, "logps/rejected": -3.5521154403686523, "loss": 3.7698, "rewards/accuracies": 0.75, "rewards/chosen": -32.49378204345703, "rewards/margins": 3.027374267578125, "rewards/rejected": -35.521156311035156, "step": 2017 }, { "epoch": 0.2747821350762527, "grad_norm": 40.102987327794494, "learning_rate": 7.278878652980024e-07, "logits/chosen": 8.722156524658203, "logits/rejected": 9.65837574005127, "logps/chosen": -2.6574790477752686, "logps/rejected": -3.2280397415161133, "loss": 3.8101, "rewards/accuracies": 0.75, "rewards/chosen": -26.574790954589844, "rewards/margins": 5.705606937408447, "rewards/rejected": -32.2803955078125, "step": 2018 }, { "epoch": 0.2749183006535948, "grad_norm": 41.40604653961379, "learning_rate": 7.277789225762704e-07, "logits/chosen": 9.486993789672852, "logits/rejected": 11.246912002563477, "logps/chosen": -2.7731263637542725, "logps/rejected": -3.2429919242858887, "loss": 4.1459, "rewards/accuracies": 0.75, "rewards/chosen": -27.731266021728516, "rewards/margins": 4.698655605316162, "rewards/rejected": -32.4299201965332, "step": 2019 }, { "epoch": 0.2750544662309368, "grad_norm": 44.232320107623785, "learning_rate": 7.276699057901815e-07, "logits/chosen": 10.79372787475586, "logits/rejected": 10.031235694885254, "logps/chosen": -3.4097278118133545, "logps/rejected": -3.420431613922119, "loss": 3.6114, "rewards/accuracies": 0.5, "rewards/chosen": -34.09727478027344, "rewards/margins": 0.10703706741333008, "rewards/rejected": -34.204315185546875, "step": 2020 }, { "epoch": 0.27519063180827885, "grad_norm": 56.47572556967111, "learning_rate": 7.275608149643687e-07, "logits/chosen": 9.985742568969727, "logits/rejected": 10.36294174194336, "logps/chosen": -3.0621438026428223, "logps/rejected": -3.0677847862243652, "loss": 3.992, "rewards/accuracies": 0.5, "rewards/chosen": -30.62143898010254, "rewards/margins": 0.05641031265258789, "rewards/rejected": -30.677846908569336, "step": 2021 }, { "epoch": 0.27532679738562094, "grad_norm": 47.918064702838144, "learning_rate": 7.274516501234822e-07, "logits/chosen": 10.050941467285156, "logits/rejected": 10.402904510498047, "logps/chosen": -3.078103542327881, "logps/rejected": -3.2431113719940186, "loss": 4.457, "rewards/accuracies": 0.5, "rewards/chosen": -30.781034469604492, "rewards/margins": 1.6500792503356934, "rewards/rejected": -32.431114196777344, "step": 2022 }, { "epoch": 0.27546296296296297, "grad_norm": 40.03394894383659, "learning_rate": 7.273424112921887e-07, "logits/chosen": 9.745207786560059, "logits/rejected": 10.29731559753418, "logps/chosen": -3.2433362007141113, "logps/rejected": -3.508756637573242, "loss": 3.638, "rewards/accuracies": 1.0, "rewards/chosen": -32.43336486816406, "rewards/margins": 2.654204845428467, "rewards/rejected": -35.08757019042969, "step": 2023 }, { "epoch": 0.275599128540305, "grad_norm": 41.812207836190545, "learning_rate": 7.272330984951714e-07, "logits/chosen": 10.065227508544922, "logits/rejected": 10.09495735168457, "logps/chosen": -3.1843888759613037, "logps/rejected": -3.3878982067108154, "loss": 3.8694, "rewards/accuracies": 0.5, "rewards/chosen": -31.843887329101562, "rewards/margins": 2.035095691680908, "rewards/rejected": -33.87898254394531, "step": 2024 }, { "epoch": 0.2757352941176471, "grad_norm": 56.927895263938616, "learning_rate": 7.271237117571306e-07, "logits/chosen": 10.161417007446289, "logits/rejected": 11.861520767211914, "logps/chosen": -3.3486366271972656, "logps/rejected": -3.8758697509765625, "loss": 3.8497, "rewards/accuracies": 0.75, "rewards/chosen": -33.486366271972656, "rewards/margins": 5.272334098815918, "rewards/rejected": -38.758697509765625, "step": 2025 }, { "epoch": 0.2758714596949891, "grad_norm": 41.303926892893585, "learning_rate": 7.27014251102783e-07, "logits/chosen": 11.092942237854004, "logits/rejected": 11.345327377319336, "logps/chosen": -3.5436034202575684, "logps/rejected": -3.9519639015197754, "loss": 4.0182, "rewards/accuracies": 1.0, "rewards/chosen": -35.43603515625, "rewards/margins": 4.0836029052734375, "rewards/rejected": -39.51963806152344, "step": 2026 }, { "epoch": 0.27600762527233114, "grad_norm": 48.71708457212026, "learning_rate": 7.269047165568623e-07, "logits/chosen": 11.486516952514648, "logits/rejected": 11.087610244750977, "logps/chosen": -3.1951918601989746, "logps/rejected": -3.243177890777588, "loss": 4.2033, "rewards/accuracies": 0.5, "rewards/chosen": -31.95191764831543, "rewards/margins": 0.4798617362976074, "rewards/rejected": -32.43177795410156, "step": 2027 }, { "epoch": 0.2761437908496732, "grad_norm": 48.72380909358682, "learning_rate": 7.267951081441188e-07, "logits/chosen": 10.484138488769531, "logits/rejected": 10.439075469970703, "logps/chosen": -3.459653377532959, "logps/rejected": -3.62625789642334, "loss": 4.4421, "rewards/accuracies": 0.75, "rewards/chosen": -34.596534729003906, "rewards/margins": 1.6660447120666504, "rewards/rejected": -36.262577056884766, "step": 2028 }, { "epoch": 0.27627995642701525, "grad_norm": 45.676882085597654, "learning_rate": 7.266854258893191e-07, "logits/chosen": 10.189546585083008, "logits/rejected": 10.0985107421875, "logps/chosen": -3.0454375743865967, "logps/rejected": -3.1282405853271484, "loss": 3.949, "rewards/accuracies": 0.75, "rewards/chosen": -30.454376220703125, "rewards/margins": 0.8280305862426758, "rewards/rejected": -31.282405853271484, "step": 2029 }, { "epoch": 0.2764161220043573, "grad_norm": 47.28290430043637, "learning_rate": 7.26575669817247e-07, "logits/chosen": 11.048019409179688, "logits/rejected": 10.073013305664062, "logps/chosen": -3.415586471557617, "logps/rejected": -3.2660329341888428, "loss": 4.0005, "rewards/accuracies": 0.5, "rewards/chosen": -34.15586471557617, "rewards/margins": -1.4955368041992188, "rewards/rejected": -32.66033172607422, "step": 2030 }, { "epoch": 0.27655228758169936, "grad_norm": 55.068869790579264, "learning_rate": 7.264658399527031e-07, "logits/chosen": 10.813800811767578, "logits/rejected": 11.004657745361328, "logps/chosen": -3.4854390621185303, "logps/rejected": -3.28957462310791, "loss": 4.2229, "rewards/accuracies": 0.0, "rewards/chosen": -34.85438919067383, "rewards/margins": -1.9586434364318848, "rewards/rejected": -32.89574432373047, "step": 2031 }, { "epoch": 0.2766884531590414, "grad_norm": 46.703648760997226, "learning_rate": 7.263559363205038e-07, "logits/chosen": 10.128730773925781, "logits/rejected": 10.276580810546875, "logps/chosen": -2.692640781402588, "logps/rejected": -2.816324472427368, "loss": 3.8414, "rewards/accuracies": 0.75, "rewards/chosen": -26.926408767700195, "rewards/margins": 1.2368354797363281, "rewards/rejected": -28.163244247436523, "step": 2032 }, { "epoch": 0.2768246187363834, "grad_norm": 46.745271072199785, "learning_rate": 7.26245958945483e-07, "logits/chosen": 11.4893217086792, "logits/rejected": 11.659765243530273, "logps/chosen": -3.575401782989502, "logps/rejected": -3.632902145385742, "loss": 4.2156, "rewards/accuracies": 0.75, "rewards/chosen": -35.7540168762207, "rewards/margins": 0.5750026702880859, "rewards/rejected": -36.329017639160156, "step": 2033 }, { "epoch": 0.2769607843137255, "grad_norm": 55.53816492515712, "learning_rate": 7.261359078524912e-07, "logits/chosen": 11.160947799682617, "logits/rejected": 11.408221244812012, "logps/chosen": -3.3920836448669434, "logps/rejected": -3.6192288398742676, "loss": 3.9946, "rewards/accuracies": 0.75, "rewards/chosen": -33.92083740234375, "rewards/margins": 2.2714505195617676, "rewards/rejected": -36.192283630371094, "step": 2034 }, { "epoch": 0.27709694989106753, "grad_norm": 44.74991135960051, "learning_rate": 7.260257830663949e-07, "logits/chosen": 10.565815925598145, "logits/rejected": 11.86630916595459, "logps/chosen": -2.9973180294036865, "logps/rejected": -3.646707057952881, "loss": 4.1079, "rewards/accuracies": 0.75, "rewards/chosen": -29.973182678222656, "rewards/margins": 6.493890285491943, "rewards/rejected": -36.467071533203125, "step": 2035 }, { "epoch": 0.27723311546840956, "grad_norm": 46.24081637654435, "learning_rate": 7.259155846120781e-07, "logits/chosen": 10.796895980834961, "logits/rejected": 11.767656326293945, "logps/chosen": -3.170259952545166, "logps/rejected": -3.4192090034484863, "loss": 3.964, "rewards/accuracies": 0.75, "rewards/chosen": -31.702600479125977, "rewards/margins": 2.4894886016845703, "rewards/rejected": -34.19208908081055, "step": 2036 }, { "epoch": 0.27736928104575165, "grad_norm": 45.477265381330405, "learning_rate": 7.258053125144409e-07, "logits/chosen": 11.539351463317871, "logits/rejected": 11.533016204833984, "logps/chosen": -3.104193687438965, "logps/rejected": -3.3890044689178467, "loss": 3.648, "rewards/accuracies": 1.0, "rewards/chosen": -31.04193687438965, "rewards/margins": 2.8481078147888184, "rewards/rejected": -33.890045166015625, "step": 2037 }, { "epoch": 0.2775054466230937, "grad_norm": 44.982317173642606, "learning_rate": 7.256949667984003e-07, "logits/chosen": 10.692113876342773, "logits/rejected": 10.723516464233398, "logps/chosen": -3.327004909515381, "logps/rejected": -3.437248706817627, "loss": 3.9559, "rewards/accuracies": 0.5, "rewards/chosen": -33.270050048828125, "rewards/margins": 1.102437973022461, "rewards/rejected": -34.37248611450195, "step": 2038 }, { "epoch": 0.2776416122004357, "grad_norm": 45.203767725678496, "learning_rate": 7.255845474888895e-07, "logits/chosen": 11.353498458862305, "logits/rejected": 10.650814056396484, "logps/chosen": -3.7698981761932373, "logps/rejected": -3.865610122680664, "loss": 4.1098, "rewards/accuracies": 0.75, "rewards/chosen": -37.69898223876953, "rewards/margins": 0.9571223258972168, "rewards/rejected": -38.65610122680664, "step": 2039 }, { "epoch": 0.2777777777777778, "grad_norm": 42.682973431685326, "learning_rate": 7.254740546108591e-07, "logits/chosen": 10.082803726196289, "logits/rejected": 10.665252685546875, "logps/chosen": -3.0956132411956787, "logps/rejected": -3.2210779190063477, "loss": 3.7555, "rewards/accuracies": 0.75, "rewards/chosen": -30.956132888793945, "rewards/margins": 1.2546448707580566, "rewards/rejected": -32.210777282714844, "step": 2040 }, { "epoch": 0.2779139433551198, "grad_norm": 46.26997236790007, "learning_rate": 7.253634881892755e-07, "logits/chosen": 9.497200965881348, "logits/rejected": 10.458972930908203, "logps/chosen": -2.879244327545166, "logps/rejected": -3.2354652881622314, "loss": 4.0311, "rewards/accuracies": 1.0, "rewards/chosen": -28.792442321777344, "rewards/margins": 3.562209129333496, "rewards/rejected": -32.354652404785156, "step": 2041 }, { "epoch": 0.27805010893246185, "grad_norm": 44.85806558939768, "learning_rate": 7.252528482491224e-07, "logits/chosen": 11.165983200073242, "logits/rejected": 11.578134536743164, "logps/chosen": -3.3943305015563965, "logps/rejected": -3.5085346698760986, "loss": 3.4084, "rewards/accuracies": 0.75, "rewards/chosen": -33.94330596923828, "rewards/margins": 1.1420412063598633, "rewards/rejected": -35.08534622192383, "step": 2042 }, { "epoch": 0.27818627450980393, "grad_norm": 45.1318450362185, "learning_rate": 7.251421348153996e-07, "logits/chosen": 10.2318754196167, "logits/rejected": 11.471602439880371, "logps/chosen": -3.2223541736602783, "logps/rejected": -3.6753122806549072, "loss": 3.3286, "rewards/accuracies": 1.0, "rewards/chosen": -32.223541259765625, "rewards/margins": 4.529581069946289, "rewards/rejected": -36.75312042236328, "step": 2043 }, { "epoch": 0.27832244008714596, "grad_norm": 57.80631110689802, "learning_rate": 7.250313479131238e-07, "logits/chosen": 10.987954139709473, "logits/rejected": 11.144309997558594, "logps/chosen": -3.485874891281128, "logps/rejected": -3.737119197845459, "loss": 3.8731, "rewards/accuracies": 0.75, "rewards/chosen": -34.85874557495117, "rewards/margins": 2.512441635131836, "rewards/rejected": -37.37118911743164, "step": 2044 }, { "epoch": 0.278458605664488, "grad_norm": 48.884888897014584, "learning_rate": 7.249204875673282e-07, "logits/chosen": 10.34976577758789, "logits/rejected": 10.738727569580078, "logps/chosen": -3.139967441558838, "logps/rejected": -3.2270936965942383, "loss": 4.0746, "rewards/accuracies": 0.5, "rewards/chosen": -31.399675369262695, "rewards/margins": 0.8712620735168457, "rewards/rejected": -32.270938873291016, "step": 2045 }, { "epoch": 0.2785947712418301, "grad_norm": 49.75695300755649, "learning_rate": 7.248095538030626e-07, "logits/chosen": 11.787881851196289, "logits/rejected": 11.092042922973633, "logps/chosen": -3.2812492847442627, "logps/rejected": -3.4771904945373535, "loss": 4.4969, "rewards/accuracies": 0.5, "rewards/chosen": -32.81249237060547, "rewards/margins": 1.9594106674194336, "rewards/rejected": -34.77190399169922, "step": 2046 }, { "epoch": 0.2787309368191721, "grad_norm": 45.08387816393598, "learning_rate": 7.246985466453934e-07, "logits/chosen": 10.927677154541016, "logits/rejected": 10.590829849243164, "logps/chosen": -3.8233256340026855, "logps/rejected": -3.648876667022705, "loss": 4.0516, "rewards/accuracies": 0.5, "rewards/chosen": -38.233253479003906, "rewards/margins": -1.7444887161254883, "rewards/rejected": -36.488765716552734, "step": 2047 }, { "epoch": 0.2788671023965142, "grad_norm": 40.36223955115329, "learning_rate": 7.245874661194037e-07, "logits/chosen": 9.752860069274902, "logits/rejected": 11.164066314697266, "logps/chosen": -3.012495279312134, "logps/rejected": -2.989396333694458, "loss": 3.7432, "rewards/accuracies": 0.25, "rewards/chosen": -30.12495231628418, "rewards/margins": -0.23098993301391602, "rewards/rejected": -29.893962860107422, "step": 2048 }, { "epoch": 0.2790032679738562, "grad_norm": 48.42623687094657, "learning_rate": 7.244763122501928e-07, "logits/chosen": 10.212675094604492, "logits/rejected": 11.188398361206055, "logps/chosen": -3.0217511653900146, "logps/rejected": -3.387882709503174, "loss": 3.8944, "rewards/accuracies": 0.75, "rewards/chosen": -30.217510223388672, "rewards/margins": 3.6613149642944336, "rewards/rejected": -33.87882614135742, "step": 2049 }, { "epoch": 0.27913943355119825, "grad_norm": 41.717332604201914, "learning_rate": 7.243650850628771e-07, "logits/chosen": 10.989557266235352, "logits/rejected": 11.168245315551758, "logps/chosen": -3.445178270339966, "logps/rejected": -3.3392205238342285, "loss": 3.8432, "rewards/accuracies": 0.5, "rewards/chosen": -34.4517822265625, "rewards/margins": -1.0595793724060059, "rewards/rejected": -33.39220428466797, "step": 2050 }, { "epoch": 0.27927559912854033, "grad_norm": 43.11377187482457, "learning_rate": 7.242537845825891e-07, "logits/chosen": 11.366989135742188, "logits/rejected": 10.947468757629395, "logps/chosen": -3.7931761741638184, "logps/rejected": -3.3737363815307617, "loss": 3.9688, "rewards/accuracies": 0.0, "rewards/chosen": -37.9317626953125, "rewards/margins": -4.19439697265625, "rewards/rejected": -33.73736572265625, "step": 2051 }, { "epoch": 0.27941176470588236, "grad_norm": 44.3163937090065, "learning_rate": 7.241424108344784e-07, "logits/chosen": 10.911855697631836, "logits/rejected": 11.49569320678711, "logps/chosen": -3.7878801822662354, "logps/rejected": -3.8651952743530273, "loss": 4.1921, "rewards/accuracies": 0.75, "rewards/chosen": -37.87879943847656, "rewards/margins": 0.7731513977050781, "rewards/rejected": -38.651954650878906, "step": 2052 }, { "epoch": 0.2795479302832244, "grad_norm": 44.816458159384524, "learning_rate": 7.240309638437104e-07, "logits/chosen": 11.313295364379883, "logits/rejected": 11.891971588134766, "logps/chosen": -3.64695405960083, "logps/rejected": -3.903639316558838, "loss": 4.404, "rewards/accuracies": 0.5, "rewards/chosen": -36.469539642333984, "rewards/margins": 2.5668506622314453, "rewards/rejected": -39.03639221191406, "step": 2053 }, { "epoch": 0.2796840958605665, "grad_norm": 46.73851675471098, "learning_rate": 7.239194436354677e-07, "logits/chosen": 10.866466522216797, "logits/rejected": 10.660722732543945, "logps/chosen": -3.359196901321411, "logps/rejected": -3.4798851013183594, "loss": 4.4227, "rewards/accuracies": 0.75, "rewards/chosen": -33.59196853637695, "rewards/margins": 1.2068824768066406, "rewards/rejected": -34.798851013183594, "step": 2054 }, { "epoch": 0.2798202614379085, "grad_norm": 44.44116523611023, "learning_rate": 7.238078502349491e-07, "logits/chosen": 10.678529739379883, "logits/rejected": 10.90660285949707, "logps/chosen": -3.5138468742370605, "logps/rejected": -3.6071324348449707, "loss": 4.309, "rewards/accuracies": 0.75, "rewards/chosen": -35.138465881347656, "rewards/margins": 0.9328560829162598, "rewards/rejected": -36.07132339477539, "step": 2055 }, { "epoch": 0.27995642701525053, "grad_norm": 41.684828125335855, "learning_rate": 7.236961836673701e-07, "logits/chosen": 12.222993850708008, "logits/rejected": 12.460819244384766, "logps/chosen": -3.411181926727295, "logps/rejected": -3.1839756965637207, "loss": 3.941, "rewards/accuracies": 0.5, "rewards/chosen": -34.111820220947266, "rewards/margins": -2.2720651626586914, "rewards/rejected": -31.83975601196289, "step": 2056 }, { "epoch": 0.2800925925925926, "grad_norm": 37.779701974886486, "learning_rate": 7.23584443957963e-07, "logits/chosen": 11.328814506530762, "logits/rejected": 11.81326961517334, "logps/chosen": -3.5789999961853027, "logps/rejected": -3.616410255432129, "loss": 3.9762, "rewards/accuracies": 0.75, "rewards/chosen": -35.790000915527344, "rewards/margins": 0.37410402297973633, "rewards/rejected": -36.164100646972656, "step": 2057 }, { "epoch": 0.28022875816993464, "grad_norm": 47.45658283732515, "learning_rate": 7.234726311319757e-07, "logits/chosen": 10.872186660766602, "logits/rejected": 11.738412857055664, "logps/chosen": -3.464928150177002, "logps/rejected": -3.6447415351867676, "loss": 4.6981, "rewards/accuracies": 0.5, "rewards/chosen": -34.6492805480957, "rewards/margins": 1.7981328964233398, "rewards/rejected": -36.44741439819336, "step": 2058 }, { "epoch": 0.2803649237472767, "grad_norm": 41.73577117179903, "learning_rate": 7.233607452146737e-07, "logits/chosen": 11.795262336730957, "logits/rejected": 11.516353607177734, "logps/chosen": -3.2608413696289062, "logps/rejected": -3.3691110610961914, "loss": 4.2638, "rewards/accuracies": 0.75, "rewards/chosen": -32.60841369628906, "rewards/margins": 1.0826983451843262, "rewards/rejected": -33.69110870361328, "step": 2059 }, { "epoch": 0.28050108932461876, "grad_norm": 42.50102357176081, "learning_rate": 7.232487862313382e-07, "logits/chosen": 11.085987091064453, "logits/rejected": 11.950462341308594, "logps/chosen": -3.4711592197418213, "logps/rejected": -3.706508159637451, "loss": 4.221, "rewards/accuracies": 0.75, "rewards/chosen": -34.71159362792969, "rewards/margins": 2.353489398956299, "rewards/rejected": -37.06508255004883, "step": 2060 }, { "epoch": 0.2806372549019608, "grad_norm": 38.90315756865195, "learning_rate": 7.231367542072677e-07, "logits/chosen": 11.63935661315918, "logits/rejected": 12.251663208007812, "logps/chosen": -3.490365982055664, "logps/rejected": -3.713557243347168, "loss": 3.6925, "rewards/accuracies": 0.5, "rewards/chosen": -34.90365982055664, "rewards/margins": 2.231914520263672, "rewards/rejected": -37.13557434082031, "step": 2061 }, { "epoch": 0.2807734204793028, "grad_norm": 39.76156247227579, "learning_rate": 7.230246491677762e-07, "logits/chosen": 11.465438842773438, "logits/rejected": 11.80034065246582, "logps/chosen": -3.1738381385803223, "logps/rejected": -3.5051703453063965, "loss": 3.8128, "rewards/accuracies": 0.5, "rewards/chosen": -31.73838233947754, "rewards/margins": 3.3133225440979004, "rewards/rejected": -35.05170440673828, "step": 2062 }, { "epoch": 0.2809095860566449, "grad_norm": 42.453357483521415, "learning_rate": 7.229124711381952e-07, "logits/chosen": 11.759775161743164, "logits/rejected": 11.697175025939941, "logps/chosen": -3.750964641571045, "logps/rejected": -3.865787982940674, "loss": 3.9083, "rewards/accuracies": 0.75, "rewards/chosen": -37.5096435546875, "rewards/margins": 1.1482343673706055, "rewards/rejected": -38.65787887573242, "step": 2063 }, { "epoch": 0.28104575163398693, "grad_norm": 55.58661424990483, "learning_rate": 7.228002201438723e-07, "logits/chosen": 10.610663414001465, "logits/rejected": 10.974113464355469, "logps/chosen": -3.410808563232422, "logps/rejected": -3.530812978744507, "loss": 3.9846, "rewards/accuracies": 0.5, "rewards/chosen": -34.10808563232422, "rewards/margins": 1.2000441551208496, "rewards/rejected": -35.30813217163086, "step": 2064 }, { "epoch": 0.28118191721132896, "grad_norm": 43.451136985672726, "learning_rate": 7.226878962101712e-07, "logits/chosen": 12.302106857299805, "logits/rejected": 12.141172409057617, "logps/chosen": -3.665592670440674, "logps/rejected": -3.789672374725342, "loss": 4.301, "rewards/accuracies": 0.75, "rewards/chosen": -36.65592956542969, "rewards/margins": 1.2407951354980469, "rewards/rejected": -37.89672088623047, "step": 2065 }, { "epoch": 0.28131808278867104, "grad_norm": 44.6707946502697, "learning_rate": 7.225754993624727e-07, "logits/chosen": 11.12503433227539, "logits/rejected": 11.502676010131836, "logps/chosen": -3.1889214515686035, "logps/rejected": -3.429274320602417, "loss": 3.9257, "rewards/accuracies": 0.5, "rewards/chosen": -31.88921546936035, "rewards/margins": 2.40352725982666, "rewards/rejected": -34.29274368286133, "step": 2066 }, { "epoch": 0.28145424836601307, "grad_norm": 48.705213110443694, "learning_rate": 7.224630296261736e-07, "logits/chosen": 10.310937881469727, "logits/rejected": 10.727593421936035, "logps/chosen": -3.474220037460327, "logps/rejected": -3.450951099395752, "loss": 4.455, "rewards/accuracies": 0.25, "rewards/chosen": -34.7421989440918, "rewards/margins": -0.23268890380859375, "rewards/rejected": -34.50951385498047, "step": 2067 }, { "epoch": 0.2815904139433551, "grad_norm": 41.91479032957029, "learning_rate": 7.223504870266875e-07, "logits/chosen": 11.784817695617676, "logits/rejected": 11.208290100097656, "logps/chosen": -3.4656386375427246, "logps/rejected": -3.173889398574829, "loss": 4.4404, "rewards/accuracies": 0.0, "rewards/chosen": -34.65638732910156, "rewards/margins": -2.9174904823303223, "rewards/rejected": -31.738895416259766, "step": 2068 }, { "epoch": 0.2817265795206972, "grad_norm": 40.88537963680473, "learning_rate": 7.222378715894442e-07, "logits/chosen": 11.473581314086914, "logits/rejected": 11.399474143981934, "logps/chosen": -3.265981674194336, "logps/rejected": -3.4489426612854004, "loss": 3.8403, "rewards/accuracies": 0.5, "rewards/chosen": -32.65981674194336, "rewards/margins": 1.8296117782592773, "rewards/rejected": -34.48942565917969, "step": 2069 }, { "epoch": 0.2818627450980392, "grad_norm": 45.434483314516235, "learning_rate": 7.221251833398902e-07, "logits/chosen": 10.279817581176758, "logits/rejected": 10.267242431640625, "logps/chosen": -3.1655020713806152, "logps/rejected": -3.1746463775634766, "loss": 4.1094, "rewards/accuracies": 0.5, "rewards/chosen": -31.65502166748047, "rewards/margins": 0.09144258499145508, "rewards/rejected": -31.746461868286133, "step": 2070 }, { "epoch": 0.28199891067538124, "grad_norm": 50.314801888185656, "learning_rate": 7.220124223034883e-07, "logits/chosen": 10.630815505981445, "logits/rejected": 10.557682037353516, "logps/chosen": -3.4332311153411865, "logps/rejected": -3.321901321411133, "loss": 4.0195, "rewards/accuracies": 0.25, "rewards/chosen": -34.332313537597656, "rewards/margins": -1.1132969856262207, "rewards/rejected": -33.21901321411133, "step": 2071 }, { "epoch": 0.2821350762527233, "grad_norm": 43.92362262309567, "learning_rate": 7.218995885057179e-07, "logits/chosen": 11.541135787963867, "logits/rejected": 11.75316047668457, "logps/chosen": -3.3261148929595947, "logps/rejected": -3.57920241355896, "loss": 4.4623, "rewards/accuracies": 0.5, "rewards/chosen": -33.261146545410156, "rewards/margins": 2.530876636505127, "rewards/rejected": -35.792022705078125, "step": 2072 }, { "epoch": 0.28227124183006536, "grad_norm": 44.70565013143902, "learning_rate": 7.217866819720745e-07, "logits/chosen": 11.094372749328613, "logits/rejected": 11.93753719329834, "logps/chosen": -3.3920278549194336, "logps/rejected": -3.483088970184326, "loss": 4.1194, "rewards/accuracies": 0.75, "rewards/chosen": -33.9202766418457, "rewards/margins": 0.9106130599975586, "rewards/rejected": -34.83088684082031, "step": 2073 }, { "epoch": 0.2824074074074074, "grad_norm": 42.49054602517869, "learning_rate": 7.216737027280704e-07, "logits/chosen": 11.566611289978027, "logits/rejected": 11.856651306152344, "logps/chosen": -3.5408201217651367, "logps/rejected": -3.1845157146453857, "loss": 4.2152, "rewards/accuracies": 0.0, "rewards/chosen": -35.408203125, "rewards/margins": -3.5630440711975098, "rewards/rejected": -31.845157623291016, "step": 2074 }, { "epoch": 0.28254357298474947, "grad_norm": 48.97971447329948, "learning_rate": 7.215606507992342e-07, "logits/chosen": 11.725763320922852, "logits/rejected": 11.923212051391602, "logps/chosen": -3.435180187225342, "logps/rejected": -3.5270843505859375, "loss": 4.5251, "rewards/accuracies": 0.75, "rewards/chosen": -34.351802825927734, "rewards/margins": 0.9190387725830078, "rewards/rejected": -35.27083969116211, "step": 2075 }, { "epoch": 0.2826797385620915, "grad_norm": 38.29491100519231, "learning_rate": 7.214475262111109e-07, "logits/chosen": 11.248998641967773, "logits/rejected": 11.711894989013672, "logps/chosen": -3.2605295181274414, "logps/rejected": -3.4658796787261963, "loss": 4.1614, "rewards/accuracies": 0.5, "rewards/chosen": -32.60529708862305, "rewards/margins": 2.053499698638916, "rewards/rejected": -34.65879821777344, "step": 2076 }, { "epoch": 0.2828159041394335, "grad_norm": 47.52475346166277, "learning_rate": 7.21334328989262e-07, "logits/chosen": 10.26494026184082, "logits/rejected": 10.991219520568848, "logps/chosen": -3.0820116996765137, "logps/rejected": -3.2998874187469482, "loss": 4.3866, "rewards/accuracies": 0.75, "rewards/chosen": -30.820117950439453, "rewards/margins": 2.178755760192871, "rewards/rejected": -32.998870849609375, "step": 2077 }, { "epoch": 0.2829520697167756, "grad_norm": 44.27895412122759, "learning_rate": 7.212210591592653e-07, "logits/chosen": 11.059854507446289, "logits/rejected": 11.847240447998047, "logps/chosen": -3.1412675380706787, "logps/rejected": -3.3664565086364746, "loss": 3.833, "rewards/accuracies": 0.5, "rewards/chosen": -31.412675857543945, "rewards/margins": 2.251889705657959, "rewards/rejected": -33.66456604003906, "step": 2078 }, { "epoch": 0.28308823529411764, "grad_norm": 47.67449695788681, "learning_rate": 7.21107716746715e-07, "logits/chosen": 10.08210277557373, "logits/rejected": 11.118005752563477, "logps/chosen": -2.576733112335205, "logps/rejected": -2.889458179473877, "loss": 3.6885, "rewards/accuracies": 0.75, "rewards/chosen": -25.767333984375, "rewards/margins": 3.127248764038086, "rewards/rejected": -28.894580841064453, "step": 2079 }, { "epoch": 0.28322440087145967, "grad_norm": 46.986537166162215, "learning_rate": 7.209943017772218e-07, "logits/chosen": 11.703779220581055, "logits/rejected": 12.075532913208008, "logps/chosen": -3.4780752658843994, "logps/rejected": -3.631012439727783, "loss": 4.3966, "rewards/accuracies": 0.5, "rewards/chosen": -34.78075408935547, "rewards/margins": 1.529374599456787, "rewards/rejected": -36.31012725830078, "step": 2080 }, { "epoch": 0.28336056644880175, "grad_norm": 44.31135649736584, "learning_rate": 7.208808142764128e-07, "logits/chosen": 9.957310676574707, "logits/rejected": 10.16744613647461, "logps/chosen": -2.6940999031066895, "logps/rejected": -3.0267648696899414, "loss": 4.3566, "rewards/accuracies": 0.75, "rewards/chosen": -26.941001892089844, "rewards/margins": 3.3266468048095703, "rewards/rejected": -30.26764678955078, "step": 2081 }, { "epoch": 0.2834967320261438, "grad_norm": 38.640979853163515, "learning_rate": 7.207672542699314e-07, "logits/chosen": 10.935186386108398, "logits/rejected": 10.981404304504395, "logps/chosen": -2.913482666015625, "logps/rejected": -3.2452445030212402, "loss": 3.9805, "rewards/accuracies": 0.75, "rewards/chosen": -29.13482666015625, "rewards/margins": 3.3176164627075195, "rewards/rejected": -32.45244216918945, "step": 2082 }, { "epoch": 0.2836328976034858, "grad_norm": 43.109318970527504, "learning_rate": 7.206536217834372e-07, "logits/chosen": 11.374443054199219, "logits/rejected": 11.129838943481445, "logps/chosen": -3.380274534225464, "logps/rejected": -3.3861021995544434, "loss": 4.3002, "rewards/accuracies": 0.25, "rewards/chosen": -33.8027458190918, "rewards/margins": 0.058277130126953125, "rewards/rejected": -33.86102294921875, "step": 2083 }, { "epoch": 0.2837690631808279, "grad_norm": 43.239723400218516, "learning_rate": 7.205399168426069e-07, "logits/chosen": 11.561668395996094, "logits/rejected": 10.568626403808594, "logps/chosen": -3.260404586791992, "logps/rejected": -3.0775461196899414, "loss": 4.4239, "rewards/accuracies": 0.25, "rewards/chosen": -32.60404586791992, "rewards/margins": -1.8285856246948242, "rewards/rejected": -30.775461196899414, "step": 2084 }, { "epoch": 0.2839052287581699, "grad_norm": 42.9574036487665, "learning_rate": 7.204261394731326e-07, "logits/chosen": 10.474336624145508, "logits/rejected": 11.271965026855469, "logps/chosen": -3.0620784759521484, "logps/rejected": -3.21087646484375, "loss": 4.5673, "rewards/accuracies": 0.75, "rewards/chosen": -30.620784759521484, "rewards/margins": 1.487978458404541, "rewards/rejected": -32.108760833740234, "step": 2085 }, { "epoch": 0.284041394335512, "grad_norm": 39.926222083647865, "learning_rate": 7.203122897007234e-07, "logits/chosen": 9.050244331359863, "logits/rejected": 10.838600158691406, "logps/chosen": -2.9394912719726562, "logps/rejected": -3.24251651763916, "loss": 3.3757, "rewards/accuracies": 0.75, "rewards/chosen": -29.39491081237793, "rewards/margins": 3.030254364013672, "rewards/rejected": -32.42516326904297, "step": 2086 }, { "epoch": 0.28417755991285404, "grad_norm": 44.22682511681191, "learning_rate": 7.201983675511046e-07, "logits/chosen": 9.689366340637207, "logits/rejected": 10.553110122680664, "logps/chosen": -2.8971099853515625, "logps/rejected": -3.3041250705718994, "loss": 4.4373, "rewards/accuracies": 1.0, "rewards/chosen": -28.971099853515625, "rewards/margins": 4.0701518058776855, "rewards/rejected": -33.04125213623047, "step": 2087 }, { "epoch": 0.28431372549019607, "grad_norm": 41.60873381013074, "learning_rate": 7.20084373050018e-07, "logits/chosen": 11.097051620483398, "logits/rejected": 11.30634593963623, "logps/chosen": -3.1041057109832764, "logps/rejected": -3.3328585624694824, "loss": 4.4895, "rewards/accuracies": 0.75, "rewards/chosen": -31.041057586669922, "rewards/margins": 2.2875285148620605, "rewards/rejected": -33.328582763671875, "step": 2088 }, { "epoch": 0.28444989106753815, "grad_norm": 43.80801713095213, "learning_rate": 7.199703062232214e-07, "logits/chosen": 11.335678100585938, "logits/rejected": 11.764324188232422, "logps/chosen": -3.318833112716675, "logps/rejected": -3.540250539779663, "loss": 3.7392, "rewards/accuracies": 0.75, "rewards/chosen": -33.188331604003906, "rewards/margins": 2.214174270629883, "rewards/rejected": -35.402503967285156, "step": 2089 }, { "epoch": 0.2845860566448802, "grad_norm": 48.262590118070655, "learning_rate": 7.198561670964892e-07, "logits/chosen": 10.183629989624023, "logits/rejected": 10.126514434814453, "logps/chosen": -3.1189095973968506, "logps/rejected": -3.1562891006469727, "loss": 3.9179, "rewards/accuracies": 0.5, "rewards/chosen": -31.189096450805664, "rewards/margins": 0.3737955093383789, "rewards/rejected": -31.56289291381836, "step": 2090 }, { "epoch": 0.2847222222222222, "grad_norm": 41.875518035255894, "learning_rate": 7.19741955695612e-07, "logits/chosen": 9.433666229248047, "logits/rejected": 11.278946876525879, "logps/chosen": -3.1665122509002686, "logps/rejected": -3.60078763961792, "loss": 4.1804, "rewards/accuracies": 0.75, "rewards/chosen": -31.665122985839844, "rewards/margins": 4.342754364013672, "rewards/rejected": -36.007877349853516, "step": 2091 }, { "epoch": 0.2848583877995643, "grad_norm": 43.1231189729355, "learning_rate": 7.19627672046397e-07, "logits/chosen": 10.282855033874512, "logits/rejected": 10.83995532989502, "logps/chosen": -3.348529815673828, "logps/rejected": -3.4631800651550293, "loss": 3.6761, "rewards/accuracies": 0.5, "rewards/chosen": -33.48529815673828, "rewards/margins": 1.1465015411376953, "rewards/rejected": -34.631797790527344, "step": 2092 }, { "epoch": 0.2849945533769063, "grad_norm": 45.48480648010974, "learning_rate": 7.195133161746675e-07, "logits/chosen": 11.396958351135254, "logits/rejected": 11.58237361907959, "logps/chosen": -3.1105504035949707, "logps/rejected": -3.383054733276367, "loss": 3.6241, "rewards/accuracies": 0.75, "rewards/chosen": -31.10550308227539, "rewards/margins": 2.725043773651123, "rewards/rejected": -33.83054733276367, "step": 2093 }, { "epoch": 0.28513071895424835, "grad_norm": 84.25720642658163, "learning_rate": 7.19398888106263e-07, "logits/chosen": 11.061416625976562, "logits/rejected": 10.781536102294922, "logps/chosen": -2.9572033882141113, "logps/rejected": -2.697122573852539, "loss": 4.4527, "rewards/accuracies": 0.25, "rewards/chosen": -29.572032928466797, "rewards/margins": -2.6008076667785645, "rewards/rejected": -26.97122573852539, "step": 2094 }, { "epoch": 0.28526688453159044, "grad_norm": 39.935040469894304, "learning_rate": 7.192843878670396e-07, "logits/chosen": 10.059043884277344, "logits/rejected": 10.38717269897461, "logps/chosen": -3.0521674156188965, "logps/rejected": -3.3307814598083496, "loss": 3.9711, "rewards/accuracies": 0.75, "rewards/chosen": -30.52167510986328, "rewards/margins": 2.7861404418945312, "rewards/rejected": -33.30781555175781, "step": 2095 }, { "epoch": 0.28540305010893247, "grad_norm": 40.61893559179885, "learning_rate": 7.191698154828694e-07, "logits/chosen": 11.02426528930664, "logits/rejected": 10.910818099975586, "logps/chosen": -3.145888566970825, "logps/rejected": -3.2198190689086914, "loss": 4.4113, "rewards/accuracies": 0.75, "rewards/chosen": -31.458887100219727, "rewards/margins": 0.7393021583557129, "rewards/rejected": -32.19818878173828, "step": 2096 }, { "epoch": 0.2855392156862745, "grad_norm": 37.991597548152185, "learning_rate": 7.190551709796413e-07, "logits/chosen": 11.358185768127441, "logits/rejected": 11.53367805480957, "logps/chosen": -3.0196428298950195, "logps/rejected": -3.3001182079315186, "loss": 3.8446, "rewards/accuracies": 1.0, "rewards/chosen": -30.196430206298828, "rewards/margins": 2.8047537803649902, "rewards/rejected": -33.001182556152344, "step": 2097 }, { "epoch": 0.2856753812636166, "grad_norm": 40.09452246754646, "learning_rate": 7.189404543832598e-07, "logits/chosen": 10.931800842285156, "logits/rejected": 11.442222595214844, "logps/chosen": -3.064166307449341, "logps/rejected": -3.5294840335845947, "loss": 3.9635, "rewards/accuracies": 1.0, "rewards/chosen": -30.641664505004883, "rewards/margins": 4.653175354003906, "rewards/rejected": -35.294837951660156, "step": 2098 }, { "epoch": 0.2858115468409586, "grad_norm": 37.9527944271494, "learning_rate": 7.188256657196463e-07, "logits/chosen": 10.971050262451172, "logits/rejected": 11.118017196655273, "logps/chosen": -3.44946551322937, "logps/rejected": -3.6371819972991943, "loss": 4.0999, "rewards/accuracies": 0.75, "rewards/chosen": -34.49465560913086, "rewards/margins": 1.8771629333496094, "rewards/rejected": -36.37181854248047, "step": 2099 }, { "epoch": 0.28594771241830064, "grad_norm": 40.20757989477272, "learning_rate": 7.187108050147382e-07, "logits/chosen": 10.846535682678223, "logits/rejected": 11.537679672241211, "logps/chosen": -3.658154010772705, "logps/rejected": -3.6575517654418945, "loss": 4.5742, "rewards/accuracies": 0.25, "rewards/chosen": -36.581539154052734, "rewards/margins": -0.006021976470947266, "rewards/rejected": -36.57551574707031, "step": 2100 }, { "epoch": 0.2860838779956427, "grad_norm": 45.98330829014444, "learning_rate": 7.185958722944893e-07, "logits/chosen": 11.256919860839844, "logits/rejected": 11.649002075195312, "logps/chosen": -3.713988780975342, "logps/rejected": -3.852154493331909, "loss": 4.2945, "rewards/accuracies": 0.75, "rewards/chosen": -37.139888763427734, "rewards/margins": 1.3816585540771484, "rewards/rejected": -38.52154541015625, "step": 2101 }, { "epoch": 0.28622004357298475, "grad_norm": 38.62721486045825, "learning_rate": 7.184808675848693e-07, "logits/chosen": 10.762407302856445, "logits/rejected": 11.205286979675293, "logps/chosen": -3.3836607933044434, "logps/rejected": -3.6304850578308105, "loss": 3.9832, "rewards/accuracies": 0.75, "rewards/chosen": -33.83660888671875, "rewards/margins": 2.4682421684265137, "rewards/rejected": -36.30485153198242, "step": 2102 }, { "epoch": 0.2863562091503268, "grad_norm": 44.3437360540905, "learning_rate": 7.183657909118648e-07, "logits/chosen": 11.001167297363281, "logits/rejected": 11.673547744750977, "logps/chosen": -2.988839626312256, "logps/rejected": -3.295783519744873, "loss": 3.5929, "rewards/accuracies": 0.75, "rewards/chosen": -29.888397216796875, "rewards/margins": 3.0694398880004883, "rewards/rejected": -32.95783615112305, "step": 2103 }, { "epoch": 0.28649237472766886, "grad_norm": 46.67958931003994, "learning_rate": 7.182506423014784e-07, "logits/chosen": 11.250234603881836, "logits/rejected": 11.05894660949707, "logps/chosen": -3.059098720550537, "logps/rejected": -3.2134385108947754, "loss": 3.7946, "rewards/accuracies": 0.75, "rewards/chosen": -30.590986251831055, "rewards/margins": 1.5433998107910156, "rewards/rejected": -32.13438415527344, "step": 2104 }, { "epoch": 0.2866285403050109, "grad_norm": 40.62928013125049, "learning_rate": 7.181354217797285e-07, "logits/chosen": 10.649439811706543, "logits/rejected": 10.65928840637207, "logps/chosen": -2.929161787033081, "logps/rejected": -3.182884693145752, "loss": 3.7713, "rewards/accuracies": 0.75, "rewards/chosen": -29.29161834716797, "rewards/margins": 2.5372276306152344, "rewards/rejected": -31.828845977783203, "step": 2105 }, { "epoch": 0.2867647058823529, "grad_norm": 42.30584955947376, "learning_rate": 7.180201293726503e-07, "logits/chosen": 11.838250160217285, "logits/rejected": 11.610699653625488, "logps/chosen": -3.278538703918457, "logps/rejected": -3.656475782394409, "loss": 4.4606, "rewards/accuracies": 0.75, "rewards/chosen": -32.78538513183594, "rewards/margins": 3.7793712615966797, "rewards/rejected": -36.56475830078125, "step": 2106 }, { "epoch": 0.286900871459695, "grad_norm": 46.43907182625571, "learning_rate": 7.179047651062951e-07, "logits/chosen": 11.277458190917969, "logits/rejected": 10.458108901977539, "logps/chosen": -3.1877803802490234, "logps/rejected": -2.920100688934326, "loss": 4.8021, "rewards/accuracies": 0.25, "rewards/chosen": -31.877803802490234, "rewards/margins": -2.676797389984131, "rewards/rejected": -29.20100975036621, "step": 2107 }, { "epoch": 0.28703703703703703, "grad_norm": 40.75133090554133, "learning_rate": 7.177893290067304e-07, "logits/chosen": 10.12508773803711, "logits/rejected": 11.908807754516602, "logps/chosen": -3.13189959526062, "logps/rejected": -3.77121901512146, "loss": 3.7768, "rewards/accuracies": 1.0, "rewards/chosen": -31.31899642944336, "rewards/margins": 6.393194675445557, "rewards/rejected": -37.712188720703125, "step": 2108 }, { "epoch": 0.28717320261437906, "grad_norm": 43.44663638333955, "learning_rate": 7.176738211000399e-07, "logits/chosen": 10.83641242980957, "logits/rejected": 11.517208099365234, "logps/chosen": -3.3669145107269287, "logps/rejected": -3.6483426094055176, "loss": 4.0651, "rewards/accuracies": 0.75, "rewards/chosen": -33.66914367675781, "rewards/margins": 2.8142824172973633, "rewards/rejected": -36.483428955078125, "step": 2109 }, { "epoch": 0.28730936819172115, "grad_norm": 50.35339774349421, "learning_rate": 7.175582414123237e-07, "logits/chosen": 10.611328125, "logits/rejected": 11.367691040039062, "logps/chosen": -3.416989803314209, "logps/rejected": -3.522331714630127, "loss": 3.6118, "rewards/accuracies": 0.75, "rewards/chosen": -34.169898986816406, "rewards/margins": 1.053421974182129, "rewards/rejected": -35.22331619262695, "step": 2110 }, { "epoch": 0.2874455337690632, "grad_norm": 49.33033412462002, "learning_rate": 7.174425899696978e-07, "logits/chosen": 9.900920867919922, "logits/rejected": 10.965543746948242, "logps/chosen": -3.1775014400482178, "logps/rejected": -3.145343542098999, "loss": 4.3997, "rewards/accuracies": 0.5, "rewards/chosen": -31.77501678466797, "rewards/margins": -0.3215808868408203, "rewards/rejected": -31.453433990478516, "step": 2111 }, { "epoch": 0.2875816993464052, "grad_norm": 44.02319607253423, "learning_rate": 7.173268667982947e-07, "logits/chosen": 11.298973083496094, "logits/rejected": 11.443231582641602, "logps/chosen": -3.231041193008423, "logps/rejected": -3.3207602500915527, "loss": 4.2348, "rewards/accuracies": 0.5, "rewards/chosen": -32.31040954589844, "rewards/margins": 0.8971924781799316, "rewards/rejected": -33.207603454589844, "step": 2112 }, { "epoch": 0.2877178649237473, "grad_norm": 35.81548956301975, "learning_rate": 7.172110719242631e-07, "logits/chosen": 10.548263549804688, "logits/rejected": 10.789844512939453, "logps/chosen": -3.475719928741455, "logps/rejected": -3.2922251224517822, "loss": 3.8841, "rewards/accuracies": 0.5, "rewards/chosen": -34.7572021484375, "rewards/margins": -1.8349475860595703, "rewards/rejected": -32.9222526550293, "step": 2113 }, { "epoch": 0.2878540305010893, "grad_norm": 40.530308579511484, "learning_rate": 7.170952053737676e-07, "logits/chosen": 10.952919006347656, "logits/rejected": 11.499550819396973, "logps/chosen": -3.3272902965545654, "logps/rejected": -3.381361961364746, "loss": 4.2919, "rewards/accuracies": 0.75, "rewards/chosen": -33.27290344238281, "rewards/margins": 0.5407166481018066, "rewards/rejected": -33.813621520996094, "step": 2114 }, { "epoch": 0.28799019607843135, "grad_norm": 41.65007288395572, "learning_rate": 7.169792671729894e-07, "logits/chosen": 11.20849609375, "logits/rejected": 10.897176742553711, "logps/chosen": -3.5355281829833984, "logps/rejected": -3.811264753341675, "loss": 3.9605, "rewards/accuracies": 0.75, "rewards/chosen": -35.355281829833984, "rewards/margins": 2.7573652267456055, "rewards/rejected": -38.112648010253906, "step": 2115 }, { "epoch": 0.28812636165577343, "grad_norm": 44.54434878906131, "learning_rate": 7.168632573481255e-07, "logits/chosen": 11.239093780517578, "logits/rejected": 11.79705810546875, "logps/chosen": -3.644123077392578, "logps/rejected": -3.540849447250366, "loss": 3.8597, "rewards/accuracies": 0.25, "rewards/chosen": -36.44123077392578, "rewards/margins": -1.0327339172363281, "rewards/rejected": -35.40849685668945, "step": 2116 }, { "epoch": 0.28826252723311546, "grad_norm": 41.45129858541342, "learning_rate": 7.167471759253894e-07, "logits/chosen": 11.794126510620117, "logits/rejected": 11.200887680053711, "logps/chosen": -3.4421396255493164, "logps/rejected": -3.532404899597168, "loss": 4.4109, "rewards/accuracies": 0.5, "rewards/chosen": -34.42139434814453, "rewards/margins": 0.9026503562927246, "rewards/rejected": -35.32404708862305, "step": 2117 }, { "epoch": 0.2883986928104575, "grad_norm": 43.32402172683255, "learning_rate": 7.166310229310107e-07, "logits/chosen": 10.356952667236328, "logits/rejected": 10.603716850280762, "logps/chosen": -3.0653786659240723, "logps/rejected": -3.299051284790039, "loss": 3.598, "rewards/accuracies": 0.75, "rewards/chosen": -30.653785705566406, "rewards/margins": 2.3367271423339844, "rewards/rejected": -32.990509033203125, "step": 2118 }, { "epoch": 0.2885348583877996, "grad_norm": 45.476711983061875, "learning_rate": 7.16514798391235e-07, "logits/chosen": 12.35400676727295, "logits/rejected": 12.997810363769531, "logps/chosen": -3.735137939453125, "logps/rejected": -3.9054336547851562, "loss": 4.0972, "rewards/accuracies": 0.5, "rewards/chosen": -37.35137939453125, "rewards/margins": 1.702956199645996, "rewards/rejected": -39.05433654785156, "step": 2119 }, { "epoch": 0.2886710239651416, "grad_norm": 50.125992707776746, "learning_rate": 7.163985023323244e-07, "logits/chosen": 11.199941635131836, "logits/rejected": 11.606159210205078, "logps/chosen": -3.713608980178833, "logps/rejected": -3.8456406593322754, "loss": 4.4564, "rewards/accuracies": 0.5, "rewards/chosen": -37.13608932495117, "rewards/margins": 1.3203163146972656, "rewards/rejected": -38.45640563964844, "step": 2120 }, { "epoch": 0.28880718954248363, "grad_norm": 44.3339341789182, "learning_rate": 7.162821347805567e-07, "logits/chosen": 11.751530647277832, "logits/rejected": 12.509910583496094, "logps/chosen": -3.7359678745269775, "logps/rejected": -3.7259879112243652, "loss": 3.9288, "rewards/accuracies": 0.25, "rewards/chosen": -37.35968017578125, "rewards/margins": -0.09980010986328125, "rewards/rejected": -37.25988006591797, "step": 2121 }, { "epoch": 0.2889433551198257, "grad_norm": 40.473237072632685, "learning_rate": 7.161656957622263e-07, "logits/chosen": 11.094980239868164, "logits/rejected": 10.696342468261719, "logps/chosen": -3.3093819618225098, "logps/rejected": -3.2783865928649902, "loss": 3.8546, "rewards/accuracies": 0.25, "rewards/chosen": -33.09381866455078, "rewards/margins": -0.3099541664123535, "rewards/rejected": -32.78386688232422, "step": 2122 }, { "epoch": 0.28907952069716775, "grad_norm": 58.34540205620165, "learning_rate": 7.160491853036434e-07, "logits/chosen": 10.842863082885742, "logits/rejected": 11.570045471191406, "logps/chosen": -3.190286874771118, "logps/rejected": -3.415738821029663, "loss": 4.5531, "rewards/accuracies": 1.0, "rewards/chosen": -31.902868270874023, "rewards/margins": 2.254519462585449, "rewards/rejected": -34.157386779785156, "step": 2123 }, { "epoch": 0.28921568627450983, "grad_norm": 38.36678001683997, "learning_rate": 7.159326034311347e-07, "logits/chosen": 11.34926700592041, "logits/rejected": 12.712688446044922, "logps/chosen": -3.3583078384399414, "logps/rejected": -3.5806188583374023, "loss": 3.7222, "rewards/accuracies": 0.75, "rewards/chosen": -33.58307647705078, "rewards/margins": 2.223109245300293, "rewards/rejected": -35.806190490722656, "step": 2124 }, { "epoch": 0.28935185185185186, "grad_norm": 39.03524317913246, "learning_rate": 7.158159501710426e-07, "logits/chosen": 10.847206115722656, "logits/rejected": 12.364921569824219, "logps/chosen": -3.4453086853027344, "logps/rejected": -3.6644835472106934, "loss": 4.035, "rewards/accuracies": 0.75, "rewards/chosen": -34.45309066772461, "rewards/margins": 2.191743850708008, "rewards/rejected": -36.644832611083984, "step": 2125 }, { "epoch": 0.2894880174291939, "grad_norm": 53.56320087865002, "learning_rate": 7.156992255497261e-07, "logits/chosen": 11.457379341125488, "logits/rejected": 11.983804702758789, "logps/chosen": -3.436474323272705, "logps/rejected": -3.731562376022339, "loss": 4.3517, "rewards/accuracies": 1.0, "rewards/chosen": -34.364742279052734, "rewards/margins": 2.950881004333496, "rewards/rejected": -37.31562042236328, "step": 2126 }, { "epoch": 0.289624183006536, "grad_norm": 36.155372460448966, "learning_rate": 7.155824295935599e-07, "logits/chosen": 11.269749641418457, "logits/rejected": 12.530998229980469, "logps/chosen": -3.483631134033203, "logps/rejected": -4.0015459060668945, "loss": 3.9947, "rewards/accuracies": 1.0, "rewards/chosen": -34.83631134033203, "rewards/margins": 5.17915153503418, "rewards/rejected": -40.01546096801758, "step": 2127 }, { "epoch": 0.289760348583878, "grad_norm": 43.21649829042449, "learning_rate": 7.154655623289353e-07, "logits/chosen": 11.416069030761719, "logits/rejected": 12.823644638061523, "logps/chosen": -3.3960719108581543, "logps/rejected": -3.823397397994995, "loss": 3.7906, "rewards/accuracies": 1.0, "rewards/chosen": -33.96072006225586, "rewards/margins": 4.27325439453125, "rewards/rejected": -38.233970642089844, "step": 2128 }, { "epoch": 0.28989651416122003, "grad_norm": 40.96229932038949, "learning_rate": 7.15348623782259e-07, "logits/chosen": 11.88625717163086, "logits/rejected": 12.017841339111328, "logps/chosen": -3.4861111640930176, "logps/rejected": -3.618159770965576, "loss": 3.9557, "rewards/accuracies": 0.25, "rewards/chosen": -34.86111068725586, "rewards/margins": 1.3204870223999023, "rewards/rejected": -36.18159866333008, "step": 2129 }, { "epoch": 0.2900326797385621, "grad_norm": 38.73264081881514, "learning_rate": 7.152316139799545e-07, "logits/chosen": 12.828542709350586, "logits/rejected": 12.15339183807373, "logps/chosen": -3.9872381687164307, "logps/rejected": -4.150778293609619, "loss": 3.5664, "rewards/accuracies": 0.75, "rewards/chosen": -39.87238311767578, "rewards/margins": 1.6354026794433594, "rewards/rejected": -41.507781982421875, "step": 2130 }, { "epoch": 0.29016884531590414, "grad_norm": 47.229036034959634, "learning_rate": 7.151145329484612e-07, "logits/chosen": 10.766645431518555, "logits/rejected": 10.520637512207031, "logps/chosen": -3.257369041442871, "logps/rejected": -3.1288838386535645, "loss": 4.4022, "rewards/accuracies": 0.5, "rewards/chosen": -32.573692321777344, "rewards/margins": -1.2848520278930664, "rewards/rejected": -31.288837432861328, "step": 2131 }, { "epoch": 0.2903050108932462, "grad_norm": 41.10027927288226, "learning_rate": 7.149973807142343e-07, "logits/chosen": 11.086037635803223, "logits/rejected": 12.158252716064453, "logps/chosen": -3.131845712661743, "logps/rejected": -3.3758442401885986, "loss": 3.9751, "rewards/accuracies": 1.0, "rewards/chosen": -31.318456649780273, "rewards/margins": 2.4399852752685547, "rewards/rejected": -33.75844192504883, "step": 2132 }, { "epoch": 0.29044117647058826, "grad_norm": 50.289292026046944, "learning_rate": 7.148801573037454e-07, "logits/chosen": 11.642463684082031, "logits/rejected": 11.439263343811035, "logps/chosen": -3.042816162109375, "logps/rejected": -3.2840805053710938, "loss": 4.5912, "rewards/accuracies": 0.75, "rewards/chosen": -30.42816162109375, "rewards/margins": 2.412642002105713, "rewards/rejected": -32.84080505371094, "step": 2133 }, { "epoch": 0.2905773420479303, "grad_norm": 45.25085210956221, "learning_rate": 7.147628627434823e-07, "logits/chosen": 11.402843475341797, "logits/rejected": 12.113582611083984, "logps/chosen": -3.4467804431915283, "logps/rejected": -3.5084228515625, "loss": 4.1333, "rewards/accuracies": 0.75, "rewards/chosen": -34.467803955078125, "rewards/margins": 0.6164255142211914, "rewards/rejected": -35.084228515625, "step": 2134 }, { "epoch": 0.2907135076252723, "grad_norm": 49.06181900521971, "learning_rate": 7.146454970599484e-07, "logits/chosen": 10.711010932922363, "logits/rejected": 11.70937728881836, "logps/chosen": -3.204557418823242, "logps/rejected": -3.614570140838623, "loss": 4.2063, "rewards/accuracies": 1.0, "rewards/chosen": -32.04557418823242, "rewards/margins": 4.100127696990967, "rewards/rejected": -36.14569854736328, "step": 2135 }, { "epoch": 0.2908496732026144, "grad_norm": 44.55102060554325, "learning_rate": 7.145280602796636e-07, "logits/chosen": 10.777437210083008, "logits/rejected": 10.906341552734375, "logps/chosen": -3.235637664794922, "logps/rejected": -3.2737081050872803, "loss": 3.6231, "rewards/accuracies": 0.5, "rewards/chosen": -32.35637664794922, "rewards/margins": 0.3807034492492676, "rewards/rejected": -32.73707962036133, "step": 2136 }, { "epoch": 0.29098583877995643, "grad_norm": 52.38537619170586, "learning_rate": 7.144105524291637e-07, "logits/chosen": 11.575006484985352, "logits/rejected": 11.466283798217773, "logps/chosen": -3.5028395652770996, "logps/rejected": -3.6821165084838867, "loss": 3.6343, "rewards/accuracies": 0.75, "rewards/chosen": -35.02839660644531, "rewards/margins": 1.7927699089050293, "rewards/rejected": -36.8211669921875, "step": 2137 }, { "epoch": 0.29112200435729846, "grad_norm": 42.85182582991005, "learning_rate": 7.142929735350005e-07, "logits/chosen": 10.946100234985352, "logits/rejected": 11.44464111328125, "logps/chosen": -3.844944477081299, "logps/rejected": -3.6457223892211914, "loss": 4.4804, "rewards/accuracies": 0.5, "rewards/chosen": -38.44944763183594, "rewards/margins": -1.9922208786010742, "rewards/rejected": -36.45722579956055, "step": 2138 }, { "epoch": 0.29125816993464054, "grad_norm": 45.40559741457608, "learning_rate": 7.141753236237419e-07, "logits/chosen": 12.11819076538086, "logits/rejected": 11.948097229003906, "logps/chosen": -3.4418764114379883, "logps/rejected": -3.343069314956665, "loss": 4.1474, "rewards/accuracies": 0.25, "rewards/chosen": -34.41876220703125, "rewards/margins": -0.9880695343017578, "rewards/rejected": -33.430694580078125, "step": 2139 }, { "epoch": 0.29139433551198257, "grad_norm": 48.603123687258204, "learning_rate": 7.140576027219719e-07, "logits/chosen": 10.89438247680664, "logits/rejected": 11.164520263671875, "logps/chosen": -3.4657645225524902, "logps/rejected": -3.689305305480957, "loss": 4.1121, "rewards/accuracies": 1.0, "rewards/chosen": -34.65764617919922, "rewards/margins": 2.235405445098877, "rewards/rejected": -36.89305114746094, "step": 2140 }, { "epoch": 0.2915305010893246, "grad_norm": 54.77500803436561, "learning_rate": 7.139398108562906e-07, "logits/chosen": 11.120420455932617, "logits/rejected": 12.02096939086914, "logps/chosen": -3.5398905277252197, "logps/rejected": -3.563241481781006, "loss": 4.3371, "rewards/accuracies": 0.75, "rewards/chosen": -35.39890670776367, "rewards/margins": 0.23350811004638672, "rewards/rejected": -35.632415771484375, "step": 2141 }, { "epoch": 0.2916666666666667, "grad_norm": 47.692572091555164, "learning_rate": 7.13821948053314e-07, "logits/chosen": 12.594966888427734, "logits/rejected": 11.532684326171875, "logps/chosen": -3.5656914710998535, "logps/rejected": -3.467740535736084, "loss": 4.4285, "rewards/accuracies": 0.5, "rewards/chosen": -35.65691375732422, "rewards/margins": -0.9795083999633789, "rewards/rejected": -34.677406311035156, "step": 2142 }, { "epoch": 0.2918028322440087, "grad_norm": 46.56652214306675, "learning_rate": 7.137040143396742e-07, "logits/chosen": 11.25766658782959, "logits/rejected": 11.89864730834961, "logps/chosen": -3.1605567932128906, "logps/rejected": -3.397843837738037, "loss": 4.0617, "rewards/accuracies": 1.0, "rewards/chosen": -31.605566024780273, "rewards/margins": 2.372870922088623, "rewards/rejected": -33.97843933105469, "step": 2143 }, { "epoch": 0.29193899782135074, "grad_norm": 52.95753286814193, "learning_rate": 7.135860097420192e-07, "logits/chosen": 11.96424388885498, "logits/rejected": 11.803581237792969, "logps/chosen": -3.366396188735962, "logps/rejected": -3.7466182708740234, "loss": 4.6047, "rewards/accuracies": 0.5, "rewards/chosen": -33.663963317871094, "rewards/margins": 3.8022193908691406, "rewards/rejected": -37.46617889404297, "step": 2144 }, { "epoch": 0.2920751633986928, "grad_norm": 56.064768433718235, "learning_rate": 7.134679342870133e-07, "logits/chosen": 11.211599349975586, "logits/rejected": 10.33771800994873, "logps/chosen": -3.250732898712158, "logps/rejected": -3.108293056488037, "loss": 4.4753, "rewards/accuracies": 0.25, "rewards/chosen": -32.50733184814453, "rewards/margins": -1.4243974685668945, "rewards/rejected": -31.082931518554688, "step": 2145 }, { "epoch": 0.29221132897603486, "grad_norm": 45.89628170730563, "learning_rate": 7.133497880013363e-07, "logits/chosen": 11.874767303466797, "logits/rejected": 12.426628112792969, "logps/chosen": -3.532115936279297, "logps/rejected": -3.8041887283325195, "loss": 4.1372, "rewards/accuracies": 0.75, "rewards/chosen": -35.32115936279297, "rewards/margins": 2.72072696685791, "rewards/rejected": -38.04188537597656, "step": 2146 }, { "epoch": 0.2923474945533769, "grad_norm": 47.63403554066358, "learning_rate": 7.132315709116845e-07, "logits/chosen": 12.418825149536133, "logits/rejected": 11.789264678955078, "logps/chosen": -3.6227707862854004, "logps/rejected": -3.7522311210632324, "loss": 4.5453, "rewards/accuracies": 0.75, "rewards/chosen": -36.22770690917969, "rewards/margins": 1.2946043014526367, "rewards/rejected": -37.52231216430664, "step": 2147 }, { "epoch": 0.29248366013071897, "grad_norm": 45.15949777793223, "learning_rate": 7.131132830447703e-07, "logits/chosen": 11.361207962036133, "logits/rejected": 12.378408432006836, "logps/chosen": -3.38693904876709, "logps/rejected": -3.94362473487854, "loss": 3.5973, "rewards/accuracies": 1.0, "rewards/chosen": -33.86939239501953, "rewards/margins": 5.566855430603027, "rewards/rejected": -39.43624496459961, "step": 2148 }, { "epoch": 0.292619825708061, "grad_norm": 45.70704697398613, "learning_rate": 7.129949244273212e-07, "logits/chosen": 12.804498672485352, "logits/rejected": 12.013507843017578, "logps/chosen": -3.862321376800537, "logps/rejected": -3.44283127784729, "loss": 4.267, "rewards/accuracies": 0.25, "rewards/chosen": -38.62321472167969, "rewards/margins": -4.19490385055542, "rewards/rejected": -34.428314208984375, "step": 2149 }, { "epoch": 0.292755991285403, "grad_norm": 46.81460082993493, "learning_rate": 7.128764950860819e-07, "logits/chosen": 12.653380393981934, "logits/rejected": 12.528322219848633, "logps/chosen": -3.5078303813934326, "logps/rejected": -3.7273988723754883, "loss": 4.0745, "rewards/accuracies": 0.75, "rewards/chosen": -35.078304290771484, "rewards/margins": 2.195683479309082, "rewards/rejected": -37.27398681640625, "step": 2150 }, { "epoch": 0.2928921568627451, "grad_norm": 45.34105041044355, "learning_rate": 7.127579950478123e-07, "logits/chosen": 11.664617538452148, "logits/rejected": 11.49687671661377, "logps/chosen": -3.337035655975342, "logps/rejected": -3.430974245071411, "loss": 3.7755, "rewards/accuracies": 0.25, "rewards/chosen": -33.370357513427734, "rewards/margins": 0.9393863677978516, "rewards/rejected": -34.30974197387695, "step": 2151 }, { "epoch": 0.29302832244008714, "grad_norm": 49.495633886325145, "learning_rate": 7.126394243392885e-07, "logits/chosen": 12.303411483764648, "logits/rejected": 12.802412986755371, "logps/chosen": -3.4318413734436035, "logps/rejected": -3.404146432876587, "loss": 4.3819, "rewards/accuracies": 0.5, "rewards/chosen": -34.31841278076172, "rewards/margins": -0.2769498825073242, "rewards/rejected": -34.041465759277344, "step": 2152 }, { "epoch": 0.29316448801742917, "grad_norm": 44.078238999988415, "learning_rate": 7.125207829873023e-07, "logits/chosen": 12.871500015258789, "logits/rejected": 12.317182540893555, "logps/chosen": -3.4808568954467773, "logps/rejected": -3.853529214859009, "loss": 3.9408, "rewards/accuracies": 0.75, "rewards/chosen": -34.808570861816406, "rewards/margins": 3.7267212867736816, "rewards/rejected": -38.5352897644043, "step": 2153 }, { "epoch": 0.29330065359477125, "grad_norm": 44.00788280385512, "learning_rate": 7.12402071018662e-07, "logits/chosen": 12.19473648071289, "logits/rejected": 12.62335205078125, "logps/chosen": -3.9673571586608887, "logps/rejected": -4.055758953094482, "loss": 4.1928, "rewards/accuracies": 0.5, "rewards/chosen": -39.67356872558594, "rewards/margins": 0.8840174674987793, "rewards/rejected": -40.557586669921875, "step": 2154 }, { "epoch": 0.2934368191721133, "grad_norm": 49.460013735773195, "learning_rate": 7.122832884601914e-07, "logits/chosen": 12.352034568786621, "logits/rejected": 12.755623817443848, "logps/chosen": -3.8285696506500244, "logps/rejected": -4.102464199066162, "loss": 3.5355, "rewards/accuracies": 1.0, "rewards/chosen": -38.28569412231445, "rewards/margins": 2.738945960998535, "rewards/rejected": -41.02464294433594, "step": 2155 }, { "epoch": 0.2935729847494553, "grad_norm": 46.408235508724324, "learning_rate": 7.121644353387303e-07, "logits/chosen": 11.41915512084961, "logits/rejected": 11.6802978515625, "logps/chosen": -3.1663310527801514, "logps/rejected": -3.555176258087158, "loss": 3.2526, "rewards/accuracies": 0.75, "rewards/chosen": -31.663311004638672, "rewards/margins": 3.8884520530700684, "rewards/rejected": -35.55176544189453, "step": 2156 }, { "epoch": 0.2937091503267974, "grad_norm": 44.201097683793854, "learning_rate": 7.120455116811347e-07, "logits/chosen": 12.784749031066895, "logits/rejected": 12.211257934570312, "logps/chosen": -3.4596071243286133, "logps/rejected": -3.5091638565063477, "loss": 4.1117, "rewards/accuracies": 0.5, "rewards/chosen": -34.596073150634766, "rewards/margins": 0.49556446075439453, "rewards/rejected": -35.091636657714844, "step": 2157 }, { "epoch": 0.2938453159041394, "grad_norm": 46.038703273891905, "learning_rate": 7.119265175142764e-07, "logits/chosen": 10.517646789550781, "logits/rejected": 12.484223365783691, "logps/chosen": -2.9328665733337402, "logps/rejected": -3.583615779876709, "loss": 3.7401, "rewards/accuracies": 0.75, "rewards/chosen": -29.328664779663086, "rewards/margins": 6.507493495941162, "rewards/rejected": -35.836158752441406, "step": 2158 }, { "epoch": 0.29398148148148145, "grad_norm": 51.942225282433455, "learning_rate": 7.11807452865043e-07, "logits/chosen": 11.619638442993164, "logits/rejected": 12.102627754211426, "logps/chosen": -3.5674490928649902, "logps/rejected": -3.8770294189453125, "loss": 3.999, "rewards/accuracies": 1.0, "rewards/chosen": -35.67449188232422, "rewards/margins": 3.0958008766174316, "rewards/rejected": -38.77029037475586, "step": 2159 }, { "epoch": 0.29411764705882354, "grad_norm": 42.610836592750545, "learning_rate": 7.116883177603383e-07, "logits/chosen": 10.194604873657227, "logits/rejected": 12.30607795715332, "logps/chosen": -3.126772880554199, "logps/rejected": -3.5482659339904785, "loss": 4.4747, "rewards/accuracies": 1.0, "rewards/chosen": -31.26772689819336, "rewards/margins": 4.214932441711426, "rewards/rejected": -35.48265838623047, "step": 2160 }, { "epoch": 0.29425381263616557, "grad_norm": 43.18589464020942, "learning_rate": 7.115691122270817e-07, "logits/chosen": 11.440234184265137, "logits/rejected": 11.926935195922852, "logps/chosen": -3.32841420173645, "logps/rejected": -3.365993022918701, "loss": 3.9964, "rewards/accuracies": 0.5, "rewards/chosen": -33.284141540527344, "rewards/margins": 0.37578773498535156, "rewards/rejected": -33.65993118286133, "step": 2161 }, { "epoch": 0.29438997821350765, "grad_norm": 43.71554158864352, "learning_rate": 7.114498362922086e-07, "logits/chosen": 11.102545738220215, "logits/rejected": 11.498800277709961, "logps/chosen": -3.1871604919433594, "logps/rejected": -3.3642783164978027, "loss": 4.2567, "rewards/accuracies": 1.0, "rewards/chosen": -31.87160301208496, "rewards/margins": 1.7711796760559082, "rewards/rejected": -33.642784118652344, "step": 2162 }, { "epoch": 0.2945261437908497, "grad_norm": 46.109026205586446, "learning_rate": 7.113304899826707e-07, "logits/chosen": 11.275935173034668, "logits/rejected": 12.109867095947266, "logps/chosen": -3.536393642425537, "logps/rejected": -3.8086400032043457, "loss": 4.5567, "rewards/accuracies": 0.5, "rewards/chosen": -35.36393737792969, "rewards/margins": 2.7224631309509277, "rewards/rejected": -38.086402893066406, "step": 2163 }, { "epoch": 0.2946623093681917, "grad_norm": 40.906832512035194, "learning_rate": 7.11211073325435e-07, "logits/chosen": 11.785430908203125, "logits/rejected": 11.600617408752441, "logps/chosen": -3.444911241531372, "logps/rejected": -3.5995442867279053, "loss": 3.74, "rewards/accuracies": 0.75, "rewards/chosen": -34.44911193847656, "rewards/margins": 1.546330451965332, "rewards/rejected": -35.995445251464844, "step": 2164 }, { "epoch": 0.2947984749455338, "grad_norm": 51.80727469521673, "learning_rate": 7.110915863474849e-07, "logits/chosen": 11.936923027038574, "logits/rejected": 12.082818984985352, "logps/chosen": -3.600212335586548, "logps/rejected": -3.597484588623047, "loss": 4.1615, "rewards/accuracies": 0.5, "rewards/chosen": -36.00212097167969, "rewards/margins": -0.02727508544921875, "rewards/rejected": -35.97484588623047, "step": 2165 }, { "epoch": 0.2949346405228758, "grad_norm": 48.012298030073914, "learning_rate": 7.109720290758192e-07, "logits/chosen": 11.420793533325195, "logits/rejected": 11.905410766601562, "logps/chosen": -3.1518921852111816, "logps/rejected": -3.436237335205078, "loss": 3.5018, "rewards/accuracies": 1.0, "rewards/chosen": -31.5189208984375, "rewards/margins": 2.8434529304504395, "rewards/rejected": -34.36237335205078, "step": 2166 }, { "epoch": 0.29507080610021785, "grad_norm": 40.14564854960349, "learning_rate": 7.108524015374531e-07, "logits/chosen": 12.557037353515625, "logits/rejected": 13.00014877319336, "logps/chosen": -3.558926582336426, "logps/rejected": -3.5187368392944336, "loss": 4.0431, "rewards/accuracies": 0.5, "rewards/chosen": -35.589263916015625, "rewards/margins": -0.4018983840942383, "rewards/rejected": -35.18737030029297, "step": 2167 }, { "epoch": 0.29520697167755994, "grad_norm": 42.36622872236855, "learning_rate": 7.107327037594173e-07, "logits/chosen": 11.8047456741333, "logits/rejected": 11.80318832397461, "logps/chosen": -3.576827049255371, "logps/rejected": -3.4307596683502197, "loss": 4.0474, "rewards/accuracies": 0.25, "rewards/chosen": -35.768272399902344, "rewards/margins": -1.4606742858886719, "rewards/rejected": -34.30759811401367, "step": 2168 }, { "epoch": 0.29534313725490197, "grad_norm": 42.63128644058613, "learning_rate": 7.106129357687586e-07, "logits/chosen": 12.360738754272461, "logits/rejected": 12.144678115844727, "logps/chosen": -3.754896879196167, "logps/rejected": -3.880734443664551, "loss": 3.9096, "rewards/accuracies": 0.75, "rewards/chosen": -37.54896926879883, "rewards/margins": 1.258376121520996, "rewards/rejected": -38.80734634399414, "step": 2169 }, { "epoch": 0.295479302832244, "grad_norm": 44.95685833819224, "learning_rate": 7.104930975925395e-07, "logits/chosen": 12.4281644821167, "logits/rejected": 12.515270233154297, "logps/chosen": -3.617206335067749, "logps/rejected": -3.7633137702941895, "loss": 3.9026, "rewards/accuracies": 0.75, "rewards/chosen": -36.17206573486328, "rewards/margins": 1.4610743522644043, "rewards/rejected": -37.633140563964844, "step": 2170 }, { "epoch": 0.2956154684095861, "grad_norm": 42.967157427133316, "learning_rate": 7.103731892578384e-07, "logits/chosen": 10.850759506225586, "logits/rejected": 12.081355094909668, "logps/chosen": -2.603253126144409, "logps/rejected": -3.1747658252716064, "loss": 4.0753, "rewards/accuracies": 1.0, "rewards/chosen": -26.03253173828125, "rewards/margins": 5.715126991271973, "rewards/rejected": -31.747657775878906, "step": 2171 }, { "epoch": 0.2957516339869281, "grad_norm": 45.333921898281126, "learning_rate": 7.102532107917496e-07, "logits/chosen": 10.86027717590332, "logits/rejected": 10.66284465789795, "logps/chosen": -3.0855281352996826, "logps/rejected": -3.3331973552703857, "loss": 3.6241, "rewards/accuracies": 0.75, "rewards/chosen": -30.855281829833984, "rewards/margins": 2.4766921997070312, "rewards/rejected": -33.331974029541016, "step": 2172 }, { "epoch": 0.29588779956427014, "grad_norm": 39.985837687037005, "learning_rate": 7.101331622213833e-07, "logits/chosen": 11.075302124023438, "logits/rejected": 13.735092163085938, "logps/chosen": -2.992584466934204, "logps/rejected": -3.570448875427246, "loss": 3.6788, "rewards/accuracies": 1.0, "rewards/chosen": -29.925846099853516, "rewards/margins": 5.778644561767578, "rewards/rejected": -35.704490661621094, "step": 2173 }, { "epoch": 0.2960239651416122, "grad_norm": 41.31671515432498, "learning_rate": 7.100130435738654e-07, "logits/chosen": 11.989072799682617, "logits/rejected": 12.259971618652344, "logps/chosen": -3.3406903743743896, "logps/rejected": -3.391403913497925, "loss": 4.0529, "rewards/accuracies": 0.25, "rewards/chosen": -33.40690231323242, "rewards/margins": 0.5071368217468262, "rewards/rejected": -33.914039611816406, "step": 2174 }, { "epoch": 0.29616013071895425, "grad_norm": 46.68631491192007, "learning_rate": 7.098928548763377e-07, "logits/chosen": 11.383489608764648, "logits/rejected": 12.190607070922852, "logps/chosen": -3.002804756164551, "logps/rejected": -3.4143166542053223, "loss": 4.4155, "rewards/accuracies": 1.0, "rewards/chosen": -30.028047561645508, "rewards/margins": 4.115119457244873, "rewards/rejected": -34.143165588378906, "step": 2175 }, { "epoch": 0.2962962962962963, "grad_norm": 42.33969737633237, "learning_rate": 7.097725961559579e-07, "logits/chosen": 13.080949783325195, "logits/rejected": 13.050538063049316, "logps/chosen": -3.5311245918273926, "logps/rejected": -3.427778720855713, "loss": 4.6189, "rewards/accuracies": 0.25, "rewards/chosen": -35.31124496459961, "rewards/margins": -1.0334596633911133, "rewards/rejected": -34.27778625488281, "step": 2176 }, { "epoch": 0.29643246187363836, "grad_norm": 44.36995906807047, "learning_rate": 7.096522674398993e-07, "logits/chosen": 11.507636070251465, "logits/rejected": 11.346823692321777, "logps/chosen": -3.0728139877319336, "logps/rejected": -3.1799755096435547, "loss": 4.2359, "rewards/accuracies": 0.75, "rewards/chosen": -30.728137969970703, "rewards/margins": 1.0716180801391602, "rewards/rejected": -31.79975700378418, "step": 2177 }, { "epoch": 0.2965686274509804, "grad_norm": 63.09271090158375, "learning_rate": 7.095318687553513e-07, "logits/chosen": 12.530038833618164, "logits/rejected": 12.371360778808594, "logps/chosen": -3.3590505123138428, "logps/rejected": -3.1528306007385254, "loss": 4.2723, "rewards/accuracies": 0.25, "rewards/chosen": -33.59050750732422, "rewards/margins": -2.0622000694274902, "rewards/rejected": -31.528305053710938, "step": 2178 }, { "epoch": 0.2967047930283224, "grad_norm": 44.29253945917983, "learning_rate": 7.094114001295188e-07, "logits/chosen": 11.611980438232422, "logits/rejected": 12.526317596435547, "logps/chosen": -2.9983620643615723, "logps/rejected": -3.8873515129089355, "loss": 3.3999, "rewards/accuracies": 1.0, "rewards/chosen": -29.983619689941406, "rewards/margins": 8.889894485473633, "rewards/rejected": -38.873512268066406, "step": 2179 }, { "epoch": 0.2968409586056645, "grad_norm": 39.43409189773087, "learning_rate": 7.092908615896231e-07, "logits/chosen": 11.607576370239258, "logits/rejected": 12.073282241821289, "logps/chosen": -2.870530605316162, "logps/rejected": -3.4591407775878906, "loss": 3.7139, "rewards/accuracies": 1.0, "rewards/chosen": -28.705307006835938, "rewards/margins": 5.886101245880127, "rewards/rejected": -34.591407775878906, "step": 2180 }, { "epoch": 0.29697712418300654, "grad_norm": 41.806100359096774, "learning_rate": 7.091702531629003e-07, "logits/chosen": 10.76162338256836, "logits/rejected": 12.565643310546875, "logps/chosen": -3.1876845359802246, "logps/rejected": -3.467766761779785, "loss": 4.1078, "rewards/accuracies": 1.0, "rewards/chosen": -31.876846313476562, "rewards/margins": 2.80082368850708, "rewards/rejected": -34.677669525146484, "step": 2181 }, { "epoch": 0.29711328976034856, "grad_norm": 51.61712459490878, "learning_rate": 7.090495748766035e-07, "logits/chosen": 11.431951522827148, "logits/rejected": 11.966522216796875, "logps/chosen": -3.0160090923309326, "logps/rejected": -3.2594683170318604, "loss": 4.2695, "rewards/accuracies": 0.75, "rewards/chosen": -30.160091400146484, "rewards/margins": 2.434591293334961, "rewards/rejected": -32.59468078613281, "step": 2182 }, { "epoch": 0.29724945533769065, "grad_norm": 45.44312167803479, "learning_rate": 7.089288267580004e-07, "logits/chosen": 11.044692993164062, "logits/rejected": 11.554593086242676, "logps/chosen": -3.3064985275268555, "logps/rejected": -3.452247142791748, "loss": 4.3089, "rewards/accuracies": 0.75, "rewards/chosen": -33.06498718261719, "rewards/margins": 1.4574861526489258, "rewards/rejected": -34.52246856689453, "step": 2183 }, { "epoch": 0.2973856209150327, "grad_norm": 48.294306901106, "learning_rate": 7.088080088343753e-07, "logits/chosen": 12.075957298278809, "logits/rejected": 11.908220291137695, "logps/chosen": -3.191380023956299, "logps/rejected": -3.482211112976074, "loss": 3.8758, "rewards/accuracies": 1.0, "rewards/chosen": -31.913803100585938, "rewards/margins": 2.908308506011963, "rewards/rejected": -34.82210922241211, "step": 2184 }, { "epoch": 0.2975217864923747, "grad_norm": 44.405174476883076, "learning_rate": 7.08687121133028e-07, "logits/chosen": 11.427204132080078, "logits/rejected": 11.93991756439209, "logps/chosen": -3.3625900745391846, "logps/rejected": -3.4591867923736572, "loss": 4.3206, "rewards/accuracies": 0.75, "rewards/chosen": -33.62590026855469, "rewards/margins": 0.9659676551818848, "rewards/rejected": -34.59186935424805, "step": 2185 }, { "epoch": 0.2976579520697168, "grad_norm": 42.29626092652742, "learning_rate": 7.08566163681274e-07, "logits/chosen": 13.001583099365234, "logits/rejected": 12.793737411499023, "logps/chosen": -3.2929744720458984, "logps/rejected": -3.5484490394592285, "loss": 3.7244, "rewards/accuracies": 1.0, "rewards/chosen": -32.92974853515625, "rewards/margins": 2.5547447204589844, "rewards/rejected": -35.48448944091797, "step": 2186 }, { "epoch": 0.2977941176470588, "grad_norm": 44.929909743756475, "learning_rate": 7.084451365064447e-07, "logits/chosen": 11.055093765258789, "logits/rejected": 11.549022674560547, "logps/chosen": -3.3893556594848633, "logps/rejected": -3.343925952911377, "loss": 4.6147, "rewards/accuracies": 0.5, "rewards/chosen": -33.893558502197266, "rewards/margins": -0.4542970657348633, "rewards/rejected": -33.43926239013672, "step": 2187 }, { "epoch": 0.29793028322440085, "grad_norm": 47.42567191461589, "learning_rate": 7.083240396358872e-07, "logits/chosen": 11.346616744995117, "logits/rejected": 10.660161018371582, "logps/chosen": -3.281723976135254, "logps/rejected": -3.4089345932006836, "loss": 4.2714, "rewards/accuracies": 0.75, "rewards/chosen": -32.81724166870117, "rewards/margins": 1.2721080780029297, "rewards/rejected": -34.08934783935547, "step": 2188 }, { "epoch": 0.29806644880174293, "grad_norm": 39.378214425225785, "learning_rate": 7.082028730969643e-07, "logits/chosen": 11.586642265319824, "logits/rejected": 11.195748329162598, "logps/chosen": -3.0004091262817383, "logps/rejected": -3.0681569576263428, "loss": 4.2197, "rewards/accuracies": 0.5, "rewards/chosen": -30.004093170166016, "rewards/margins": 0.6774754524230957, "rewards/rejected": -30.681568145751953, "step": 2189 }, { "epoch": 0.29820261437908496, "grad_norm": 44.36052701548159, "learning_rate": 7.080816369170545e-07, "logits/chosen": 12.140392303466797, "logits/rejected": 12.575546264648438, "logps/chosen": -3.5392231941223145, "logps/rejected": -3.660207986831665, "loss": 4.4164, "rewards/accuracies": 0.75, "rewards/chosen": -35.39223098754883, "rewards/margins": 1.2098474502563477, "rewards/rejected": -36.602081298828125, "step": 2190 }, { "epoch": 0.298338779956427, "grad_norm": 44.4337267441045, "learning_rate": 7.079603311235524e-07, "logits/chosen": 11.493304252624512, "logits/rejected": 11.63375473022461, "logps/chosen": -3.3272852897644043, "logps/rejected": -3.5480823516845703, "loss": 4.1173, "rewards/accuracies": 0.5, "rewards/chosen": -33.272850036621094, "rewards/margins": 2.2079696655273438, "rewards/rejected": -35.48081970214844, "step": 2191 }, { "epoch": 0.2984749455337691, "grad_norm": 41.13410381618422, "learning_rate": 7.078389557438677e-07, "logits/chosen": 11.353687286376953, "logits/rejected": 12.110076904296875, "logps/chosen": -3.66221022605896, "logps/rejected": -3.6616053581237793, "loss": 3.8847, "rewards/accuracies": 0.5, "rewards/chosen": -36.622100830078125, "rewards/margins": -0.006045818328857422, "rewards/rejected": -36.616058349609375, "step": 2192 }, { "epoch": 0.2986111111111111, "grad_norm": 49.484527201866086, "learning_rate": 7.077175108054265e-07, "logits/chosen": 12.648368835449219, "logits/rejected": 12.72071361541748, "logps/chosen": -3.5683035850524902, "logps/rejected": -3.4536314010620117, "loss": 3.8118, "rewards/accuracies": 0.5, "rewards/chosen": -35.68303680419922, "rewards/margins": -1.1467208862304688, "rewards/rejected": -34.53631591796875, "step": 2193 }, { "epoch": 0.29874727668845313, "grad_norm": 40.5345477539727, "learning_rate": 7.075959963356699e-07, "logits/chosen": 12.242118835449219, "logits/rejected": 11.974977493286133, "logps/chosen": -3.5078792572021484, "logps/rejected": -3.8223674297332764, "loss": 3.5427, "rewards/accuracies": 0.75, "rewards/chosen": -35.07879638671875, "rewards/margins": 3.144881248474121, "rewards/rejected": -38.22367477416992, "step": 2194 }, { "epoch": 0.2988834422657952, "grad_norm": 43.28193483828527, "learning_rate": 7.074744123620554e-07, "logits/chosen": 11.79769515991211, "logits/rejected": 12.858165740966797, "logps/chosen": -3.3066189289093018, "logps/rejected": -3.8100714683532715, "loss": 4.4185, "rewards/accuracies": 1.0, "rewards/chosen": -33.066192626953125, "rewards/margins": 5.034524440765381, "rewards/rejected": -38.10071563720703, "step": 2195 }, { "epoch": 0.29901960784313725, "grad_norm": 49.30526782500188, "learning_rate": 7.073527589120559e-07, "logits/chosen": 11.369089126586914, "logits/rejected": 12.220661163330078, "logps/chosen": -3.401165008544922, "logps/rejected": -3.9267823696136475, "loss": 4.0532, "rewards/accuracies": 1.0, "rewards/chosen": -34.01165008544922, "rewards/margins": 5.256173133850098, "rewards/rejected": -39.267822265625, "step": 2196 }, { "epoch": 0.2991557734204793, "grad_norm": 46.09619439295432, "learning_rate": 7.072310360131598e-07, "logits/chosen": 11.905502319335938, "logits/rejected": 11.558517456054688, "logps/chosen": -3.4348340034484863, "logps/rejected": -3.860487699508667, "loss": 4.2442, "rewards/accuracies": 0.75, "rewards/chosen": -34.34833908081055, "rewards/margins": 4.25653600692749, "rewards/rejected": -38.60487365722656, "step": 2197 }, { "epoch": 0.29929193899782136, "grad_norm": 41.011246247567534, "learning_rate": 7.071092436928715e-07, "logits/chosen": 12.566732406616211, "logits/rejected": 12.374313354492188, "logps/chosen": -3.5201497077941895, "logps/rejected": -3.449432611465454, "loss": 3.9211, "rewards/accuracies": 0.5, "rewards/chosen": -35.201499938964844, "rewards/margins": -0.7071719169616699, "rewards/rejected": -34.494327545166016, "step": 2198 }, { "epoch": 0.2994281045751634, "grad_norm": 455.788566895395, "learning_rate": 7.069873819787111e-07, "logits/chosen": 11.786993026733398, "logits/rejected": 12.121196746826172, "logps/chosen": -3.379725456237793, "logps/rejected": -3.4766643047332764, "loss": 3.7602, "rewards/accuracies": 0.5, "rewards/chosen": -33.7972526550293, "rewards/margins": 0.9693903923034668, "rewards/rejected": -34.76664352416992, "step": 2199 }, { "epoch": 0.2995642701525055, "grad_norm": 41.28506212734276, "learning_rate": 7.068654508982142e-07, "logits/chosen": 10.429443359375, "logits/rejected": 11.413662910461426, "logps/chosen": -3.227567195892334, "logps/rejected": -3.5602869987487793, "loss": 4.0655, "rewards/accuracies": 1.0, "rewards/chosen": -32.275672912597656, "rewards/margins": 3.3271965980529785, "rewards/rejected": -35.60287094116211, "step": 2200 }, { "epoch": 0.2997004357298475, "grad_norm": 56.25522826634461, "learning_rate": 7.06743450478932e-07, "logits/chosen": 11.066590309143066, "logits/rejected": 12.106194496154785, "logps/chosen": -3.2649855613708496, "logps/rejected": -3.2312521934509277, "loss": 4.0661, "rewards/accuracies": 0.5, "rewards/chosen": -32.64985656738281, "rewards/margins": -0.33733367919921875, "rewards/rejected": -32.312522888183594, "step": 2201 }, { "epoch": 0.29983660130718953, "grad_norm": 41.97925976252699, "learning_rate": 7.066213807484315e-07, "logits/chosen": 11.832862854003906, "logits/rejected": 11.871356964111328, "logps/chosen": -3.411085605621338, "logps/rejected": -3.5842862129211426, "loss": 3.7745, "rewards/accuracies": 0.5, "rewards/chosen": -34.11085510253906, "rewards/margins": 1.732004165649414, "rewards/rejected": -35.84286117553711, "step": 2202 }, { "epoch": 0.2999727668845316, "grad_norm": 45.532341625092165, "learning_rate": 7.064992417342956e-07, "logits/chosen": 12.212509155273438, "logits/rejected": 12.642821311950684, "logps/chosen": -3.44882869720459, "logps/rejected": -3.549659252166748, "loss": 3.4243, "rewards/accuracies": 0.5, "rewards/chosen": -34.48828887939453, "rewards/margins": 1.0083041191101074, "rewards/rejected": -35.49658966064453, "step": 2203 }, { "epoch": 0.30010893246187365, "grad_norm": 48.05463043280054, "learning_rate": 7.063770334641224e-07, "logits/chosen": 11.763494491577148, "logits/rejected": 12.351247787475586, "logps/chosen": -3.4690051078796387, "logps/rejected": -3.347529649734497, "loss": 4.5402, "rewards/accuracies": 0.25, "rewards/chosen": -34.6900520324707, "rewards/margins": -1.2147560119628906, "rewards/rejected": -33.47529602050781, "step": 2204 }, { "epoch": 0.3002450980392157, "grad_norm": 47.99802554312597, "learning_rate": 7.062547559655261e-07, "logits/chosen": 11.495158195495605, "logits/rejected": 10.930968284606934, "logps/chosen": -3.2103452682495117, "logps/rejected": -3.205887794494629, "loss": 4.657, "rewards/accuracies": 0.5, "rewards/chosen": -32.10345458984375, "rewards/margins": -0.044574737548828125, "rewards/rejected": -32.05887985229492, "step": 2205 }, { "epoch": 0.30038126361655776, "grad_norm": 41.371420293679925, "learning_rate": 7.06132409266136e-07, "logits/chosen": 11.618341445922852, "logits/rejected": 11.974729537963867, "logps/chosen": -2.993112802505493, "logps/rejected": -3.3991522789001465, "loss": 4.3443, "rewards/accuracies": 1.0, "rewards/chosen": -29.931127548217773, "rewards/margins": 4.060394287109375, "rewards/rejected": -33.99152374267578, "step": 2206 }, { "epoch": 0.3005174291938998, "grad_norm": 41.851595606347765, "learning_rate": 7.060099933935976e-07, "logits/chosen": 11.97009563446045, "logits/rejected": 12.263810157775879, "logps/chosen": -3.411898612976074, "logps/rejected": -3.3035168647766113, "loss": 4.1447, "rewards/accuracies": 0.25, "rewards/chosen": -34.118988037109375, "rewards/margins": -1.083817958831787, "rewards/rejected": -33.0351676940918, "step": 2207 }, { "epoch": 0.3006535947712418, "grad_norm": 41.14236906419816, "learning_rate": 7.058875083755718e-07, "logits/chosen": 11.444314956665039, "logits/rejected": 11.600354194641113, "logps/chosen": -3.3896918296813965, "logps/rejected": -3.268115520477295, "loss": 3.961, "rewards/accuracies": 0.25, "rewards/chosen": -33.89691925048828, "rewards/margins": -1.2157635688781738, "rewards/rejected": -32.681156158447266, "step": 2208 }, { "epoch": 0.3007897603485839, "grad_norm": 44.41957648951541, "learning_rate": 7.057649542397348e-07, "logits/chosen": 11.910636901855469, "logits/rejected": 11.958560943603516, "logps/chosen": -3.372896909713745, "logps/rejected": -3.2012243270874023, "loss": 4.2223, "rewards/accuracies": 0.5, "rewards/chosen": -33.72896957397461, "rewards/margins": -1.7167258262634277, "rewards/rejected": -32.012245178222656, "step": 2209 }, { "epoch": 0.30092592592592593, "grad_norm": 41.49523885488657, "learning_rate": 7.05642331013779e-07, "logits/chosen": 11.129810333251953, "logits/rejected": 12.08743667602539, "logps/chosen": -3.086038589477539, "logps/rejected": -3.4356470108032227, "loss": 3.5092, "rewards/accuracies": 1.0, "rewards/chosen": -30.860387802124023, "rewards/margins": 3.4960832595825195, "rewards/rejected": -34.35647201538086, "step": 2210 }, { "epoch": 0.30106209150326796, "grad_norm": 48.475586706046165, "learning_rate": 7.055196387254119e-07, "logits/chosen": 10.137299537658691, "logits/rejected": 11.885908126831055, "logps/chosen": -2.8376712799072266, "logps/rejected": -3.6186063289642334, "loss": 3.8941, "rewards/accuracies": 1.0, "rewards/chosen": -28.376712799072266, "rewards/margins": 7.809350967407227, "rewards/rejected": -36.186065673828125, "step": 2211 }, { "epoch": 0.30119825708061004, "grad_norm": 38.47633958982593, "learning_rate": 7.053968774023571e-07, "logits/chosen": 10.883031845092773, "logits/rejected": 9.975534439086914, "logps/chosen": -3.0951895713806152, "logps/rejected": -2.9786438941955566, "loss": 3.8612, "rewards/accuracies": 0.25, "rewards/chosen": -30.95189666748047, "rewards/margins": -1.1654582023620605, "rewards/rejected": -29.78643798828125, "step": 2212 }, { "epoch": 0.3013344226579521, "grad_norm": 42.93879787278341, "learning_rate": 7.052740470723535e-07, "logits/chosen": 9.817456245422363, "logits/rejected": 11.153144836425781, "logps/chosen": -3.017101764678955, "logps/rejected": -3.338008165359497, "loss": 3.9628, "rewards/accuracies": 0.75, "rewards/chosen": -30.171016693115234, "rewards/margins": 3.209064483642578, "rewards/rejected": -33.38008117675781, "step": 2213 }, { "epoch": 0.3014705882352941, "grad_norm": 41.55306919381877, "learning_rate": 7.051511477631554e-07, "logits/chosen": 11.048837661743164, "logits/rejected": 10.938943862915039, "logps/chosen": -3.004767894744873, "logps/rejected": -2.9879567623138428, "loss": 4.0339, "rewards/accuracies": 0.5, "rewards/chosen": -30.047679901123047, "rewards/margins": -0.16811180114746094, "rewards/rejected": -29.879568099975586, "step": 2214 }, { "epoch": 0.3016067538126362, "grad_norm": 40.61983683160194, "learning_rate": 7.050281795025331e-07, "logits/chosen": 10.48038387298584, "logits/rejected": 12.096956253051758, "logps/chosen": -2.9004628658294678, "logps/rejected": -3.4339871406555176, "loss": 3.6699, "rewards/accuracies": 0.75, "rewards/chosen": -29.004629135131836, "rewards/margins": 5.335242748260498, "rewards/rejected": -34.339874267578125, "step": 2215 }, { "epoch": 0.3017429193899782, "grad_norm": 44.6225309106237, "learning_rate": 7.049051423182721e-07, "logits/chosen": 10.461223602294922, "logits/rejected": 10.434820175170898, "logps/chosen": -3.367936611175537, "logps/rejected": -3.383641242980957, "loss": 4.2303, "rewards/accuracies": 0.25, "rewards/chosen": -33.67936706542969, "rewards/margins": 0.1570444107055664, "rewards/rejected": -33.83641052246094, "step": 2216 }, { "epoch": 0.30187908496732024, "grad_norm": 41.586701407920366, "learning_rate": 7.04782036238174e-07, "logits/chosen": 12.338605880737305, "logits/rejected": 11.330057144165039, "logps/chosen": -3.500772476196289, "logps/rejected": -3.4707953929901123, "loss": 4.1002, "rewards/accuracies": 0.5, "rewards/chosen": -35.00772476196289, "rewards/margins": -0.2997703552246094, "rewards/rejected": -34.70795440673828, "step": 2217 }, { "epoch": 0.30201525054466233, "grad_norm": 41.94846607117136, "learning_rate": 7.046588612900555e-07, "logits/chosen": 12.136343002319336, "logits/rejected": 12.331476211547852, "logps/chosen": -3.4803109169006348, "logps/rejected": -3.620713949203491, "loss": 4.4943, "rewards/accuracies": 0.5, "rewards/chosen": -34.8031120300293, "rewards/margins": 1.4040298461914062, "rewards/rejected": -36.20713806152344, "step": 2218 }, { "epoch": 0.30215141612200436, "grad_norm": 41.764544071158035, "learning_rate": 7.045356175017489e-07, "logits/chosen": 11.669296264648438, "logits/rejected": 11.598791122436523, "logps/chosen": -3.24076771736145, "logps/rejected": -3.2160823345184326, "loss": 4.36, "rewards/accuracies": 0.5, "rewards/chosen": -32.407676696777344, "rewards/margins": -0.2468547821044922, "rewards/rejected": -32.16082000732422, "step": 2219 }, { "epoch": 0.3022875816993464, "grad_norm": 39.075726066513255, "learning_rate": 7.044123049011022e-07, "logits/chosen": 12.261678695678711, "logits/rejected": 11.858728408813477, "logps/chosen": -3.62857723236084, "logps/rejected": -3.901265859603882, "loss": 3.8721, "rewards/accuracies": 0.75, "rewards/chosen": -36.28577423095703, "rewards/margins": 2.7268829345703125, "rewards/rejected": -39.012657165527344, "step": 2220 }, { "epoch": 0.30242374727668847, "grad_norm": 39.536873157103216, "learning_rate": 7.042889235159789e-07, "logits/chosen": 10.928627014160156, "logits/rejected": 12.547247886657715, "logps/chosen": -3.205336570739746, "logps/rejected": -3.582432746887207, "loss": 4.0738, "rewards/accuracies": 0.75, "rewards/chosen": -32.053367614746094, "rewards/margins": 3.7709622383117676, "rewards/rejected": -35.82432556152344, "step": 2221 }, { "epoch": 0.3025599128540305, "grad_norm": 38.12155178057786, "learning_rate": 7.041654733742581e-07, "logits/chosen": 11.883041381835938, "logits/rejected": 12.111228942871094, "logps/chosen": -3.2231955528259277, "logps/rejected": -3.6693966388702393, "loss": 3.8139, "rewards/accuracies": 1.0, "rewards/chosen": -32.231956481933594, "rewards/margins": 4.462009906768799, "rewards/rejected": -36.693965911865234, "step": 2222 }, { "epoch": 0.30269607843137253, "grad_norm": 40.8880430854413, "learning_rate": 7.040419545038344e-07, "logits/chosen": 11.101724624633789, "logits/rejected": 11.50102424621582, "logps/chosen": -3.354084014892578, "logps/rejected": -3.3759756088256836, "loss": 4.0802, "rewards/accuracies": 0.25, "rewards/chosen": -33.54084014892578, "rewards/margins": 0.2189159393310547, "rewards/rejected": -33.75975799560547, "step": 2223 }, { "epoch": 0.3028322440087146, "grad_norm": 45.301177851983354, "learning_rate": 7.039183669326175e-07, "logits/chosen": 12.128955841064453, "logits/rejected": 11.385156631469727, "logps/chosen": -3.3687779903411865, "logps/rejected": -3.4989047050476074, "loss": 4.7363, "rewards/accuracies": 0.5, "rewards/chosen": -33.687782287597656, "rewards/margins": 1.3012661933898926, "rewards/rejected": -34.98904800415039, "step": 2224 }, { "epoch": 0.30296840958605664, "grad_norm": 44.81388078539682, "learning_rate": 7.037947106885336e-07, "logits/chosen": 11.853976249694824, "logits/rejected": 11.707494735717773, "logps/chosen": -3.192241668701172, "logps/rejected": -3.398963689804077, "loss": 3.9243, "rewards/accuracies": 0.75, "rewards/chosen": -31.92241668701172, "rewards/margins": 2.0672202110290527, "rewards/rejected": -33.98963928222656, "step": 2225 }, { "epoch": 0.30310457516339867, "grad_norm": 49.57238444677033, "learning_rate": 7.036709857995237e-07, "logits/chosen": 10.759661674499512, "logits/rejected": 12.414984703063965, "logps/chosen": -3.2831997871398926, "logps/rejected": -3.638845920562744, "loss": 3.8914, "rewards/accuracies": 0.75, "rewards/chosen": -32.83199691772461, "rewards/margins": 3.556460380554199, "rewards/rejected": -36.388458251953125, "step": 2226 }, { "epoch": 0.30324074074074076, "grad_norm": 43.09781534170061, "learning_rate": 7.035471922935445e-07, "logits/chosen": 11.630327224731445, "logits/rejected": 11.212223052978516, "logps/chosen": -3.1867733001708984, "logps/rejected": -2.9432435035705566, "loss": 4.3084, "rewards/accuracies": 0.25, "rewards/chosen": -31.867733001708984, "rewards/margins": -2.4352970123291016, "rewards/rejected": -29.432435989379883, "step": 2227 }, { "epoch": 0.3033769063180828, "grad_norm": 39.61095552892142, "learning_rate": 7.034233301985678e-07, "logits/chosen": 11.865662574768066, "logits/rejected": 11.516003608703613, "logps/chosen": -3.3705127239227295, "logps/rejected": -3.177591562271118, "loss": 4.3047, "rewards/accuracies": 0.25, "rewards/chosen": -33.70512771606445, "rewards/margins": -1.9292116165161133, "rewards/rejected": -31.775917053222656, "step": 2228 }, { "epoch": 0.3035130718954248, "grad_norm": 44.64901218052861, "learning_rate": 7.032993995425815e-07, "logits/chosen": 10.773962020874023, "logits/rejected": 10.933972358703613, "logps/chosen": -3.246572494506836, "logps/rejected": -3.6846671104431152, "loss": 3.7385, "rewards/accuracies": 1.0, "rewards/chosen": -32.46572494506836, "rewards/margins": 4.380946159362793, "rewards/rejected": -36.84667205810547, "step": 2229 }, { "epoch": 0.3036492374727669, "grad_norm": 39.275517529709624, "learning_rate": 7.031754003535889e-07, "logits/chosen": 11.865368843078613, "logits/rejected": 13.031991958618164, "logps/chosen": -3.2244091033935547, "logps/rejected": -3.682159900665283, "loss": 3.9824, "rewards/accuracies": 1.0, "rewards/chosen": -32.24409103393555, "rewards/margins": 4.577509880065918, "rewards/rejected": -36.82160186767578, "step": 2230 }, { "epoch": 0.3037854030501089, "grad_norm": 40.62156939194781, "learning_rate": 7.030513326596085e-07, "logits/chosen": 11.323036193847656, "logits/rejected": 12.28653335571289, "logps/chosen": -3.354494094848633, "logps/rejected": -3.468097686767578, "loss": 4.0267, "rewards/accuracies": 0.5, "rewards/chosen": -33.544944763183594, "rewards/margins": 1.136035442352295, "rewards/rejected": -34.68097686767578, "step": 2231 }, { "epoch": 0.30392156862745096, "grad_norm": 45.984589465355015, "learning_rate": 7.029271964886745e-07, "logits/chosen": 11.194272994995117, "logits/rejected": 10.873678207397461, "logps/chosen": -3.1175942420959473, "logps/rejected": -3.256498336791992, "loss": 4.3669, "rewards/accuracies": 0.75, "rewards/chosen": -31.175941467285156, "rewards/margins": 1.3890385627746582, "rewards/rejected": -32.564979553222656, "step": 2232 }, { "epoch": 0.30405773420479304, "grad_norm": 44.28182450298687, "learning_rate": 7.028029918688364e-07, "logits/chosen": 11.199638366699219, "logits/rejected": 11.583171844482422, "logps/chosen": -3.506049156188965, "logps/rejected": -3.4513955116271973, "loss": 4.231, "rewards/accuracies": 0.25, "rewards/chosen": -35.06049346923828, "rewards/margins": -0.5465378761291504, "rewards/rejected": -34.513954162597656, "step": 2233 }, { "epoch": 0.30419389978213507, "grad_norm": 48.49739510891738, "learning_rate": 7.026787188281592e-07, "logits/chosen": 10.021581649780273, "logits/rejected": 11.501524925231934, "logps/chosen": -2.6483726501464844, "logps/rejected": -2.965061902999878, "loss": 4.3365, "rewards/accuracies": 0.75, "rewards/chosen": -26.483726501464844, "rewards/margins": 3.166891574859619, "rewards/rejected": -29.650619506835938, "step": 2234 }, { "epoch": 0.3043300653594771, "grad_norm": 43.620639864296045, "learning_rate": 7.025543773947235e-07, "logits/chosen": 11.048434257507324, "logits/rejected": 11.54743766784668, "logps/chosen": -2.9054596424102783, "logps/rejected": -3.0981507301330566, "loss": 4.0799, "rewards/accuracies": 0.5, "rewards/chosen": -29.054595947265625, "rewards/margins": 1.9269094467163086, "rewards/rejected": -30.98150634765625, "step": 2235 }, { "epoch": 0.3044662309368192, "grad_norm": 55.43642117379848, "learning_rate": 7.024299675966255e-07, "logits/chosen": 11.183845520019531, "logits/rejected": 11.964954376220703, "logps/chosen": -3.3482604026794434, "logps/rejected": -3.695140838623047, "loss": 4.2502, "rewards/accuracies": 1.0, "rewards/chosen": -33.48260498046875, "rewards/margins": 3.4688034057617188, "rewards/rejected": -36.95140838623047, "step": 2236 }, { "epoch": 0.3046023965141612, "grad_norm": 47.371549003026345, "learning_rate": 7.023054894619763e-07, "logits/chosen": 11.831701278686523, "logits/rejected": 11.516746520996094, "logps/chosen": -3.5534322261810303, "logps/rejected": -3.56308913230896, "loss": 4.5033, "rewards/accuracies": 0.5, "rewards/chosen": -35.534324645996094, "rewards/margins": 0.09656953811645508, "rewards/rejected": -35.630889892578125, "step": 2237 }, { "epoch": 0.3047385620915033, "grad_norm": 42.460879407392206, "learning_rate": 7.021809430189028e-07, "logits/chosen": 10.300477981567383, "logits/rejected": 10.801240921020508, "logps/chosen": -3.332038402557373, "logps/rejected": -3.5917673110961914, "loss": 3.4614, "rewards/accuracies": 1.0, "rewards/chosen": -33.32038497924805, "rewards/margins": 2.5972886085510254, "rewards/rejected": -35.91767120361328, "step": 2238 }, { "epoch": 0.3048747276688453, "grad_norm": 43.86365328762847, "learning_rate": 7.020563282955474e-07, "logits/chosen": 11.593988418579102, "logits/rejected": 11.702863693237305, "logps/chosen": -3.7029550075531006, "logps/rejected": -3.414029598236084, "loss": 4.6385, "rewards/accuracies": 0.25, "rewards/chosen": -37.02954864501953, "rewards/margins": -2.889254570007324, "rewards/rejected": -34.140296936035156, "step": 2239 }, { "epoch": 0.30501089324618735, "grad_norm": 104.94232125048309, "learning_rate": 7.019316453200678e-07, "logits/chosen": 11.649677276611328, "logits/rejected": 10.985636711120605, "logps/chosen": -3.18519926071167, "logps/rejected": -3.048186779022217, "loss": 4.0129, "rewards/accuracies": 0.5, "rewards/chosen": -31.851991653442383, "rewards/margins": -1.3701238632202148, "rewards/rejected": -30.481868743896484, "step": 2240 }, { "epoch": 0.30514705882352944, "grad_norm": 44.350998130177096, "learning_rate": 7.018068941206372e-07, "logits/chosen": 11.8989896774292, "logits/rejected": 12.915260314941406, "logps/chosen": -3.5060131549835205, "logps/rejected": -3.7458488941192627, "loss": 3.785, "rewards/accuracies": 0.75, "rewards/chosen": -35.06013107299805, "rewards/margins": 2.3983564376831055, "rewards/rejected": -37.45848846435547, "step": 2241 }, { "epoch": 0.30528322440087147, "grad_norm": 43.15047271167733, "learning_rate": 7.01682074725444e-07, "logits/chosen": 12.402572631835938, "logits/rejected": 11.528836250305176, "logps/chosen": -3.596090793609619, "logps/rejected": -3.499129295349121, "loss": 4.1054, "rewards/accuracies": 0.75, "rewards/chosen": -35.960906982421875, "rewards/margins": -0.9696154594421387, "rewards/rejected": -34.99129104614258, "step": 2242 }, { "epoch": 0.3054193899782135, "grad_norm": 41.1814225224714, "learning_rate": 7.015571871626925e-07, "logits/chosen": 10.902244567871094, "logits/rejected": 11.711423873901367, "logps/chosen": -3.4139721393585205, "logps/rejected": -3.8595170974731445, "loss": 3.4592, "rewards/accuracies": 1.0, "rewards/chosen": -34.13972091674805, "rewards/margins": 4.455451011657715, "rewards/rejected": -38.59517288208008, "step": 2243 }, { "epoch": 0.3055555555555556, "grad_norm": 49.190536141749334, "learning_rate": 7.014322314606017e-07, "logits/chosen": 11.058427810668945, "logits/rejected": 11.086101531982422, "logps/chosen": -3.351857900619507, "logps/rejected": -3.2946810722351074, "loss": 3.7278, "rewards/accuracies": 0.5, "rewards/chosen": -33.518577575683594, "rewards/margins": -0.571770191192627, "rewards/rejected": -32.946807861328125, "step": 2244 }, { "epoch": 0.3056917211328976, "grad_norm": 41.64386928299287, "learning_rate": 7.013072076474065e-07, "logits/chosen": 12.39931869506836, "logits/rejected": 12.414438247680664, "logps/chosen": -3.837937116622925, "logps/rejected": -4.01526403427124, "loss": 3.9085, "rewards/accuracies": 0.75, "rewards/chosen": -38.379371643066406, "rewards/margins": 1.7732696533203125, "rewards/rejected": -40.15264129638672, "step": 2245 }, { "epoch": 0.30582788671023964, "grad_norm": 43.285986730447924, "learning_rate": 7.011821157513572e-07, "logits/chosen": 12.015644073486328, "logits/rejected": 11.990822792053223, "logps/chosen": -3.4598920345306396, "logps/rejected": -3.5337655544281006, "loss": 4.2912, "rewards/accuracies": 0.75, "rewards/chosen": -34.59891891479492, "rewards/margins": 0.738736629486084, "rewards/rejected": -35.3376579284668, "step": 2246 }, { "epoch": 0.3059640522875817, "grad_norm": 48.340244384122734, "learning_rate": 7.010569558007193e-07, "logits/chosen": 11.026775360107422, "logits/rejected": 12.041412353515625, "logps/chosen": -3.3564205169677734, "logps/rejected": -3.5749287605285645, "loss": 4.033, "rewards/accuracies": 0.5, "rewards/chosen": -33.56420135498047, "rewards/margins": 2.1850852966308594, "rewards/rejected": -35.749290466308594, "step": 2247 }, { "epoch": 0.30610021786492375, "grad_norm": 44.22034737950231, "learning_rate": 7.009317278237735e-07, "logits/chosen": 12.244251251220703, "logits/rejected": 12.852193832397461, "logps/chosen": -3.472930908203125, "logps/rejected": -3.8460566997528076, "loss": 4.0564, "rewards/accuracies": 1.0, "rewards/chosen": -34.72930908203125, "rewards/margins": 3.731255054473877, "rewards/rejected": -38.46056365966797, "step": 2248 }, { "epoch": 0.3062363834422658, "grad_norm": 42.51240563713978, "learning_rate": 7.008064318488163e-07, "logits/chosen": 12.101119041442871, "logits/rejected": 12.425664901733398, "logps/chosen": -3.3265702724456787, "logps/rejected": -3.3440144062042236, "loss": 3.859, "rewards/accuracies": 0.75, "rewards/chosen": -33.26570129394531, "rewards/margins": 0.17444133758544922, "rewards/rejected": -33.440147399902344, "step": 2249 }, { "epoch": 0.30637254901960786, "grad_norm": 49.48976065984159, "learning_rate": 7.006810679041594e-07, "logits/chosen": 11.744640350341797, "logits/rejected": 12.527081489562988, "logps/chosen": -3.2821872234344482, "logps/rejected": -3.6801209449768066, "loss": 4.4216, "rewards/accuracies": 1.0, "rewards/chosen": -32.82187271118164, "rewards/margins": 3.979340076446533, "rewards/rejected": -36.801212310791016, "step": 2250 }, { "epoch": 0.3065087145969499, "grad_norm": 47.59264672845612, "learning_rate": 7.005556360181298e-07, "logits/chosen": 11.36599349975586, "logits/rejected": 11.620777130126953, "logps/chosen": -3.374551773071289, "logps/rejected": -3.742161273956299, "loss": 4.2248, "rewards/accuracies": 0.75, "rewards/chosen": -33.745521545410156, "rewards/margins": 3.6760945320129395, "rewards/rejected": -37.42161560058594, "step": 2251 }, { "epoch": 0.3066448801742919, "grad_norm": 42.88346776439319, "learning_rate": 7.004301362190698e-07, "logits/chosen": 11.384621620178223, "logits/rejected": 11.339380264282227, "logps/chosen": -3.5140771865844727, "logps/rejected": -3.655667781829834, "loss": 3.798, "rewards/accuracies": 0.75, "rewards/chosen": -35.14077377319336, "rewards/margins": 1.4159059524536133, "rewards/rejected": -36.556678771972656, "step": 2252 }, { "epoch": 0.306781045751634, "grad_norm": 48.208140449949184, "learning_rate": 7.00304568535337e-07, "logits/chosen": 12.7570161819458, "logits/rejected": 11.311958312988281, "logps/chosen": -3.0589261054992676, "logps/rejected": -3.6432461738586426, "loss": 3.8571, "rewards/accuracies": 0.75, "rewards/chosen": -30.589262008666992, "rewards/margins": 5.843197345733643, "rewards/rejected": -36.432456970214844, "step": 2253 }, { "epoch": 0.30691721132897604, "grad_norm": 50.59160467186985, "learning_rate": 7.001789329953048e-07, "logits/chosen": 11.385488510131836, "logits/rejected": 12.471208572387695, "logps/chosen": -3.246258497238159, "logps/rejected": -3.6204192638397217, "loss": 3.7268, "rewards/accuracies": 0.75, "rewards/chosen": -32.46258544921875, "rewards/margins": 3.741605758666992, "rewards/rejected": -36.204193115234375, "step": 2254 }, { "epoch": 0.30705337690631807, "grad_norm": 49.359796435186816, "learning_rate": 7.000532296273612e-07, "logits/chosen": 11.620318412780762, "logits/rejected": 11.700685501098633, "logps/chosen": -3.2970170974731445, "logps/rejected": -3.997267484664917, "loss": 4.3425, "rewards/accuracies": 0.75, "rewards/chosen": -32.97016906738281, "rewards/margins": 7.002503871917725, "rewards/rejected": -39.97267150878906, "step": 2255 }, { "epoch": 0.30718954248366015, "grad_norm": 45.459491166118305, "learning_rate": 6.999274584599102e-07, "logits/chosen": 10.880043029785156, "logits/rejected": 11.160972595214844, "logps/chosen": -3.2723019123077393, "logps/rejected": -3.450716972351074, "loss": 4.1922, "rewards/accuracies": 0.5, "rewards/chosen": -32.723018646240234, "rewards/margins": 1.7841510772705078, "rewards/rejected": -34.507171630859375, "step": 2256 }, { "epoch": 0.3073257080610022, "grad_norm": 52.2864752299592, "learning_rate": 6.998016195213708e-07, "logits/chosen": 11.737247467041016, "logits/rejected": 12.014705657958984, "logps/chosen": -3.5370254516601562, "logps/rejected": -3.699514865875244, "loss": 4.6005, "rewards/accuracies": 1.0, "rewards/chosen": -35.37025451660156, "rewards/margins": 1.6248970031738281, "rewards/rejected": -36.995147705078125, "step": 2257 }, { "epoch": 0.3074618736383442, "grad_norm": 55.04194648910684, "learning_rate": 6.996757128401771e-07, "logits/chosen": 11.32366943359375, "logits/rejected": 11.816610336303711, "logps/chosen": -3.3457419872283936, "logps/rejected": -3.6839070320129395, "loss": 4.6363, "rewards/accuracies": 1.0, "rewards/chosen": -33.457420349121094, "rewards/margins": 3.381648063659668, "rewards/rejected": -36.83906555175781, "step": 2258 }, { "epoch": 0.3075980392156863, "grad_norm": 50.81282227011666, "learning_rate": 6.995497384447791e-07, "logits/chosen": 12.051122665405273, "logits/rejected": 12.527908325195312, "logps/chosen": -3.6061038970947266, "logps/rejected": -3.977228879928589, "loss": 4.6712, "rewards/accuracies": 0.75, "rewards/chosen": -36.061038970947266, "rewards/margins": 3.711249351501465, "rewards/rejected": -39.77228927612305, "step": 2259 }, { "epoch": 0.3077342047930283, "grad_norm": 43.804966421346556, "learning_rate": 6.994236963636415e-07, "logits/chosen": 12.204141616821289, "logits/rejected": 11.62371826171875, "logps/chosen": -3.5885610580444336, "logps/rejected": -3.6758036613464355, "loss": 4.1334, "rewards/accuracies": 0.5, "rewards/chosen": -35.88561248779297, "rewards/margins": 0.8724246025085449, "rewards/rejected": -36.758033752441406, "step": 2260 }, { "epoch": 0.30787037037037035, "grad_norm": 47.85050320505367, "learning_rate": 6.992975866252447e-07, "logits/chosen": 11.26905345916748, "logits/rejected": 11.62661361694336, "logps/chosen": -3.3782882690429688, "logps/rejected": -3.696406364440918, "loss": 4.4794, "rewards/accuracies": 0.75, "rewards/chosen": -33.78288650512695, "rewards/margins": 3.181180000305176, "rewards/rejected": -36.96406555175781, "step": 2261 }, { "epoch": 0.30800653594771243, "grad_norm": 47.26646181455123, "learning_rate": 6.991714092580842e-07, "logits/chosen": 11.024535179138184, "logits/rejected": 11.83229923248291, "logps/chosen": -3.4112417697906494, "logps/rejected": -3.5125226974487305, "loss": 4.207, "rewards/accuracies": 0.25, "rewards/chosen": -34.11241912841797, "rewards/margins": 1.0128064155578613, "rewards/rejected": -35.12522506713867, "step": 2262 }, { "epoch": 0.30814270152505446, "grad_norm": 44.02629092277143, "learning_rate": 6.990451642906708e-07, "logits/chosen": 12.117202758789062, "logits/rejected": 11.916382789611816, "logps/chosen": -3.4501729011535645, "logps/rejected": -3.7110538482666016, "loss": 4.5191, "rewards/accuracies": 0.75, "rewards/chosen": -34.50172805786133, "rewards/margins": 2.608811378479004, "rewards/rejected": -37.110538482666016, "step": 2263 }, { "epoch": 0.3082788671023965, "grad_norm": 51.49921056157578, "learning_rate": 6.989188517515305e-07, "logits/chosen": 11.426259994506836, "logits/rejected": 11.516107559204102, "logps/chosen": -3.263117790222168, "logps/rejected": -3.3971757888793945, "loss": 4.3088, "rewards/accuracies": 0.5, "rewards/chosen": -32.63117980957031, "rewards/margins": 1.340578556060791, "rewards/rejected": -33.97175598144531, "step": 2264 }, { "epoch": 0.3084150326797386, "grad_norm": 44.437242735407146, "learning_rate": 6.987924716692049e-07, "logits/chosen": 11.71653938293457, "logits/rejected": 11.007034301757812, "logps/chosen": -3.51594877243042, "logps/rejected": -3.343689203262329, "loss": 4.134, "rewards/accuracies": 0.5, "rewards/chosen": -35.159488677978516, "rewards/margins": -1.7225966453552246, "rewards/rejected": -33.4368896484375, "step": 2265 }, { "epoch": 0.3085511982570806, "grad_norm": 46.34515007164527, "learning_rate": 6.986660240722504e-07, "logits/chosen": 11.808250427246094, "logits/rejected": 11.676664352416992, "logps/chosen": -3.4866223335266113, "logps/rejected": -3.4022393226623535, "loss": 3.9925, "rewards/accuracies": 0.5, "rewards/chosen": -34.8662223815918, "rewards/margins": -0.8438301086425781, "rewards/rejected": -34.02239227294922, "step": 2266 }, { "epoch": 0.30868736383442263, "grad_norm": 43.10952385275088, "learning_rate": 6.985395089892391e-07, "logits/chosen": 11.420286178588867, "logits/rejected": 12.002767562866211, "logps/chosen": -3.311164379119873, "logps/rejected": -3.627002239227295, "loss": 3.9071, "rewards/accuracies": 1.0, "rewards/chosen": -33.11164093017578, "rewards/margins": 3.158379554748535, "rewards/rejected": -36.270023345947266, "step": 2267 }, { "epoch": 0.3088235294117647, "grad_norm": 40.543037541342365, "learning_rate": 6.984129264487578e-07, "logits/chosen": 11.338265419006348, "logits/rejected": 11.595745086669922, "logps/chosen": -3.467900276184082, "logps/rejected": -3.660008192062378, "loss": 3.969, "rewards/accuracies": 0.75, "rewards/chosen": -34.67900085449219, "rewards/margins": 1.9210782051086426, "rewards/rejected": -36.60008239746094, "step": 2268 }, { "epoch": 0.30895969498910675, "grad_norm": 43.08586404429985, "learning_rate": 6.982862764794091e-07, "logits/chosen": 10.279266357421875, "logits/rejected": 11.638585090637207, "logps/chosen": -3.334500312805176, "logps/rejected": -3.5889289379119873, "loss": 3.8595, "rewards/accuracies": 0.75, "rewards/chosen": -33.345001220703125, "rewards/margins": 2.544283866882324, "rewards/rejected": -35.88928985595703, "step": 2269 }, { "epoch": 0.3090958605664488, "grad_norm": 43.030070087362816, "learning_rate": 6.981595591098106e-07, "logits/chosen": 11.657236099243164, "logits/rejected": 11.703163146972656, "logps/chosen": -3.582184314727783, "logps/rejected": -3.6887645721435547, "loss": 3.7013, "rewards/accuracies": 0.5, "rewards/chosen": -35.82184600830078, "rewards/margins": 1.0658016204833984, "rewards/rejected": -36.88764572143555, "step": 2270 }, { "epoch": 0.30923202614379086, "grad_norm": 40.3319343256302, "learning_rate": 6.980327743685951e-07, "logits/chosen": 9.994894027709961, "logits/rejected": 10.439297676086426, "logps/chosen": -3.326474666595459, "logps/rejected": -3.541053295135498, "loss": 3.6089, "rewards/accuracies": 0.5, "rewards/chosen": -33.264747619628906, "rewards/margins": 2.145785331726074, "rewards/rejected": -35.4105339050293, "step": 2271 }, { "epoch": 0.3093681917211329, "grad_norm": 45.21095017205206, "learning_rate": 6.979059222844107e-07, "logits/chosen": 12.59440803527832, "logits/rejected": 11.468269348144531, "logps/chosen": -3.630645275115967, "logps/rejected": -3.4633028507232666, "loss": 3.9794, "rewards/accuracies": 0.25, "rewards/chosen": -36.30644989013672, "rewards/margins": -1.6734232902526855, "rewards/rejected": -34.633026123046875, "step": 2272 }, { "epoch": 0.3095043572984749, "grad_norm": 43.7642291806904, "learning_rate": 6.977790028859206e-07, "logits/chosen": 11.850390434265137, "logits/rejected": 11.105916023254395, "logps/chosen": -3.3586409091949463, "logps/rejected": -3.553478717803955, "loss": 4.0748, "rewards/accuracies": 1.0, "rewards/chosen": -33.58641052246094, "rewards/margins": 1.948378086090088, "rewards/rejected": -35.5347900390625, "step": 2273 }, { "epoch": 0.309640522875817, "grad_norm": 42.39303249564644, "learning_rate": 6.976520162018033e-07, "logits/chosen": 11.563451766967773, "logits/rejected": 11.71790599822998, "logps/chosen": -3.1794631481170654, "logps/rejected": -3.3124375343322754, "loss": 4.2761, "rewards/accuracies": 0.25, "rewards/chosen": -31.794633865356445, "rewards/margins": 1.329744815826416, "rewards/rejected": -33.1243782043457, "step": 2274 }, { "epoch": 0.30977668845315903, "grad_norm": 47.84477944229104, "learning_rate": 6.975249622607525e-07, "logits/chosen": 10.943464279174805, "logits/rejected": 11.475309371948242, "logps/chosen": -3.4936437606811523, "logps/rejected": -3.524937629699707, "loss": 4.0344, "rewards/accuracies": 0.5, "rewards/chosen": -34.936439514160156, "rewards/margins": 0.31293630599975586, "rewards/rejected": -35.24937438964844, "step": 2275 }, { "epoch": 0.3099128540305011, "grad_norm": 47.947326239423866, "learning_rate": 6.973978410914773e-07, "logits/chosen": 9.731497764587402, "logits/rejected": 10.396883010864258, "logps/chosen": -2.880145788192749, "logps/rejected": -3.17596173286438, "loss": 4.233, "rewards/accuracies": 0.5, "rewards/chosen": -28.801456451416016, "rewards/margins": 2.9581594467163086, "rewards/rejected": -31.75961685180664, "step": 2276 }, { "epoch": 0.31004901960784315, "grad_norm": 40.94974195409213, "learning_rate": 6.972706527227015e-07, "logits/chosen": 11.786983489990234, "logits/rejected": 12.082735061645508, "logps/chosen": -3.4534668922424316, "logps/rejected": -3.636085271835327, "loss": 3.7572, "rewards/accuracies": 0.5, "rewards/chosen": -34.534671783447266, "rewards/margins": 1.8261823654174805, "rewards/rejected": -36.36085510253906, "step": 2277 }, { "epoch": 0.3101851851851852, "grad_norm": 47.03254345433116, "learning_rate": 6.971433971831644e-07, "logits/chosen": 11.60380744934082, "logits/rejected": 12.047135353088379, "logps/chosen": -3.1910035610198975, "logps/rejected": -3.6005234718322754, "loss": 3.9295, "rewards/accuracies": 0.75, "rewards/chosen": -31.910036087036133, "rewards/margins": 4.095197677612305, "rewards/rejected": -36.00523376464844, "step": 2278 }, { "epoch": 0.31032135076252726, "grad_norm": 41.04027398870881, "learning_rate": 6.970160745016205e-07, "logits/chosen": 10.936782836914062, "logits/rejected": 11.88260269165039, "logps/chosen": -3.246354341506958, "logps/rejected": -3.485724925994873, "loss": 3.8238, "rewards/accuracies": 0.75, "rewards/chosen": -32.46354293823242, "rewards/margins": 2.393707752227783, "rewards/rejected": -34.85725021362305, "step": 2279 }, { "epoch": 0.3104575163398693, "grad_norm": 45.018938902180224, "learning_rate": 6.968886847068394e-07, "logits/chosen": 10.258913040161133, "logits/rejected": 11.045269012451172, "logps/chosen": -3.0144217014312744, "logps/rejected": -3.341843605041504, "loss": 4.215, "rewards/accuracies": 0.75, "rewards/chosen": -30.144216537475586, "rewards/margins": 3.2742204666137695, "rewards/rejected": -33.41843795776367, "step": 2280 }, { "epoch": 0.3105936819172113, "grad_norm": 40.61923264261677, "learning_rate": 6.967612278276059e-07, "logits/chosen": 12.11693286895752, "logits/rejected": 11.368024826049805, "logps/chosen": -3.1296608448028564, "logps/rejected": -3.3354408740997314, "loss": 3.8121, "rewards/accuracies": 0.75, "rewards/chosen": -31.296607971191406, "rewards/margins": 2.057800769805908, "rewards/rejected": -33.354408264160156, "step": 2281 }, { "epoch": 0.3107298474945534, "grad_norm": 45.99969180618545, "learning_rate": 6.9663370389272e-07, "logits/chosen": 9.797719955444336, "logits/rejected": 11.447616577148438, "logps/chosen": -2.816845417022705, "logps/rejected": -3.324000120162964, "loss": 3.8154, "rewards/accuracies": 1.0, "rewards/chosen": -28.168453216552734, "rewards/margins": 5.07154655456543, "rewards/rejected": -33.2400016784668, "step": 2282 }, { "epoch": 0.31086601307189543, "grad_norm": 37.75239057724913, "learning_rate": 6.965061129309965e-07, "logits/chosen": 11.130104064941406, "logits/rejected": 12.435737609863281, "logps/chosen": -3.637601137161255, "logps/rejected": -4.0262041091918945, "loss": 3.6807, "rewards/accuracies": 0.75, "rewards/chosen": -36.37601089477539, "rewards/margins": 3.886031150817871, "rewards/rejected": -40.26204299926758, "step": 2283 }, { "epoch": 0.31100217864923746, "grad_norm": 48.79871821960127, "learning_rate": 6.963784549712661e-07, "logits/chosen": 11.231215476989746, "logits/rejected": 12.08094596862793, "logps/chosen": -3.3240888118743896, "logps/rejected": -3.7064647674560547, "loss": 3.7388, "rewards/accuracies": 1.0, "rewards/chosen": -33.24089050292969, "rewards/margins": 3.823758125305176, "rewards/rejected": -37.06465148925781, "step": 2284 }, { "epoch": 0.31113834422657954, "grad_norm": 41.19741346603217, "learning_rate": 6.962507300423738e-07, "logits/chosen": 12.28782844543457, "logits/rejected": 11.92813777923584, "logps/chosen": -3.59456729888916, "logps/rejected": -3.6604514122009277, "loss": 3.6265, "rewards/accuracies": 0.5, "rewards/chosen": -35.94567108154297, "rewards/margins": 0.6588420867919922, "rewards/rejected": -36.604515075683594, "step": 2285 }, { "epoch": 0.3112745098039216, "grad_norm": 49.153710413961534, "learning_rate": 6.961229381731801e-07, "logits/chosen": 11.74030876159668, "logits/rejected": 11.287466049194336, "logps/chosen": -3.28947114944458, "logps/rejected": -3.4679465293884277, "loss": 4.4845, "rewards/accuracies": 0.75, "rewards/chosen": -32.894710540771484, "rewards/margins": 1.784754753112793, "rewards/rejected": -34.679466247558594, "step": 2286 }, { "epoch": 0.3114106753812636, "grad_norm": 41.349558424220334, "learning_rate": 6.959950793925608e-07, "logits/chosen": 11.327705383300781, "logits/rejected": 11.53412914276123, "logps/chosen": -3.590003728866577, "logps/rejected": -3.7642993927001953, "loss": 3.7551, "rewards/accuracies": 0.75, "rewards/chosen": -35.90003967285156, "rewards/margins": 1.7429566383361816, "rewards/rejected": -37.64299392700195, "step": 2287 }, { "epoch": 0.3115468409586057, "grad_norm": 43.95299700695556, "learning_rate": 6.958671537294067e-07, "logits/chosen": 10.781490325927734, "logits/rejected": 12.54725456237793, "logps/chosen": -3.495741367340088, "logps/rejected": -3.9414703845977783, "loss": 3.7829, "rewards/accuracies": 0.75, "rewards/chosen": -34.95741271972656, "rewards/margins": 4.45728874206543, "rewards/rejected": -39.414703369140625, "step": 2288 }, { "epoch": 0.3116830065359477, "grad_norm": 48.76771129138826, "learning_rate": 6.957391612126235e-07, "logits/chosen": 11.98248291015625, "logits/rejected": 12.147810935974121, "logps/chosen": -4.044687271118164, "logps/rejected": -3.8703441619873047, "loss": 4.2943, "rewards/accuracies": 0.5, "rewards/chosen": -40.44687271118164, "rewards/margins": -1.743433952331543, "rewards/rejected": -38.70343780517578, "step": 2289 }, { "epoch": 0.31181917211328974, "grad_norm": 39.15056027945461, "learning_rate": 6.956111018711322e-07, "logits/chosen": 11.987833976745605, "logits/rejected": 11.97076416015625, "logps/chosen": -3.1931052207946777, "logps/rejected": -3.330878734588623, "loss": 3.6768, "rewards/accuracies": 0.5, "rewards/chosen": -31.931049346923828, "rewards/margins": 1.3777360916137695, "rewards/rejected": -33.30878448486328, "step": 2290 }, { "epoch": 0.31195533769063183, "grad_norm": 44.04422463196549, "learning_rate": 6.954829757338689e-07, "logits/chosen": 11.300156593322754, "logits/rejected": 11.572565078735352, "logps/chosen": -3.2939610481262207, "logps/rejected": -3.5086145401000977, "loss": 3.9274, "rewards/accuracies": 0.5, "rewards/chosen": -32.939613342285156, "rewards/margins": 2.1465344429016113, "rewards/rejected": -35.086143493652344, "step": 2291 }, { "epoch": 0.31209150326797386, "grad_norm": 48.66291054518986, "learning_rate": 6.953547828297847e-07, "logits/chosen": 11.488129615783691, "logits/rejected": 11.333135604858398, "logps/chosen": -3.4929022789001465, "logps/rejected": -3.6163082122802734, "loss": 4.6154, "rewards/accuracies": 0.5, "rewards/chosen": -34.92902374267578, "rewards/margins": 1.2340598106384277, "rewards/rejected": -36.1630859375, "step": 2292 }, { "epoch": 0.3122276688453159, "grad_norm": 41.74182633397399, "learning_rate": 6.95226523187846e-07, "logits/chosen": 10.68380355834961, "logits/rejected": 11.693130493164062, "logps/chosen": -3.02329158782959, "logps/rejected": -3.794851303100586, "loss": 3.801, "rewards/accuracies": 1.0, "rewards/chosen": -30.2329158782959, "rewards/margins": 7.7155961990356445, "rewards/rejected": -37.94851303100586, "step": 2293 }, { "epoch": 0.31236383442265797, "grad_norm": 47.81170595005028, "learning_rate": 6.950981968370339e-07, "logits/chosen": 11.924908638000488, "logits/rejected": 12.262738227844238, "logps/chosen": -3.7406303882598877, "logps/rejected": -4.173366546630859, "loss": 3.7444, "rewards/accuracies": 0.5, "rewards/chosen": -37.406307220458984, "rewards/margins": 4.32736349105835, "rewards/rejected": -41.73366928100586, "step": 2294 }, { "epoch": 0.3125, "grad_norm": 43.13047957651803, "learning_rate": 6.94969803806345e-07, "logits/chosen": 10.40400218963623, "logits/rejected": 11.381430625915527, "logps/chosen": -3.3518056869506836, "logps/rejected": -3.4283242225646973, "loss": 4.0107, "rewards/accuracies": 0.75, "rewards/chosen": -33.51805877685547, "rewards/margins": 0.7651853561401367, "rewards/rejected": -34.283241271972656, "step": 2295 }, { "epoch": 0.31263616557734203, "grad_norm": 47.04334456631966, "learning_rate": 6.948413441247906e-07, "logits/chosen": 12.484114646911621, "logits/rejected": 12.423619270324707, "logps/chosen": -3.3167099952697754, "logps/rejected": -3.545466184616089, "loss": 4.0846, "rewards/accuracies": 0.75, "rewards/chosen": -33.16709899902344, "rewards/margins": 2.287562847137451, "rewards/rejected": -35.45466232299805, "step": 2296 }, { "epoch": 0.3127723311546841, "grad_norm": 43.749225374332354, "learning_rate": 6.947128178213974e-07, "logits/chosen": 11.04593276977539, "logits/rejected": 11.368429183959961, "logps/chosen": -3.5153045654296875, "logps/rejected": -3.4879188537597656, "loss": 4.1347, "rewards/accuracies": 0.5, "rewards/chosen": -35.153045654296875, "rewards/margins": -0.27385568618774414, "rewards/rejected": -34.879188537597656, "step": 2297 }, { "epoch": 0.31290849673202614, "grad_norm": 42.84794252160139, "learning_rate": 6.945842249252068e-07, "logits/chosen": 11.759337425231934, "logits/rejected": 11.906394958496094, "logps/chosen": -3.5882976055145264, "logps/rejected": -3.4387874603271484, "loss": 4.22, "rewards/accuracies": 0.0, "rewards/chosen": -35.882972717285156, "rewards/margins": -1.4951019287109375, "rewards/rejected": -34.387874603271484, "step": 2298 }, { "epoch": 0.31304466230936817, "grad_norm": 48.04673818446537, "learning_rate": 6.944555654652756e-07, "logits/chosen": 10.630605697631836, "logits/rejected": 11.234710693359375, "logps/chosen": -3.618189811706543, "logps/rejected": -3.999300241470337, "loss": 3.9443, "rewards/accuracies": 0.75, "rewards/chosen": -36.18190002441406, "rewards/margins": 3.8111038208007812, "rewards/rejected": -39.99300003051758, "step": 2299 }, { "epoch": 0.31318082788671026, "grad_norm": 46.63879150836187, "learning_rate": 6.943268394706754e-07, "logits/chosen": 9.962447166442871, "logits/rejected": 10.318022727966309, "logps/chosen": -2.9994418621063232, "logps/rejected": -3.158337116241455, "loss": 4.7153, "rewards/accuracies": 0.5, "rewards/chosen": -29.99441909790039, "rewards/margins": 1.588951587677002, "rewards/rejected": -31.583370208740234, "step": 2300 }, { "epoch": 0.3133169934640523, "grad_norm": 38.09729250707865, "learning_rate": 6.941980469704928e-07, "logits/chosen": 10.586372375488281, "logits/rejected": 11.051368713378906, "logps/chosen": -3.1526994705200195, "logps/rejected": -3.5476150512695312, "loss": 3.4995, "rewards/accuracies": 0.75, "rewards/chosen": -31.526994705200195, "rewards/margins": 3.9491562843322754, "rewards/rejected": -35.47615051269531, "step": 2301 }, { "epoch": 0.3134531590413943, "grad_norm": 40.62197439745322, "learning_rate": 6.940691879938297e-07, "logits/chosen": 10.80319595336914, "logits/rejected": 11.795995712280273, "logps/chosen": -3.3533008098602295, "logps/rejected": -3.878091812133789, "loss": 3.8449, "rewards/accuracies": 1.0, "rewards/chosen": -33.53301239013672, "rewards/margins": 5.247906684875488, "rewards/rejected": -38.78091812133789, "step": 2302 }, { "epoch": 0.3135893246187364, "grad_norm": 41.36481724881585, "learning_rate": 6.939402625698027e-07, "logits/chosen": 11.600234985351562, "logits/rejected": 11.873041152954102, "logps/chosen": -3.505887746810913, "logps/rejected": -3.5692780017852783, "loss": 3.6066, "rewards/accuracies": 0.75, "rewards/chosen": -35.058876037597656, "rewards/margins": 0.6339020729064941, "rewards/rejected": -35.692779541015625, "step": 2303 }, { "epoch": 0.3137254901960784, "grad_norm": 45.314190913681, "learning_rate": 6.938112707275437e-07, "logits/chosen": 10.993867874145508, "logits/rejected": 11.098814964294434, "logps/chosen": -3.3789010047912598, "logps/rejected": -3.494157314300537, "loss": 3.5894, "rewards/accuracies": 0.5, "rewards/chosen": -33.78900909423828, "rewards/margins": 1.1525630950927734, "rewards/rejected": -34.94157409667969, "step": 2304 }, { "epoch": 0.31386165577342046, "grad_norm": 45.683598622918375, "learning_rate": 6.936822124961994e-07, "logits/chosen": 10.905484199523926, "logits/rejected": 11.224184036254883, "logps/chosen": -3.267467975616455, "logps/rejected": -3.265782356262207, "loss": 3.4993, "rewards/accuracies": 0.5, "rewards/chosen": -32.6746826171875, "rewards/margins": -0.01685476303100586, "rewards/rejected": -32.6578254699707, "step": 2305 }, { "epoch": 0.31399782135076254, "grad_norm": 38.930539586542906, "learning_rate": 6.935530879049317e-07, "logits/chosen": 10.729429244995117, "logits/rejected": 11.135977745056152, "logps/chosen": -3.313906669616699, "logps/rejected": -4.045256614685059, "loss": 3.6479, "rewards/accuracies": 1.0, "rewards/chosen": -33.13906478881836, "rewards/margins": 7.313498497009277, "rewards/rejected": -40.45256423950195, "step": 2306 }, { "epoch": 0.31413398692810457, "grad_norm": 45.99277486127395, "learning_rate": 6.93423896982917e-07, "logits/chosen": 10.319323539733887, "logits/rejected": 10.95207405090332, "logps/chosen": -3.174551248550415, "logps/rejected": -3.2582883834838867, "loss": 4.8736, "rewards/accuracies": 0.5, "rewards/chosen": -31.745513916015625, "rewards/margins": 0.8373703956604004, "rewards/rejected": -32.5828857421875, "step": 2307 }, { "epoch": 0.3142701525054466, "grad_norm": 42.87217749965504, "learning_rate": 6.932946397593475e-07, "logits/chosen": 10.314396858215332, "logits/rejected": 10.811643600463867, "logps/chosen": -3.1935040950775146, "logps/rejected": -3.447612762451172, "loss": 4.648, "rewards/accuracies": 0.75, "rewards/chosen": -31.935039520263672, "rewards/margins": 2.5410852432250977, "rewards/rejected": -34.47612762451172, "step": 2308 }, { "epoch": 0.3144063180827887, "grad_norm": 42.997578336032284, "learning_rate": 6.931653162634296e-07, "logits/chosen": 10.503021240234375, "logits/rejected": 11.122930526733398, "logps/chosen": -3.524930715560913, "logps/rejected": -3.7226850986480713, "loss": 3.9618, "rewards/accuracies": 0.75, "rewards/chosen": -35.249305725097656, "rewards/margins": 1.9775428771972656, "rewards/rejected": -37.22685241699219, "step": 2309 }, { "epoch": 0.3145424836601307, "grad_norm": 42.643871200183945, "learning_rate": 6.930359265243853e-07, "logits/chosen": 10.75815200805664, "logits/rejected": 9.776532173156738, "logps/chosen": -3.1685149669647217, "logps/rejected": -3.1511669158935547, "loss": 4.8343, "rewards/accuracies": 0.25, "rewards/chosen": -31.685148239135742, "rewards/margins": -0.17348146438598633, "rewards/rejected": -31.51166534423828, "step": 2310 }, { "epoch": 0.31467864923747274, "grad_norm": 40.76619536944663, "learning_rate": 6.929064705714511e-07, "logits/chosen": 10.051445007324219, "logits/rejected": 10.058343887329102, "logps/chosen": -3.2487685680389404, "logps/rejected": -3.5537843704223633, "loss": 4.4363, "rewards/accuracies": 0.75, "rewards/chosen": -32.48768615722656, "rewards/margins": 3.0501585006713867, "rewards/rejected": -35.537841796875, "step": 2311 }, { "epoch": 0.3148148148148148, "grad_norm": 41.09283387055032, "learning_rate": 6.927769484338787e-07, "logits/chosen": 10.727439880371094, "logits/rejected": 10.585619926452637, "logps/chosen": -2.8878252506256104, "logps/rejected": -3.3799784183502197, "loss": 3.7899, "rewards/accuracies": 1.0, "rewards/chosen": -28.878252029418945, "rewards/margins": 4.921531677246094, "rewards/rejected": -33.799781799316406, "step": 2312 }, { "epoch": 0.31495098039215685, "grad_norm": 41.31950935351189, "learning_rate": 6.926473601409346e-07, "logits/chosen": 11.617067337036133, "logits/rejected": 11.550697326660156, "logps/chosen": -3.5287039279937744, "logps/rejected": -3.5826807022094727, "loss": 4.0258, "rewards/accuracies": 0.75, "rewards/chosen": -35.28704071044922, "rewards/margins": 0.5397672653198242, "rewards/rejected": -35.82680892944336, "step": 2313 }, { "epoch": 0.3150871459694989, "grad_norm": 38.92734318595609, "learning_rate": 6.925177057219006e-07, "logits/chosen": 12.107856750488281, "logits/rejected": 11.166669845581055, "logps/chosen": -3.6646275520324707, "logps/rejected": -3.2763054370880127, "loss": 3.8411, "rewards/accuracies": 0.25, "rewards/chosen": -36.64627456665039, "rewards/margins": -3.8832221031188965, "rewards/rejected": -32.76305389404297, "step": 2314 }, { "epoch": 0.31522331154684097, "grad_norm": 39.30277334310339, "learning_rate": 6.923879852060729e-07, "logits/chosen": 11.20217514038086, "logits/rejected": 11.823407173156738, "logps/chosen": -3.192970037460327, "logps/rejected": -3.338876247406006, "loss": 4.1724, "rewards/accuracies": 0.75, "rewards/chosen": -31.92970085144043, "rewards/margins": 1.4590630531311035, "rewards/rejected": -33.388763427734375, "step": 2315 }, { "epoch": 0.315359477124183, "grad_norm": 38.42849222184308, "learning_rate": 6.92258198622763e-07, "logits/chosen": 9.805203437805176, "logits/rejected": 10.801366806030273, "logps/chosen": -3.4238932132720947, "logps/rejected": -3.638319492340088, "loss": 4.2796, "rewards/accuracies": 1.0, "rewards/chosen": -34.23893356323242, "rewards/margins": 2.14426326751709, "rewards/rejected": -36.38319396972656, "step": 2316 }, { "epoch": 0.3154956427015251, "grad_norm": 39.87151838437317, "learning_rate": 6.921283460012974e-07, "logits/chosen": 10.821053504943848, "logits/rejected": 12.328707695007324, "logps/chosen": -3.4733948707580566, "logps/rejected": -3.715944766998291, "loss": 3.7565, "rewards/accuracies": 0.75, "rewards/chosen": -34.733951568603516, "rewards/margins": 2.4254980087280273, "rewards/rejected": -37.159446716308594, "step": 2317 }, { "epoch": 0.3156318082788671, "grad_norm": 42.165814931725905, "learning_rate": 6.919984273710172e-07, "logits/chosen": 11.834783554077148, "logits/rejected": 11.32507038116455, "logps/chosen": -3.1875052452087402, "logps/rejected": -3.2665750980377197, "loss": 4.1748, "rewards/accuracies": 0.5, "rewards/chosen": -31.875051498413086, "rewards/margins": 0.7906994819641113, "rewards/rejected": -32.665748596191406, "step": 2318 }, { "epoch": 0.31576797385620914, "grad_norm": 54.038843330041324, "learning_rate": 6.918684427612787e-07, "logits/chosen": 10.553260803222656, "logits/rejected": 10.921028137207031, "logps/chosen": -3.5220272541046143, "logps/rejected": -3.571615695953369, "loss": 4.1038, "rewards/accuracies": 0.75, "rewards/chosen": -35.220272064208984, "rewards/margins": 0.49588489532470703, "rewards/rejected": -35.716156005859375, "step": 2319 }, { "epoch": 0.3159041394335512, "grad_norm": 42.62260315756899, "learning_rate": 6.917383922014527e-07, "logits/chosen": 9.972564697265625, "logits/rejected": 11.231966018676758, "logps/chosen": -3.0992610454559326, "logps/rejected": -3.4700417518615723, "loss": 4.1427, "rewards/accuracies": 0.75, "rewards/chosen": -30.992610931396484, "rewards/margins": 3.7078070640563965, "rewards/rejected": -34.700416564941406, "step": 2320 }, { "epoch": 0.31604030501089325, "grad_norm": 39.136355582317016, "learning_rate": 6.916082757209258e-07, "logits/chosen": 11.70619010925293, "logits/rejected": 11.263920783996582, "logps/chosen": -3.3930602073669434, "logps/rejected": -3.6592602729797363, "loss": 3.995, "rewards/accuracies": 0.75, "rewards/chosen": -33.93060302734375, "rewards/margins": 2.6620001792907715, "rewards/rejected": -36.59260177612305, "step": 2321 }, { "epoch": 0.3161764705882353, "grad_norm": 37.684231501190155, "learning_rate": 6.914780933490984e-07, "logits/chosen": 10.218915939331055, "logits/rejected": 9.894940376281738, "logps/chosen": -2.9924426078796387, "logps/rejected": -3.336045265197754, "loss": 3.7568, "rewards/accuracies": 0.75, "rewards/chosen": -29.924427032470703, "rewards/margins": 3.4360265731811523, "rewards/rejected": -33.360450744628906, "step": 2322 }, { "epoch": 0.31631263616557737, "grad_norm": 50.70808202592168, "learning_rate": 6.913478451153864e-07, "logits/chosen": 12.179767608642578, "logits/rejected": 12.084835052490234, "logps/chosen": -3.6003384590148926, "logps/rejected": -3.7407805919647217, "loss": 4.3968, "rewards/accuracies": 0.75, "rewards/chosen": -36.00338363647461, "rewards/margins": 1.4044227600097656, "rewards/rejected": -37.407806396484375, "step": 2323 }, { "epoch": 0.3164488017429194, "grad_norm": 49.29922029506639, "learning_rate": 6.912175310492205e-07, "logits/chosen": 10.707185745239258, "logits/rejected": 11.94497299194336, "logps/chosen": -3.406357765197754, "logps/rejected": -3.575786590576172, "loss": 4.3296, "rewards/accuracies": 0.5, "rewards/chosen": -34.063575744628906, "rewards/margins": 1.6942877769470215, "rewards/rejected": -35.75786590576172, "step": 2324 }, { "epoch": 0.3165849673202614, "grad_norm": 40.005448195921495, "learning_rate": 6.910871511800462e-07, "logits/chosen": 9.910520553588867, "logits/rejected": 10.618976593017578, "logps/chosen": -2.885197877883911, "logps/rejected": -3.164696216583252, "loss": 4.1141, "rewards/accuracies": 0.75, "rewards/chosen": -28.851978302001953, "rewards/margins": 2.794981002807617, "rewards/rejected": -31.646961212158203, "step": 2325 }, { "epoch": 0.3167211328976035, "grad_norm": 41.24460050206404, "learning_rate": 6.90956705537324e-07, "logits/chosen": 10.187028884887695, "logits/rejected": 10.17527961730957, "logps/chosen": -3.137608289718628, "logps/rejected": -3.049032211303711, "loss": 4.0601, "rewards/accuracies": 0.5, "rewards/chosen": -31.376083374023438, "rewards/margins": -0.8857593536376953, "rewards/rejected": -30.490324020385742, "step": 2326 }, { "epoch": 0.31685729847494554, "grad_norm": 42.640568892054475, "learning_rate": 6.90826194150529e-07, "logits/chosen": 10.888872146606445, "logits/rejected": 12.07917594909668, "logps/chosen": -3.403473138809204, "logps/rejected": -3.6765201091766357, "loss": 4.0797, "rewards/accuracies": 1.0, "rewards/chosen": -34.03472900390625, "rewards/margins": 2.730471611022949, "rewards/rejected": -36.765201568603516, "step": 2327 }, { "epoch": 0.31699346405228757, "grad_norm": 40.41142874159054, "learning_rate": 6.906956170491516e-07, "logits/chosen": 10.54513168334961, "logits/rejected": 11.244943618774414, "logps/chosen": -3.0858559608459473, "logps/rejected": -3.338388442993164, "loss": 3.6217, "rewards/accuracies": 1.0, "rewards/chosen": -30.858558654785156, "rewards/margins": 2.5253233909606934, "rewards/rejected": -33.383880615234375, "step": 2328 }, { "epoch": 0.31712962962962965, "grad_norm": 45.44342257846712, "learning_rate": 6.905649742626966e-07, "logits/chosen": 10.649478912353516, "logits/rejected": 10.926666259765625, "logps/chosen": -3.2894251346588135, "logps/rejected": -3.7046103477478027, "loss": 4.0642, "rewards/accuracies": 1.0, "rewards/chosen": -32.89425277709961, "rewards/margins": 4.151849269866943, "rewards/rejected": -37.04610061645508, "step": 2329 }, { "epoch": 0.3172657952069717, "grad_norm": 47.38403065281412, "learning_rate": 6.904342658206836e-07, "logits/chosen": 11.361635208129883, "logits/rejected": 11.711200714111328, "logps/chosen": -3.3114206790924072, "logps/rejected": -3.5591838359832764, "loss": 4.3283, "rewards/accuracies": 0.5, "rewards/chosen": -33.11420822143555, "rewards/margins": 2.4776296615600586, "rewards/rejected": -35.591835021972656, "step": 2330 }, { "epoch": 0.3174019607843137, "grad_norm": 40.78932112823565, "learning_rate": 6.903034917526478e-07, "logits/chosen": 11.07244873046875, "logits/rejected": 11.551156044006348, "logps/chosen": -3.2531046867370605, "logps/rejected": -3.5874712467193604, "loss": 3.9659, "rewards/accuracies": 0.75, "rewards/chosen": -32.53105163574219, "rewards/margins": 3.343663215637207, "rewards/rejected": -35.87471008300781, "step": 2331 }, { "epoch": 0.3175381263616558, "grad_norm": 38.75875204204925, "learning_rate": 6.901726520881382e-07, "logits/chosen": 10.521038055419922, "logits/rejected": 11.703425407409668, "logps/chosen": -3.2696847915649414, "logps/rejected": -3.9395039081573486, "loss": 3.6334, "rewards/accuracies": 1.0, "rewards/chosen": -32.69684600830078, "rewards/margins": 6.698193073272705, "rewards/rejected": -39.39503860473633, "step": 2332 }, { "epoch": 0.3176742919389978, "grad_norm": 40.94044053915832, "learning_rate": 6.900417468567193e-07, "logits/chosen": 10.896194458007812, "logits/rejected": 11.42146110534668, "logps/chosen": -3.1603498458862305, "logps/rejected": -3.5059781074523926, "loss": 4.0234, "rewards/accuracies": 0.75, "rewards/chosen": -31.603496551513672, "rewards/margins": 3.4562859535217285, "rewards/rejected": -35.059783935546875, "step": 2333 }, { "epoch": 0.31781045751633985, "grad_norm": 46.93552179873418, "learning_rate": 6.899107760879701e-07, "logits/chosen": 10.69473934173584, "logits/rejected": 11.595890045166016, "logps/chosen": -3.258601188659668, "logps/rejected": -3.664188861846924, "loss": 4.2749, "rewards/accuracies": 0.75, "rewards/chosen": -32.58601379394531, "rewards/margins": 4.055877685546875, "rewards/rejected": -36.64189147949219, "step": 2334 }, { "epoch": 0.31794662309368193, "grad_norm": 42.946361133779575, "learning_rate": 6.897797398114847e-07, "logits/chosen": 12.590612411499023, "logits/rejected": 12.056246757507324, "logps/chosen": -3.0719616413116455, "logps/rejected": -3.2518341541290283, "loss": 4.3503, "rewards/accuracies": 0.75, "rewards/chosen": -30.719615936279297, "rewards/margins": 1.79872465133667, "rewards/rejected": -32.518341064453125, "step": 2335 }, { "epoch": 0.31808278867102396, "grad_norm": 45.26158585476281, "learning_rate": 6.896486380568718e-07, "logits/chosen": 11.825996398925781, "logits/rejected": 11.754030227661133, "logps/chosen": -3.393876791000366, "logps/rejected": -3.4793825149536133, "loss": 4.1894, "rewards/accuracies": 0.5, "rewards/chosen": -33.93876647949219, "rewards/margins": 0.8550591468811035, "rewards/rejected": -34.7938232421875, "step": 2336 }, { "epoch": 0.318218954248366, "grad_norm": 43.32185152482134, "learning_rate": 6.895174708537548e-07, "logits/chosen": 11.349723815917969, "logits/rejected": 12.022860527038574, "logps/chosen": -3.255666971206665, "logps/rejected": -3.5577917098999023, "loss": 4.3106, "rewards/accuracies": 0.75, "rewards/chosen": -32.556671142578125, "rewards/margins": 3.0212478637695312, "rewards/rejected": -35.577919006347656, "step": 2337 }, { "epoch": 0.3183551198257081, "grad_norm": 39.85006562705887, "learning_rate": 6.893862382317721e-07, "logits/chosen": 11.872261047363281, "logits/rejected": 12.48193359375, "logps/chosen": -3.8092963695526123, "logps/rejected": -3.937371253967285, "loss": 3.6559, "rewards/accuracies": 0.5, "rewards/chosen": -38.09296417236328, "rewards/margins": 1.2807493209838867, "rewards/rejected": -39.373714447021484, "step": 2338 }, { "epoch": 0.3184912854030501, "grad_norm": 41.451747370701234, "learning_rate": 6.892549402205767e-07, "logits/chosen": 10.36070442199707, "logits/rejected": 9.64610481262207, "logps/chosen": -3.186577796936035, "logps/rejected": -3.149805784225464, "loss": 3.9735, "rewards/accuracies": 0.5, "rewards/chosen": -31.865779876708984, "rewards/margins": -0.3677215576171875, "rewards/rejected": -31.498058319091797, "step": 2339 }, { "epoch": 0.31862745098039214, "grad_norm": 45.31183662270688, "learning_rate": 6.891235768498367e-07, "logits/chosen": 12.287771224975586, "logits/rejected": 12.28370475769043, "logps/chosen": -3.9325592517852783, "logps/rejected": -3.900399923324585, "loss": 3.6862, "rewards/accuracies": 0.5, "rewards/chosen": -39.325592041015625, "rewards/margins": -0.3215923309326172, "rewards/rejected": -39.003997802734375, "step": 2340 }, { "epoch": 0.3187636165577342, "grad_norm": 43.82056959855168, "learning_rate": 6.889921481492346e-07, "logits/chosen": 10.632078170776367, "logits/rejected": 11.18873405456543, "logps/chosen": -3.453744649887085, "logps/rejected": -3.691084146499634, "loss": 4.1587, "rewards/accuracies": 0.75, "rewards/chosen": -34.537445068359375, "rewards/margins": 2.373394012451172, "rewards/rejected": -36.91084289550781, "step": 2341 }, { "epoch": 0.31889978213507625, "grad_norm": 40.399282839123124, "learning_rate": 6.888606541484677e-07, "logits/chosen": 11.128192901611328, "logits/rejected": 11.161646842956543, "logps/chosen": -3.395515203475952, "logps/rejected": -3.5057806968688965, "loss": 3.6752, "rewards/accuracies": 0.5, "rewards/chosen": -33.95515441894531, "rewards/margins": 1.1026530265808105, "rewards/rejected": -35.05780792236328, "step": 2342 }, { "epoch": 0.3190359477124183, "grad_norm": 43.9471214616044, "learning_rate": 6.887290948772482e-07, "logits/chosen": 11.047972679138184, "logits/rejected": 11.549041748046875, "logps/chosen": -3.118478298187256, "logps/rejected": -3.5807886123657227, "loss": 3.614, "rewards/accuracies": 1.0, "rewards/chosen": -31.184782028198242, "rewards/margins": 4.623104095458984, "rewards/rejected": -35.807884216308594, "step": 2343 }, { "epoch": 0.31917211328976036, "grad_norm": 41.91444391971926, "learning_rate": 6.885974703653032e-07, "logits/chosen": 10.654979705810547, "logits/rejected": 11.212791442871094, "logps/chosen": -3.527153491973877, "logps/rejected": -3.4962894916534424, "loss": 3.9749, "rewards/accuracies": 0.5, "rewards/chosen": -35.27153396606445, "rewards/margins": -0.3086404800415039, "rewards/rejected": -34.962894439697266, "step": 2344 }, { "epoch": 0.3193082788671024, "grad_norm": 47.34271894170932, "learning_rate": 6.88465780642374e-07, "logits/chosen": 11.449405670166016, "logits/rejected": 12.82888412475586, "logps/chosen": -3.395735502243042, "logps/rejected": -4.126121520996094, "loss": 4.3969, "rewards/accuracies": 1.0, "rewards/chosen": -33.95735549926758, "rewards/margins": 7.303862571716309, "rewards/rejected": -41.2612190246582, "step": 2345 }, { "epoch": 0.3194444444444444, "grad_norm": 42.549574211250324, "learning_rate": 6.883340257382174e-07, "logits/chosen": 11.992057800292969, "logits/rejected": 11.892217636108398, "logps/chosen": -3.4259731769561768, "logps/rejected": -3.58089017868042, "loss": 4.1842, "rewards/accuracies": 0.5, "rewards/chosen": -34.259735107421875, "rewards/margins": 1.549170970916748, "rewards/rejected": -35.808902740478516, "step": 2346 }, { "epoch": 0.3195806100217865, "grad_norm": 40.03448908725747, "learning_rate": 6.882022056826041e-07, "logits/chosen": 10.952400207519531, "logits/rejected": 11.436800956726074, "logps/chosen": -3.6255435943603516, "logps/rejected": -3.6541733741760254, "loss": 4.1602, "rewards/accuracies": 0.5, "rewards/chosen": -36.255435943603516, "rewards/margins": 0.2862977981567383, "rewards/rejected": -36.54173278808594, "step": 2347 }, { "epoch": 0.31971677559912853, "grad_norm": 39.41807819982724, "learning_rate": 6.880703205053203e-07, "logits/chosen": 11.670367240905762, "logits/rejected": 11.482526779174805, "logps/chosen": -3.4882113933563232, "logps/rejected": -3.6381723880767822, "loss": 4.0065, "rewards/accuracies": 0.5, "rewards/chosen": -34.882110595703125, "rewards/margins": 1.4996109008789062, "rewards/rejected": -36.3817253112793, "step": 2348 }, { "epoch": 0.31985294117647056, "grad_norm": 40.14867579327548, "learning_rate": 6.879383702361663e-07, "logits/chosen": 11.554887771606445, "logits/rejected": 11.411606788635254, "logps/chosen": -3.133960247039795, "logps/rejected": -3.310816764831543, "loss": 3.7316, "rewards/accuracies": 0.75, "rewards/chosen": -31.339603424072266, "rewards/margins": 1.7685651779174805, "rewards/rejected": -33.10816955566406, "step": 2349 }, { "epoch": 0.31998910675381265, "grad_norm": 50.88885640761666, "learning_rate": 6.878063549049573e-07, "logits/chosen": 12.369078636169434, "logits/rejected": 12.127238273620605, "logps/chosen": -3.579987049102783, "logps/rejected": -3.4192094802856445, "loss": 4.3847, "rewards/accuracies": 0.25, "rewards/chosen": -35.79987335205078, "rewards/margins": -1.6077756881713867, "rewards/rejected": -34.19209671020508, "step": 2350 }, { "epoch": 0.3201252723311547, "grad_norm": 45.28805815415206, "learning_rate": 6.876742745415235e-07, "logits/chosen": 11.68813705444336, "logits/rejected": 12.292601585388184, "logps/chosen": -3.7241644859313965, "logps/rejected": -3.7857253551483154, "loss": 3.861, "rewards/accuracies": 0.25, "rewards/chosen": -37.24164581298828, "rewards/margins": 0.6156101226806641, "rewards/rejected": -37.85725402832031, "step": 2351 }, { "epoch": 0.3202614379084967, "grad_norm": 43.60947263454372, "learning_rate": 6.875421291757094e-07, "logits/chosen": 11.602420806884766, "logits/rejected": 12.500064849853516, "logps/chosen": -3.6990177631378174, "logps/rejected": -3.792402744293213, "loss": 4.1057, "rewards/accuracies": 0.5, "rewards/chosen": -36.990177154541016, "rewards/margins": 0.9338474273681641, "rewards/rejected": -37.92402267456055, "step": 2352 }, { "epoch": 0.3203976034858388, "grad_norm": 45.71447932614288, "learning_rate": 6.874099188373743e-07, "logits/chosen": 11.718986511230469, "logits/rejected": 11.834516525268555, "logps/chosen": -3.4111502170562744, "logps/rejected": -3.5499911308288574, "loss": 3.804, "rewards/accuracies": 0.5, "rewards/chosen": -34.11150360107422, "rewards/margins": 1.3884105682373047, "rewards/rejected": -35.49991226196289, "step": 2353 }, { "epoch": 0.3205337690631808, "grad_norm": 49.07448760401416, "learning_rate": 6.872776435563924e-07, "logits/chosen": 11.310858726501465, "logits/rejected": 12.210070610046387, "logps/chosen": -3.3508987426757812, "logps/rejected": -3.463587760925293, "loss": 4.9683, "rewards/accuracies": 0.5, "rewards/chosen": -33.50898742675781, "rewards/margins": 1.12689208984375, "rewards/rejected": -34.63587951660156, "step": 2354 }, { "epoch": 0.3206699346405229, "grad_norm": 42.89824395232435, "learning_rate": 6.871453033626522e-07, "logits/chosen": 12.440805435180664, "logits/rejected": 11.988123893737793, "logps/chosen": -3.870303153991699, "logps/rejected": -3.81527042388916, "loss": 3.8387, "rewards/accuracies": 0.5, "rewards/chosen": -38.703033447265625, "rewards/margins": -0.5503273010253906, "rewards/rejected": -38.152706146240234, "step": 2355 }, { "epoch": 0.32080610021786493, "grad_norm": 41.924580715566904, "learning_rate": 6.870128982860573e-07, "logits/chosen": 10.990203857421875, "logits/rejected": 12.133970260620117, "logps/chosen": -3.121777057647705, "logps/rejected": -3.6249020099639893, "loss": 4.1671, "rewards/accuracies": 1.0, "rewards/chosen": -31.217771530151367, "rewards/margins": 5.031249523162842, "rewards/rejected": -36.249019622802734, "step": 2356 }, { "epoch": 0.32094226579520696, "grad_norm": 43.61058897157565, "learning_rate": 6.868804283565254e-07, "logits/chosen": 11.100814819335938, "logits/rejected": 11.312705039978027, "logps/chosen": -3.520352840423584, "logps/rejected": -3.5408427715301514, "loss": 4.5467, "rewards/accuracies": 0.5, "rewards/chosen": -35.203529357910156, "rewards/margins": 0.20489931106567383, "rewards/rejected": -35.40842819213867, "step": 2357 }, { "epoch": 0.32107843137254904, "grad_norm": 48.52836864123281, "learning_rate": 6.867478936039892e-07, "logits/chosen": 11.961849212646484, "logits/rejected": 11.723630905151367, "logps/chosen": -3.6982574462890625, "logps/rejected": -3.4207561016082764, "loss": 4.2359, "rewards/accuracies": 0.5, "rewards/chosen": -36.982574462890625, "rewards/margins": -2.7750139236450195, "rewards/rejected": -34.207557678222656, "step": 2358 }, { "epoch": 0.3212145969498911, "grad_norm": 39.583724985641325, "learning_rate": 6.866152940583964e-07, "logits/chosen": 11.367137908935547, "logits/rejected": 11.859065055847168, "logps/chosen": -3.6227238178253174, "logps/rejected": -3.6879630088806152, "loss": 3.7188, "rewards/accuracies": 0.5, "rewards/chosen": -36.22723388671875, "rewards/margins": 0.6523933410644531, "rewards/rejected": -36.87963104248047, "step": 2359 }, { "epoch": 0.3213507625272331, "grad_norm": 40.66002610477757, "learning_rate": 6.864826297497086e-07, "logits/chosen": 11.216262817382812, "logits/rejected": 11.643423080444336, "logps/chosen": -3.647845506668091, "logps/rejected": -3.7760794162750244, "loss": 4.001, "rewards/accuracies": 0.75, "rewards/chosen": -36.47845458984375, "rewards/margins": 1.2823381423950195, "rewards/rejected": -37.76079559326172, "step": 2360 }, { "epoch": 0.3214869281045752, "grad_norm": 44.82084796334831, "learning_rate": 6.863499007079026e-07, "logits/chosen": 10.757440567016602, "logits/rejected": 11.327428817749023, "logps/chosen": -3.1431946754455566, "logps/rejected": -3.5259323120117188, "loss": 3.7281, "rewards/accuracies": 1.0, "rewards/chosen": -31.43194580078125, "rewards/margins": 3.8273754119873047, "rewards/rejected": -35.25932312011719, "step": 2361 }, { "epoch": 0.3216230936819172, "grad_norm": 39.83322392204664, "learning_rate": 6.862171069629695e-07, "logits/chosen": 12.69648551940918, "logits/rejected": 11.804719924926758, "logps/chosen": -3.7121334075927734, "logps/rejected": -3.590579032897949, "loss": 4.249, "rewards/accuracies": 0.25, "rewards/chosen": -37.121334075927734, "rewards/margins": -1.2155447006225586, "rewards/rejected": -35.905792236328125, "step": 2362 }, { "epoch": 0.32175925925925924, "grad_norm": 39.96832829110954, "learning_rate": 6.860842485449153e-07, "logits/chosen": 11.753660202026367, "logits/rejected": 11.278791427612305, "logps/chosen": -3.567354202270508, "logps/rejected": -3.74301815032959, "loss": 4.3236, "rewards/accuracies": 0.75, "rewards/chosen": -35.67354202270508, "rewards/margins": 1.756638526916504, "rewards/rejected": -37.430179595947266, "step": 2363 }, { "epoch": 0.32189542483660133, "grad_norm": 38.41593332961221, "learning_rate": 6.859513254837601e-07, "logits/chosen": 11.883048057556152, "logits/rejected": 11.441818237304688, "logps/chosen": -3.4363627433776855, "logps/rejected": -3.394636631011963, "loss": 3.7282, "rewards/accuracies": 0.5, "rewards/chosen": -34.36362838745117, "rewards/margins": -0.41726207733154297, "rewards/rejected": -33.94636535644531, "step": 2364 }, { "epoch": 0.32203159041394336, "grad_norm": 41.34015401048053, "learning_rate": 6.858183378095394e-07, "logits/chosen": 10.886594772338867, "logits/rejected": 11.424335479736328, "logps/chosen": -3.6766910552978516, "logps/rejected": -3.610278844833374, "loss": 3.8471, "rewards/accuracies": 0.5, "rewards/chosen": -36.766910552978516, "rewards/margins": -0.6641197204589844, "rewards/rejected": -36.10279083251953, "step": 2365 }, { "epoch": 0.3221677559912854, "grad_norm": 42.08457046275275, "learning_rate": 6.856852855523026e-07, "logits/chosen": 11.352394104003906, "logits/rejected": 11.656889915466309, "logps/chosen": -3.5922691822052, "logps/rejected": -3.521047353744507, "loss": 4.0749, "rewards/accuracies": 0.5, "rewards/chosen": -35.922691345214844, "rewards/margins": -0.71221923828125, "rewards/rejected": -35.210472106933594, "step": 2366 }, { "epoch": 0.32230392156862747, "grad_norm": 41.61593623654829, "learning_rate": 6.855521687421141e-07, "logits/chosen": 11.271347045898438, "logits/rejected": 11.65969181060791, "logps/chosen": -3.3294267654418945, "logps/rejected": -3.4774153232574463, "loss": 4.6095, "rewards/accuracies": 0.5, "rewards/chosen": -33.29426574707031, "rewards/margins": 1.4798851013183594, "rewards/rejected": -34.77415466308594, "step": 2367 }, { "epoch": 0.3224400871459695, "grad_norm": 39.08293030881015, "learning_rate": 6.854189874090525e-07, "logits/chosen": 10.96731185913086, "logits/rejected": 11.2371244430542, "logps/chosen": -3.3647570610046387, "logps/rejected": -3.594632625579834, "loss": 3.7562, "rewards/accuracies": 1.0, "rewards/chosen": -33.6475715637207, "rewards/margins": 2.298758029937744, "rewards/rejected": -35.946327209472656, "step": 2368 }, { "epoch": 0.32257625272331153, "grad_norm": 63.29984774919996, "learning_rate": 6.852857415832117e-07, "logits/chosen": 11.975896835327148, "logits/rejected": 11.637129783630371, "logps/chosen": -3.5368738174438477, "logps/rejected": -3.6704835891723633, "loss": 3.8119, "rewards/accuracies": 0.25, "rewards/chosen": -35.368736267089844, "rewards/margins": 1.3360986709594727, "rewards/rejected": -36.704837799072266, "step": 2369 }, { "epoch": 0.3227124183006536, "grad_norm": 42.70764881491124, "learning_rate": 6.851524312946992e-07, "logits/chosen": 11.216039657592773, "logits/rejected": 10.946897506713867, "logps/chosen": -3.2443742752075195, "logps/rejected": -3.1987698078155518, "loss": 3.938, "rewards/accuracies": 0.5, "rewards/chosen": -32.44374084472656, "rewards/margins": -0.4560427665710449, "rewards/rejected": -31.98769760131836, "step": 2370 }, { "epoch": 0.32284858387799564, "grad_norm": 42.04969459325826, "learning_rate": 6.850190565736378e-07, "logits/chosen": 12.350677490234375, "logits/rejected": 11.447179794311523, "logps/chosen": -3.9431891441345215, "logps/rejected": -3.565371036529541, "loss": 4.0651, "rewards/accuracies": 0.0, "rewards/chosen": -39.43189239501953, "rewards/margins": -3.778183937072754, "rewards/rejected": -35.653709411621094, "step": 2371 }, { "epoch": 0.32298474945533767, "grad_norm": 41.69973592964161, "learning_rate": 6.848856174501645e-07, "logits/chosen": 10.49169921875, "logits/rejected": 11.355247497558594, "logps/chosen": -3.50498366355896, "logps/rejected": -3.7129578590393066, "loss": 4.2613, "rewards/accuracies": 1.0, "rewards/chosen": -35.049835205078125, "rewards/margins": 2.0797410011291504, "rewards/rejected": -37.12957763671875, "step": 2372 }, { "epoch": 0.32312091503267976, "grad_norm": 40.9809683481764, "learning_rate": 6.84752113954431e-07, "logits/chosen": 10.95818042755127, "logits/rejected": 11.153136253356934, "logps/chosen": -3.848612070083618, "logps/rejected": -4.170214653015137, "loss": 3.3844, "rewards/accuracies": 0.75, "rewards/chosen": -38.486122131347656, "rewards/margins": 3.216024398803711, "rewards/rejected": -41.702144622802734, "step": 2373 }, { "epoch": 0.3232570806100218, "grad_norm": 45.57704082000626, "learning_rate": 6.846185461166036e-07, "logits/chosen": 11.28590202331543, "logits/rejected": 11.868470191955566, "logps/chosen": -3.34877872467041, "logps/rejected": -3.5082530975341797, "loss": 4.475, "rewards/accuracies": 0.5, "rewards/chosen": -33.487789154052734, "rewards/margins": 1.5947442054748535, "rewards/rejected": -35.08253479003906, "step": 2374 }, { "epoch": 0.3233932461873638, "grad_norm": 42.75433486525921, "learning_rate": 6.844849139668632e-07, "logits/chosen": 13.061320304870605, "logits/rejected": 12.015010833740234, "logps/chosen": -3.6196608543395996, "logps/rejected": -3.8034753799438477, "loss": 4.0369, "rewards/accuracies": 0.75, "rewards/chosen": -36.19660568237305, "rewards/margins": 1.8381500244140625, "rewards/rejected": -38.034751892089844, "step": 2375 }, { "epoch": 0.3235294117647059, "grad_norm": 42.873175623397074, "learning_rate": 6.843512175354048e-07, "logits/chosen": 11.235340118408203, "logits/rejected": 11.647529602050781, "logps/chosen": -3.5607855319976807, "logps/rejected": -3.747772216796875, "loss": 4.7295, "rewards/accuracies": 0.5, "rewards/chosen": -35.60785675048828, "rewards/margins": 1.8698644638061523, "rewards/rejected": -37.47772216796875, "step": 2376 }, { "epoch": 0.3236655773420479, "grad_norm": 45.10930336841628, "learning_rate": 6.842174568524382e-07, "logits/chosen": 11.308271408081055, "logits/rejected": 11.507012367248535, "logps/chosen": -3.4800338745117188, "logps/rejected": -3.6763601303100586, "loss": 4.5916, "rewards/accuracies": 0.75, "rewards/chosen": -34.80033874511719, "rewards/margins": 1.9632644653320312, "rewards/rejected": -36.76360321044922, "step": 2377 }, { "epoch": 0.32380174291938996, "grad_norm": 40.94429325691772, "learning_rate": 6.840836319481882e-07, "logits/chosen": 10.592866897583008, "logits/rejected": 10.915935516357422, "logps/chosen": -3.5267322063446045, "logps/rejected": -3.6489453315734863, "loss": 4.1689, "rewards/accuracies": 0.75, "rewards/chosen": -35.2673225402832, "rewards/margins": 1.2221317291259766, "rewards/rejected": -36.48945617675781, "step": 2378 }, { "epoch": 0.32393790849673204, "grad_norm": 45.38570770734257, "learning_rate": 6.839497428528931e-07, "logits/chosen": 11.521834373474121, "logits/rejected": 11.242134094238281, "logps/chosen": -3.4607503414154053, "logps/rejected": -3.429048538208008, "loss": 4.1354, "rewards/accuracies": 0.25, "rewards/chosen": -34.607505798339844, "rewards/margins": -0.3170166015625, "rewards/rejected": -34.29048538208008, "step": 2379 }, { "epoch": 0.32407407407407407, "grad_norm": 39.99147769825564, "learning_rate": 6.838157895968064e-07, "logits/chosen": 10.701888084411621, "logits/rejected": 10.956001281738281, "logps/chosen": -3.205796480178833, "logps/rejected": -3.586251735687256, "loss": 4.1297, "rewards/accuracies": 0.75, "rewards/chosen": -32.05796813964844, "rewards/margins": 3.804551601409912, "rewards/rejected": -35.862518310546875, "step": 2380 }, { "epoch": 0.3242102396514161, "grad_norm": 46.401752314608714, "learning_rate": 6.836817722101961e-07, "logits/chosen": 11.368806838989258, "logits/rejected": 10.508634567260742, "logps/chosen": -3.6340384483337402, "logps/rejected": -3.1914591789245605, "loss": 4.4032, "rewards/accuracies": 0.25, "rewards/chosen": -36.34038543701172, "rewards/margins": -4.425791263580322, "rewards/rejected": -31.914592742919922, "step": 2381 }, { "epoch": 0.3243464052287582, "grad_norm": 40.85220453505148, "learning_rate": 6.835476907233443e-07, "logits/chosen": 11.310043334960938, "logits/rejected": 11.494424819946289, "logps/chosen": -3.348733425140381, "logps/rejected": -3.7455813884735107, "loss": 4.0979, "rewards/accuracies": 1.0, "rewards/chosen": -33.48733139038086, "rewards/margins": 3.9684815406799316, "rewards/rejected": -37.455814361572266, "step": 2382 }, { "epoch": 0.3244825708061002, "grad_norm": 39.72405284090897, "learning_rate": 6.83413545166548e-07, "logits/chosen": 10.545862197875977, "logits/rejected": 10.778457641601562, "logps/chosen": -3.437074661254883, "logps/rejected": -3.4052658081054688, "loss": 3.6611, "rewards/accuracies": 0.5, "rewards/chosen": -34.370750427246094, "rewards/margins": -0.3180885314941406, "rewards/rejected": -34.05266189575195, "step": 2383 }, { "epoch": 0.32461873638344224, "grad_norm": 38.28162155289642, "learning_rate": 6.832793355701184e-07, "logits/chosen": 11.8369722366333, "logits/rejected": 11.7865571975708, "logps/chosen": -3.2653799057006836, "logps/rejected": -3.6207008361816406, "loss": 4.046, "rewards/accuracies": 0.75, "rewards/chosen": -32.65380096435547, "rewards/margins": 3.5532102584838867, "rewards/rejected": -36.207008361816406, "step": 2384 }, { "epoch": 0.3247549019607843, "grad_norm": 40.21499979851678, "learning_rate": 6.831450619643815e-07, "logits/chosen": 11.411944389343262, "logits/rejected": 11.053791046142578, "logps/chosen": -3.3959271907806396, "logps/rejected": -3.3378183841705322, "loss": 4.3649, "rewards/accuracies": 0.25, "rewards/chosen": -33.95927047729492, "rewards/margins": -0.5810880661010742, "rewards/rejected": -33.37818145751953, "step": 2385 }, { "epoch": 0.32489106753812635, "grad_norm": 37.767471823089814, "learning_rate": 6.830107243796771e-07, "logits/chosen": 11.978546142578125, "logits/rejected": 12.447868347167969, "logps/chosen": -3.4679505825042725, "logps/rejected": -3.5036754608154297, "loss": 4.0355, "rewards/accuracies": 0.5, "rewards/chosen": -34.67950439453125, "rewards/margins": 0.3572506904602051, "rewards/rejected": -35.0367546081543, "step": 2386 }, { "epoch": 0.3250272331154684, "grad_norm": 37.73527699595528, "learning_rate": 6.828763228463603e-07, "logits/chosen": 10.73065185546875, "logits/rejected": 11.567756652832031, "logps/chosen": -3.520336627960205, "logps/rejected": -3.6035780906677246, "loss": 3.8455, "rewards/accuracies": 0.5, "rewards/chosen": -35.203369140625, "rewards/margins": 0.8324146270751953, "rewards/rejected": -36.03578186035156, "step": 2387 }, { "epoch": 0.32516339869281047, "grad_norm": 44.60717352961967, "learning_rate": 6.827418573948001e-07, "logits/chosen": 11.83858871459961, "logits/rejected": 11.728196144104004, "logps/chosen": -3.4519190788269043, "logps/rejected": -3.7400383949279785, "loss": 3.9877, "rewards/accuracies": 1.0, "rewards/chosen": -34.51919174194336, "rewards/margins": 2.8811917304992676, "rewards/rejected": -37.40038299560547, "step": 2388 }, { "epoch": 0.3252995642701525, "grad_norm": 38.87358794432173, "learning_rate": 6.826073280553799e-07, "logits/chosen": 11.097030639648438, "logits/rejected": 12.086885452270508, "logps/chosen": -3.1467807292938232, "logps/rejected": -3.6258645057678223, "loss": 4.4561, "rewards/accuracies": 1.0, "rewards/chosen": -31.46780776977539, "rewards/margins": 4.790838718414307, "rewards/rejected": -36.258644104003906, "step": 2389 }, { "epoch": 0.3254357298474945, "grad_norm": 37.953836579499594, "learning_rate": 6.824727348584981e-07, "logits/chosen": 11.261378288269043, "logits/rejected": 10.594104766845703, "logps/chosen": -3.4009792804718018, "logps/rejected": -3.426232099533081, "loss": 4.3484, "rewards/accuracies": 0.5, "rewards/chosen": -34.00979232788086, "rewards/margins": 0.25252819061279297, "rewards/rejected": -34.26232147216797, "step": 2390 }, { "epoch": 0.3255718954248366, "grad_norm": 64.59872508428289, "learning_rate": 6.823380778345667e-07, "logits/chosen": 11.84058952331543, "logits/rejected": 11.056253433227539, "logps/chosen": -3.4424660205841064, "logps/rejected": -3.2614493370056152, "loss": 3.9981, "rewards/accuracies": 0.0, "rewards/chosen": -34.424659729003906, "rewards/margins": -1.810166358947754, "rewards/rejected": -32.61449432373047, "step": 2391 }, { "epoch": 0.32570806100217864, "grad_norm": 46.62123744204558, "learning_rate": 6.822033570140129e-07, "logits/chosen": 11.162405967712402, "logits/rejected": 11.993002891540527, "logps/chosen": -3.4113364219665527, "logps/rejected": -4.052535057067871, "loss": 4.293, "rewards/accuracies": 0.75, "rewards/chosen": -34.113365173339844, "rewards/margins": 6.411988258361816, "rewards/rejected": -40.525352478027344, "step": 2392 }, { "epoch": 0.3258442265795207, "grad_norm": 40.630672089595755, "learning_rate": 6.820685724272779e-07, "logits/chosen": 11.110198974609375, "logits/rejected": 11.820098876953125, "logps/chosen": -3.4847793579101562, "logps/rejected": -3.7138354778289795, "loss": 3.8123, "rewards/accuracies": 0.75, "rewards/chosen": -34.84779357910156, "rewards/margins": 2.2905588150024414, "rewards/rejected": -37.13835144042969, "step": 2393 }, { "epoch": 0.32598039215686275, "grad_norm": 38.5625715128437, "learning_rate": 6.819337241048172e-07, "logits/chosen": 10.552696228027344, "logits/rejected": 11.371414184570312, "logps/chosen": -3.2290894985198975, "logps/rejected": -3.8210785388946533, "loss": 3.7423, "rewards/accuracies": 1.0, "rewards/chosen": -32.2908935546875, "rewards/margins": 5.9198899269104, "rewards/rejected": -38.210784912109375, "step": 2394 }, { "epoch": 0.3261165577342048, "grad_norm": 41.77997288772512, "learning_rate": 6.817988120771012e-07, "logits/chosen": 11.324529647827148, "logits/rejected": 11.745902061462402, "logps/chosen": -3.8672873973846436, "logps/rejected": -3.7255818843841553, "loss": 4.0349, "rewards/accuracies": 0.5, "rewards/chosen": -38.672874450683594, "rewards/margins": -1.4170527458190918, "rewards/rejected": -37.255821228027344, "step": 2395 }, { "epoch": 0.32625272331154687, "grad_norm": 43.463137563599986, "learning_rate": 6.816638363746142e-07, "logits/chosen": 11.745138168334961, "logits/rejected": 11.544002532958984, "logps/chosen": -3.673074960708618, "logps/rejected": -3.859935760498047, "loss": 4.164, "rewards/accuracies": 0.5, "rewards/chosen": -36.730751037597656, "rewards/margins": 1.8686046600341797, "rewards/rejected": -38.59935760498047, "step": 2396 }, { "epoch": 0.3263888888888889, "grad_norm": 44.332913893107424, "learning_rate": 6.81528797027855e-07, "logits/chosen": 12.157913208007812, "logits/rejected": 12.607178688049316, "logps/chosen": -3.5875630378723145, "logps/rejected": -3.775629997253418, "loss": 3.4464, "rewards/accuracies": 0.75, "rewards/chosen": -35.87562942504883, "rewards/margins": 1.880671501159668, "rewards/rejected": -37.75630187988281, "step": 2397 }, { "epoch": 0.3265250544662309, "grad_norm": 43.153540856525595, "learning_rate": 6.81393694067337e-07, "logits/chosen": 11.903388977050781, "logits/rejected": 12.63357162475586, "logps/chosen": -3.671924352645874, "logps/rejected": -4.0439558029174805, "loss": 4.2869, "rewards/accuracies": 0.75, "rewards/chosen": -36.719242095947266, "rewards/margins": 3.7203140258789062, "rewards/rejected": -40.43955612182617, "step": 2398 }, { "epoch": 0.326661220043573, "grad_norm": 55.26632655993975, "learning_rate": 6.81258527523588e-07, "logits/chosen": 11.983844757080078, "logits/rejected": 12.046805381774902, "logps/chosen": -3.7293829917907715, "logps/rejected": -3.779658794403076, "loss": 4.2193, "rewards/accuracies": 0.25, "rewards/chosen": -37.29383087158203, "rewards/margins": 0.5027570724487305, "rewards/rejected": -37.79658508300781, "step": 2399 }, { "epoch": 0.32679738562091504, "grad_norm": 45.03206978869794, "learning_rate": 6.811232974271496e-07, "logits/chosen": 11.520050048828125, "logits/rejected": 12.442506790161133, "logps/chosen": -3.418046236038208, "logps/rejected": -3.715430974960327, "loss": 4.3358, "rewards/accuracies": 0.75, "rewards/chosen": -34.18046188354492, "rewards/margins": 2.973846435546875, "rewards/rejected": -37.15431213378906, "step": 2400 }, { "epoch": 0.32693355119825707, "grad_norm": 43.23815259712753, "learning_rate": 6.809880038085784e-07, "logits/chosen": 11.258978843688965, "logits/rejected": 11.684212684631348, "logps/chosen": -3.3491878509521484, "logps/rejected": -3.771397829055786, "loss": 3.6336, "rewards/accuracies": 0.75, "rewards/chosen": -33.49187469482422, "rewards/margins": 4.222101211547852, "rewards/rejected": -37.71398162841797, "step": 2401 }, { "epoch": 0.32706971677559915, "grad_norm": 41.07978578678318, "learning_rate": 6.808526466984451e-07, "logits/chosen": 11.602285385131836, "logits/rejected": 11.346294403076172, "logps/chosen": -3.295441150665283, "logps/rejected": -3.221951723098755, "loss": 4.3409, "rewards/accuracies": 0.5, "rewards/chosen": -32.954410552978516, "rewards/margins": -0.734893798828125, "rewards/rejected": -32.21951675415039, "step": 2402 }, { "epoch": 0.3272058823529412, "grad_norm": 41.63275019023505, "learning_rate": 6.807172261273347e-07, "logits/chosen": 11.439815521240234, "logits/rejected": 11.521604537963867, "logps/chosen": -3.6866326332092285, "logps/rejected": -3.6949994564056396, "loss": 4.0673, "rewards/accuracies": 0.75, "rewards/chosen": -36.866329193115234, "rewards/margins": 0.0836648941040039, "rewards/rejected": -36.94999313354492, "step": 2403 }, { "epoch": 0.3273420479302832, "grad_norm": 38.25143018472794, "learning_rate": 6.805817421258467e-07, "logits/chosen": 10.747917175292969, "logits/rejected": 11.649066925048828, "logps/chosen": -3.4921774864196777, "logps/rejected": -3.7670178413391113, "loss": 3.6354, "rewards/accuracies": 1.0, "rewards/chosen": -34.921775817871094, "rewards/margins": 2.7484049797058105, "rewards/rejected": -37.67018127441406, "step": 2404 }, { "epoch": 0.3274782135076253, "grad_norm": 57.49280180972426, "learning_rate": 6.804461947245947e-07, "logits/chosen": 11.117101669311523, "logits/rejected": 11.852375030517578, "logps/chosen": -3.7015318870544434, "logps/rejected": -3.89644718170166, "loss": 3.3035, "rewards/accuracies": 1.0, "rewards/chosen": -37.01531982421875, "rewards/margins": 1.9491539001464844, "rewards/rejected": -38.964473724365234, "step": 2405 }, { "epoch": 0.3276143790849673, "grad_norm": 47.02687315465602, "learning_rate": 6.803105839542068e-07, "logits/chosen": 12.089225769042969, "logits/rejected": 11.587814331054688, "logps/chosen": -3.508103370666504, "logps/rejected": -3.63144588470459, "loss": 4.1504, "rewards/accuracies": 0.5, "rewards/chosen": -35.081031799316406, "rewards/margins": 1.2334232330322266, "rewards/rejected": -36.314456939697266, "step": 2406 }, { "epoch": 0.32775054466230935, "grad_norm": 44.90433578828498, "learning_rate": 6.801749098453253e-07, "logits/chosen": 11.900941848754883, "logits/rejected": 12.809800148010254, "logps/chosen": -3.28296160697937, "logps/rejected": -4.0270609855651855, "loss": 3.6455, "rewards/accuracies": 1.0, "rewards/chosen": -32.829612731933594, "rewards/margins": 7.440994739532471, "rewards/rejected": -40.27061080932617, "step": 2407 }, { "epoch": 0.32788671023965144, "grad_norm": 40.55168129276713, "learning_rate": 6.800391724286072e-07, "logits/chosen": 11.241668701171875, "logits/rejected": 10.254262924194336, "logps/chosen": -3.7762985229492188, "logps/rejected": -3.47590970993042, "loss": 4.0975, "rewards/accuracies": 0.25, "rewards/chosen": -37.76298522949219, "rewards/margins": -3.0038881301879883, "rewards/rejected": -34.759098052978516, "step": 2408 }, { "epoch": 0.32802287581699346, "grad_norm": 37.49750433902986, "learning_rate": 6.799033717347229e-07, "logits/chosen": 10.811540603637695, "logits/rejected": 11.418122291564941, "logps/chosen": -3.4599616527557373, "logps/rejected": -3.3245387077331543, "loss": 4.4151, "rewards/accuracies": 0.0, "rewards/chosen": -34.59961700439453, "rewards/margins": -1.354231834411621, "rewards/rejected": -33.245384216308594, "step": 2409 }, { "epoch": 0.3281590413943355, "grad_norm": 43.08449594541141, "learning_rate": 6.797675077943583e-07, "logits/chosen": 11.56033706665039, "logits/rejected": 11.837369918823242, "logps/chosen": -3.509448528289795, "logps/rejected": -3.7949135303497314, "loss": 3.9311, "rewards/accuracies": 0.75, "rewards/chosen": -35.094482421875, "rewards/margins": 2.854654312133789, "rewards/rejected": -37.949134826660156, "step": 2410 }, { "epoch": 0.3282952069716776, "grad_norm": 39.85080473921211, "learning_rate": 6.796315806382129e-07, "logits/chosen": 10.997244834899902, "logits/rejected": 11.95372486114502, "logps/chosen": -3.44819712638855, "logps/rejected": -3.818420886993408, "loss": 4.102, "rewards/accuracies": 1.0, "rewards/chosen": -34.481971740722656, "rewards/margins": 3.702238082885742, "rewards/rejected": -38.18421173095703, "step": 2411 }, { "epoch": 0.3284313725490196, "grad_norm": 47.58447585076322, "learning_rate": 6.794955902970001e-07, "logits/chosen": 11.943979263305664, "logits/rejected": 12.240642547607422, "logps/chosen": -3.4453482627868652, "logps/rejected": -3.9206199645996094, "loss": 3.6952, "rewards/accuracies": 0.75, "rewards/chosen": -34.4534797668457, "rewards/margins": 4.752718448638916, "rewards/rejected": -39.206199645996094, "step": 2412 }, { "epoch": 0.32856753812636164, "grad_norm": 43.77824575058046, "learning_rate": 6.793595368014485e-07, "logits/chosen": 11.70999813079834, "logits/rejected": 10.886022567749023, "logps/chosen": -3.6893882751464844, "logps/rejected": -3.472493886947632, "loss": 3.6434, "rewards/accuracies": 0.25, "rewards/chosen": -36.893882751464844, "rewards/margins": -2.1689438819885254, "rewards/rejected": -34.724937438964844, "step": 2413 }, { "epoch": 0.3287037037037037, "grad_norm": 40.926436030234534, "learning_rate": 6.792234201823003e-07, "logits/chosen": 11.082849502563477, "logits/rejected": 12.511518478393555, "logps/chosen": -3.0513594150543213, "logps/rejected": -3.5327706336975098, "loss": 4.2073, "rewards/accuracies": 1.0, "rewards/chosen": -30.513593673706055, "rewards/margins": 4.81411075592041, "rewards/rejected": -35.32770538330078, "step": 2414 }, { "epoch": 0.32883986928104575, "grad_norm": 39.91537869945157, "learning_rate": 6.790872404703122e-07, "logits/chosen": 11.241777420043945, "logits/rejected": 12.642398834228516, "logps/chosen": -3.4735913276672363, "logps/rejected": -3.78196382522583, "loss": 3.8015, "rewards/accuracies": 0.75, "rewards/chosen": -34.73591613769531, "rewards/margins": 3.0837230682373047, "rewards/rejected": -37.819637298583984, "step": 2415 }, { "epoch": 0.3289760348583878, "grad_norm": 39.17519116060922, "learning_rate": 6.789509976962553e-07, "logits/chosen": 11.302196502685547, "logits/rejected": 11.67795181274414, "logps/chosen": -3.1527514457702637, "logps/rejected": -3.5050530433654785, "loss": 3.8293, "rewards/accuracies": 0.75, "rewards/chosen": -31.52751350402832, "rewards/margins": 3.5230178833007812, "rewards/rejected": -35.05052947998047, "step": 2416 }, { "epoch": 0.32911220043572986, "grad_norm": 45.677959705024556, "learning_rate": 6.788146918909144e-07, "logits/chosen": 10.972814559936523, "logits/rejected": 11.802850723266602, "logps/chosen": -3.230449914932251, "logps/rejected": -3.469957113265991, "loss": 4.9208, "rewards/accuracies": 0.75, "rewards/chosen": -32.304500579833984, "rewards/margins": 2.3950719833374023, "rewards/rejected": -34.69956970214844, "step": 2417 }, { "epoch": 0.3292483660130719, "grad_norm": 39.58030033388627, "learning_rate": 6.786783230850892e-07, "logits/chosen": 11.509589195251465, "logits/rejected": 11.473699569702148, "logps/chosen": -3.529569625854492, "logps/rejected": -3.6584980487823486, "loss": 4.1973, "rewards/accuracies": 0.5, "rewards/chosen": -35.29570007324219, "rewards/margins": 1.2892827987670898, "rewards/rejected": -36.58498001098633, "step": 2418 }, { "epoch": 0.3293845315904139, "grad_norm": 37.404697444218016, "learning_rate": 6.785418913095935e-07, "logits/chosen": 11.741466522216797, "logits/rejected": 11.615167617797852, "logps/chosen": -3.0871806144714355, "logps/rejected": -3.265549898147583, "loss": 3.9317, "rewards/accuracies": 0.75, "rewards/chosen": -30.87180519104004, "rewards/margins": 1.7836933135986328, "rewards/rejected": -32.65549850463867, "step": 2419 }, { "epoch": 0.329520697167756, "grad_norm": 47.202701370166395, "learning_rate": 6.784053965952549e-07, "logits/chosen": 11.672205924987793, "logits/rejected": 12.032756805419922, "logps/chosen": -3.1980907917022705, "logps/rejected": -3.355001926422119, "loss": 4.3851, "rewards/accuracies": 0.5, "rewards/chosen": -31.98090934753418, "rewards/margins": 1.5691094398498535, "rewards/rejected": -33.550018310546875, "step": 2420 }, { "epoch": 0.32965686274509803, "grad_norm": 41.97793237658859, "learning_rate": 6.782688389729156e-07, "logits/chosen": 11.593660354614258, "logits/rejected": 11.163202285766602, "logps/chosen": -3.4641852378845215, "logps/rejected": -3.0550971031188965, "loss": 3.9348, "rewards/accuracies": 0.5, "rewards/chosen": -34.64185333251953, "rewards/margins": -4.090880393981934, "rewards/rejected": -30.55097198486328, "step": 2421 }, { "epoch": 0.32979302832244006, "grad_norm": 41.91508055685674, "learning_rate": 6.781322184734319e-07, "logits/chosen": 10.912026405334473, "logits/rejected": 12.099712371826172, "logps/chosen": -2.9746856689453125, "logps/rejected": -3.3740122318267822, "loss": 4.0697, "rewards/accuracies": 1.0, "rewards/chosen": -29.746856689453125, "rewards/margins": 3.9932632446289062, "rewards/rejected": -33.74011993408203, "step": 2422 }, { "epoch": 0.32992919389978215, "grad_norm": 38.20151944268891, "learning_rate": 6.779955351276746e-07, "logits/chosen": 12.106138229370117, "logits/rejected": 11.72607421875, "logps/chosen": -3.740673542022705, "logps/rejected": -3.853999614715576, "loss": 4.2237, "rewards/accuracies": 0.75, "rewards/chosen": -37.406734466552734, "rewards/margins": 1.1332635879516602, "rewards/rejected": -38.540000915527344, "step": 2423 }, { "epoch": 0.3300653594771242, "grad_norm": 37.64788363457126, "learning_rate": 6.77858788966528e-07, "logits/chosen": 11.724428176879883, "logits/rejected": 11.731882095336914, "logps/chosen": -3.325678825378418, "logps/rejected": -3.4700100421905518, "loss": 4.3366, "rewards/accuracies": 0.5, "rewards/chosen": -33.25679016113281, "rewards/margins": 1.4433112144470215, "rewards/rejected": -34.70009994506836, "step": 2424 }, { "epoch": 0.3302015250544662, "grad_norm": 37.281548759571905, "learning_rate": 6.777219800208913e-07, "logits/chosen": 12.087224960327148, "logits/rejected": 11.855720520019531, "logps/chosen": -3.194155693054199, "logps/rejected": -3.4517624378204346, "loss": 3.8669, "rewards/accuracies": 0.75, "rewards/chosen": -31.941558837890625, "rewards/margins": 2.5760674476623535, "rewards/rejected": -34.51762390136719, "step": 2425 }, { "epoch": 0.3303376906318083, "grad_norm": 40.43096481600504, "learning_rate": 6.775851083216773e-07, "logits/chosen": 12.157466888427734, "logits/rejected": 12.603361129760742, "logps/chosen": -3.19268798828125, "logps/rejected": -3.3647067546844482, "loss": 4.0866, "rewards/accuracies": 0.75, "rewards/chosen": -31.926877975463867, "rewards/margins": 1.7201886177062988, "rewards/rejected": -33.64706802368164, "step": 2426 }, { "epoch": 0.3304738562091503, "grad_norm": 42.786819458221565, "learning_rate": 6.774481738998138e-07, "logits/chosen": 11.054354667663574, "logits/rejected": 12.4892578125, "logps/chosen": -3.343379497528076, "logps/rejected": -3.8127059936523438, "loss": 4.0246, "rewards/accuracies": 0.75, "rewards/chosen": -33.43379211425781, "rewards/margins": 4.693265914916992, "rewards/rejected": -38.12705993652344, "step": 2427 }, { "epoch": 0.33061002178649235, "grad_norm": 42.77965287196165, "learning_rate": 6.77311176786242e-07, "logits/chosen": 11.209321975708008, "logits/rejected": 12.406015396118164, "logps/chosen": -3.5089850425720215, "logps/rejected": -3.405940532684326, "loss": 4.7234, "rewards/accuracies": 0.5, "rewards/chosen": -35.08985137939453, "rewards/margins": -1.0304465293884277, "rewards/rejected": -34.05940246582031, "step": 2428 }, { "epoch": 0.33074618736383443, "grad_norm": 45.038073343906625, "learning_rate": 6.771741170119174e-07, "logits/chosen": 11.859498977661133, "logits/rejected": 11.806922912597656, "logps/chosen": -3.2933831214904785, "logps/rejected": -3.783339262008667, "loss": 2.9494, "rewards/accuracies": 1.0, "rewards/chosen": -32.93383026123047, "rewards/margins": 4.899560928344727, "rewards/rejected": -37.83338928222656, "step": 2429 }, { "epoch": 0.33088235294117646, "grad_norm": 40.56200155508728, "learning_rate": 6.7703699460781e-07, "logits/chosen": 12.136358261108398, "logits/rejected": 12.746565818786621, "logps/chosen": -3.4439730644226074, "logps/rejected": -3.584362506866455, "loss": 4.0915, "rewards/accuracies": 0.75, "rewards/chosen": -34.43973159790039, "rewards/margins": 1.4038944244384766, "rewards/rejected": -35.8436279296875, "step": 2430 }, { "epoch": 0.33101851851851855, "grad_norm": 42.5742018545562, "learning_rate": 6.768998096049037e-07, "logits/chosen": 11.893013000488281, "logits/rejected": 11.661799430847168, "logps/chosen": -3.7605156898498535, "logps/rejected": -3.61976957321167, "loss": 4.1109, "rewards/accuracies": 0.25, "rewards/chosen": -37.60515594482422, "rewards/margins": -1.4074630737304688, "rewards/rejected": -36.19769287109375, "step": 2431 }, { "epoch": 0.3311546840958606, "grad_norm": 52.548887218913144, "learning_rate": 6.767625620341965e-07, "logits/chosen": 11.612926483154297, "logits/rejected": 12.118395805358887, "logps/chosen": -3.391037940979004, "logps/rejected": -3.939286947250366, "loss": 4.3058, "rewards/accuracies": 0.5, "rewards/chosen": -33.910377502441406, "rewards/margins": 5.482490539550781, "rewards/rejected": -39.39287185668945, "step": 2432 }, { "epoch": 0.3312908496732026, "grad_norm": 45.5496741068278, "learning_rate": 6.766252519267005e-07, "logits/chosen": 12.311389923095703, "logits/rejected": 12.004947662353516, "logps/chosen": -3.3747639656066895, "logps/rejected": -3.397646427154541, "loss": 4.7942, "rewards/accuracies": 0.75, "rewards/chosen": -33.747642517089844, "rewards/margins": 0.22882366180419922, "rewards/rejected": -33.976463317871094, "step": 2433 }, { "epoch": 0.3314270152505447, "grad_norm": 39.40154497627938, "learning_rate": 6.764878793134425e-07, "logits/chosen": 11.352785110473633, "logits/rejected": 11.980003356933594, "logps/chosen": -3.4761595726013184, "logps/rejected": -3.7755205631256104, "loss": 4.3272, "rewards/accuracies": 0.75, "rewards/chosen": -34.7615966796875, "rewards/margins": 2.9936094284057617, "rewards/rejected": -37.75520324707031, "step": 2434 }, { "epoch": 0.3315631808278867, "grad_norm": 38.060457132436, "learning_rate": 6.763504442254626e-07, "logits/chosen": 12.011712074279785, "logits/rejected": 11.833690643310547, "logps/chosen": -3.31001353263855, "logps/rejected": -3.5208120346069336, "loss": 4.0942, "rewards/accuracies": 0.5, "rewards/chosen": -33.100135803222656, "rewards/margins": 2.1079821586608887, "rewards/rejected": -35.2081184387207, "step": 2435 }, { "epoch": 0.33169934640522875, "grad_norm": 45.40050463346795, "learning_rate": 6.762129466938153e-07, "logits/chosen": 12.637439727783203, "logits/rejected": 12.34609317779541, "logps/chosen": -3.813282012939453, "logps/rejected": -3.6654164791107178, "loss": 4.0147, "rewards/accuracies": 0.25, "rewards/chosen": -38.132816314697266, "rewards/margins": -1.4786529541015625, "rewards/rejected": -36.65416717529297, "step": 2436 }, { "epoch": 0.33183551198257083, "grad_norm": 44.3353757156028, "learning_rate": 6.760753867495698e-07, "logits/chosen": 12.493244171142578, "logits/rejected": 12.464550018310547, "logps/chosen": -3.811966896057129, "logps/rejected": -3.693105459213257, "loss": 4.5252, "rewards/accuracies": 0.25, "rewards/chosen": -38.119667053222656, "rewards/margins": -1.188614845275879, "rewards/rejected": -36.931053161621094, "step": 2437 }, { "epoch": 0.33197167755991286, "grad_norm": 50.20570244630478, "learning_rate": 6.759377644238083e-07, "logits/chosen": 12.233664512634277, "logits/rejected": 12.481069564819336, "logps/chosen": -3.5560803413391113, "logps/rejected": -3.7495577335357666, "loss": 4.0087, "rewards/accuracies": 0.75, "rewards/chosen": -35.5608024597168, "rewards/margins": 1.934774398803711, "rewards/rejected": -37.49557876586914, "step": 2438 }, { "epoch": 0.3321078431372549, "grad_norm": 40.60510949812885, "learning_rate": 6.758000797476283e-07, "logits/chosen": 11.581462860107422, "logits/rejected": 12.104772567749023, "logps/chosen": -3.3034119606018066, "logps/rejected": -3.3877546787261963, "loss": 3.922, "rewards/accuracies": 0.75, "rewards/chosen": -33.03411865234375, "rewards/margins": 0.8434271812438965, "rewards/rejected": -33.87754821777344, "step": 2439 }, { "epoch": 0.332244008714597, "grad_norm": 45.66013198139627, "learning_rate": 6.756623327521403e-07, "logits/chosen": 11.879379272460938, "logits/rejected": 11.601716995239258, "logps/chosen": -3.6093804836273193, "logps/rejected": -3.5780372619628906, "loss": 3.7267, "rewards/accuracies": 0.25, "rewards/chosen": -36.09380340576172, "rewards/margins": -0.3134307861328125, "rewards/rejected": -35.780372619628906, "step": 2440 }, { "epoch": 0.332380174291939, "grad_norm": 43.75693263908143, "learning_rate": 6.755245234684696e-07, "logits/chosen": 10.921028137207031, "logits/rejected": 12.021997451782227, "logps/chosen": -3.235410690307617, "logps/rejected": -3.2900233268737793, "loss": 4.2613, "rewards/accuracies": 0.5, "rewards/chosen": -32.35410690307617, "rewards/margins": 0.5461273193359375, "rewards/rejected": -32.900230407714844, "step": 2441 }, { "epoch": 0.33251633986928103, "grad_norm": 39.7987189372936, "learning_rate": 6.753866519277554e-07, "logits/chosen": 11.588959693908691, "logits/rejected": 11.101085662841797, "logps/chosen": -3.3112058639526367, "logps/rejected": -3.4697115421295166, "loss": 4.2124, "rewards/accuracies": 0.5, "rewards/chosen": -33.112060546875, "rewards/margins": 1.5850582122802734, "rewards/rejected": -34.69711685180664, "step": 2442 }, { "epoch": 0.3326525054466231, "grad_norm": 45.24774549726107, "learning_rate": 6.752487181611507e-07, "logits/chosen": 12.13147258758545, "logits/rejected": 12.85853099822998, "logps/chosen": -3.827666759490967, "logps/rejected": -4.038562774658203, "loss": 3.4305, "rewards/accuracies": 0.75, "rewards/chosen": -38.276668548583984, "rewards/margins": 2.108959197998047, "rewards/rejected": -40.38562774658203, "step": 2443 }, { "epoch": 0.33278867102396514, "grad_norm": 44.23243968979231, "learning_rate": 6.751107221998231e-07, "logits/chosen": 11.7281494140625, "logits/rejected": 11.79110050201416, "logps/chosen": -3.3775277137756348, "logps/rejected": -3.488170623779297, "loss": 3.4874, "rewards/accuracies": 0.25, "rewards/chosen": -33.77527618408203, "rewards/margins": 1.1064329147338867, "rewards/rejected": -34.88170623779297, "step": 2444 }, { "epoch": 0.3329248366013072, "grad_norm": 42.95063676309898, "learning_rate": 6.749726640749534e-07, "logits/chosen": 11.743609428405762, "logits/rejected": 11.92520523071289, "logps/chosen": -3.451101779937744, "logps/rejected": -3.640537738800049, "loss": 3.5166, "rewards/accuracies": 0.75, "rewards/chosen": -34.511016845703125, "rewards/margins": 1.8943595886230469, "rewards/rejected": -36.40538024902344, "step": 2445 }, { "epoch": 0.33306100217864926, "grad_norm": 41.78269075541528, "learning_rate": 6.748345438177375e-07, "logits/chosen": 11.312923431396484, "logits/rejected": 12.316545486450195, "logps/chosen": -3.5966968536376953, "logps/rejected": -4.1485915184021, "loss": 4.3787, "rewards/accuracies": 0.75, "rewards/chosen": -35.96696853637695, "rewards/margins": 5.518948554992676, "rewards/rejected": -41.48591613769531, "step": 2446 }, { "epoch": 0.3331971677559913, "grad_norm": 52.082063900237195, "learning_rate": 6.746963614593846e-07, "logits/chosen": 11.659326553344727, "logits/rejected": 11.899608612060547, "logps/chosen": -3.2018914222717285, "logps/rejected": -3.5215601921081543, "loss": 4.5716, "rewards/accuracies": 0.75, "rewards/chosen": -32.01891326904297, "rewards/margins": 3.196686267852783, "rewards/rejected": -35.215599060058594, "step": 2447 }, { "epoch": 0.3333333333333333, "grad_norm": 43.72501601090993, "learning_rate": 6.745581170311183e-07, "logits/chosen": 11.694375038146973, "logits/rejected": 11.666387557983398, "logps/chosen": -3.756084680557251, "logps/rejected": -3.5870466232299805, "loss": 4.4468, "rewards/accuracies": 0.5, "rewards/chosen": -37.56084442138672, "rewards/margins": -1.6903772354125977, "rewards/rejected": -35.87046813964844, "step": 2448 }, { "epoch": 0.3334694989106754, "grad_norm": 43.028294050881506, "learning_rate": 6.744198105641758e-07, "logits/chosen": 11.021462440490723, "logits/rejected": 11.037555694580078, "logps/chosen": -3.1735610961914062, "logps/rejected": -3.467644691467285, "loss": 3.3063, "rewards/accuracies": 0.75, "rewards/chosen": -31.735610961914062, "rewards/margins": 2.9408369064331055, "rewards/rejected": -34.676448822021484, "step": 2449 }, { "epoch": 0.33360566448801743, "grad_norm": 43.16878877138598, "learning_rate": 6.742814420898086e-07, "logits/chosen": 11.918214797973633, "logits/rejected": 11.81121826171875, "logps/chosen": -3.763641119003296, "logps/rejected": -3.466982364654541, "loss": 4.0373, "rewards/accuracies": 0.25, "rewards/chosen": -37.63641357421875, "rewards/margins": -2.9665870666503906, "rewards/rejected": -34.66982650756836, "step": 2450 }, { "epoch": 0.33374183006535946, "grad_norm": 42.43639262983898, "learning_rate": 6.741430116392826e-07, "logits/chosen": 10.686620712280273, "logits/rejected": 11.771421432495117, "logps/chosen": -3.426605224609375, "logps/rejected": -3.8222219944000244, "loss": 4.0371, "rewards/accuracies": 0.75, "rewards/chosen": -34.266048431396484, "rewards/margins": 3.9561710357666016, "rewards/rejected": -38.22222137451172, "step": 2451 }, { "epoch": 0.33387799564270154, "grad_norm": 39.71781840703503, "learning_rate": 6.740045192438769e-07, "logits/chosen": 11.572561264038086, "logits/rejected": 12.684046745300293, "logps/chosen": -3.0176877975463867, "logps/rejected": -3.7554805278778076, "loss": 4.0464, "rewards/accuracies": 0.75, "rewards/chosen": -30.176876068115234, "rewards/margins": 7.3779296875, "rewards/rejected": -37.554805755615234, "step": 2452 }, { "epoch": 0.33401416122004357, "grad_norm": 41.84387816832453, "learning_rate": 6.738659649348852e-07, "logits/chosen": 11.400740623474121, "logits/rejected": 11.912750244140625, "logps/chosen": -3.4821674823760986, "logps/rejected": -3.4475932121276855, "loss": 4.2816, "rewards/accuracies": 0.25, "rewards/chosen": -34.821678161621094, "rewards/margins": -0.34574365615844727, "rewards/rejected": -34.47593307495117, "step": 2453 }, { "epoch": 0.3341503267973856, "grad_norm": 44.04071417001093, "learning_rate": 6.737273487436148e-07, "logits/chosen": 11.947965621948242, "logits/rejected": 12.5916166305542, "logps/chosen": -3.5110933780670166, "logps/rejected": -3.874044179916382, "loss": 4.2896, "rewards/accuracies": 0.5, "rewards/chosen": -35.110931396484375, "rewards/margins": 3.6295080184936523, "rewards/rejected": -38.740440368652344, "step": 2454 }, { "epoch": 0.3342864923747277, "grad_norm": 41.3572940785701, "learning_rate": 6.735886707013874e-07, "logits/chosen": 11.115447998046875, "logits/rejected": 11.358612060546875, "logps/chosen": -3.301258087158203, "logps/rejected": -3.440152883529663, "loss": 3.5979, "rewards/accuracies": 0.5, "rewards/chosen": -33.01258087158203, "rewards/margins": 1.388946533203125, "rewards/rejected": -34.401527404785156, "step": 2455 }, { "epoch": 0.3344226579520697, "grad_norm": 55.56266828919581, "learning_rate": 6.734499308395382e-07, "logits/chosen": 12.001422882080078, "logits/rejected": 10.954757690429688, "logps/chosen": -3.89115047454834, "logps/rejected": -3.516939163208008, "loss": 4.6594, "rewards/accuracies": 0.0, "rewards/chosen": -38.91150665283203, "rewards/margins": -3.742110252380371, "rewards/rejected": -35.169395446777344, "step": 2456 }, { "epoch": 0.33455882352941174, "grad_norm": 40.94492151532506, "learning_rate": 6.733111291894168e-07, "logits/chosen": 11.696664810180664, "logits/rejected": 11.65711498260498, "logps/chosen": -3.391357898712158, "logps/rejected": -3.409146785736084, "loss": 4.0521, "rewards/accuracies": 0.5, "rewards/chosen": -33.913578033447266, "rewards/margins": 0.17788982391357422, "rewards/rejected": -34.091468811035156, "step": 2457 }, { "epoch": 0.3346949891067538, "grad_norm": 41.18338694520441, "learning_rate": 6.731722657823867e-07, "logits/chosen": 10.986408233642578, "logits/rejected": 10.544018745422363, "logps/chosen": -3.548051357269287, "logps/rejected": -3.2970409393310547, "loss": 3.9573, "rewards/accuracies": 0.25, "rewards/chosen": -35.48051452636719, "rewards/margins": -2.510104179382324, "rewards/rejected": -32.97040557861328, "step": 2458 }, { "epoch": 0.33483115468409586, "grad_norm": 43.59425606131944, "learning_rate": 6.73033340649825e-07, "logits/chosen": 11.287712097167969, "logits/rejected": 11.481281280517578, "logps/chosen": -3.1574697494506836, "logps/rejected": -3.574089527130127, "loss": 3.6975, "rewards/accuracies": 1.0, "rewards/chosen": -31.574697494506836, "rewards/margins": 4.166198253631592, "rewards/rejected": -35.74089813232422, "step": 2459 }, { "epoch": 0.3349673202614379, "grad_norm": 40.01362731147373, "learning_rate": 6.728943538231231e-07, "logits/chosen": 11.234825134277344, "logits/rejected": 11.699925422668457, "logps/chosen": -3.362699031829834, "logps/rejected": -3.518678903579712, "loss": 3.8075, "rewards/accuracies": 0.5, "rewards/chosen": -33.626991271972656, "rewards/margins": 1.5597987174987793, "rewards/rejected": -35.186790466308594, "step": 2460 }, { "epoch": 0.33510348583877997, "grad_norm": 45.605772929298226, "learning_rate": 6.727553053336861e-07, "logits/chosen": 11.600381851196289, "logits/rejected": 11.759008407592773, "logps/chosen": -3.554311752319336, "logps/rejected": -3.5319981575012207, "loss": 4.2032, "rewards/accuracies": 0.5, "rewards/chosen": -35.543121337890625, "rewards/margins": -0.22313642501831055, "rewards/rejected": -35.319984436035156, "step": 2461 }, { "epoch": 0.335239651416122, "grad_norm": 43.606887541437466, "learning_rate": 6.726161952129334e-07, "logits/chosen": 11.812738418579102, "logits/rejected": 11.602254867553711, "logps/chosen": -3.548898220062256, "logps/rejected": -3.3582448959350586, "loss": 4.2328, "rewards/accuracies": 0.5, "rewards/chosen": -35.488983154296875, "rewards/margins": -1.9065346717834473, "rewards/rejected": -33.58245086669922, "step": 2462 }, { "epoch": 0.335375816993464, "grad_norm": 43.07015720993682, "learning_rate": 6.724770234922977e-07, "logits/chosen": 12.22115421295166, "logits/rejected": 10.949051856994629, "logps/chosen": -3.6576132774353027, "logps/rejected": -3.4684062004089355, "loss": 4.168, "rewards/accuracies": 0.25, "rewards/chosen": -36.576133728027344, "rewards/margins": -1.8920707702636719, "rewards/rejected": -34.684059143066406, "step": 2463 }, { "epoch": 0.3355119825708061, "grad_norm": 47.79626352307404, "learning_rate": 6.723377902032264e-07, "logits/chosen": 10.840622901916504, "logits/rejected": 11.512266159057617, "logps/chosen": -3.50600528717041, "logps/rejected": -3.7114100456237793, "loss": 3.9479, "rewards/accuracies": 0.75, "rewards/chosen": -35.06005096435547, "rewards/margins": 2.0540504455566406, "rewards/rejected": -37.114105224609375, "step": 2464 }, { "epoch": 0.33564814814814814, "grad_norm": 41.82324541916573, "learning_rate": 6.721984953771802e-07, "logits/chosen": 11.729450225830078, "logits/rejected": 12.200170516967773, "logps/chosen": -3.3966739177703857, "logps/rejected": -3.6986773014068604, "loss": 3.9487, "rewards/accuracies": 0.5, "rewards/chosen": -33.96673583984375, "rewards/margins": 3.0200352668762207, "rewards/rejected": -36.98677444458008, "step": 2465 }, { "epoch": 0.33578431372549017, "grad_norm": 45.19048370579841, "learning_rate": 6.720591390456339e-07, "logits/chosen": 11.673892974853516, "logits/rejected": 11.938238143920898, "logps/chosen": -3.1656816005706787, "logps/rejected": -3.444052219390869, "loss": 4.2554, "rewards/accuracies": 0.75, "rewards/chosen": -31.656814575195312, "rewards/margins": 2.7837047576904297, "rewards/rejected": -34.440521240234375, "step": 2466 }, { "epoch": 0.33592047930283225, "grad_norm": 43.33927339603663, "learning_rate": 6.719197212400763e-07, "logits/chosen": 11.589195251464844, "logits/rejected": 11.263818740844727, "logps/chosen": -2.9993834495544434, "logps/rejected": -3.162828207015991, "loss": 4.422, "rewards/accuracies": 0.5, "rewards/chosen": -29.99383544921875, "rewards/margins": 1.6344470977783203, "rewards/rejected": -31.628280639648438, "step": 2467 }, { "epoch": 0.3360566448801743, "grad_norm": 52.77062529961387, "learning_rate": 6.717802419920099e-07, "logits/chosen": 11.659570693969727, "logits/rejected": 11.316818237304688, "logps/chosen": -3.6953511238098145, "logps/rejected": -3.7267465591430664, "loss": 4.468, "rewards/accuracies": 0.75, "rewards/chosen": -36.953514099121094, "rewards/margins": 0.3139533996582031, "rewards/rejected": -37.2674674987793, "step": 2468 }, { "epoch": 0.33619281045751637, "grad_norm": 41.557784742852355, "learning_rate": 6.716407013329514e-07, "logits/chosen": 11.91105842590332, "logits/rejected": 11.041091918945312, "logps/chosen": -3.4584474563598633, "logps/rejected": -3.453678846359253, "loss": 3.3427, "rewards/accuracies": 0.5, "rewards/chosen": -34.58447265625, "rewards/margins": -0.047686100006103516, "rewards/rejected": -34.53678894042969, "step": 2469 }, { "epoch": 0.3363289760348584, "grad_norm": 43.102572529560774, "learning_rate": 6.715010992944309e-07, "logits/chosen": 11.957721710205078, "logits/rejected": 11.183431625366211, "logps/chosen": -3.7528183460235596, "logps/rejected": -3.5947751998901367, "loss": 3.9382, "rewards/accuracies": 0.5, "rewards/chosen": -37.5281867980957, "rewards/margins": -1.580432415008545, "rewards/rejected": -35.94775390625, "step": 2470 }, { "epoch": 0.3364651416122004, "grad_norm": 41.424455882206416, "learning_rate": 6.713614359079929e-07, "logits/chosen": 10.953402519226074, "logits/rejected": 11.410961151123047, "logps/chosen": -3.288576602935791, "logps/rejected": -3.4002528190612793, "loss": 3.9803, "rewards/accuracies": 0.75, "rewards/chosen": -32.885765075683594, "rewards/margins": 1.1167621612548828, "rewards/rejected": -34.002525329589844, "step": 2471 }, { "epoch": 0.3366013071895425, "grad_norm": 55.951310058882044, "learning_rate": 6.712217112051952e-07, "logits/chosen": 12.001434326171875, "logits/rejected": 12.271414756774902, "logps/chosen": -3.5940604209899902, "logps/rejected": -3.747136116027832, "loss": 3.8307, "rewards/accuracies": 0.5, "rewards/chosen": -35.94060516357422, "rewards/margins": 1.5307579040527344, "rewards/rejected": -37.47135925292969, "step": 2472 }, { "epoch": 0.33673747276688454, "grad_norm": 46.94257062644545, "learning_rate": 6.710819252176101e-07, "logits/chosen": 11.445562362670898, "logits/rejected": 11.771568298339844, "logps/chosen": -3.386146306991577, "logps/rejected": -3.776463031768799, "loss": 4.6244, "rewards/accuracies": 0.75, "rewards/chosen": -33.86146545410156, "rewards/margins": 3.903168201446533, "rewards/rejected": -37.76462936401367, "step": 2473 }, { "epoch": 0.33687363834422657, "grad_norm": 47.518159864585066, "learning_rate": 6.70942077976823e-07, "logits/chosen": 11.554364204406738, "logits/rejected": 11.444323539733887, "logps/chosen": -3.0780415534973145, "logps/rejected": -3.3084285259246826, "loss": 4.0085, "rewards/accuracies": 0.75, "rewards/chosen": -30.78041648864746, "rewards/margins": 2.303868293762207, "rewards/rejected": -33.084285736083984, "step": 2474 }, { "epoch": 0.33700980392156865, "grad_norm": 47.046323137423535, "learning_rate": 6.708021695144338e-07, "logits/chosen": 10.915879249572754, "logits/rejected": 11.26589584350586, "logps/chosen": -3.3319380283355713, "logps/rejected": -3.4878602027893066, "loss": 3.8782, "rewards/accuracies": 0.5, "rewards/chosen": -33.31938171386719, "rewards/margins": 1.559220790863037, "rewards/rejected": -34.87860107421875, "step": 2475 }, { "epoch": 0.3371459694989107, "grad_norm": 38.92696374634324, "learning_rate": 6.70662199862056e-07, "logits/chosen": 11.332355499267578, "logits/rejected": 12.01156997680664, "logps/chosen": -3.634277105331421, "logps/rejected": -3.9671239852905273, "loss": 4.0478, "rewards/accuracies": 0.75, "rewards/chosen": -36.3427734375, "rewards/margins": 3.3284711837768555, "rewards/rejected": -39.671241760253906, "step": 2476 }, { "epoch": 0.3372821350762527, "grad_norm": 46.979376031489, "learning_rate": 6.70522169051317e-07, "logits/chosen": 11.846692085266113, "logits/rejected": 12.14649486541748, "logps/chosen": -3.5689239501953125, "logps/rejected": -3.678560972213745, "loss": 4.0618, "rewards/accuracies": 0.5, "rewards/chosen": -35.689239501953125, "rewards/margins": 1.0963706970214844, "rewards/rejected": -36.78561019897461, "step": 2477 }, { "epoch": 0.3374183006535948, "grad_norm": 43.2891562415045, "learning_rate": 6.703820771138575e-07, "logits/chosen": 11.82946491241455, "logits/rejected": 10.814894676208496, "logps/chosen": -3.4135186672210693, "logps/rejected": -3.3230745792388916, "loss": 3.9316, "rewards/accuracies": 0.5, "rewards/chosen": -34.13518524169922, "rewards/margins": -0.9044394493103027, "rewards/rejected": -33.23074722290039, "step": 2478 }, { "epoch": 0.3375544662309368, "grad_norm": 40.18834520067468, "learning_rate": 6.702419240813327e-07, "logits/chosen": 11.605823516845703, "logits/rejected": 11.865913391113281, "logps/chosen": -3.1742324829101562, "logps/rejected": -3.496727466583252, "loss": 3.6911, "rewards/accuracies": 1.0, "rewards/chosen": -31.742324829101562, "rewards/margins": 3.224946975708008, "rewards/rejected": -34.9672737121582, "step": 2479 }, { "epoch": 0.33769063180827885, "grad_norm": 42.10189562572878, "learning_rate": 6.701017099854115e-07, "logits/chosen": 11.169065475463867, "logits/rejected": 11.175239562988281, "logps/chosen": -3.273888349533081, "logps/rejected": -3.41054368019104, "loss": 3.9418, "rewards/accuracies": 0.75, "rewards/chosen": -32.73888397216797, "rewards/margins": 1.3665523529052734, "rewards/rejected": -34.105438232421875, "step": 2480 }, { "epoch": 0.33782679738562094, "grad_norm": 46.674259595934366, "learning_rate": 6.699614348577759e-07, "logits/chosen": 9.921272277832031, "logits/rejected": 10.814643859863281, "logps/chosen": -2.8412559032440186, "logps/rejected": -3.0820133686065674, "loss": 4.2933, "rewards/accuracies": 0.75, "rewards/chosen": -28.412559509277344, "rewards/margins": 2.407573699951172, "rewards/rejected": -30.820133209228516, "step": 2481 }, { "epoch": 0.33796296296296297, "grad_norm": 41.2894226922841, "learning_rate": 6.698210987301228e-07, "logits/chosen": 10.967042922973633, "logits/rejected": 11.3145751953125, "logps/chosen": -3.3936526775360107, "logps/rejected": -3.667114496231079, "loss": 4.2731, "rewards/accuracies": 0.75, "rewards/chosen": -33.9365234375, "rewards/margins": 2.7346181869506836, "rewards/rejected": -36.671142578125, "step": 2482 }, { "epoch": 0.338099128540305, "grad_norm": 41.065089833063915, "learning_rate": 6.696807016341621e-07, "logits/chosen": 10.222528457641602, "logits/rejected": 11.874078750610352, "logps/chosen": -3.183394432067871, "logps/rejected": -3.5514559745788574, "loss": 4.3484, "rewards/accuracies": 0.75, "rewards/chosen": -31.833942413330078, "rewards/margins": 3.6806182861328125, "rewards/rejected": -35.51456069946289, "step": 2483 }, { "epoch": 0.3382352941176471, "grad_norm": 40.50124468131863, "learning_rate": 6.695402436016175e-07, "logits/chosen": 11.23752212524414, "logits/rejected": 10.50002384185791, "logps/chosen": -3.2996692657470703, "logps/rejected": -3.143862247467041, "loss": 4.0342, "rewards/accuracies": 0.5, "rewards/chosen": -32.9966926574707, "rewards/margins": -1.5580706596374512, "rewards/rejected": -31.438621520996094, "step": 2484 }, { "epoch": 0.3383714596949891, "grad_norm": 41.113772267186675, "learning_rate": 6.69399724664227e-07, "logits/chosen": 11.538949966430664, "logits/rejected": 11.854990005493164, "logps/chosen": -3.3892264366149902, "logps/rejected": -3.4016642570495605, "loss": 3.7595, "rewards/accuracies": 0.5, "rewards/chosen": -33.89226531982422, "rewards/margins": 0.12437629699707031, "rewards/rejected": -34.016639709472656, "step": 2485 }, { "epoch": 0.33850762527233114, "grad_norm": 44.322897735518815, "learning_rate": 6.692591448537417e-07, "logits/chosen": 10.81088638305664, "logits/rejected": 11.69736099243164, "logps/chosen": -2.9952456951141357, "logps/rejected": -3.453028678894043, "loss": 3.9172, "rewards/accuracies": 1.0, "rewards/chosen": -29.952457427978516, "rewards/margins": 4.577830791473389, "rewards/rejected": -34.53028869628906, "step": 2486 }, { "epoch": 0.3386437908496732, "grad_norm": 66.2237201829371, "learning_rate": 6.691185042019269e-07, "logits/chosen": 10.33817195892334, "logits/rejected": 11.902077674865723, "logps/chosen": -3.1956787109375, "logps/rejected": -3.7750115394592285, "loss": 4.0082, "rewards/accuracies": 1.0, "rewards/chosen": -31.956785202026367, "rewards/margins": 5.793331146240234, "rewards/rejected": -37.750118255615234, "step": 2487 }, { "epoch": 0.33877995642701525, "grad_norm": 43.846392416104635, "learning_rate": 6.689778027405616e-07, "logits/chosen": 11.19481372833252, "logits/rejected": 11.547121047973633, "logps/chosen": -3.3928771018981934, "logps/rejected": -3.2562620639801025, "loss": 4.3732, "rewards/accuracies": 0.5, "rewards/chosen": -33.92877197265625, "rewards/margins": -1.3661503791809082, "rewards/rejected": -32.5626220703125, "step": 2488 }, { "epoch": 0.3389161220043573, "grad_norm": 42.74488565079737, "learning_rate": 6.688370405014384e-07, "logits/chosen": 11.335508346557617, "logits/rejected": 11.800089836120605, "logps/chosen": -3.288742780685425, "logps/rejected": -4.048727035522461, "loss": 3.8032, "rewards/accuracies": 1.0, "rewards/chosen": -32.887428283691406, "rewards/margins": 7.59984016418457, "rewards/rejected": -40.487266540527344, "step": 2489 }, { "epoch": 0.33905228758169936, "grad_norm": 48.905399478138285, "learning_rate": 6.686962175163636e-07, "logits/chosen": 11.601303100585938, "logits/rejected": 11.829262733459473, "logps/chosen": -3.617185592651367, "logps/rejected": -3.4024577140808105, "loss": 4.3125, "rewards/accuracies": 0.0, "rewards/chosen": -36.171852111816406, "rewards/margins": -2.1472764015197754, "rewards/rejected": -34.024574279785156, "step": 2490 }, { "epoch": 0.3391884531590414, "grad_norm": 40.31397667197723, "learning_rate": 6.685553338171574e-07, "logits/chosen": 11.34276008605957, "logits/rejected": 11.321805953979492, "logps/chosen": -3.3301241397857666, "logps/rejected": -3.4164016246795654, "loss": 4.1681, "rewards/accuracies": 0.5, "rewards/chosen": -33.30124282836914, "rewards/margins": 0.8627738952636719, "rewards/rejected": -34.16401672363281, "step": 2491 }, { "epoch": 0.3393246187363834, "grad_norm": 43.20899215661258, "learning_rate": 6.684143894356535e-07, "logits/chosen": 11.154900550842285, "logits/rejected": 11.738643646240234, "logps/chosen": -3.422806978225708, "logps/rejected": -3.663309097290039, "loss": 3.8583, "rewards/accuracies": 0.75, "rewards/chosen": -34.22806930541992, "rewards/margins": 2.405019760131836, "rewards/rejected": -36.633087158203125, "step": 2492 }, { "epoch": 0.3394607843137255, "grad_norm": 38.51599196488775, "learning_rate": 6.682733844036997e-07, "logits/chosen": 10.675567626953125, "logits/rejected": 10.176399230957031, "logps/chosen": -3.0393333435058594, "logps/rejected": -2.962796211242676, "loss": 3.221, "rewards/accuracies": 0.25, "rewards/chosen": -30.39333152770996, "rewards/margins": -0.7653698921203613, "rewards/rejected": -29.627962112426758, "step": 2493 }, { "epoch": 0.33959694989106753, "grad_norm": 43.08915467108261, "learning_rate": 6.681323187531572e-07, "logits/chosen": 11.03188705444336, "logits/rejected": 12.359827041625977, "logps/chosen": -3.2056822776794434, "logps/rejected": -3.7101333141326904, "loss": 4.2044, "rewards/accuracies": 1.0, "rewards/chosen": -32.05682373046875, "rewards/margins": 5.0445075035095215, "rewards/rejected": -37.10133361816406, "step": 2494 }, { "epoch": 0.33973311546840956, "grad_norm": 68.47224777234285, "learning_rate": 6.679911925159008e-07, "logits/chosen": 11.69625186920166, "logits/rejected": 12.591323852539062, "logps/chosen": -3.263467311859131, "logps/rejected": -3.6089797019958496, "loss": 4.0342, "rewards/accuracies": 0.75, "rewards/chosen": -32.634674072265625, "rewards/margins": 3.4551219940185547, "rewards/rejected": -36.08979415893555, "step": 2495 }, { "epoch": 0.33986928104575165, "grad_norm": 40.85803593583964, "learning_rate": 6.678500057238192e-07, "logits/chosen": 10.931045532226562, "logits/rejected": 11.310478210449219, "logps/chosen": -3.3261966705322266, "logps/rejected": -3.5068557262420654, "loss": 4.4201, "rewards/accuracies": 0.5, "rewards/chosen": -33.26197052001953, "rewards/margins": 1.806589126586914, "rewards/rejected": -35.06855773925781, "step": 2496 }, { "epoch": 0.3400054466230937, "grad_norm": 44.24776627872156, "learning_rate": 6.677087584088147e-07, "logits/chosen": 11.335391998291016, "logits/rejected": 11.898959159851074, "logps/chosen": -3.365178108215332, "logps/rejected": -3.5379889011383057, "loss": 4.2376, "rewards/accuracies": 0.5, "rewards/chosen": -33.65177917480469, "rewards/margins": 1.7281103134155273, "rewards/rejected": -35.37989044189453, "step": 2497 }, { "epoch": 0.3401416122004357, "grad_norm": 39.03053574026227, "learning_rate": 6.675674506028034e-07, "logits/chosen": 10.973552703857422, "logits/rejected": 12.001540184020996, "logps/chosen": -3.1575474739074707, "logps/rejected": -3.8501174449920654, "loss": 3.7796, "rewards/accuracies": 1.0, "rewards/chosen": -31.575475692749023, "rewards/margins": 6.9257001876831055, "rewards/rejected": -38.50117492675781, "step": 2498 }, { "epoch": 0.3402777777777778, "grad_norm": 40.26072961874904, "learning_rate": 6.674260823377149e-07, "logits/chosen": 11.118050575256348, "logits/rejected": 10.899953842163086, "logps/chosen": -3.120659112930298, "logps/rejected": -3.2916836738586426, "loss": 3.9245, "rewards/accuracies": 0.5, "rewards/chosen": -31.20659065246582, "rewards/margins": 1.71024751663208, "rewards/rejected": -32.916839599609375, "step": 2499 }, { "epoch": 0.3404139433551198, "grad_norm": 37.70056401220344, "learning_rate": 6.672846536454924e-07, "logits/chosen": 11.62136459350586, "logits/rejected": 11.461823463439941, "logps/chosen": -3.132591724395752, "logps/rejected": -3.2454192638397217, "loss": 4.0595, "rewards/accuracies": 0.5, "rewards/chosen": -31.325916290283203, "rewards/margins": 1.1282758712768555, "rewards/rejected": -32.454193115234375, "step": 2500 }, { "epoch": 0.34055010893246185, "grad_norm": 41.623001932590604, "learning_rate": 6.671431645580933e-07, "logits/chosen": 11.69265079498291, "logits/rejected": 12.105104446411133, "logps/chosen": -3.767477035522461, "logps/rejected": -3.7760138511657715, "loss": 4.3833, "rewards/accuracies": 0.25, "rewards/chosen": -37.67477035522461, "rewards/margins": 0.08536529541015625, "rewards/rejected": -37.760135650634766, "step": 2501 }, { "epoch": 0.34068627450980393, "grad_norm": 48.775351107381866, "learning_rate": 6.670016151074877e-07, "logits/chosen": 11.775753021240234, "logits/rejected": 11.781365394592285, "logps/chosen": -4.0106635093688965, "logps/rejected": -3.76497220993042, "loss": 4.4114, "rewards/accuracies": 0.25, "rewards/chosen": -40.10663604736328, "rewards/margins": -2.456913948059082, "rewards/rejected": -37.64971923828125, "step": 2502 }, { "epoch": 0.34082244008714596, "grad_norm": 45.532972334686605, "learning_rate": 6.668600053256601e-07, "logits/chosen": 11.391473770141602, "logits/rejected": 11.269233703613281, "logps/chosen": -3.5561466217041016, "logps/rejected": -3.9319801330566406, "loss": 3.991, "rewards/accuracies": 0.75, "rewards/chosen": -35.561466217041016, "rewards/margins": 3.7583370208740234, "rewards/rejected": -39.319801330566406, "step": 2503 }, { "epoch": 0.340958605664488, "grad_norm": 40.70116989902216, "learning_rate": 6.667183352446085e-07, "logits/chosen": 11.415878295898438, "logits/rejected": 11.946819305419922, "logps/chosen": -3.5619945526123047, "logps/rejected": -3.9148693084716797, "loss": 3.9266, "rewards/accuracies": 1.0, "rewards/chosen": -35.61994552612305, "rewards/margins": 3.528744697570801, "rewards/rejected": -39.14868927001953, "step": 2504 }, { "epoch": 0.3410947712418301, "grad_norm": 41.280387515925526, "learning_rate": 6.665766048963443e-07, "logits/chosen": 11.612607955932617, "logits/rejected": 12.590666770935059, "logps/chosen": -3.1893341541290283, "logps/rejected": -3.5459375381469727, "loss": 4.4744, "rewards/accuracies": 0.75, "rewards/chosen": -31.893341064453125, "rewards/margins": 3.5660324096679688, "rewards/rejected": -35.459373474121094, "step": 2505 }, { "epoch": 0.3412309368191721, "grad_norm": 40.33618134967813, "learning_rate": 6.664348143128928e-07, "logits/chosen": 11.461926460266113, "logits/rejected": 11.949698448181152, "logps/chosen": -3.739910125732422, "logps/rejected": -3.915235996246338, "loss": 3.2703, "rewards/accuracies": 0.75, "rewards/chosen": -37.39910125732422, "rewards/margins": 1.753255844116211, "rewards/rejected": -39.15235900878906, "step": 2506 }, { "epoch": 0.3413671023965142, "grad_norm": 45.309780834356, "learning_rate": 6.662929635262925e-07, "logits/chosen": 11.611645698547363, "logits/rejected": 11.409690856933594, "logps/chosen": -3.6248133182525635, "logps/rejected": -3.9312775135040283, "loss": 4.0684, "rewards/accuracies": 1.0, "rewards/chosen": -36.248130798339844, "rewards/margins": 3.0646400451660156, "rewards/rejected": -39.312774658203125, "step": 2507 }, { "epoch": 0.3415032679738562, "grad_norm": 45.532383495029364, "learning_rate": 6.661510525685958e-07, "logits/chosen": 11.412866592407227, "logits/rejected": 11.858386993408203, "logps/chosen": -3.671442985534668, "logps/rejected": -3.773738384246826, "loss": 3.9976, "rewards/accuracies": 0.5, "rewards/chosen": -36.71442794799805, "rewards/margins": 1.0229549407958984, "rewards/rejected": -37.73738479614258, "step": 2508 }, { "epoch": 0.34163943355119825, "grad_norm": 47.10612635928193, "learning_rate": 6.660090814718689e-07, "logits/chosen": 10.692560195922852, "logits/rejected": 12.007186889648438, "logps/chosen": -3.758702039718628, "logps/rejected": -3.9001200199127197, "loss": 4.0533, "rewards/accuracies": 0.75, "rewards/chosen": -37.58702087402344, "rewards/margins": 1.4141807556152344, "rewards/rejected": -39.00120162963867, "step": 2509 }, { "epoch": 0.34177559912854033, "grad_norm": 40.04084745513193, "learning_rate": 6.658670502681911e-07, "logits/chosen": 10.864585876464844, "logits/rejected": 12.303369522094727, "logps/chosen": -3.089061737060547, "logps/rejected": -3.396237850189209, "loss": 4.0168, "rewards/accuracies": 0.75, "rewards/chosen": -30.89061737060547, "rewards/margins": 3.0717620849609375, "rewards/rejected": -33.962379455566406, "step": 2510 }, { "epoch": 0.34191176470588236, "grad_norm": 38.84091614232183, "learning_rate": 6.657249589896557e-07, "logits/chosen": 11.557123184204102, "logits/rejected": 12.607427597045898, "logps/chosen": -3.503972053527832, "logps/rejected": -3.930795669555664, "loss": 4.0432, "rewards/accuracies": 0.75, "rewards/chosen": -35.03972244262695, "rewards/margins": 4.26823616027832, "rewards/rejected": -39.30795669555664, "step": 2511 }, { "epoch": 0.3420479302832244, "grad_norm": 37.24902568405038, "learning_rate": 6.655828076683693e-07, "logits/chosen": 11.495597839355469, "logits/rejected": 12.21319580078125, "logps/chosen": -3.5628645420074463, "logps/rejected": -3.785278081893921, "loss": 4.1397, "rewards/accuracies": 0.5, "rewards/chosen": -35.62864685058594, "rewards/margins": 2.2241344451904297, "rewards/rejected": -37.852779388427734, "step": 2512 }, { "epoch": 0.3421840958605665, "grad_norm": 45.34433032088984, "learning_rate": 6.654405963364521e-07, "logits/chosen": 11.233808517456055, "logits/rejected": 11.366591453552246, "logps/chosen": -3.346418857574463, "logps/rejected": -3.356942653656006, "loss": 3.816, "rewards/accuracies": 0.5, "rewards/chosen": -33.46419143676758, "rewards/margins": 0.10523605346679688, "rewards/rejected": -33.569427490234375, "step": 2513 }, { "epoch": 0.3423202614379085, "grad_norm": 37.88643901145027, "learning_rate": 6.65298325026038e-07, "logits/chosen": 11.094226837158203, "logits/rejected": 11.822820663452148, "logps/chosen": -3.3888347148895264, "logps/rejected": -3.8025712966918945, "loss": 4.2737, "rewards/accuracies": 0.75, "rewards/chosen": -33.88834762573242, "rewards/margins": 4.137364387512207, "rewards/rejected": -38.02571105957031, "step": 2514 }, { "epoch": 0.34245642701525053, "grad_norm": 39.42684237046121, "learning_rate": 6.651559937692745e-07, "logits/chosen": 10.699432373046875, "logits/rejected": 12.020776748657227, "logps/chosen": -3.176158905029297, "logps/rejected": -3.7588913440704346, "loss": 3.9434, "rewards/accuracies": 1.0, "rewards/chosen": -31.76158905029297, "rewards/margins": 5.8273234367370605, "rewards/rejected": -37.58891296386719, "step": 2515 }, { "epoch": 0.3425925925925926, "grad_norm": 37.77626749583478, "learning_rate": 6.650136025983224e-07, "logits/chosen": 11.769184112548828, "logits/rejected": 11.710049629211426, "logps/chosen": -3.5137362480163574, "logps/rejected": -3.9583077430725098, "loss": 4.1692, "rewards/accuracies": 0.75, "rewards/chosen": -35.13736343383789, "rewards/margins": 4.445713996887207, "rewards/rejected": -39.58307647705078, "step": 2516 }, { "epoch": 0.34272875816993464, "grad_norm": 34.15832972459186, "learning_rate": 6.648711515453561e-07, "logits/chosen": 12.342903137207031, "logits/rejected": 11.509855270385742, "logps/chosen": -3.362031936645508, "logps/rejected": -3.4323129653930664, "loss": 3.4281, "rewards/accuracies": 0.5, "rewards/chosen": -33.620323181152344, "rewards/margins": 0.7028079032897949, "rewards/rejected": -34.32312774658203, "step": 2517 }, { "epoch": 0.3428649237472767, "grad_norm": 39.42908037160686, "learning_rate": 6.647286406425636e-07, "logits/chosen": 11.006406784057617, "logits/rejected": 11.53940200805664, "logps/chosen": -3.286653995513916, "logps/rejected": -3.3573384284973145, "loss": 3.6021, "rewards/accuracies": 0.75, "rewards/chosen": -32.866539001464844, "rewards/margins": 0.7068443298339844, "rewards/rejected": -33.57337951660156, "step": 2518 }, { "epoch": 0.34300108932461876, "grad_norm": 42.708111397928896, "learning_rate": 6.645860699221466e-07, "logits/chosen": 12.419851303100586, "logits/rejected": 11.6704740524292, "logps/chosen": -3.9271621704101562, "logps/rejected": -3.616349220275879, "loss": 5.0079, "rewards/accuracies": 0.0, "rewards/chosen": -39.27162170410156, "rewards/margins": -3.108126640319824, "rewards/rejected": -36.16349411010742, "step": 2519 }, { "epoch": 0.3431372549019608, "grad_norm": 35.868327011605174, "learning_rate": 6.644434394163199e-07, "logits/chosen": 11.702963829040527, "logits/rejected": 11.892507553100586, "logps/chosen": -3.5792605876922607, "logps/rejected": -3.5562632083892822, "loss": 4.0566, "rewards/accuracies": 0.75, "rewards/chosen": -35.7926025390625, "rewards/margins": -0.22997379302978516, "rewards/rejected": -35.56262969970703, "step": 2520 }, { "epoch": 0.3432734204793028, "grad_norm": 44.48315099296539, "learning_rate": 6.643007491573122e-07, "logits/chosen": 12.099470138549805, "logits/rejected": 10.971247673034668, "logps/chosen": -3.305283546447754, "logps/rejected": -3.2010116577148438, "loss": 3.9978, "rewards/accuracies": 0.25, "rewards/chosen": -33.052833557128906, "rewards/margins": -1.0427165031433105, "rewards/rejected": -32.01011657714844, "step": 2521 }, { "epoch": 0.3434095860566449, "grad_norm": 41.27262005154093, "learning_rate": 6.641579991773655e-07, "logits/chosen": 11.029983520507812, "logits/rejected": 12.018104553222656, "logps/chosen": -3.382099151611328, "logps/rejected": -3.6645760536193848, "loss": 4.3812, "rewards/accuracies": 0.5, "rewards/chosen": -33.82099151611328, "rewards/margins": 2.8247694969177246, "rewards/rejected": -36.64575958251953, "step": 2522 }, { "epoch": 0.34354575163398693, "grad_norm": 40.96036974557911, "learning_rate": 6.640151895087354e-07, "logits/chosen": 11.819562911987305, "logits/rejected": 12.707585334777832, "logps/chosen": -3.3372178077697754, "logps/rejected": -3.724698543548584, "loss": 3.867, "rewards/accuracies": 0.75, "rewards/chosen": -33.37217712402344, "rewards/margins": 3.874810218811035, "rewards/rejected": -37.246986389160156, "step": 2523 }, { "epoch": 0.34368191721132896, "grad_norm": 44.54570188872481, "learning_rate": 6.638723201836908e-07, "logits/chosen": 12.662757873535156, "logits/rejected": 12.703391075134277, "logps/chosen": -3.496384620666504, "logps/rejected": -3.7107527256011963, "loss": 4.8551, "rewards/accuracies": 0.75, "rewards/chosen": -34.963844299316406, "rewards/margins": 2.143681526184082, "rewards/rejected": -37.10752868652344, "step": 2524 }, { "epoch": 0.34381808278867104, "grad_norm": 42.31895914472118, "learning_rate": 6.637293912345143e-07, "logits/chosen": 10.749346733093262, "logits/rejected": 12.056770324707031, "logps/chosen": -3.2837026119232178, "logps/rejected": -3.6516261100769043, "loss": 3.4592, "rewards/accuracies": 1.0, "rewards/chosen": -32.83702850341797, "rewards/margins": 3.6792354583740234, "rewards/rejected": -36.51626205444336, "step": 2525 }, { "epoch": 0.34395424836601307, "grad_norm": 40.94290587463018, "learning_rate": 6.635864026935018e-07, "logits/chosen": 12.174617767333984, "logits/rejected": 11.76508903503418, "logps/chosen": -3.426147222518921, "logps/rejected": -3.616638660430908, "loss": 3.9666, "rewards/accuracies": 0.75, "rewards/chosen": -34.261474609375, "rewards/margins": 1.9049139022827148, "rewards/rejected": -36.166385650634766, "step": 2526 }, { "epoch": 0.3440904139433551, "grad_norm": 36.35666798052956, "learning_rate": 6.634433545929628e-07, "logits/chosen": 11.877945899963379, "logits/rejected": 12.404240608215332, "logps/chosen": -3.077910900115967, "logps/rejected": -3.573556423187256, "loss": 3.6853, "rewards/accuracies": 0.75, "rewards/chosen": -30.77910614013672, "rewards/margins": 4.95645809173584, "rewards/rejected": -35.735565185546875, "step": 2527 }, { "epoch": 0.3442265795206972, "grad_norm": 35.84728851658023, "learning_rate": 6.633002469652201e-07, "logits/chosen": 10.093708992004395, "logits/rejected": 11.339865684509277, "logps/chosen": -3.2198219299316406, "logps/rejected": -3.463843822479248, "loss": 3.9339, "rewards/accuracies": 0.5, "rewards/chosen": -32.198219299316406, "rewards/margins": 2.4402213096618652, "rewards/rejected": -34.6384391784668, "step": 2528 }, { "epoch": 0.3443627450980392, "grad_norm": 36.4245510095929, "learning_rate": 6.631570798426102e-07, "logits/chosen": 10.534212112426758, "logits/rejected": 11.300010681152344, "logps/chosen": -2.996299982070923, "logps/rejected": -3.3433282375335693, "loss": 4.0855, "rewards/accuracies": 1.0, "rewards/chosen": -29.96299934387207, "rewards/margins": 3.4702835083007812, "rewards/rejected": -33.433284759521484, "step": 2529 }, { "epoch": 0.34449891067538124, "grad_norm": 34.89894058034787, "learning_rate": 6.630138532574829e-07, "logits/chosen": 10.805932998657227, "logits/rejected": 11.447964668273926, "logps/chosen": -3.171111583709717, "logps/rejected": -3.345719337463379, "loss": 3.7232, "rewards/accuracies": 0.5, "rewards/chosen": -31.711116790771484, "rewards/margins": 1.7460756301879883, "rewards/rejected": -33.457191467285156, "step": 2530 }, { "epoch": 0.3446350762527233, "grad_norm": 35.25348023242401, "learning_rate": 6.628705672422013e-07, "logits/chosen": 12.00146484375, "logits/rejected": 12.25462532043457, "logps/chosen": -3.335310935974121, "logps/rejected": -3.5854222774505615, "loss": 4.5127, "rewards/accuracies": 0.5, "rewards/chosen": -33.353111267089844, "rewards/margins": 2.5011115074157715, "rewards/rejected": -35.854225158691406, "step": 2531 }, { "epoch": 0.34477124183006536, "grad_norm": 42.12263451046632, "learning_rate": 6.627272218291421e-07, "logits/chosen": 12.43122386932373, "logits/rejected": 12.733039855957031, "logps/chosen": -3.8772757053375244, "logps/rejected": -4.228456497192383, "loss": 4.5321, "rewards/accuracies": 0.5, "rewards/chosen": -38.77275848388672, "rewards/margins": 3.511807918548584, "rewards/rejected": -42.28456115722656, "step": 2532 }, { "epoch": 0.3449074074074074, "grad_norm": 48.52236710917983, "learning_rate": 6.625838170506954e-07, "logits/chosen": 11.078720092773438, "logits/rejected": 11.98587417602539, "logps/chosen": -3.4744620323181152, "logps/rejected": -3.8431620597839355, "loss": 3.8467, "rewards/accuracies": 0.75, "rewards/chosen": -34.74462127685547, "rewards/margins": 3.686997413635254, "rewards/rejected": -38.431617736816406, "step": 2533 }, { "epoch": 0.34504357298474947, "grad_norm": 34.21371723230805, "learning_rate": 6.624403529392647e-07, "logits/chosen": 12.996309280395508, "logits/rejected": 12.005560874938965, "logps/chosen": -3.8176746368408203, "logps/rejected": -3.571990728378296, "loss": 4.0568, "rewards/accuracies": 0.5, "rewards/chosen": -38.17675018310547, "rewards/margins": -2.4568395614624023, "rewards/rejected": -35.71990966796875, "step": 2534 }, { "epoch": 0.3451797385620915, "grad_norm": 37.2824428364611, "learning_rate": 6.622968295272669e-07, "logits/chosen": 11.38296127319336, "logits/rejected": 11.729469299316406, "logps/chosen": -3.623363971710205, "logps/rejected": -3.8121895790100098, "loss": 4.4391, "rewards/accuracies": 0.5, "rewards/chosen": -36.233642578125, "rewards/margins": 1.888254165649414, "rewards/rejected": -38.12189483642578, "step": 2535 }, { "epoch": 0.3453159041394335, "grad_norm": 44.844583336328874, "learning_rate": 6.621532468471324e-07, "logits/chosen": 11.371646881103516, "logits/rejected": 11.504877090454102, "logps/chosen": -3.285616397857666, "logps/rejected": -3.554842948913574, "loss": 4.2534, "rewards/accuracies": 0.75, "rewards/chosen": -32.856163024902344, "rewards/margins": 2.6922664642333984, "rewards/rejected": -35.548431396484375, "step": 2536 }, { "epoch": 0.3454520697167756, "grad_norm": 39.467220101343486, "learning_rate": 6.620096049313048e-07, "logits/chosen": 12.357016563415527, "logits/rejected": 11.930883407592773, "logps/chosen": -3.474438190460205, "logps/rejected": -3.6752915382385254, "loss": 4.1474, "rewards/accuracies": 0.75, "rewards/chosen": -34.744380950927734, "rewards/margins": 2.0085344314575195, "rewards/rejected": -36.75291442871094, "step": 2537 }, { "epoch": 0.34558823529411764, "grad_norm": 36.39726844618571, "learning_rate": 6.61865903812241e-07, "logits/chosen": 10.342455863952637, "logits/rejected": 12.330848693847656, "logps/chosen": -3.3993964195251465, "logps/rejected": -3.5995914936065674, "loss": 3.8078, "rewards/accuracies": 0.5, "rewards/chosen": -33.99396514892578, "rewards/margins": 2.0019493103027344, "rewards/rejected": -35.995914459228516, "step": 2538 }, { "epoch": 0.34572440087145967, "grad_norm": 38.86673841193445, "learning_rate": 6.617221435224117e-07, "logits/chosen": 11.391060829162598, "logits/rejected": 12.180686950683594, "logps/chosen": -3.354523181915283, "logps/rejected": -3.4879815578460693, "loss": 3.716, "rewards/accuracies": 0.75, "rewards/chosen": -33.54523468017578, "rewards/margins": 1.3345837593078613, "rewards/rejected": -34.87981414794922, "step": 2539 }, { "epoch": 0.34586056644880175, "grad_norm": 39.069547292625664, "learning_rate": 6.615783240943007e-07, "logits/chosen": 11.619474411010742, "logits/rejected": 11.751874923706055, "logps/chosen": -3.4366421699523926, "logps/rejected": -3.5266799926757812, "loss": 4.0121, "rewards/accuracies": 0.5, "rewards/chosen": -34.366424560546875, "rewards/margins": 0.9003767967224121, "rewards/rejected": -35.26679992675781, "step": 2540 }, { "epoch": 0.3459967320261438, "grad_norm": 43.745256035750764, "learning_rate": 6.614344455604051e-07, "logits/chosen": 11.865446090698242, "logits/rejected": 11.276363372802734, "logps/chosen": -3.193633794784546, "logps/rejected": -3.0977659225463867, "loss": 4.1758, "rewards/accuracies": 0.25, "rewards/chosen": -31.936336517333984, "rewards/margins": -0.9586801528930664, "rewards/rejected": -30.977657318115234, "step": 2541 }, { "epoch": 0.3461328976034858, "grad_norm": 40.61579873214187, "learning_rate": 6.612905079532355e-07, "logits/chosen": 12.085630416870117, "logits/rejected": 12.196565628051758, "logps/chosen": -3.553267240524292, "logps/rejected": -3.5418596267700195, "loss": 4.264, "rewards/accuracies": 0.5, "rewards/chosen": -35.53266906738281, "rewards/margins": -0.11407613754272461, "rewards/rejected": -35.41859436035156, "step": 2542 }, { "epoch": 0.3462690631808279, "grad_norm": 40.79337713851142, "learning_rate": 6.611465113053158e-07, "logits/chosen": 11.452869415283203, "logits/rejected": 12.184226989746094, "logps/chosen": -3.532874345779419, "logps/rejected": -3.67809796333313, "loss": 3.8442, "rewards/accuracies": 0.75, "rewards/chosen": -35.32874298095703, "rewards/margins": 1.452235221862793, "rewards/rejected": -36.78097915649414, "step": 2543 }, { "epoch": 0.3464052287581699, "grad_norm": 40.07804165106794, "learning_rate": 6.610024556491831e-07, "logits/chosen": 11.136717796325684, "logits/rejected": 11.782230377197266, "logps/chosen": -3.298891544342041, "logps/rejected": -3.6817233562469482, "loss": 4.2166, "rewards/accuracies": 1.0, "rewards/chosen": -32.988914489746094, "rewards/margins": 3.8283185958862305, "rewards/rejected": -36.81723403930664, "step": 2544 }, { "epoch": 0.346541394335512, "grad_norm": 42.94208938515479, "learning_rate": 6.608583410173883e-07, "logits/chosen": 11.256556510925293, "logits/rejected": 11.877058029174805, "logps/chosen": -3.359611749649048, "logps/rejected": -3.604177474975586, "loss": 3.8826, "rewards/accuracies": 0.75, "rewards/chosen": -33.59611892700195, "rewards/margins": 2.4456562995910645, "rewards/rejected": -36.04177474975586, "step": 2545 }, { "epoch": 0.34667755991285404, "grad_norm": 43.987273404911974, "learning_rate": 6.60714167442495e-07, "logits/chosen": 11.112686157226562, "logits/rejected": 12.260255813598633, "logps/chosen": -3.3939108848571777, "logps/rejected": -3.8880887031555176, "loss": 3.9968, "rewards/accuracies": 1.0, "rewards/chosen": -33.939109802246094, "rewards/margins": 4.941778182983398, "rewards/rejected": -38.880889892578125, "step": 2546 }, { "epoch": 0.34681372549019607, "grad_norm": 39.32981612820138, "learning_rate": 6.605699349570804e-07, "logits/chosen": 10.436291694641113, "logits/rejected": 12.008352279663086, "logps/chosen": -3.4286258220672607, "logps/rejected": -3.6536524295806885, "loss": 4.5009, "rewards/accuracies": 0.5, "rewards/chosen": -34.286258697509766, "rewards/margins": 2.2502641677856445, "rewards/rejected": -36.536521911621094, "step": 2547 }, { "epoch": 0.34694989106753815, "grad_norm": 48.96087183831269, "learning_rate": 6.604256435937351e-07, "logits/chosen": 11.849140167236328, "logits/rejected": 12.118309020996094, "logps/chosen": -3.7295563220977783, "logps/rejected": -3.908236026763916, "loss": 4.0146, "rewards/accuracies": 0.5, "rewards/chosen": -37.295562744140625, "rewards/margins": 1.786794662475586, "rewards/rejected": -39.082359313964844, "step": 2548 }, { "epoch": 0.3470860566448802, "grad_norm": 43.25792617290438, "learning_rate": 6.602812933850628e-07, "logits/chosen": 12.623109817504883, "logits/rejected": 12.434269905090332, "logps/chosen": -3.8364999294281006, "logps/rejected": -3.8691282272338867, "loss": 4.1858, "rewards/accuracies": 0.25, "rewards/chosen": -38.36499786376953, "rewards/margins": 0.32628345489501953, "rewards/rejected": -38.6912841796875, "step": 2549 }, { "epoch": 0.3472222222222222, "grad_norm": 57.01661638975882, "learning_rate": 6.60136884363681e-07, "logits/chosen": 11.835349082946777, "logits/rejected": 11.731793403625488, "logps/chosen": -3.3851780891418457, "logps/rejected": -3.782181978225708, "loss": 4.2019, "rewards/accuracies": 1.0, "rewards/chosen": -33.85177993774414, "rewards/margins": 3.9700403213500977, "rewards/rejected": -37.82182312011719, "step": 2550 }, { "epoch": 0.3473583877995643, "grad_norm": 45.34304883958244, "learning_rate": 6.599924165622198e-07, "logits/chosen": 11.189300537109375, "logits/rejected": 11.934057235717773, "logps/chosen": -3.649966239929199, "logps/rejected": -3.4696428775787354, "loss": 4.1386, "rewards/accuracies": 0.25, "rewards/chosen": -36.499664306640625, "rewards/margins": -1.8032331466674805, "rewards/rejected": -34.69643020629883, "step": 2551 }, { "epoch": 0.3474945533769063, "grad_norm": 43.335299564011464, "learning_rate": 6.598478900133229e-07, "logits/chosen": 12.31883430480957, "logits/rejected": 11.797359466552734, "logps/chosen": -3.9522805213928223, "logps/rejected": -3.8628933429718018, "loss": 4.1602, "rewards/accuracies": 0.25, "rewards/chosen": -39.522804260253906, "rewards/margins": -0.8938713073730469, "rewards/rejected": -38.628936767578125, "step": 2552 }, { "epoch": 0.34763071895424835, "grad_norm": 42.564254878798984, "learning_rate": 6.597033047496474e-07, "logits/chosen": 11.393133163452148, "logits/rejected": 12.82463264465332, "logps/chosen": -3.5488197803497314, "logps/rejected": -3.80674409866333, "loss": 4.1876, "rewards/accuracies": 0.75, "rewards/chosen": -35.488197326660156, "rewards/margins": 2.5792431831359863, "rewards/rejected": -38.067440032958984, "step": 2553 }, { "epoch": 0.34776688453159044, "grad_norm": 42.56942623160268, "learning_rate": 6.595586608038634e-07, "logits/chosen": 10.308210372924805, "logits/rejected": 11.506863594055176, "logps/chosen": -3.6840298175811768, "logps/rejected": -3.8615972995758057, "loss": 3.5092, "rewards/accuracies": 0.75, "rewards/chosen": -36.84029769897461, "rewards/margins": 1.7756757736206055, "rewards/rejected": -38.615970611572266, "step": 2554 }, { "epoch": 0.34790305010893247, "grad_norm": 46.187178839370915, "learning_rate": 6.594139582086544e-07, "logits/chosen": 10.603506088256836, "logits/rejected": 11.188882827758789, "logps/chosen": -3.31494402885437, "logps/rejected": -3.6980507373809814, "loss": 4.0341, "rewards/accuracies": 0.75, "rewards/chosen": -33.14944076538086, "rewards/margins": 3.8310680389404297, "rewards/rejected": -36.980506896972656, "step": 2555 }, { "epoch": 0.3480392156862745, "grad_norm": 43.70128668204633, "learning_rate": 6.592691969967174e-07, "logits/chosen": 10.933348655700684, "logits/rejected": 10.888267517089844, "logps/chosen": -3.5055384635925293, "logps/rejected": -3.4497554302215576, "loss": 4.1897, "rewards/accuracies": 0.75, "rewards/chosen": -35.055381774902344, "rewards/margins": -0.5578303337097168, "rewards/rejected": -34.497554779052734, "step": 2556 }, { "epoch": 0.3481753812636166, "grad_norm": 45.55347792821039, "learning_rate": 6.59124377200762e-07, "logits/chosen": 11.693927764892578, "logits/rejected": 11.62842082977295, "logps/chosen": -3.4884674549102783, "logps/rejected": -3.8936421871185303, "loss": 3.5073, "rewards/accuracies": 0.75, "rewards/chosen": -34.884674072265625, "rewards/margins": 4.051748275756836, "rewards/rejected": -38.936424255371094, "step": 2557 }, { "epoch": 0.3483115468409586, "grad_norm": 40.79074573187382, "learning_rate": 6.589794988535118e-07, "logits/chosen": 11.275604248046875, "logits/rejected": 11.526727676391602, "logps/chosen": -3.623404026031494, "logps/rejected": -3.6563501358032227, "loss": 3.8859, "rewards/accuracies": 0.5, "rewards/chosen": -36.234039306640625, "rewards/margins": 0.32946157455444336, "rewards/rejected": -36.563499450683594, "step": 2558 }, { "epoch": 0.34844771241830064, "grad_norm": 48.15875610872262, "learning_rate": 6.588345619877028e-07, "logits/chosen": 12.677347183227539, "logits/rejected": 13.101119995117188, "logps/chosen": -3.896697759628296, "logps/rejected": -3.877906322479248, "loss": 4.4672, "rewards/accuracies": 0.5, "rewards/chosen": -38.966976165771484, "rewards/margins": -0.1879119873046875, "rewards/rejected": -38.7790641784668, "step": 2559 }, { "epoch": 0.3485838779956427, "grad_norm": 40.31217400775544, "learning_rate": 6.586895666360852e-07, "logits/chosen": 11.400934219360352, "logits/rejected": 11.817258834838867, "logps/chosen": -3.7461695671081543, "logps/rejected": -3.9880964756011963, "loss": 4.0995, "rewards/accuracies": 0.5, "rewards/chosen": -37.461692810058594, "rewards/margins": 2.4192705154418945, "rewards/rejected": -39.88096618652344, "step": 2560 }, { "epoch": 0.34872004357298475, "grad_norm": 43.28728702142906, "learning_rate": 6.585445128314217e-07, "logits/chosen": 11.24105453491211, "logits/rejected": 12.982133865356445, "logps/chosen": -3.2048285007476807, "logps/rejected": -4.020632266998291, "loss": 4.3259, "rewards/accuracies": 1.0, "rewards/chosen": -32.04828643798828, "rewards/margins": 8.158038139343262, "rewards/rejected": -40.206321716308594, "step": 2561 }, { "epoch": 0.3488562091503268, "grad_norm": 44.03028220308409, "learning_rate": 6.583994006064883e-07, "logits/chosen": 11.201071739196777, "logits/rejected": 13.079742431640625, "logps/chosen": -3.358851909637451, "logps/rejected": -3.8885085582733154, "loss": 3.5738, "rewards/accuracies": 0.75, "rewards/chosen": -33.58852005004883, "rewards/margins": 5.296565532684326, "rewards/rejected": -38.88508605957031, "step": 2562 }, { "epoch": 0.34899237472766886, "grad_norm": 42.015008773951266, "learning_rate": 6.582542299940744e-07, "logits/chosen": 12.279214859008789, "logits/rejected": 12.30146598815918, "logps/chosen": -3.885042667388916, "logps/rejected": -3.913219690322876, "loss": 3.6924, "rewards/accuracies": 0.5, "rewards/chosen": -38.850425720214844, "rewards/margins": 0.2817678451538086, "rewards/rejected": -39.13219451904297, "step": 2563 }, { "epoch": 0.3491285403050109, "grad_norm": 61.99402390843577, "learning_rate": 6.581090010269825e-07, "logits/chosen": 12.524377822875977, "logits/rejected": 12.277482986450195, "logps/chosen": -3.8443427085876465, "logps/rejected": -3.6664249897003174, "loss": 4.0961, "rewards/accuracies": 0.5, "rewards/chosen": -38.44342803955078, "rewards/margins": -1.7791762351989746, "rewards/rejected": -36.664249420166016, "step": 2564 }, { "epoch": 0.3492647058823529, "grad_norm": 55.36363183648078, "learning_rate": 6.579637137380282e-07, "logits/chosen": 12.78604507446289, "logits/rejected": 11.957111358642578, "logps/chosen": -3.7083730697631836, "logps/rejected": -3.4844918251037598, "loss": 4.1058, "rewards/accuracies": 0.25, "rewards/chosen": -37.08373260498047, "rewards/margins": -2.2388153076171875, "rewards/rejected": -34.84491729736328, "step": 2565 }, { "epoch": 0.349400871459695, "grad_norm": 53.49847559041715, "learning_rate": 6.578183681600405e-07, "logits/chosen": 11.990530014038086, "logits/rejected": 11.91379165649414, "logps/chosen": -3.5693631172180176, "logps/rejected": -3.946732997894287, "loss": 3.9837, "rewards/accuracies": 0.75, "rewards/chosen": -35.693634033203125, "rewards/margins": 3.773700714111328, "rewards/rejected": -39.46733093261719, "step": 2566 }, { "epoch": 0.34953703703703703, "grad_norm": 43.01423347760638, "learning_rate": 6.576729643258613e-07, "logits/chosen": 11.646292686462402, "logits/rejected": 11.824193000793457, "logps/chosen": -3.151665687561035, "logps/rejected": -3.5832650661468506, "loss": 3.5573, "rewards/accuracies": 0.75, "rewards/chosen": -31.516658782958984, "rewards/margins": 4.3159918785095215, "rewards/rejected": -35.83264923095703, "step": 2567 }, { "epoch": 0.34967320261437906, "grad_norm": 41.00631445478747, "learning_rate": 6.575275022683459e-07, "logits/chosen": 10.946769714355469, "logits/rejected": 11.11091423034668, "logps/chosen": -3.53214430809021, "logps/rejected": -3.7452683448791504, "loss": 4.1366, "rewards/accuracies": 1.0, "rewards/chosen": -35.321441650390625, "rewards/margins": 2.131242275238037, "rewards/rejected": -37.45268249511719, "step": 2568 }, { "epoch": 0.34980936819172115, "grad_norm": 39.08195116680131, "learning_rate": 6.573819820203627e-07, "logits/chosen": 11.37159538269043, "logits/rejected": 12.210821151733398, "logps/chosen": -3.443699598312378, "logps/rejected": -3.612506151199341, "loss": 3.4513, "rewards/accuracies": 0.5, "rewards/chosen": -34.43699645996094, "rewards/margins": 1.688066005706787, "rewards/rejected": -36.12506103515625, "step": 2569 }, { "epoch": 0.3499455337690632, "grad_norm": 45.41772208424539, "learning_rate": 6.572364036147931e-07, "logits/chosen": 11.54025650024414, "logits/rejected": 11.895417213439941, "logps/chosen": -3.3931822776794434, "logps/rejected": -3.792344093322754, "loss": 4.129, "rewards/accuracies": 0.75, "rewards/chosen": -33.93182373046875, "rewards/margins": 3.9916186332702637, "rewards/rejected": -37.92344284057617, "step": 2570 }, { "epoch": 0.3500816993464052, "grad_norm": 102.17255235633448, "learning_rate": 6.570907670845316e-07, "logits/chosen": 11.1500883102417, "logits/rejected": 11.151004791259766, "logps/chosen": -2.9956037998199463, "logps/rejected": -3.4191720485687256, "loss": 3.5321, "rewards/accuracies": 1.0, "rewards/chosen": -29.956037521362305, "rewards/margins": 4.235683441162109, "rewards/rejected": -34.19171905517578, "step": 2571 }, { "epoch": 0.3502178649237473, "grad_norm": 42.93832018559587, "learning_rate": 6.569450724624863e-07, "logits/chosen": 11.373788833618164, "logits/rejected": 10.888626098632812, "logps/chosen": -3.38980770111084, "logps/rejected": -3.45170521736145, "loss": 3.8293, "rewards/accuracies": 0.5, "rewards/chosen": -33.89807891845703, "rewards/margins": 0.6189732551574707, "rewards/rejected": -34.517051696777344, "step": 2572 }, { "epoch": 0.3503540305010893, "grad_norm": 40.93112860381067, "learning_rate": 6.567993197815779e-07, "logits/chosen": 10.657174110412598, "logits/rejected": 11.414558410644531, "logps/chosen": -3.6214425563812256, "logps/rejected": -3.8462347984313965, "loss": 3.6616, "rewards/accuracies": 0.75, "rewards/chosen": -36.21442413330078, "rewards/margins": 2.2479209899902344, "rewards/rejected": -38.46234893798828, "step": 2573 }, { "epoch": 0.35049019607843135, "grad_norm": 41.598742748627615, "learning_rate": 6.566535090747404e-07, "logits/chosen": 11.091222763061523, "logits/rejected": 10.992286682128906, "logps/chosen": -3.597278356552124, "logps/rejected": -3.793285846710205, "loss": 3.5873, "rewards/accuracies": 1.0, "rewards/chosen": -35.972782135009766, "rewards/margins": 1.960073471069336, "rewards/rejected": -37.932857513427734, "step": 2574 }, { "epoch": 0.35062636165577343, "grad_norm": 40.81907885645978, "learning_rate": 6.565076403749211e-07, "logits/chosen": 11.011137008666992, "logits/rejected": 11.277616500854492, "logps/chosen": -3.627279043197632, "logps/rejected": -3.6391284465789795, "loss": 4.0221, "rewards/accuracies": 0.5, "rewards/chosen": -36.272789001464844, "rewards/margins": 0.11849260330200195, "rewards/rejected": -36.39128112792969, "step": 2575 }, { "epoch": 0.35076252723311546, "grad_norm": 67.25237822768364, "learning_rate": 6.563617137150801e-07, "logits/chosen": 11.241204261779785, "logits/rejected": 11.649354934692383, "logps/chosen": -3.4609811305999756, "logps/rejected": -3.7761809825897217, "loss": 3.8511, "rewards/accuracies": 0.5, "rewards/chosen": -34.60980987548828, "rewards/margins": 3.1519975662231445, "rewards/rejected": -37.761810302734375, "step": 2576 }, { "epoch": 0.3508986928104575, "grad_norm": 42.27715374593242, "learning_rate": 6.562157291281908e-07, "logits/chosen": 11.591221809387207, "logits/rejected": 11.739568710327148, "logps/chosen": -3.261122465133667, "logps/rejected": -3.7108805179595947, "loss": 3.8773, "rewards/accuracies": 0.75, "rewards/chosen": -32.61122512817383, "rewards/margins": 4.4975810050964355, "rewards/rejected": -37.108802795410156, "step": 2577 }, { "epoch": 0.3510348583877996, "grad_norm": 43.08855607268002, "learning_rate": 6.560696866472396e-07, "logits/chosen": 11.924055099487305, "logits/rejected": 12.579017639160156, "logps/chosen": -3.6462764739990234, "logps/rejected": -3.6164002418518066, "loss": 4.2467, "rewards/accuracies": 0.5, "rewards/chosen": -36.462764739990234, "rewards/margins": -0.29875946044921875, "rewards/rejected": -36.164005279541016, "step": 2578 }, { "epoch": 0.3511710239651416, "grad_norm": 39.54141179582385, "learning_rate": 6.559235863052259e-07, "logits/chosen": 10.656370162963867, "logits/rejected": 11.617051124572754, "logps/chosen": -3.572014331817627, "logps/rejected": -4.065199851989746, "loss": 3.8733, "rewards/accuracies": 1.0, "rewards/chosen": -35.72014617919922, "rewards/margins": 4.931856155395508, "rewards/rejected": -40.652000427246094, "step": 2579 }, { "epoch": 0.35130718954248363, "grad_norm": 44.29649342850882, "learning_rate": 6.557774281351626e-07, "logits/chosen": 10.574026107788086, "logits/rejected": 11.736332893371582, "logps/chosen": -3.1793885231018066, "logps/rejected": -3.788789749145508, "loss": 4.0805, "rewards/accuracies": 1.0, "rewards/chosen": -31.793886184692383, "rewards/margins": 6.0940117835998535, "rewards/rejected": -37.88789749145508, "step": 2580 }, { "epoch": 0.3514433551198257, "grad_norm": 43.752386029394, "learning_rate": 6.556312121700751e-07, "logits/chosen": 10.359790802001953, "logits/rejected": 10.974140167236328, "logps/chosen": -3.0666441917419434, "logps/rejected": -3.4412174224853516, "loss": 3.8629, "rewards/accuracies": 1.0, "rewards/chosen": -30.66644287109375, "rewards/margins": 3.745730400085449, "rewards/rejected": -34.41217041015625, "step": 2581 }, { "epoch": 0.35157952069716775, "grad_norm": 47.39979321427958, "learning_rate": 6.55484938443002e-07, "logits/chosen": 10.609119415283203, "logits/rejected": 12.039791107177734, "logps/chosen": -3.4476137161254883, "logps/rejected": -3.73882794380188, "loss": 4.2199, "rewards/accuracies": 0.5, "rewards/chosen": -34.47613525390625, "rewards/margins": 2.912144184112549, "rewards/rejected": -37.388282775878906, "step": 2582 }, { "epoch": 0.35171568627450983, "grad_norm": 42.49463107812876, "learning_rate": 6.553386069869953e-07, "logits/chosen": 10.575658798217773, "logits/rejected": 11.443717956542969, "logps/chosen": -3.343003034591675, "logps/rejected": -3.568453788757324, "loss": 3.9939, "rewards/accuracies": 0.75, "rewards/chosen": -33.430030822753906, "rewards/margins": 2.2545084953308105, "rewards/rejected": -35.684539794921875, "step": 2583 }, { "epoch": 0.35185185185185186, "grad_norm": 42.1807469253974, "learning_rate": 6.551922178351196e-07, "logits/chosen": 10.71114730834961, "logits/rejected": 11.309904098510742, "logps/chosen": -3.5147669315338135, "logps/rejected": -3.6225507259368896, "loss": 4.2398, "rewards/accuracies": 0.75, "rewards/chosen": -35.147666931152344, "rewards/margins": 1.0778417587280273, "rewards/rejected": -36.22550964355469, "step": 2584 }, { "epoch": 0.3519880174291939, "grad_norm": 40.64364420097958, "learning_rate": 6.55045771020453e-07, "logits/chosen": 11.430438995361328, "logits/rejected": 12.723455429077148, "logps/chosen": -3.128708839416504, "logps/rejected": -3.783456325531006, "loss": 3.8794, "rewards/accuracies": 1.0, "rewards/chosen": -31.287084579467773, "rewards/margins": 6.547476291656494, "rewards/rejected": -37.834564208984375, "step": 2585 }, { "epoch": 0.352124183006536, "grad_norm": 38.79470316056383, "learning_rate": 6.548992665760861e-07, "logits/chosen": 11.96345043182373, "logits/rejected": 11.076458930969238, "logps/chosen": -3.3933210372924805, "logps/rejected": -3.4097814559936523, "loss": 4.1007, "rewards/accuracies": 0.75, "rewards/chosen": -33.93320846557617, "rewards/margins": 0.16460514068603516, "rewards/rejected": -34.097816467285156, "step": 2586 }, { "epoch": 0.352260348583878, "grad_norm": 39.53675098252076, "learning_rate": 6.547527045351228e-07, "logits/chosen": 11.815998077392578, "logits/rejected": 11.485574722290039, "logps/chosen": -3.4546005725860596, "logps/rejected": -3.7015137672424316, "loss": 3.6916, "rewards/accuracies": 0.75, "rewards/chosen": -34.54600524902344, "rewards/margins": 2.4691295623779297, "rewards/rejected": -37.01513671875, "step": 2587 }, { "epoch": 0.35239651416122003, "grad_norm": 39.13996929054848, "learning_rate": 6.546060849306803e-07, "logits/chosen": 11.128807067871094, "logits/rejected": 10.26046371459961, "logps/chosen": -3.2911665439605713, "logps/rejected": -3.3670196533203125, "loss": 3.8199, "rewards/accuracies": 0.5, "rewards/chosen": -32.91166687011719, "rewards/margins": 0.7585325241088867, "rewards/rejected": -33.670196533203125, "step": 2588 }, { "epoch": 0.3525326797385621, "grad_norm": 46.18994337859116, "learning_rate": 6.544594077958882e-07, "logits/chosen": 11.14627742767334, "logits/rejected": 11.618677139282227, "logps/chosen": -3.5170469284057617, "logps/rejected": -3.757122755050659, "loss": 4.427, "rewards/accuracies": 0.75, "rewards/chosen": -35.17047119140625, "rewards/margins": 2.400759696960449, "rewards/rejected": -37.57122802734375, "step": 2589 }, { "epoch": 0.35266884531590414, "grad_norm": 43.05491110558363, "learning_rate": 6.543126731638896e-07, "logits/chosen": 11.246915817260742, "logits/rejected": 12.06301212310791, "logps/chosen": -3.4319324493408203, "logps/rejected": -3.6777353286743164, "loss": 4.4627, "rewards/accuracies": 0.75, "rewards/chosen": -34.3193244934082, "rewards/margins": 2.4580259323120117, "rewards/rejected": -36.77735137939453, "step": 2590 }, { "epoch": 0.3528050108932462, "grad_norm": 44.04423162488068, "learning_rate": 6.541658810678404e-07, "logits/chosen": 11.084810256958008, "logits/rejected": 12.339859008789062, "logps/chosen": -3.6977102756500244, "logps/rejected": -3.6313414573669434, "loss": 4.1988, "rewards/accuracies": 0.25, "rewards/chosen": -36.97710418701172, "rewards/margins": -0.6636896133422852, "rewards/rejected": -36.31341552734375, "step": 2591 }, { "epoch": 0.35294117647058826, "grad_norm": 43.471675122172016, "learning_rate": 6.540190315409092e-07, "logits/chosen": 10.150737762451172, "logits/rejected": 11.636528015136719, "logps/chosen": -3.2052998542785645, "logps/rejected": -3.793689727783203, "loss": 4.2104, "rewards/accuracies": 0.75, "rewards/chosen": -32.053001403808594, "rewards/margins": 5.883896827697754, "rewards/rejected": -37.93689727783203, "step": 2592 }, { "epoch": 0.3530773420479303, "grad_norm": 41.12116205020581, "learning_rate": 6.538721246162783e-07, "logits/chosen": 12.030378341674805, "logits/rejected": 11.144851684570312, "logps/chosen": -3.8396711349487305, "logps/rejected": -3.7294490337371826, "loss": 4.4537, "rewards/accuracies": 0.25, "rewards/chosen": -38.396705627441406, "rewards/margins": -1.1022167205810547, "rewards/rejected": -37.29449462890625, "step": 2593 }, { "epoch": 0.3532135076252723, "grad_norm": 45.968010051476824, "learning_rate": 6.537251603271421e-07, "logits/chosen": 11.868425369262695, "logits/rejected": 11.650043487548828, "logps/chosen": -3.533792734146118, "logps/rejected": -3.564429998397827, "loss": 4.5551, "rewards/accuracies": 0.5, "rewards/chosen": -35.33792495727539, "rewards/margins": 0.30637311935424805, "rewards/rejected": -35.6442985534668, "step": 2594 }, { "epoch": 0.3533496732026144, "grad_norm": 40.1567469824107, "learning_rate": 6.535781387067088e-07, "logits/chosen": 12.300009727478027, "logits/rejected": 11.583503723144531, "logps/chosen": -3.888287305831909, "logps/rejected": -3.8591322898864746, "loss": 3.7393, "rewards/accuracies": 0.5, "rewards/chosen": -38.88287353515625, "rewards/margins": -0.2915506362915039, "rewards/rejected": -38.59132385253906, "step": 2595 }, { "epoch": 0.35348583877995643, "grad_norm": 43.02045298730788, "learning_rate": 6.534310597881989e-07, "logits/chosen": 11.002863883972168, "logits/rejected": 11.514780044555664, "logps/chosen": -3.773017406463623, "logps/rejected": -3.9177000522613525, "loss": 4.4198, "rewards/accuracies": 0.5, "rewards/chosen": -37.73017120361328, "rewards/margins": 1.4468259811401367, "rewards/rejected": -39.177001953125, "step": 2596 }, { "epoch": 0.35362200435729846, "grad_norm": 40.92201020377758, "learning_rate": 6.532839236048461e-07, "logits/chosen": 12.10345458984375, "logits/rejected": 11.585659980773926, "logps/chosen": -3.9011974334716797, "logps/rejected": -3.726172924041748, "loss": 3.9476, "rewards/accuracies": 0.25, "rewards/chosen": -39.01197052001953, "rewards/margins": -1.7502460479736328, "rewards/rejected": -37.26172637939453, "step": 2597 }, { "epoch": 0.35375816993464054, "grad_norm": 41.85619627865507, "learning_rate": 6.53136730189897e-07, "logits/chosen": 10.472309112548828, "logits/rejected": 10.976887702941895, "logps/chosen": -3.657724380493164, "logps/rejected": -3.879316568374634, "loss": 3.8473, "rewards/accuracies": 0.75, "rewards/chosen": -36.57724380493164, "rewards/margins": 2.2159223556518555, "rewards/rejected": -38.79316711425781, "step": 2598 }, { "epoch": 0.35389433551198257, "grad_norm": 44.81668191555194, "learning_rate": 6.529894795766114e-07, "logits/chosen": 11.352250099182129, "logits/rejected": 11.607877731323242, "logps/chosen": -3.7181296348571777, "logps/rejected": -3.630864381790161, "loss": 4.433, "rewards/accuracies": 0.5, "rewards/chosen": -37.181297302246094, "rewards/margins": -0.8726520538330078, "rewards/rejected": -36.30864334106445, "step": 2599 }, { "epoch": 0.3540305010893246, "grad_norm": 38.733157594814344, "learning_rate": 6.528421717982616e-07, "logits/chosen": 12.177261352539062, "logits/rejected": 11.756616592407227, "logps/chosen": -4.088240623474121, "logps/rejected": -4.01024055480957, "loss": 4.1046, "rewards/accuracies": 0.5, "rewards/chosen": -40.882408142089844, "rewards/margins": -0.7800016403198242, "rewards/rejected": -40.10240173339844, "step": 2600 }, { "epoch": 0.3541666666666667, "grad_norm": 42.038767873202545, "learning_rate": 6.526948068881332e-07, "logits/chosen": 10.610631942749023, "logits/rejected": 11.16264820098877, "logps/chosen": -3.560912847518921, "logps/rejected": -3.7736237049102783, "loss": 3.9171, "rewards/accuracies": 0.75, "rewards/chosen": -35.609127044677734, "rewards/margins": 2.127108573913574, "rewards/rejected": -37.736236572265625, "step": 2601 }, { "epoch": 0.3543028322440087, "grad_norm": 38.57729227973795, "learning_rate": 6.525473848795243e-07, "logits/chosen": 11.127174377441406, "logits/rejected": 11.239131927490234, "logps/chosen": -3.6406619548797607, "logps/rejected": -3.594369649887085, "loss": 4.2974, "rewards/accuracies": 0.5, "rewards/chosen": -36.4066162109375, "rewards/margins": -0.4629220962524414, "rewards/rejected": -35.94369888305664, "step": 2602 }, { "epoch": 0.35443899782135074, "grad_norm": 48.28797501724675, "learning_rate": 6.523999058057462e-07, "logits/chosen": 12.249300003051758, "logits/rejected": 11.390368461608887, "logps/chosen": -3.6890389919281006, "logps/rejected": -3.519951820373535, "loss": 4.4612, "rewards/accuracies": 0.5, "rewards/chosen": -36.89038848876953, "rewards/margins": -1.690873146057129, "rewards/rejected": -35.19951629638672, "step": 2603 }, { "epoch": 0.3545751633986928, "grad_norm": 42.03815640647769, "learning_rate": 6.522523697001231e-07, "logits/chosen": 11.096243858337402, "logits/rejected": 10.28961181640625, "logps/chosen": -3.569169282913208, "logps/rejected": -3.3023462295532227, "loss": 3.6462, "rewards/accuracies": 0.25, "rewards/chosen": -35.69169616699219, "rewards/margins": -2.668231964111328, "rewards/rejected": -33.023460388183594, "step": 2604 }, { "epoch": 0.35471132897603486, "grad_norm": 43.297106144255416, "learning_rate": 6.521047765959919e-07, "logits/chosen": 10.454547882080078, "logits/rejected": 10.94416332244873, "logps/chosen": -3.392085075378418, "logps/rejected": -3.5737454891204834, "loss": 4.6087, "rewards/accuracies": 0.75, "rewards/chosen": -33.92084884643555, "rewards/margins": 1.8166046142578125, "rewards/rejected": -35.737457275390625, "step": 2605 }, { "epoch": 0.3548474945533769, "grad_norm": 40.878618097413664, "learning_rate": 6.519571265267025e-07, "logits/chosen": 11.450384140014648, "logits/rejected": 10.424787521362305, "logps/chosen": -3.434378147125244, "logps/rejected": -3.1043481826782227, "loss": 4.4179, "rewards/accuracies": 0.25, "rewards/chosen": -34.343780517578125, "rewards/margins": -3.3002991676330566, "rewards/rejected": -31.043481826782227, "step": 2606 }, { "epoch": 0.35498366013071897, "grad_norm": 44.28220662228631, "learning_rate": 6.518094195256175e-07, "logits/chosen": 10.73868465423584, "logits/rejected": 11.024032592773438, "logps/chosen": -3.2488741874694824, "logps/rejected": -3.446974754333496, "loss": 4.127, "rewards/accuracies": 0.5, "rewards/chosen": -32.488739013671875, "rewards/margins": 1.981006145477295, "rewards/rejected": -34.469749450683594, "step": 2607 }, { "epoch": 0.355119825708061, "grad_norm": 42.338396010289124, "learning_rate": 6.516616556261129e-07, "logits/chosen": 11.176610946655273, "logits/rejected": 11.054027557373047, "logps/chosen": -4.171586036682129, "logps/rejected": -4.072329521179199, "loss": 4.4676, "rewards/accuracies": 0.25, "rewards/chosen": -41.71585464477539, "rewards/margins": -0.9925565719604492, "rewards/rejected": -40.723297119140625, "step": 2608 }, { "epoch": 0.355255991285403, "grad_norm": 46.429767065228816, "learning_rate": 6.51513834861577e-07, "logits/chosen": 10.594476699829102, "logits/rejected": 11.546236991882324, "logps/chosen": -3.524754524230957, "logps/rejected": -3.584743022918701, "loss": 3.984, "rewards/accuracies": 0.5, "rewards/chosen": -35.2475471496582, "rewards/margins": 0.5998826026916504, "rewards/rejected": -35.84742736816406, "step": 2609 }, { "epoch": 0.3553921568627451, "grad_norm": 42.54844355722791, "learning_rate": 6.513659572654108e-07, "logits/chosen": 11.34726333618164, "logits/rejected": 12.012723922729492, "logps/chosen": -3.3803558349609375, "logps/rejected": -3.4570281505584717, "loss": 4.1347, "rewards/accuracies": 0.5, "rewards/chosen": -33.803558349609375, "rewards/margins": 0.7667255401611328, "rewards/rejected": -34.570281982421875, "step": 2610 }, { "epoch": 0.35552832244008714, "grad_norm": 39.658585141616, "learning_rate": 6.512180228710288e-07, "logits/chosen": 10.342016220092773, "logits/rejected": 11.395637512207031, "logps/chosen": -3.0693328380584717, "logps/rejected": -3.194404125213623, "loss": 4.0771, "rewards/accuracies": 0.5, "rewards/chosen": -30.693328857421875, "rewards/margins": 1.2507143020629883, "rewards/rejected": -31.944042205810547, "step": 2611 }, { "epoch": 0.35566448801742917, "grad_norm": 45.82806811860188, "learning_rate": 6.510700317118582e-07, "logits/chosen": 11.140646934509277, "logits/rejected": 12.575982093811035, "logps/chosen": -3.215439558029175, "logps/rejected": -3.7129688262939453, "loss": 3.8874, "rewards/accuracies": 0.75, "rewards/chosen": -32.154396057128906, "rewards/margins": 4.9752936363220215, "rewards/rejected": -37.12969207763672, "step": 2612 }, { "epoch": 0.35580065359477125, "grad_norm": 38.83921946149471, "learning_rate": 6.509219838213383e-07, "logits/chosen": 9.544013977050781, "logits/rejected": 10.870515823364258, "logps/chosen": -2.8439626693725586, "logps/rejected": -3.2872440814971924, "loss": 4.3915, "rewards/accuracies": 0.75, "rewards/chosen": -28.43962860107422, "rewards/margins": 4.432812690734863, "rewards/rejected": -32.872440338134766, "step": 2613 }, { "epoch": 0.3559368191721133, "grad_norm": 46.216585787587825, "learning_rate": 6.507738792329222e-07, "logits/chosen": 11.67833423614502, "logits/rejected": 11.445091247558594, "logps/chosen": -3.4564356803894043, "logps/rejected": -3.7083256244659424, "loss": 4.1704, "rewards/accuracies": 0.75, "rewards/chosen": -34.56435775756836, "rewards/margins": 2.5188980102539062, "rewards/rejected": -37.083255767822266, "step": 2614 }, { "epoch": 0.3560729847494553, "grad_norm": 48.0669987800748, "learning_rate": 6.506257179800751e-07, "logits/chosen": 10.952882766723633, "logits/rejected": 10.766263961791992, "logps/chosen": -3.744580030441284, "logps/rejected": -3.5451924800872803, "loss": 4.2454, "rewards/accuracies": 0.25, "rewards/chosen": -37.44580078125, "rewards/margins": -1.993875503540039, "rewards/rejected": -35.451927185058594, "step": 2615 }, { "epoch": 0.3562091503267974, "grad_norm": 36.87155369003312, "learning_rate": 6.504775000962752e-07, "logits/chosen": 11.56466293334961, "logits/rejected": 11.932082176208496, "logps/chosen": -3.3790876865386963, "logps/rejected": -3.6458818912506104, "loss": 3.5306, "rewards/accuracies": 0.75, "rewards/chosen": -33.79087829589844, "rewards/margins": 2.6679415702819824, "rewards/rejected": -36.45882034301758, "step": 2616 }, { "epoch": 0.3563453159041394, "grad_norm": 40.28022449203723, "learning_rate": 6.503292256150139e-07, "logits/chosen": 9.588294982910156, "logits/rejected": 10.92655086517334, "logps/chosen": -3.3199820518493652, "logps/rejected": -3.696788787841797, "loss": 4.1535, "rewards/accuracies": 0.5, "rewards/chosen": -33.19982147216797, "rewards/margins": 3.7680673599243164, "rewards/rejected": -36.96788787841797, "step": 2617 }, { "epoch": 0.35648148148148145, "grad_norm": 45.20182232225749, "learning_rate": 6.501808945697947e-07, "logits/chosen": 10.507904052734375, "logits/rejected": 11.856830596923828, "logps/chosen": -3.1097285747528076, "logps/rejected": -3.3532543182373047, "loss": 4.284, "rewards/accuracies": 0.75, "rewards/chosen": -31.097286224365234, "rewards/margins": 2.4352574348449707, "rewards/rejected": -33.53253936767578, "step": 2618 }, { "epoch": 0.35661764705882354, "grad_norm": 55.76060657426872, "learning_rate": 6.500325069941343e-07, "logits/chosen": 11.29423713684082, "logits/rejected": 10.824705123901367, "logps/chosen": -3.5009846687316895, "logps/rejected": -3.5521907806396484, "loss": 3.9425, "rewards/accuracies": 0.75, "rewards/chosen": -35.009849548339844, "rewards/margins": 0.512061595916748, "rewards/rejected": -35.52191162109375, "step": 2619 }, { "epoch": 0.35675381263616557, "grad_norm": 40.03905306356141, "learning_rate": 6.498840629215623e-07, "logits/chosen": 11.309476852416992, "logits/rejected": 11.927608489990234, "logps/chosen": -3.466367244720459, "logps/rejected": -3.8584794998168945, "loss": 4.0362, "rewards/accuracies": 1.0, "rewards/chosen": -34.663673400878906, "rewards/margins": 3.92112398147583, "rewards/rejected": -38.58479309082031, "step": 2620 }, { "epoch": 0.35688997821350765, "grad_norm": 37.79795662120833, "learning_rate": 6.497355623856207e-07, "logits/chosen": 12.198726654052734, "logits/rejected": 11.500066757202148, "logps/chosen": -3.7886288166046143, "logps/rejected": -3.8073320388793945, "loss": 3.7613, "rewards/accuracies": 0.25, "rewards/chosen": -37.886287689208984, "rewards/margins": 0.18703460693359375, "rewards/rejected": -38.07332229614258, "step": 2621 }, { "epoch": 0.3570261437908497, "grad_norm": 42.82384342883751, "learning_rate": 6.495870054198644e-07, "logits/chosen": 11.355724334716797, "logits/rejected": 10.889074325561523, "logps/chosen": -3.237267017364502, "logps/rejected": -3.25051212310791, "loss": 4.3606, "rewards/accuracies": 0.5, "rewards/chosen": -32.37266540527344, "rewards/margins": 0.13245344161987305, "rewards/rejected": -32.50511932373047, "step": 2622 }, { "epoch": 0.3571623093681917, "grad_norm": 46.3462034859674, "learning_rate": 6.494383920578612e-07, "logits/chosen": 10.376220703125, "logits/rejected": 11.388486862182617, "logps/chosen": -2.9256882667541504, "logps/rejected": -3.1498236656188965, "loss": 4.2879, "rewards/accuracies": 0.75, "rewards/chosen": -29.25688362121582, "rewards/margins": 2.241352081298828, "rewards/rejected": -31.498233795166016, "step": 2623 }, { "epoch": 0.3572984749455338, "grad_norm": 45.47558234259226, "learning_rate": 6.492897223331913e-07, "logits/chosen": 10.316516876220703, "logits/rejected": 10.892084121704102, "logps/chosen": -3.2349774837493896, "logps/rejected": -3.8777523040771484, "loss": 4.3307, "rewards/accuracies": 1.0, "rewards/chosen": -32.34977340698242, "rewards/margins": 6.427750587463379, "rewards/rejected": -38.77752685546875, "step": 2624 }, { "epoch": 0.3574346405228758, "grad_norm": 40.7951633400019, "learning_rate": 6.49140996279448e-07, "logits/chosen": 10.178427696228027, "logits/rejected": 10.57737922668457, "logps/chosen": -2.7780821323394775, "logps/rejected": -3.175370931625366, "loss": 3.937, "rewards/accuracies": 0.75, "rewards/chosen": -27.78082275390625, "rewards/margins": 3.9728875160217285, "rewards/rejected": -31.753707885742188, "step": 2625 }, { "epoch": 0.35757080610021785, "grad_norm": 37.17627751860655, "learning_rate": 6.489922139302372e-07, "logits/chosen": 10.543209075927734, "logits/rejected": 10.859092712402344, "logps/chosen": -3.384570598602295, "logps/rejected": -3.6779747009277344, "loss": 3.6817, "rewards/accuracies": 0.75, "rewards/chosen": -33.845706939697266, "rewards/margins": 2.9340391159057617, "rewards/rejected": -36.779747009277344, "step": 2626 }, { "epoch": 0.35770697167755994, "grad_norm": 44.51509124810079, "learning_rate": 6.488433753191776e-07, "logits/chosen": 12.716350555419922, "logits/rejected": 12.38016128540039, "logps/chosen": -3.9467074871063232, "logps/rejected": -4.240185737609863, "loss": 4.085, "rewards/accuracies": 0.75, "rewards/chosen": -39.46707534790039, "rewards/margins": 2.9347848892211914, "rewards/rejected": -42.40186309814453, "step": 2627 }, { "epoch": 0.35784313725490197, "grad_norm": 41.53394499394941, "learning_rate": 6.486944804799002e-07, "logits/chosen": 10.608704566955566, "logits/rejected": 10.943899154663086, "logps/chosen": -2.9313604831695557, "logps/rejected": -3.288069248199463, "loss": 4.1529, "rewards/accuracies": 1.0, "rewards/chosen": -29.31360626220703, "rewards/margins": 3.567087173461914, "rewards/rejected": -32.88069152832031, "step": 2628 }, { "epoch": 0.357979302832244, "grad_norm": 43.1009550404914, "learning_rate": 6.485455294460494e-07, "logits/chosen": 11.032003402709961, "logits/rejected": 11.878429412841797, "logps/chosen": -3.377997398376465, "logps/rejected": -3.6082558631896973, "loss": 4.1828, "rewards/accuracies": 0.75, "rewards/chosen": -33.77997589111328, "rewards/margins": 2.3025832176208496, "rewards/rejected": -36.082557678222656, "step": 2629 }, { "epoch": 0.3581154684095861, "grad_norm": 53.193465050164505, "learning_rate": 6.483965222512815e-07, "logits/chosen": 11.491470336914062, "logits/rejected": 11.661163330078125, "logps/chosen": -3.6740684509277344, "logps/rejected": -3.894362449645996, "loss": 3.5753, "rewards/accuracies": 0.5, "rewards/chosen": -36.740684509277344, "rewards/margins": 2.2029409408569336, "rewards/rejected": -38.943626403808594, "step": 2630 }, { "epoch": 0.3582516339869281, "grad_norm": 43.209131440829005, "learning_rate": 6.482474589292662e-07, "logits/chosen": 11.264540672302246, "logits/rejected": 11.293130874633789, "logps/chosen": -3.5553579330444336, "logps/rejected": -3.6651363372802734, "loss": 4.3901, "rewards/accuracies": 0.75, "rewards/chosen": -35.55358123779297, "rewards/margins": 1.0977821350097656, "rewards/rejected": -36.651363372802734, "step": 2631 }, { "epoch": 0.35838779956427014, "grad_norm": 48.881849212740256, "learning_rate": 6.480983395136857e-07, "logits/chosen": 11.671323776245117, "logits/rejected": 12.175559997558594, "logps/chosen": -3.3376288414001465, "logps/rejected": -3.726926803588867, "loss": 4.2717, "rewards/accuracies": 1.0, "rewards/chosen": -33.37628936767578, "rewards/margins": 3.89298152923584, "rewards/rejected": -37.26927185058594, "step": 2632 }, { "epoch": 0.3585239651416122, "grad_norm": 52.874081215018705, "learning_rate": 6.479491640382343e-07, "logits/chosen": 12.080207824707031, "logits/rejected": 12.13383674621582, "logps/chosen": -3.683074712753296, "logps/rejected": -3.4662137031555176, "loss": 4.0008, "rewards/accuracies": 0.25, "rewards/chosen": -36.830745697021484, "rewards/margins": -2.1686081886291504, "rewards/rejected": -34.662139892578125, "step": 2633 }, { "epoch": 0.35866013071895425, "grad_norm": 44.739104862194516, "learning_rate": 6.477999325366199e-07, "logits/chosen": 10.727039337158203, "logits/rejected": 11.598974227905273, "logps/chosen": -3.525254726409912, "logps/rejected": -3.6960790157318115, "loss": 4.257, "rewards/accuracies": 0.75, "rewards/chosen": -35.25254440307617, "rewards/margins": 1.7082433700561523, "rewards/rejected": -36.96078872680664, "step": 2634 }, { "epoch": 0.3587962962962963, "grad_norm": 48.414601262386775, "learning_rate": 6.476506450425624e-07, "logits/chosen": 9.225778579711914, "logits/rejected": 11.239339828491211, "logps/chosen": -3.0298027992248535, "logps/rejected": -3.743079900741577, "loss": 4.2341, "rewards/accuracies": 1.0, "rewards/chosen": -30.29802703857422, "rewards/margins": 7.132771968841553, "rewards/rejected": -37.43080139160156, "step": 2635 }, { "epoch": 0.35893246187363836, "grad_norm": 38.65977609541052, "learning_rate": 6.475013015897945e-07, "logits/chosen": 11.078211784362793, "logits/rejected": 12.42742919921875, "logps/chosen": -3.1347298622131348, "logps/rejected": -3.444066047668457, "loss": 3.7602, "rewards/accuracies": 0.75, "rewards/chosen": -31.347299575805664, "rewards/margins": 3.0933609008789062, "rewards/rejected": -34.44065856933594, "step": 2636 }, { "epoch": 0.3590686274509804, "grad_norm": 47.389202206799716, "learning_rate": 6.473519022120616e-07, "logits/chosen": 10.301345825195312, "logits/rejected": 10.84693717956543, "logps/chosen": -3.453606367111206, "logps/rejected": -3.6210460662841797, "loss": 4.1906, "rewards/accuracies": 1.0, "rewards/chosen": -34.53606414794922, "rewards/margins": 1.6743965148925781, "rewards/rejected": -36.2104606628418, "step": 2637 }, { "epoch": 0.3592047930283224, "grad_norm": 45.18873262484925, "learning_rate": 6.47202446943122e-07, "logits/chosen": 11.002768516540527, "logits/rejected": 11.718994140625, "logps/chosen": -3.3983778953552246, "logps/rejected": -3.823000907897949, "loss": 3.7242, "rewards/accuracies": 0.75, "rewards/chosen": -33.98377990722656, "rewards/margins": 4.246228218078613, "rewards/rejected": -38.23000717163086, "step": 2638 }, { "epoch": 0.3593409586056645, "grad_norm": 41.22644676285371, "learning_rate": 6.470529358167459e-07, "logits/chosen": 10.624198913574219, "logits/rejected": 11.295787811279297, "logps/chosen": -3.415045976638794, "logps/rejected": -3.7751784324645996, "loss": 3.7838, "rewards/accuracies": 0.75, "rewards/chosen": -34.15045928955078, "rewards/margins": 3.601325035095215, "rewards/rejected": -37.75178527832031, "step": 2639 }, { "epoch": 0.35947712418300654, "grad_norm": 54.60693937416916, "learning_rate": 6.469033688667167e-07, "logits/chosen": 10.652270317077637, "logits/rejected": 11.514679908752441, "logps/chosen": -3.2773945331573486, "logps/rejected": -3.8953099250793457, "loss": 4.0957, "rewards/accuracies": 0.75, "rewards/chosen": -32.77394485473633, "rewards/margins": 6.179156303405762, "rewards/rejected": -38.953102111816406, "step": 2640 }, { "epoch": 0.35961328976034856, "grad_norm": 40.06950992853826, "learning_rate": 6.467537461268306e-07, "logits/chosen": 11.29292106628418, "logits/rejected": 10.559613227844238, "logps/chosen": -3.267690658569336, "logps/rejected": -3.3234589099884033, "loss": 3.8626, "rewards/accuracies": 0.5, "rewards/chosen": -32.676902770996094, "rewards/margins": 0.5576834678649902, "rewards/rejected": -33.234588623046875, "step": 2641 }, { "epoch": 0.35974945533769065, "grad_norm": 40.14330453945665, "learning_rate": 6.466040676308959e-07, "logits/chosen": 10.632506370544434, "logits/rejected": 10.908416748046875, "logps/chosen": -3.2998385429382324, "logps/rejected": -3.670485258102417, "loss": 3.9682, "rewards/accuracies": 0.75, "rewards/chosen": -32.99838638305664, "rewards/margins": 3.7064666748046875, "rewards/rejected": -36.70485305786133, "step": 2642 }, { "epoch": 0.3598856209150327, "grad_norm": 40.56903420691004, "learning_rate": 6.464543334127334e-07, "logits/chosen": 10.702508926391602, "logits/rejected": 11.018850326538086, "logps/chosen": -3.407212972640991, "logps/rejected": -3.6258630752563477, "loss": 3.9353, "rewards/accuracies": 0.75, "rewards/chosen": -34.07212829589844, "rewards/margins": 2.18649959564209, "rewards/rejected": -36.258628845214844, "step": 2643 }, { "epoch": 0.3600217864923747, "grad_norm": 42.202017604890436, "learning_rate": 6.463045435061772e-07, "logits/chosen": 11.259727478027344, "logits/rejected": 11.88086986541748, "logps/chosen": -3.3524959087371826, "logps/rejected": -3.8413097858428955, "loss": 4.0238, "rewards/accuracies": 1.0, "rewards/chosen": -33.524959564208984, "rewards/margins": 4.888136863708496, "rewards/rejected": -38.4130973815918, "step": 2644 }, { "epoch": 0.3601579520697168, "grad_norm": 45.92648904392825, "learning_rate": 6.461546979450736e-07, "logits/chosen": 10.851724624633789, "logits/rejected": 11.323446273803711, "logps/chosen": -3.333097457885742, "logps/rejected": -3.694981813430786, "loss": 4.2799, "rewards/accuracies": 0.75, "rewards/chosen": -33.33097457885742, "rewards/margins": 3.6188430786132812, "rewards/rejected": -36.9498176574707, "step": 2645 }, { "epoch": 0.3602941176470588, "grad_norm": 42.526085550072594, "learning_rate": 6.46004796763281e-07, "logits/chosen": 11.769119262695312, "logits/rejected": 11.811270713806152, "logps/chosen": -3.49613094329834, "logps/rejected": -3.552145481109619, "loss": 3.8271, "rewards/accuracies": 0.5, "rewards/chosen": -34.96131134033203, "rewards/margins": 0.5601439476013184, "rewards/rejected": -35.521453857421875, "step": 2646 }, { "epoch": 0.36043028322440085, "grad_norm": 51.72351479680696, "learning_rate": 6.458548399946712e-07, "logits/chosen": 10.33036994934082, "logits/rejected": 11.607728004455566, "logps/chosen": -3.1137783527374268, "logps/rejected": -3.6484577655792236, "loss": 4.1019, "rewards/accuracies": 1.0, "rewards/chosen": -31.13778305053711, "rewards/margins": 5.346794128417969, "rewards/rejected": -36.484580993652344, "step": 2647 }, { "epoch": 0.36056644880174293, "grad_norm": 42.34962065623622, "learning_rate": 6.457048276731279e-07, "logits/chosen": 11.0657958984375, "logits/rejected": 11.399418830871582, "logps/chosen": -3.4563639163970947, "logps/rejected": -3.7602334022521973, "loss": 3.9273, "rewards/accuracies": 1.0, "rewards/chosen": -34.56364059448242, "rewards/margins": 3.038694381713867, "rewards/rejected": -37.602333068847656, "step": 2648 }, { "epoch": 0.36070261437908496, "grad_norm": 42.54606836711879, "learning_rate": 6.45554759832548e-07, "logits/chosen": 11.034862518310547, "logits/rejected": 11.51284408569336, "logps/chosen": -3.2527942657470703, "logps/rejected": -3.5300941467285156, "loss": 3.9049, "rewards/accuracies": 0.75, "rewards/chosen": -32.5279426574707, "rewards/margins": 2.772998809814453, "rewards/rejected": -35.300941467285156, "step": 2649 }, { "epoch": 0.360838779956427, "grad_norm": 61.03260453337756, "learning_rate": 6.454046365068401e-07, "logits/chosen": 11.277281761169434, "logits/rejected": 11.872382164001465, "logps/chosen": -3.2281651496887207, "logps/rejected": -3.487508773803711, "loss": 4.2046, "rewards/accuracies": 0.75, "rewards/chosen": -32.281654357910156, "rewards/margins": 2.5934324264526367, "rewards/rejected": -34.875083923339844, "step": 2650 }, { "epoch": 0.3609749455337691, "grad_norm": 42.587151245978745, "learning_rate": 6.452544577299263e-07, "logits/chosen": 11.138938903808594, "logits/rejected": 11.631904602050781, "logps/chosen": -3.4146807193756104, "logps/rejected": -3.2705092430114746, "loss": 3.8754, "rewards/accuracies": 0.5, "rewards/chosen": -34.14680862426758, "rewards/margins": -1.4417128562927246, "rewards/rejected": -32.70509338378906, "step": 2651 }, { "epoch": 0.3611111111111111, "grad_norm": 59.83915306778188, "learning_rate": 6.451042235357403e-07, "logits/chosen": 11.463459014892578, "logits/rejected": 12.169736862182617, "logps/chosen": -3.289487361907959, "logps/rejected": -3.807978630065918, "loss": 3.3865, "rewards/accuracies": 1.0, "rewards/chosen": -32.894874572753906, "rewards/margins": 5.184909820556641, "rewards/rejected": -38.07978439331055, "step": 2652 }, { "epoch": 0.36124727668845313, "grad_norm": 40.639460965556296, "learning_rate": 6.44953933958229e-07, "logits/chosen": 11.780488967895508, "logits/rejected": 12.7582368850708, "logps/chosen": -3.6415791511535645, "logps/rejected": -3.9005987644195557, "loss": 3.4798, "rewards/accuracies": 0.75, "rewards/chosen": -36.41578674316406, "rewards/margins": 2.5901975631713867, "rewards/rejected": -39.00598907470703, "step": 2653 }, { "epoch": 0.3613834422657952, "grad_norm": 43.03313644693034, "learning_rate": 6.448035890313516e-07, "logits/chosen": 10.471817016601562, "logits/rejected": 10.691823959350586, "logps/chosen": -2.706122398376465, "logps/rejected": -3.352914810180664, "loss": 3.8267, "rewards/accuracies": 1.0, "rewards/chosen": -27.06122398376465, "rewards/margins": 6.467923164367676, "rewards/rejected": -33.52914810180664, "step": 2654 }, { "epoch": 0.36151960784313725, "grad_norm": 41.61689419536678, "learning_rate": 6.446531887890796e-07, "logits/chosen": 10.692537307739258, "logits/rejected": 10.692587852478027, "logps/chosen": -3.0706567764282227, "logps/rejected": -2.904985189437866, "loss": 3.6197, "rewards/accuracies": 0.25, "rewards/chosen": -30.70656967163086, "rewards/margins": -1.6567158699035645, "rewards/rejected": -29.04985237121582, "step": 2655 }, { "epoch": 0.3616557734204793, "grad_norm": 42.438153663033866, "learning_rate": 6.445027332653971e-07, "logits/chosen": 11.759653091430664, "logits/rejected": 10.208401679992676, "logps/chosen": -3.2801833152770996, "logps/rejected": -3.243856191635132, "loss": 3.7477, "rewards/accuracies": 0.5, "rewards/chosen": -32.80183410644531, "rewards/margins": -0.36327123641967773, "rewards/rejected": -32.438560485839844, "step": 2656 }, { "epoch": 0.36179193899782136, "grad_norm": 44.06003625713189, "learning_rate": 6.443522224943013e-07, "logits/chosen": 10.386967658996582, "logits/rejected": 10.949930191040039, "logps/chosen": -3.2671823501586914, "logps/rejected": -3.4343223571777344, "loss": 4.0492, "rewards/accuracies": 0.75, "rewards/chosen": -32.67182159423828, "rewards/margins": 1.671398639678955, "rewards/rejected": -34.343223571777344, "step": 2657 }, { "epoch": 0.3619281045751634, "grad_norm": 43.33767229722976, "learning_rate": 6.442016565098006e-07, "logits/chosen": 10.769584655761719, "logits/rejected": 11.397102355957031, "logps/chosen": -3.0914924144744873, "logps/rejected": -3.750598192214966, "loss": 4.2021, "rewards/accuracies": 1.0, "rewards/chosen": -30.9149227142334, "rewards/margins": 6.591059684753418, "rewards/rejected": -37.5059814453125, "step": 2658 }, { "epoch": 0.3620642701525055, "grad_norm": 46.993204604904854, "learning_rate": 6.440510353459173e-07, "logits/chosen": 11.24166488647461, "logits/rejected": 10.824546813964844, "logps/chosen": -3.5570688247680664, "logps/rejected": -3.5933923721313477, "loss": 4.378, "rewards/accuracies": 0.5, "rewards/chosen": -35.57068634033203, "rewards/margins": 0.3632364273071289, "rewards/rejected": -35.933921813964844, "step": 2659 }, { "epoch": 0.3622004357298475, "grad_norm": 46.819059365593766, "learning_rate": 6.439003590366851e-07, "logits/chosen": 11.345541954040527, "logits/rejected": 13.203524589538574, "logps/chosen": -3.0330119132995605, "logps/rejected": -3.732182502746582, "loss": 4.6272, "rewards/accuracies": 1.0, "rewards/chosen": -30.330116271972656, "rewards/margins": 6.991709232330322, "rewards/rejected": -37.32182693481445, "step": 2660 }, { "epoch": 0.36233660130718953, "grad_norm": 45.02119332199846, "learning_rate": 6.437496276161507e-07, "logits/chosen": 11.946484565734863, "logits/rejected": 11.671862602233887, "logps/chosen": -3.63372802734375, "logps/rejected": -3.543050527572632, "loss": 3.7006, "rewards/accuracies": 0.5, "rewards/chosen": -36.3372802734375, "rewards/margins": -0.9067754745483398, "rewards/rejected": -35.430503845214844, "step": 2661 }, { "epoch": 0.3624727668845316, "grad_norm": 61.11927421725192, "learning_rate": 6.435988411183732e-07, "logits/chosen": 10.425798416137695, "logits/rejected": 11.0389404296875, "logps/chosen": -3.4510762691497803, "logps/rejected": -3.5993475914001465, "loss": 4.3302, "rewards/accuracies": 0.5, "rewards/chosen": -34.510765075683594, "rewards/margins": 1.4827136993408203, "rewards/rejected": -35.99347686767578, "step": 2662 }, { "epoch": 0.36260893246187365, "grad_norm": 43.29075217667378, "learning_rate": 6.434479995774238e-07, "logits/chosen": 12.303718566894531, "logits/rejected": 12.09620475769043, "logps/chosen": -3.610384941101074, "logps/rejected": -3.342287540435791, "loss": 3.9348, "rewards/accuracies": 0.25, "rewards/chosen": -36.103851318359375, "rewards/margins": -2.680975914001465, "rewards/rejected": -33.422874450683594, "step": 2663 }, { "epoch": 0.3627450980392157, "grad_norm": 42.19783534139054, "learning_rate": 6.432971030273865e-07, "logits/chosen": 11.33353042602539, "logits/rejected": 11.480524063110352, "logps/chosen": -2.9862823486328125, "logps/rejected": -3.4419281482696533, "loss": 4.0404, "rewards/accuracies": 0.75, "rewards/chosen": -29.862825393676758, "rewards/margins": 4.556457042694092, "rewards/rejected": -34.419281005859375, "step": 2664 }, { "epoch": 0.36288126361655776, "grad_norm": 42.75369277285799, "learning_rate": 6.431461515023578e-07, "logits/chosen": 10.859834671020508, "logits/rejected": 11.83675765991211, "logps/chosen": -3.346219062805176, "logps/rejected": -3.6381144523620605, "loss": 3.6409, "rewards/accuracies": 0.75, "rewards/chosen": -33.46219253540039, "rewards/margins": 2.9189534187316895, "rewards/rejected": -36.38114547729492, "step": 2665 }, { "epoch": 0.3630174291938998, "grad_norm": 43.34551744056847, "learning_rate": 6.429951450364462e-07, "logits/chosen": 10.205556869506836, "logits/rejected": 12.273122787475586, "logps/chosen": -3.4338035583496094, "logps/rejected": -3.8303980827331543, "loss": 3.98, "rewards/accuracies": 1.0, "rewards/chosen": -34.338035583496094, "rewards/margins": 3.965939998626709, "rewards/rejected": -38.303977966308594, "step": 2666 }, { "epoch": 0.3631535947712418, "grad_norm": 46.741339692329866, "learning_rate": 6.42844083663773e-07, "logits/chosen": 10.91129207611084, "logits/rejected": 11.166844367980957, "logps/chosen": -3.4995243549346924, "logps/rejected": -3.4795522689819336, "loss": 4.0606, "rewards/accuracies": 0.5, "rewards/chosen": -34.995243072509766, "rewards/margins": -0.19971990585327148, "rewards/rejected": -34.79552459716797, "step": 2667 }, { "epoch": 0.3632897603485839, "grad_norm": 48.975662642170235, "learning_rate": 6.426929674184718e-07, "logits/chosen": 10.46721076965332, "logits/rejected": 11.483352661132812, "logps/chosen": -3.28305721282959, "logps/rejected": -3.5051980018615723, "loss": 3.9726, "rewards/accuracies": 0.5, "rewards/chosen": -32.830570220947266, "rewards/margins": 2.221409320831299, "rewards/rejected": -35.051979064941406, "step": 2668 }, { "epoch": 0.36342592592592593, "grad_norm": 44.20402777437829, "learning_rate": 6.425417963346884e-07, "logits/chosen": 10.863370895385742, "logits/rejected": 10.891109466552734, "logps/chosen": -3.2961535453796387, "logps/rejected": -3.202793598175049, "loss": 3.9224, "rewards/accuracies": 0.25, "rewards/chosen": -32.96153259277344, "rewards/margins": -0.933598518371582, "rewards/rejected": -32.02793502807617, "step": 2669 }, { "epoch": 0.36356209150326796, "grad_norm": 42.896787100672, "learning_rate": 6.423905704465812e-07, "logits/chosen": 11.21375846862793, "logits/rejected": 11.442065238952637, "logps/chosen": -3.375577926635742, "logps/rejected": -3.666165590286255, "loss": 3.6567, "rewards/accuracies": 0.75, "rewards/chosen": -33.75577926635742, "rewards/margins": 2.9058752059936523, "rewards/rejected": -36.66165542602539, "step": 2670 }, { "epoch": 0.36369825708061004, "grad_norm": 48.32886090755379, "learning_rate": 6.42239289788321e-07, "logits/chosen": 11.421425819396973, "logits/rejected": 11.161214828491211, "logps/chosen": -3.6119561195373535, "logps/rejected": -3.577117919921875, "loss": 4.4183, "rewards/accuracies": 0.5, "rewards/chosen": -36.11956024169922, "rewards/margins": -0.34838199615478516, "rewards/rejected": -35.77117919921875, "step": 2671 }, { "epoch": 0.3638344226579521, "grad_norm": 47.27239197728528, "learning_rate": 6.42087954394091e-07, "logits/chosen": 11.514394760131836, "logits/rejected": 11.451356887817383, "logps/chosen": -3.566253185272217, "logps/rejected": -3.394256114959717, "loss": 4.7592, "rewards/accuracies": 0.5, "rewards/chosen": -35.66252899169922, "rewards/margins": -1.7199687957763672, "rewards/rejected": -33.942562103271484, "step": 2672 }, { "epoch": 0.3639705882352941, "grad_norm": 46.41101842986703, "learning_rate": 6.419365642980866e-07, "logits/chosen": 11.248710632324219, "logits/rejected": 12.276403427124023, "logps/chosen": -3.5568740367889404, "logps/rejected": -3.7663819789886475, "loss": 3.5262, "rewards/accuracies": 0.75, "rewards/chosen": -35.56874084472656, "rewards/margins": 2.0950798988342285, "rewards/rejected": -37.663818359375, "step": 2673 }, { "epoch": 0.3641067538126362, "grad_norm": 43.5390678754739, "learning_rate": 6.417851195345155e-07, "logits/chosen": 12.728721618652344, "logits/rejected": 12.66238784790039, "logps/chosen": -3.4062163829803467, "logps/rejected": -3.958649158477783, "loss": 3.9439, "rewards/accuracies": 1.0, "rewards/chosen": -34.062164306640625, "rewards/margins": 5.524326801300049, "rewards/rejected": -39.586490631103516, "step": 2674 }, { "epoch": 0.3642429193899782, "grad_norm": 44.222637079553834, "learning_rate": 6.416336201375981e-07, "logits/chosen": 10.681756973266602, "logits/rejected": 11.382279396057129, "logps/chosen": -3.0628998279571533, "logps/rejected": -3.491321086883545, "loss": 3.847, "rewards/accuracies": 1.0, "rewards/chosen": -30.628997802734375, "rewards/margins": 4.284212112426758, "rewards/rejected": -34.9132080078125, "step": 2675 }, { "epoch": 0.36437908496732024, "grad_norm": 42.654088509808666, "learning_rate": 6.414820661415667e-07, "logits/chosen": 11.895491600036621, "logits/rejected": 12.09811782836914, "logps/chosen": -3.2857930660247803, "logps/rejected": -3.569197177886963, "loss": 4.3068, "rewards/accuracies": 0.75, "rewards/chosen": -32.857933044433594, "rewards/margins": 2.8340392112731934, "rewards/rejected": -35.69197082519531, "step": 2676 }, { "epoch": 0.36451525054466233, "grad_norm": 41.297268831950184, "learning_rate": 6.413304575806667e-07, "logits/chosen": 11.431703567504883, "logits/rejected": 12.00529670715332, "logps/chosen": -3.2798397541046143, "logps/rejected": -3.4633684158325195, "loss": 3.5865, "rewards/accuracies": 0.5, "rewards/chosen": -32.798397064208984, "rewards/margins": 1.8352875709533691, "rewards/rejected": -34.63368225097656, "step": 2677 }, { "epoch": 0.36465141612200436, "grad_norm": 45.71771318574622, "learning_rate": 6.411787944891547e-07, "logits/chosen": 11.57142162322998, "logits/rejected": 11.459756851196289, "logps/chosen": -3.3802223205566406, "logps/rejected": -3.564713478088379, "loss": 3.8784, "rewards/accuracies": 0.5, "rewards/chosen": -33.802223205566406, "rewards/margins": 1.8449110984802246, "rewards/rejected": -35.647132873535156, "step": 2678 }, { "epoch": 0.3647875816993464, "grad_norm": 43.10730812762127, "learning_rate": 6.410270769013005e-07, "logits/chosen": 11.73312759399414, "logits/rejected": 11.232614517211914, "logps/chosen": -3.5891528129577637, "logps/rejected": -3.5720107555389404, "loss": 4.3829, "rewards/accuracies": 0.5, "rewards/chosen": -35.89152526855469, "rewards/margins": -0.17142009735107422, "rewards/rejected": -35.72010803222656, "step": 2679 }, { "epoch": 0.36492374727668847, "grad_norm": 45.60795974616573, "learning_rate": 6.408753048513859e-07, "logits/chosen": 11.36712646484375, "logits/rejected": 11.443647384643555, "logps/chosen": -3.173764228820801, "logps/rejected": -3.170642614364624, "loss": 3.8724, "rewards/accuracies": 0.5, "rewards/chosen": -31.737640380859375, "rewards/margins": -0.03121471405029297, "rewards/rejected": -31.706424713134766, "step": 2680 }, { "epoch": 0.3650599128540305, "grad_norm": 43.0463471150564, "learning_rate": 6.407234783737052e-07, "logits/chosen": 10.718158721923828, "logits/rejected": 11.169020652770996, "logps/chosen": -3.0696661472320557, "logps/rejected": -3.234903573989868, "loss": 3.8327, "rewards/accuracies": 0.75, "rewards/chosen": -30.6966609954834, "rewards/margins": 1.6523737907409668, "rewards/rejected": -32.349037170410156, "step": 2681 }, { "epoch": 0.36519607843137253, "grad_norm": 39.16233763615144, "learning_rate": 6.405715975025646e-07, "logits/chosen": 10.86434555053711, "logits/rejected": 11.095491409301758, "logps/chosen": -3.356818675994873, "logps/rejected": -3.367577314376831, "loss": 3.7855, "rewards/accuracies": 0.5, "rewards/chosen": -33.56818389892578, "rewards/margins": 0.10758733749389648, "rewards/rejected": -33.67577362060547, "step": 2682 }, { "epoch": 0.3653322440087146, "grad_norm": 43.86076442069307, "learning_rate": 6.40419662272283e-07, "logits/chosen": 11.155731201171875, "logits/rejected": 11.317070007324219, "logps/chosen": -3.618407726287842, "logps/rejected": -3.7664549350738525, "loss": 4.1629, "rewards/accuracies": 0.75, "rewards/chosen": -36.18408203125, "rewards/margins": 1.4804706573486328, "rewards/rejected": -37.66455078125, "step": 2683 }, { "epoch": 0.36546840958605664, "grad_norm": 47.16340911854295, "learning_rate": 6.402676727171913e-07, "logits/chosen": 10.973455429077148, "logits/rejected": 12.4993314743042, "logps/chosen": -3.7148046493530273, "logps/rejected": -3.9673373699188232, "loss": 4.1627, "rewards/accuracies": 0.75, "rewards/chosen": -37.14804458618164, "rewards/margins": 2.52532958984375, "rewards/rejected": -39.67337417602539, "step": 2684 }, { "epoch": 0.36560457516339867, "grad_norm": 41.10346176908589, "learning_rate": 6.401156288716331e-07, "logits/chosen": 11.371047973632812, "logits/rejected": 11.06523323059082, "logps/chosen": -3.6385104656219482, "logps/rejected": -3.7608652114868164, "loss": 3.8653, "rewards/accuracies": 0.75, "rewards/chosen": -36.38510513305664, "rewards/margins": 1.2235469818115234, "rewards/rejected": -37.60865020751953, "step": 2685 }, { "epoch": 0.36574074074074076, "grad_norm": 45.42903048208456, "learning_rate": 6.399635307699636e-07, "logits/chosen": 11.620347023010254, "logits/rejected": 11.460819244384766, "logps/chosen": -3.406559467315674, "logps/rejected": -3.4128258228302, "loss": 4.2536, "rewards/accuracies": 0.5, "rewards/chosen": -34.06559753417969, "rewards/margins": 0.06266307830810547, "rewards/rejected": -34.128257751464844, "step": 2686 }, { "epoch": 0.3658769063180828, "grad_norm": 54.50594325167314, "learning_rate": 6.398113784465508e-07, "logits/chosen": 10.83556842803955, "logits/rejected": 11.653266906738281, "logps/chosen": -3.6700565814971924, "logps/rejected": -3.675786018371582, "loss": 4.7183, "rewards/accuracies": 0.5, "rewards/chosen": -36.700565338134766, "rewards/margins": 0.05729389190673828, "rewards/rejected": -36.75785827636719, "step": 2687 }, { "epoch": 0.3660130718954248, "grad_norm": 46.410968365197604, "learning_rate": 6.396591719357746e-07, "logits/chosen": 11.363399505615234, "logits/rejected": 10.846871376037598, "logps/chosen": -3.6332478523254395, "logps/rejected": -3.521671772003174, "loss": 4.4433, "rewards/accuracies": 0.5, "rewards/chosen": -36.33247756958008, "rewards/margins": -1.1157584190368652, "rewards/rejected": -35.21672058105469, "step": 2688 }, { "epoch": 0.3661492374727669, "grad_norm": 46.80835785361184, "learning_rate": 6.395069112720275e-07, "logits/chosen": 10.77724838256836, "logits/rejected": 11.789939880371094, "logps/chosen": -3.355219841003418, "logps/rejected": -3.6463265419006348, "loss": 3.6474, "rewards/accuracies": 0.75, "rewards/chosen": -33.55220031738281, "rewards/margins": 2.911065101623535, "rewards/rejected": -36.46326446533203, "step": 2689 }, { "epoch": 0.3662854030501089, "grad_norm": 48.14525781250209, "learning_rate": 6.393545964897142e-07, "logits/chosen": 10.371885299682617, "logits/rejected": 11.591352462768555, "logps/chosen": -3.567405939102173, "logps/rejected": -3.883312702178955, "loss": 4.312, "rewards/accuracies": 0.5, "rewards/chosen": -35.6740608215332, "rewards/margins": 3.159066677093506, "rewards/rejected": -38.833126068115234, "step": 2690 }, { "epoch": 0.36642156862745096, "grad_norm": 49.28269220608225, "learning_rate": 6.392022276232511e-07, "logits/chosen": 10.139422416687012, "logits/rejected": 9.946897506713867, "logps/chosen": -3.1409549713134766, "logps/rejected": -2.9437174797058105, "loss": 4.187, "rewards/accuracies": 0.25, "rewards/chosen": -31.409549713134766, "rewards/margins": -1.9723753929138184, "rewards/rejected": -29.437175750732422, "step": 2691 }, { "epoch": 0.36655773420479304, "grad_norm": 43.173956948175345, "learning_rate": 6.390498047070675e-07, "logits/chosen": 12.109466552734375, "logits/rejected": 11.717482566833496, "logps/chosen": -3.79264235496521, "logps/rejected": -3.605038642883301, "loss": 4.2391, "rewards/accuracies": 0.5, "rewards/chosen": -37.926422119140625, "rewards/margins": -1.8760390281677246, "rewards/rejected": -36.050384521484375, "step": 2692 }, { "epoch": 0.36669389978213507, "grad_norm": 64.40382554629265, "learning_rate": 6.388973277756045e-07, "logits/chosen": 10.801944732666016, "logits/rejected": 11.09950065612793, "logps/chosen": -3.4278080463409424, "logps/rejected": -3.5940301418304443, "loss": 4.6616, "rewards/accuracies": 0.75, "rewards/chosen": -34.27808380126953, "rewards/margins": 1.662219524383545, "rewards/rejected": -35.94029998779297, "step": 2693 }, { "epoch": 0.3668300653594771, "grad_norm": 39.301849522763376, "learning_rate": 6.387447968633156e-07, "logits/chosen": 11.796619415283203, "logits/rejected": 11.12901496887207, "logps/chosen": -3.786673069000244, "logps/rejected": -3.7136099338531494, "loss": 3.9295, "rewards/accuracies": 0.5, "rewards/chosen": -37.866729736328125, "rewards/margins": -0.7306327819824219, "rewards/rejected": -37.13610076904297, "step": 2694 }, { "epoch": 0.3669662309368192, "grad_norm": 54.17936214172359, "learning_rate": 6.385922120046663e-07, "logits/chosen": 11.581796646118164, "logits/rejected": 12.088930130004883, "logps/chosen": -3.6931843757629395, "logps/rejected": -4.164005279541016, "loss": 4.5111, "rewards/accuracies": 1.0, "rewards/chosen": -36.931846618652344, "rewards/margins": 4.708207130432129, "rewards/rejected": -41.640052795410156, "step": 2695 }, { "epoch": 0.3671023965141612, "grad_norm": 52.16534337617099, "learning_rate": 6.384395732341344e-07, "logits/chosen": 11.170753479003906, "logits/rejected": 11.380916595458984, "logps/chosen": -3.6517505645751953, "logps/rejected": -3.5666472911834717, "loss": 3.9301, "rewards/accuracies": 0.5, "rewards/chosen": -36.51750183105469, "rewards/margins": -0.8510298728942871, "rewards/rejected": -35.666473388671875, "step": 2696 }, { "epoch": 0.3672385620915033, "grad_norm": 41.009208104606465, "learning_rate": 6.382868805862101e-07, "logits/chosen": 10.632621765136719, "logits/rejected": 11.679276466369629, "logps/chosen": -3.4217286109924316, "logps/rejected": -3.718845844268799, "loss": 3.4728, "rewards/accuracies": 0.75, "rewards/chosen": -34.21728515625, "rewards/margins": 2.971170425415039, "rewards/rejected": -37.18845748901367, "step": 2697 }, { "epoch": 0.3673747276688453, "grad_norm": 40.6422532033195, "learning_rate": 6.381341340953953e-07, "logits/chosen": 12.020143508911133, "logits/rejected": 10.711738586425781, "logps/chosen": -3.6309566497802734, "logps/rejected": -3.642415761947632, "loss": 4.1321, "rewards/accuracies": 0.5, "rewards/chosen": -36.309566497802734, "rewards/margins": 0.11458873748779297, "rewards/rejected": -36.424156188964844, "step": 2698 }, { "epoch": 0.36751089324618735, "grad_norm": 46.379522459629534, "learning_rate": 6.379813337962046e-07, "logits/chosen": 11.455388069152832, "logits/rejected": 12.17018985748291, "logps/chosen": -3.7016634941101074, "logps/rejected": -3.8663992881774902, "loss": 4.1758, "rewards/accuracies": 0.5, "rewards/chosen": -37.016632080078125, "rewards/margins": 1.64735746383667, "rewards/rejected": -38.66399383544922, "step": 2699 }, { "epoch": 0.36764705882352944, "grad_norm": 37.47936099830467, "learning_rate": 6.378284797231643e-07, "logits/chosen": 10.555381774902344, "logits/rejected": 10.864216804504395, "logps/chosen": -3.342146158218384, "logps/rejected": -3.5076119899749756, "loss": 3.6146, "rewards/accuracies": 0.75, "rewards/chosen": -33.42146301269531, "rewards/margins": 1.654658317565918, "rewards/rejected": -35.07611846923828, "step": 2700 }, { "epoch": 0.36778322440087147, "grad_norm": 41.935930683779986, "learning_rate": 6.376755719108131e-07, "logits/chosen": 11.50777530670166, "logits/rejected": 11.609142303466797, "logps/chosen": -3.564499855041504, "logps/rejected": -3.8121769428253174, "loss": 4.084, "rewards/accuracies": 0.75, "rewards/chosen": -35.644996643066406, "rewards/margins": 2.4767708778381348, "rewards/rejected": -38.12176513671875, "step": 2701 }, { "epoch": 0.3679193899782135, "grad_norm": 45.45060030312762, "learning_rate": 6.375226103937019e-07, "logits/chosen": 10.828603744506836, "logits/rejected": 12.118974685668945, "logps/chosen": -3.3155784606933594, "logps/rejected": -3.690581798553467, "loss": 4.3589, "rewards/accuracies": 1.0, "rewards/chosen": -33.155784606933594, "rewards/margins": 3.750033378601074, "rewards/rejected": -36.90581512451172, "step": 2702 }, { "epoch": 0.3680555555555556, "grad_norm": 47.56405251698668, "learning_rate": 6.373695952063933e-07, "logits/chosen": 11.729063034057617, "logits/rejected": 12.063882827758789, "logps/chosen": -3.6798605918884277, "logps/rejected": -3.9363796710968018, "loss": 3.6839, "rewards/accuracies": 1.0, "rewards/chosen": -36.798606872558594, "rewards/margins": 2.5651931762695312, "rewards/rejected": -39.363800048828125, "step": 2703 }, { "epoch": 0.3681917211328976, "grad_norm": 48.582582506168215, "learning_rate": 6.372165263834625e-07, "logits/chosen": 10.941337585449219, "logits/rejected": 10.837116241455078, "logps/chosen": -3.528041124343872, "logps/rejected": -3.5846686363220215, "loss": 3.6384, "rewards/accuracies": 0.5, "rewards/chosen": -35.28041076660156, "rewards/margins": 0.5662746429443359, "rewards/rejected": -35.84668731689453, "step": 2704 }, { "epoch": 0.36832788671023964, "grad_norm": 47.331732863413876, "learning_rate": 6.370634039594969e-07, "logits/chosen": 12.182120323181152, "logits/rejected": 12.498109817504883, "logps/chosen": -3.598001003265381, "logps/rejected": -4.003213405609131, "loss": 4.0954, "rewards/accuracies": 0.75, "rewards/chosen": -35.980010986328125, "rewards/margins": 4.0521240234375, "rewards/rejected": -40.032135009765625, "step": 2705 }, { "epoch": 0.3684640522875817, "grad_norm": 42.39134542168465, "learning_rate": 6.369102279690955e-07, "logits/chosen": 10.93471908569336, "logits/rejected": 12.236797332763672, "logps/chosen": -3.45633864402771, "logps/rejected": -3.9642417430877686, "loss": 4.0459, "rewards/accuracies": 1.0, "rewards/chosen": -34.56338882446289, "rewards/margins": 5.079030990600586, "rewards/rejected": -39.642417907714844, "step": 2706 }, { "epoch": 0.36860021786492375, "grad_norm": 43.04098145609605, "learning_rate": 6.367569984468698e-07, "logits/chosen": 12.25131607055664, "logits/rejected": 11.799490928649902, "logps/chosen": -3.651301860809326, "logps/rejected": -3.5886430740356445, "loss": 4.9416, "rewards/accuracies": 0.25, "rewards/chosen": -36.51301956176758, "rewards/margins": -0.6265859603881836, "rewards/rejected": -35.88643264770508, "step": 2707 }, { "epoch": 0.3687363834422658, "grad_norm": 44.34603799014056, "learning_rate": 6.366037154274433e-07, "logits/chosen": 11.356627464294434, "logits/rejected": 11.874004364013672, "logps/chosen": -3.4543285369873047, "logps/rejected": -3.696552276611328, "loss": 3.8393, "rewards/accuracies": 0.75, "rewards/chosen": -34.54328536987305, "rewards/margins": 2.422234535217285, "rewards/rejected": -36.96552276611328, "step": 2708 }, { "epoch": 0.36887254901960786, "grad_norm": 39.93061183983786, "learning_rate": 6.364503789454514e-07, "logits/chosen": 10.815351486206055, "logits/rejected": 11.262598037719727, "logps/chosen": -3.215402841567993, "logps/rejected": -3.44288969039917, "loss": 3.8646, "rewards/accuracies": 1.0, "rewards/chosen": -32.154029846191406, "rewards/margins": 2.2748661041259766, "rewards/rejected": -34.42889404296875, "step": 2709 }, { "epoch": 0.3690087145969499, "grad_norm": 40.75614615365355, "learning_rate": 6.362969890355419e-07, "logits/chosen": 11.514955520629883, "logits/rejected": 11.547842025756836, "logps/chosen": -3.5679895877838135, "logps/rejected": -3.731170892715454, "loss": 3.9593, "rewards/accuracies": 0.5, "rewards/chosen": -35.679893493652344, "rewards/margins": 1.6318130493164062, "rewards/rejected": -37.31170654296875, "step": 2710 }, { "epoch": 0.3691448801742919, "grad_norm": 45.48740221209288, "learning_rate": 6.361435457323745e-07, "logits/chosen": 10.576634407043457, "logits/rejected": 11.469390869140625, "logps/chosen": -3.2107439041137695, "logps/rejected": -3.344771385192871, "loss": 4.2337, "rewards/accuracies": 0.5, "rewards/chosen": -32.10744094848633, "rewards/margins": 1.3402738571166992, "rewards/rejected": -33.447715759277344, "step": 2711 }, { "epoch": 0.369281045751634, "grad_norm": 44.257585033546995, "learning_rate": 6.359900490706209e-07, "logits/chosen": 11.362579345703125, "logits/rejected": 11.444511413574219, "logps/chosen": -3.6433467864990234, "logps/rejected": -3.630311965942383, "loss": 3.8886, "rewards/accuracies": 0.5, "rewards/chosen": -36.4334716796875, "rewards/margins": -0.13035011291503906, "rewards/rejected": -36.30311584472656, "step": 2712 }, { "epoch": 0.36941721132897604, "grad_norm": 50.87466590144444, "learning_rate": 6.358364990849651e-07, "logits/chosen": 11.038656234741211, "logits/rejected": 12.24427604675293, "logps/chosen": -3.291024684906006, "logps/rejected": -3.9117579460144043, "loss": 4.3537, "rewards/accuracies": 0.75, "rewards/chosen": -32.910247802734375, "rewards/margins": 6.207334041595459, "rewards/rejected": -39.11758041381836, "step": 2713 }, { "epoch": 0.36955337690631807, "grad_norm": 39.33873879730353, "learning_rate": 6.35682895810103e-07, "logits/chosen": 10.526723861694336, "logits/rejected": 10.685286521911621, "logps/chosen": -3.235200881958008, "logps/rejected": -3.3762643337249756, "loss": 3.8338, "rewards/accuracies": 0.5, "rewards/chosen": -32.35200881958008, "rewards/margins": 1.410634994506836, "rewards/rejected": -33.76264190673828, "step": 2714 }, { "epoch": 0.36968954248366015, "grad_norm": 39.662073057125625, "learning_rate": 6.35529239280742e-07, "logits/chosen": 11.63768482208252, "logits/rejected": 12.284013748168945, "logps/chosen": -3.8579554557800293, "logps/rejected": -3.965426445007324, "loss": 3.6715, "rewards/accuracies": 0.5, "rewards/chosen": -38.579559326171875, "rewards/margins": 1.0747089385986328, "rewards/rejected": -39.654266357421875, "step": 2715 }, { "epoch": 0.3698257080610022, "grad_norm": 39.007243872867726, "learning_rate": 6.353755295316029e-07, "logits/chosen": 11.727375984191895, "logits/rejected": 11.493568420410156, "logps/chosen": -3.364579200744629, "logps/rejected": -3.590970516204834, "loss": 3.7968, "rewards/accuracies": 0.5, "rewards/chosen": -33.64579391479492, "rewards/margins": 2.2639098167419434, "rewards/rejected": -35.90970230102539, "step": 2716 }, { "epoch": 0.3699618736383442, "grad_norm": 49.777744436909444, "learning_rate": 6.352217665974171e-07, "logits/chosen": 11.773547172546387, "logits/rejected": 12.351907730102539, "logps/chosen": -3.695061683654785, "logps/rejected": -3.807502031326294, "loss": 4.1301, "rewards/accuracies": 0.5, "rewards/chosen": -36.95061492919922, "rewards/margins": 1.1244010925292969, "rewards/rejected": -38.07501983642578, "step": 2717 }, { "epoch": 0.3700980392156863, "grad_norm": 41.75850129081158, "learning_rate": 6.350679505129287e-07, "logits/chosen": 10.097529411315918, "logits/rejected": 10.80672550201416, "logps/chosen": -3.661790132522583, "logps/rejected": -3.7336788177490234, "loss": 3.5087, "rewards/accuracies": 0.5, "rewards/chosen": -36.61790466308594, "rewards/margins": 0.7188844680786133, "rewards/rejected": -37.33678436279297, "step": 2718 }, { "epoch": 0.3702342047930283, "grad_norm": 41.20831553772287, "learning_rate": 6.34914081312894e-07, "logits/chosen": 10.564626693725586, "logits/rejected": 10.334057807922363, "logps/chosen": -3.3733367919921875, "logps/rejected": -3.4601891040802, "loss": 4.0992, "rewards/accuracies": 0.5, "rewards/chosen": -33.733367919921875, "rewards/margins": 0.8685221672058105, "rewards/rejected": -34.601890563964844, "step": 2719 }, { "epoch": 0.37037037037037035, "grad_norm": 50.5155953629199, "learning_rate": 6.347601590320806e-07, "logits/chosen": 12.222662925720215, "logits/rejected": 11.620445251464844, "logps/chosen": -3.4681549072265625, "logps/rejected": -3.7444541454315186, "loss": 3.5694, "rewards/accuracies": 0.75, "rewards/chosen": -34.681549072265625, "rewards/margins": 2.7629923820495605, "rewards/rejected": -37.444541931152344, "step": 2720 }, { "epoch": 0.37050653594771243, "grad_norm": 40.96975454640391, "learning_rate": 6.346061837052687e-07, "logits/chosen": 10.975858688354492, "logits/rejected": 10.816673278808594, "logps/chosen": -3.5522146224975586, "logps/rejected": -3.421367645263672, "loss": 3.638, "rewards/accuracies": 0.25, "rewards/chosen": -35.52214813232422, "rewards/margins": -1.308469295501709, "rewards/rejected": -34.21367645263672, "step": 2721 }, { "epoch": 0.37064270152505446, "grad_norm": 46.641252347049324, "learning_rate": 6.344521553672505e-07, "logits/chosen": 11.176499366760254, "logits/rejected": 12.057551383972168, "logps/chosen": -3.368297815322876, "logps/rejected": -3.4576802253723145, "loss": 4.3213, "rewards/accuracies": 0.75, "rewards/chosen": -33.682979583740234, "rewards/margins": 0.8938231468200684, "rewards/rejected": -34.57680130004883, "step": 2722 }, { "epoch": 0.3707788671023965, "grad_norm": 44.480274588643674, "learning_rate": 6.342980740528297e-07, "logits/chosen": 11.052461624145508, "logits/rejected": 12.159314155578613, "logps/chosen": -3.52414608001709, "logps/rejected": -3.8699898719787598, "loss": 4.0838, "rewards/accuracies": 1.0, "rewards/chosen": -35.24146270751953, "rewards/margins": 3.4584360122680664, "rewards/rejected": -38.69989776611328, "step": 2723 }, { "epoch": 0.3709150326797386, "grad_norm": 46.8030800982408, "learning_rate": 6.341439397968222e-07, "logits/chosen": 11.774730682373047, "logits/rejected": 11.863794326782227, "logps/chosen": -3.5839366912841797, "logps/rejected": -3.7915077209472656, "loss": 3.7489, "rewards/accuracies": 1.0, "rewards/chosen": -35.8393669128418, "rewards/margins": 2.075711250305176, "rewards/rejected": -37.915077209472656, "step": 2724 }, { "epoch": 0.3710511982570806, "grad_norm": 39.876361538653654, "learning_rate": 6.339897526340562e-07, "logits/chosen": 11.654170989990234, "logits/rejected": 12.650754928588867, "logps/chosen": -3.664738416671753, "logps/rejected": -3.8698220252990723, "loss": 3.7498, "rewards/accuracies": 0.75, "rewards/chosen": -36.64738464355469, "rewards/margins": 2.050835609436035, "rewards/rejected": -38.698219299316406, "step": 2725 }, { "epoch": 0.37118736383442263, "grad_norm": 42.078583610615574, "learning_rate": 6.338355125993715e-07, "logits/chosen": 12.394325256347656, "logits/rejected": 11.617227554321289, "logps/chosen": -3.627359628677368, "logps/rejected": -3.6255552768707275, "loss": 3.959, "rewards/accuracies": 0.5, "rewards/chosen": -36.273597717285156, "rewards/margins": -0.018041133880615234, "rewards/rejected": -36.25555419921875, "step": 2726 }, { "epoch": 0.3713235294117647, "grad_norm": 49.15407758031643, "learning_rate": 6.336812197276197e-07, "logits/chosen": 11.29084587097168, "logits/rejected": 10.980659484863281, "logps/chosen": -3.0420260429382324, "logps/rejected": -3.5704424381256104, "loss": 3.8469, "rewards/accuracies": 1.0, "rewards/chosen": -30.42026138305664, "rewards/margins": 5.284162998199463, "rewards/rejected": -35.70442199707031, "step": 2727 }, { "epoch": 0.37145969498910675, "grad_norm": 41.63659289996592, "learning_rate": 6.335268740536648e-07, "logits/chosen": 10.808577537536621, "logits/rejected": 11.218650817871094, "logps/chosen": -3.466191291809082, "logps/rejected": -3.653409242630005, "loss": 4.4361, "rewards/accuracies": 0.5, "rewards/chosen": -34.66191101074219, "rewards/margins": 1.8721799850463867, "rewards/rejected": -36.534095764160156, "step": 2728 }, { "epoch": 0.3715958605664488, "grad_norm": 47.90299858801503, "learning_rate": 6.333724756123823e-07, "logits/chosen": 11.020812034606934, "logits/rejected": 11.207321166992188, "logps/chosen": -3.178276538848877, "logps/rejected": -3.6232099533081055, "loss": 4.3434, "rewards/accuracies": 0.5, "rewards/chosen": -31.782766342163086, "rewards/margins": 4.449332237243652, "rewards/rejected": -36.23209762573242, "step": 2729 }, { "epoch": 0.37173202614379086, "grad_norm": 44.408124228302164, "learning_rate": 6.332180244386597e-07, "logits/chosen": 11.064615249633789, "logits/rejected": 12.01151180267334, "logps/chosen": -3.814269781112671, "logps/rejected": -3.9055817127227783, "loss": 4.6338, "rewards/accuracies": 0.75, "rewards/chosen": -38.142696380615234, "rewards/margins": 0.913121223449707, "rewards/rejected": -39.055816650390625, "step": 2730 }, { "epoch": 0.3718681917211329, "grad_norm": 42.76485056382217, "learning_rate": 6.330635205673968e-07, "logits/chosen": 10.975259780883789, "logits/rejected": 11.40451431274414, "logps/chosen": -3.388939619064331, "logps/rejected": -3.5078608989715576, "loss": 3.8937, "rewards/accuracies": 0.5, "rewards/chosen": -33.88939666748047, "rewards/margins": 1.1892123222351074, "rewards/rejected": -35.07860565185547, "step": 2731 }, { "epoch": 0.3720043572984749, "grad_norm": 40.20336139697088, "learning_rate": 6.32908964033505e-07, "logits/chosen": 10.375091552734375, "logits/rejected": 11.155023574829102, "logps/chosen": -3.1897292137145996, "logps/rejected": -3.627622365951538, "loss": 3.7679, "rewards/accuracies": 1.0, "rewards/chosen": -31.897293090820312, "rewards/margins": 4.378931999206543, "rewards/rejected": -36.27622604370117, "step": 2732 }, { "epoch": 0.372140522875817, "grad_norm": 53.81492813557735, "learning_rate": 6.327543548719074e-07, "logits/chosen": 11.107555389404297, "logits/rejected": 11.487293243408203, "logps/chosen": -3.7368295192718506, "logps/rejected": -3.893603801727295, "loss": 3.769, "rewards/accuracies": 0.75, "rewards/chosen": -37.36829376220703, "rewards/margins": 1.5677461624145508, "rewards/rejected": -38.93604278564453, "step": 2733 }, { "epoch": 0.37227668845315903, "grad_norm": 49.48077257809028, "learning_rate": 6.325996931175393e-07, "logits/chosen": 11.659889221191406, "logits/rejected": 11.968894958496094, "logps/chosen": -3.836402416229248, "logps/rejected": -3.802299737930298, "loss": 4.0844, "rewards/accuracies": 0.5, "rewards/chosen": -38.36402130126953, "rewards/margins": -0.34102535247802734, "rewards/rejected": -38.02299499511719, "step": 2734 }, { "epoch": 0.3724128540305011, "grad_norm": 41.23275376779466, "learning_rate": 6.32444978805348e-07, "logits/chosen": 11.2789945602417, "logits/rejected": 11.89200210571289, "logps/chosen": -3.3821911811828613, "logps/rejected": -3.7022252082824707, "loss": 3.4568, "rewards/accuracies": 0.75, "rewards/chosen": -33.8219108581543, "rewards/margins": 3.2003426551818848, "rewards/rejected": -37.02225112915039, "step": 2735 }, { "epoch": 0.37254901960784315, "grad_norm": 49.70473580471073, "learning_rate": 6.322902119702922e-07, "logits/chosen": 11.16348648071289, "logits/rejected": 11.702898025512695, "logps/chosen": -3.609520673751831, "logps/rejected": -3.864107608795166, "loss": 4.9432, "rewards/accuracies": 0.75, "rewards/chosen": -36.09520721435547, "rewards/margins": 2.545870304107666, "rewards/rejected": -38.641075134277344, "step": 2736 }, { "epoch": 0.3726851851851852, "grad_norm": 45.47088867687017, "learning_rate": 6.321353926473429e-07, "logits/chosen": 10.727415084838867, "logits/rejected": 11.22966194152832, "logps/chosen": -3.5836174488067627, "logps/rejected": -3.839048147201538, "loss": 4.0294, "rewards/accuracies": 0.75, "rewards/chosen": -35.83617401123047, "rewards/margins": 2.554306983947754, "rewards/rejected": -38.390480041503906, "step": 2737 }, { "epoch": 0.37282135076252726, "grad_norm": 40.00528133089753, "learning_rate": 6.319805208714829e-07, "logits/chosen": 11.792290687561035, "logits/rejected": 11.858288764953613, "logps/chosen": -3.558154821395874, "logps/rejected": -3.7023584842681885, "loss": 4.2758, "rewards/accuracies": 0.75, "rewards/chosen": -35.58155059814453, "rewards/margins": 1.44203519821167, "rewards/rejected": -37.023582458496094, "step": 2738 }, { "epoch": 0.3729575163398693, "grad_norm": 44.21408451638505, "learning_rate": 6.318255966777065e-07, "logits/chosen": 11.905374526977539, "logits/rejected": 11.95777702331543, "logps/chosen": -4.017569541931152, "logps/rejected": -3.9861526489257812, "loss": 4.2442, "rewards/accuracies": 0.5, "rewards/chosen": -40.175689697265625, "rewards/margins": -0.3141660690307617, "rewards/rejected": -39.86152648925781, "step": 2739 }, { "epoch": 0.3730936819172113, "grad_norm": 59.24476495164542, "learning_rate": 6.316706201010204e-07, "logits/chosen": 11.801520347595215, "logits/rejected": 11.570098876953125, "logps/chosen": -3.738380193710327, "logps/rejected": -3.644136428833008, "loss": 4.2548, "rewards/accuracies": 0.25, "rewards/chosen": -37.3838005065918, "rewards/margins": -0.9424381256103516, "rewards/rejected": -36.44136428833008, "step": 2740 }, { "epoch": 0.3732298474945534, "grad_norm": 49.46995361190695, "learning_rate": 6.315155911764426e-07, "logits/chosen": 11.690733909606934, "logits/rejected": 11.467840194702148, "logps/chosen": -3.37471342086792, "logps/rejected": -3.6453959941864014, "loss": 4.6411, "rewards/accuracies": 0.75, "rewards/chosen": -33.74713134765625, "rewards/margins": 2.706827163696289, "rewards/rejected": -36.45396041870117, "step": 2741 }, { "epoch": 0.37336601307189543, "grad_norm": 39.46572414821558, "learning_rate": 6.313605099390032e-07, "logits/chosen": 11.961862564086914, "logits/rejected": 11.415977478027344, "logps/chosen": -3.385596752166748, "logps/rejected": -3.384570598602295, "loss": 3.4012, "rewards/accuracies": 0.5, "rewards/chosen": -33.85596466064453, "rewards/margins": -0.010262489318847656, "rewards/rejected": -33.845703125, "step": 2742 }, { "epoch": 0.37350217864923746, "grad_norm": 40.33315023639042, "learning_rate": 6.312053764237441e-07, "logits/chosen": 11.251315116882324, "logits/rejected": 11.746360778808594, "logps/chosen": -3.4654250144958496, "logps/rejected": -3.6221046447753906, "loss": 3.8464, "rewards/accuracies": 0.75, "rewards/chosen": -34.65425109863281, "rewards/margins": 1.5667991638183594, "rewards/rejected": -36.221046447753906, "step": 2743 }, { "epoch": 0.37363834422657954, "grad_norm": 51.43511631735108, "learning_rate": 6.310501906657192e-07, "logits/chosen": 11.812644958496094, "logits/rejected": 11.541780471801758, "logps/chosen": -3.849452257156372, "logps/rejected": -3.6504733562469482, "loss": 3.4956, "rewards/accuracies": 0.5, "rewards/chosen": -38.49452209472656, "rewards/margins": -1.9897880554199219, "rewards/rejected": -36.50473403930664, "step": 2744 }, { "epoch": 0.3737745098039216, "grad_norm": 48.18158628824663, "learning_rate": 6.308949526999937e-07, "logits/chosen": 12.11217975616455, "logits/rejected": 12.452856063842773, "logps/chosen": -4.009637355804443, "logps/rejected": -4.030241966247559, "loss": 4.2087, "rewards/accuracies": 0.5, "rewards/chosen": -40.09637451171875, "rewards/margins": 0.20604705810546875, "rewards/rejected": -40.30242156982422, "step": 2745 }, { "epoch": 0.3739106753812636, "grad_norm": 47.12103223052479, "learning_rate": 6.30739662561645e-07, "logits/chosen": 12.488147735595703, "logits/rejected": 12.750503540039062, "logps/chosen": -3.605409622192383, "logps/rejected": -3.7901883125305176, "loss": 3.6052, "rewards/accuracies": 0.75, "rewards/chosen": -36.054100036621094, "rewards/margins": 1.8477859497070312, "rewards/rejected": -37.901885986328125, "step": 2746 }, { "epoch": 0.3740468409586057, "grad_norm": 49.884289399086335, "learning_rate": 6.305843202857624e-07, "logits/chosen": 10.995759963989258, "logits/rejected": 11.941810607910156, "logps/chosen": -3.6543846130371094, "logps/rejected": -3.880166530609131, "loss": 4.5324, "rewards/accuracies": 0.75, "rewards/chosen": -36.543846130371094, "rewards/margins": 2.2578206062316895, "rewards/rejected": -38.801666259765625, "step": 2747 }, { "epoch": 0.3741830065359477, "grad_norm": 43.05194704636671, "learning_rate": 6.304289259074464e-07, "logits/chosen": 11.260876655578613, "logits/rejected": 11.76688289642334, "logps/chosen": -3.2400031089782715, "logps/rejected": -3.771872043609619, "loss": 3.7939, "rewards/accuracies": 1.0, "rewards/chosen": -32.40003204345703, "rewards/margins": 5.318690299987793, "rewards/rejected": -37.718719482421875, "step": 2748 }, { "epoch": 0.37431917211328974, "grad_norm": 42.65797227895321, "learning_rate": 6.302734794618099e-07, "logits/chosen": 10.776159286499023, "logits/rejected": 12.349279403686523, "logps/chosen": -3.2338638305664062, "logps/rejected": -3.7455337047576904, "loss": 3.9204, "rewards/accuracies": 0.75, "rewards/chosen": -32.33863830566406, "rewards/margins": 5.116700172424316, "rewards/rejected": -37.45533752441406, "step": 2749 }, { "epoch": 0.37445533769063183, "grad_norm": 42.94173326849783, "learning_rate": 6.301179809839774e-07, "logits/chosen": 12.009078979492188, "logits/rejected": 12.393319129943848, "logps/chosen": -4.080067157745361, "logps/rejected": -3.6156020164489746, "loss": 3.9155, "rewards/accuracies": 0.0, "rewards/chosen": -40.80067443847656, "rewards/margins": -4.644654750823975, "rewards/rejected": -36.1560173034668, "step": 2750 }, { "epoch": 0.37459150326797386, "grad_norm": 47.012183631580854, "learning_rate": 6.299624305090848e-07, "logits/chosen": 11.488153457641602, "logits/rejected": 11.963846206665039, "logps/chosen": -3.21775484085083, "logps/rejected": -3.6580305099487305, "loss": 4.4682, "rewards/accuracies": 0.75, "rewards/chosen": -32.177547454833984, "rewards/margins": 4.402756214141846, "rewards/rejected": -36.58030319213867, "step": 2751 }, { "epoch": 0.3747276688453159, "grad_norm": 45.07388984757453, "learning_rate": 6.298068280722802e-07, "logits/chosen": 12.48680305480957, "logits/rejected": 12.95982551574707, "logps/chosen": -3.7654452323913574, "logps/rejected": -4.148500442504883, "loss": 3.8466, "rewards/accuracies": 0.75, "rewards/chosen": -37.654449462890625, "rewards/margins": 3.830552101135254, "rewards/rejected": -41.48500061035156, "step": 2752 }, { "epoch": 0.37486383442265797, "grad_norm": 45.974315378123606, "learning_rate": 6.296511737087232e-07, "logits/chosen": 11.901230812072754, "logits/rejected": 12.176278114318848, "logps/chosen": -3.66640043258667, "logps/rejected": -3.5260353088378906, "loss": 4.2976, "rewards/accuracies": 0.25, "rewards/chosen": -36.664005279541016, "rewards/margins": -1.403653621673584, "rewards/rejected": -35.26034927368164, "step": 2753 }, { "epoch": 0.375, "grad_norm": 44.15103816457848, "learning_rate": 6.29495467453585e-07, "logits/chosen": 11.1484375, "logits/rejected": 11.518775939941406, "logps/chosen": -3.265146017074585, "logps/rejected": -3.6047894954681396, "loss": 4.0656, "rewards/accuracies": 1.0, "rewards/chosen": -32.651458740234375, "rewards/margins": 3.3964357376098633, "rewards/rejected": -36.04789352416992, "step": 2754 }, { "epoch": 0.37513616557734203, "grad_norm": 72.11441379806384, "learning_rate": 6.293397093420492e-07, "logits/chosen": 10.517721176147461, "logits/rejected": 11.030210494995117, "logps/chosen": -3.4715819358825684, "logps/rejected": -3.6795337200164795, "loss": 3.741, "rewards/accuracies": 0.75, "rewards/chosen": -34.7158203125, "rewards/margins": 2.079519271850586, "rewards/rejected": -36.79533386230469, "step": 2755 }, { "epoch": 0.3752723311546841, "grad_norm": 44.18699372059251, "learning_rate": 6.291838994093101e-07, "logits/chosen": 12.078755378723145, "logits/rejected": 12.48444652557373, "logps/chosen": -4.087776184082031, "logps/rejected": -4.2895188331604, "loss": 3.8579, "rewards/accuracies": 0.75, "rewards/chosen": -40.87776565551758, "rewards/margins": 2.017423629760742, "rewards/rejected": -42.89518737792969, "step": 2756 }, { "epoch": 0.37540849673202614, "grad_norm": 43.76586525085918, "learning_rate": 6.290280376905745e-07, "logits/chosen": 12.074493408203125, "logits/rejected": 12.740080833435059, "logps/chosen": -3.6093549728393555, "logps/rejected": -3.740997076034546, "loss": 3.9701, "rewards/accuracies": 0.75, "rewards/chosen": -36.09355163574219, "rewards/margins": 1.3164215087890625, "rewards/rejected": -37.40997314453125, "step": 2757 }, { "epoch": 0.37554466230936817, "grad_norm": 42.907625927370674, "learning_rate": 6.288721242210608e-07, "logits/chosen": 12.918174743652344, "logits/rejected": 11.41592788696289, "logps/chosen": -3.9834280014038086, "logps/rejected": -3.4675869941711426, "loss": 4.0114, "rewards/accuracies": 0.0, "rewards/chosen": -39.83428192138672, "rewards/margins": -5.15841007232666, "rewards/rejected": -34.67586898803711, "step": 2758 }, { "epoch": 0.37568082788671026, "grad_norm": 46.85513216330182, "learning_rate": 6.287161590359986e-07, "logits/chosen": 11.387945175170898, "logits/rejected": 11.9725923538208, "logps/chosen": -3.895159959793091, "logps/rejected": -3.7861223220825195, "loss": 4.2117, "rewards/accuracies": 0.25, "rewards/chosen": -38.95159912109375, "rewards/margins": -1.090378761291504, "rewards/rejected": -37.86122131347656, "step": 2759 }, { "epoch": 0.3758169934640523, "grad_norm": 46.038026245598346, "learning_rate": 6.285601421706296e-07, "logits/chosen": 12.800125122070312, "logits/rejected": 12.893842697143555, "logps/chosen": -4.295011520385742, "logps/rejected": -4.268163204193115, "loss": 4.4171, "rewards/accuracies": 0.5, "rewards/chosen": -42.950111389160156, "rewards/margins": -0.2684812545776367, "rewards/rejected": -42.68163299560547, "step": 2760 }, { "epoch": 0.3759531590413943, "grad_norm": 77.72444878788109, "learning_rate": 6.284040736602074e-07, "logits/chosen": 12.128637313842773, "logits/rejected": 12.672855377197266, "logps/chosen": -3.553825855255127, "logps/rejected": -3.5036017894744873, "loss": 4.606, "rewards/accuracies": 0.5, "rewards/chosen": -35.53826141357422, "rewards/margins": -0.5022411346435547, "rewards/rejected": -35.03601837158203, "step": 2761 }, { "epoch": 0.3760893246187364, "grad_norm": 45.45612446344535, "learning_rate": 6.282479535399966e-07, "logits/chosen": 11.84296989440918, "logits/rejected": 12.487789154052734, "logps/chosen": -3.801633358001709, "logps/rejected": -3.918013572692871, "loss": 4.3084, "rewards/accuracies": 0.75, "rewards/chosen": -38.016334533691406, "rewards/margins": 1.1638050079345703, "rewards/rejected": -39.180137634277344, "step": 2762 }, { "epoch": 0.3762254901960784, "grad_norm": 43.54516124418192, "learning_rate": 6.280917818452741e-07, "logits/chosen": 12.608770370483398, "logits/rejected": 12.450242042541504, "logps/chosen": -3.7802467346191406, "logps/rejected": -4.119485378265381, "loss": 3.5209, "rewards/accuracies": 0.75, "rewards/chosen": -37.802467346191406, "rewards/margins": 3.392385482788086, "rewards/rejected": -41.194854736328125, "step": 2763 }, { "epoch": 0.37636165577342046, "grad_norm": 127.78746855103819, "learning_rate": 6.279355586113279e-07, "logits/chosen": 11.986597061157227, "logits/rejected": 12.267204284667969, "logps/chosen": -3.7350547313690186, "logps/rejected": -4.074811935424805, "loss": 4.1609, "rewards/accuracies": 0.75, "rewards/chosen": -37.350547790527344, "rewards/margins": 3.3975744247436523, "rewards/rejected": -40.74811935424805, "step": 2764 }, { "epoch": 0.37649782135076254, "grad_norm": 42.68397387962879, "learning_rate": 6.277792838734582e-07, "logits/chosen": 11.278337478637695, "logits/rejected": 12.191071510314941, "logps/chosen": -3.7397220134735107, "logps/rejected": -3.865010976791382, "loss": 4.0894, "rewards/accuracies": 0.75, "rewards/chosen": -37.397216796875, "rewards/margins": 1.252889633178711, "rewards/rejected": -38.650108337402344, "step": 2765 }, { "epoch": 0.37663398692810457, "grad_norm": 46.01784777949642, "learning_rate": 6.276229576669765e-07, "logits/chosen": 12.077957153320312, "logits/rejected": 11.990208625793457, "logps/chosen": -3.615938901901245, "logps/rejected": -4.000640392303467, "loss": 4.0869, "rewards/accuracies": 1.0, "rewards/chosen": -36.15938949584961, "rewards/margins": 3.8470144271850586, "rewards/rejected": -40.006404876708984, "step": 2766 }, { "epoch": 0.3767701525054466, "grad_norm": 44.12741791556, "learning_rate": 6.274665800272059e-07, "logits/chosen": 11.228543281555176, "logits/rejected": 12.321063995361328, "logps/chosen": -3.4875478744506836, "logps/rejected": -3.745178699493408, "loss": 4.1034, "rewards/accuracies": 0.75, "rewards/chosen": -34.87548065185547, "rewards/margins": 2.576305866241455, "rewards/rejected": -37.451786041259766, "step": 2767 }, { "epoch": 0.3769063180827887, "grad_norm": 42.6339121131561, "learning_rate": 6.273101509894813e-07, "logits/chosen": 12.053167343139648, "logits/rejected": 11.583436965942383, "logps/chosen": -3.7218356132507324, "logps/rejected": -3.501006603240967, "loss": 4.1206, "rewards/accuracies": 0.0, "rewards/chosen": -37.21835708618164, "rewards/margins": -2.2082910537719727, "rewards/rejected": -35.010066986083984, "step": 2768 }, { "epoch": 0.3770424836601307, "grad_norm": 54.434402829862115, "learning_rate": 6.27153670589149e-07, "logits/chosen": 11.798348426818848, "logits/rejected": 11.58967113494873, "logps/chosen": -3.4681129455566406, "logps/rejected": -3.8601574897766113, "loss": 4.1142, "rewards/accuracies": 0.75, "rewards/chosen": -34.681129455566406, "rewards/margins": 3.9204440116882324, "rewards/rejected": -38.6015739440918, "step": 2769 }, { "epoch": 0.37717864923747274, "grad_norm": 48.08232117951559, "learning_rate": 6.269971388615674e-07, "logits/chosen": 11.962201118469238, "logits/rejected": 11.542739868164062, "logps/chosen": -3.620840072631836, "logps/rejected": -3.5589261054992676, "loss": 3.7597, "rewards/accuracies": 0.5, "rewards/chosen": -36.20840072631836, "rewards/margins": -0.619138240814209, "rewards/rejected": -35.589263916015625, "step": 2770 }, { "epoch": 0.3773148148148148, "grad_norm": 51.60280013837759, "learning_rate": 6.268405558421057e-07, "logits/chosen": 11.322751998901367, "logits/rejected": 11.62639045715332, "logps/chosen": -3.3153624534606934, "logps/rejected": -3.4946811199188232, "loss": 3.5665, "rewards/accuracies": 1.0, "rewards/chosen": -33.15362548828125, "rewards/margins": 1.7931890487670898, "rewards/rejected": -34.946815490722656, "step": 2771 }, { "epoch": 0.37745098039215685, "grad_norm": 45.24762567479292, "learning_rate": 6.266839215661454e-07, "logits/chosen": 12.135523796081543, "logits/rejected": 12.529634475708008, "logps/chosen": -3.6818532943725586, "logps/rejected": -4.06436014175415, "loss": 3.9118, "rewards/accuracies": 1.0, "rewards/chosen": -36.81853103637695, "rewards/margins": 3.8250694274902344, "rewards/rejected": -40.64360046386719, "step": 2772 }, { "epoch": 0.3775871459694989, "grad_norm": 48.04458935563377, "learning_rate": 6.265272360690793e-07, "logits/chosen": 12.730703353881836, "logits/rejected": 13.540203094482422, "logps/chosen": -4.016548156738281, "logps/rejected": -4.214688777923584, "loss": 4.6771, "rewards/accuracies": 0.75, "rewards/chosen": -40.16548156738281, "rewards/margins": 1.9814062118530273, "rewards/rejected": -42.146888732910156, "step": 2773 }, { "epoch": 0.37772331154684097, "grad_norm": 55.942449266696585, "learning_rate": 6.263704993863116e-07, "logits/chosen": 11.92463493347168, "logits/rejected": 12.42447280883789, "logps/chosen": -3.9207763671875, "logps/rejected": -4.051253318786621, "loss": 4.2383, "rewards/accuracies": 0.75, "rewards/chosen": -39.207763671875, "rewards/margins": 1.3047704696655273, "rewards/rejected": -40.512535095214844, "step": 2774 }, { "epoch": 0.377859477124183, "grad_norm": 48.123963139399656, "learning_rate": 6.262137115532584e-07, "logits/chosen": 12.085551261901855, "logits/rejected": 12.530660629272461, "logps/chosen": -3.7515017986297607, "logps/rejected": -4.304830551147461, "loss": 3.9828, "rewards/accuracies": 1.0, "rewards/chosen": -37.515018463134766, "rewards/margins": 5.533291816711426, "rewards/rejected": -43.048309326171875, "step": 2775 }, { "epoch": 0.3779956427015251, "grad_norm": 51.94510660704565, "learning_rate": 6.260568726053472e-07, "logits/chosen": 11.17904281616211, "logits/rejected": 11.825590133666992, "logps/chosen": -3.4619879722595215, "logps/rejected": -3.662095785140991, "loss": 4.4453, "rewards/accuracies": 1.0, "rewards/chosen": -34.619876861572266, "rewards/margins": 2.0010814666748047, "rewards/rejected": -36.6209602355957, "step": 2776 }, { "epoch": 0.3781318082788671, "grad_norm": 49.97583199526154, "learning_rate": 6.25899982578017e-07, "logits/chosen": 12.385002136230469, "logits/rejected": 12.597164154052734, "logps/chosen": -4.157184600830078, "logps/rejected": -3.9249157905578613, "loss": 4.3473, "rewards/accuracies": 0.25, "rewards/chosen": -41.57184982299805, "rewards/margins": -2.3226919174194336, "rewards/rejected": -39.2491569519043, "step": 2777 }, { "epoch": 0.37826797385620914, "grad_norm": 47.74525413883555, "learning_rate": 6.257430415067185e-07, "logits/chosen": 11.217037200927734, "logits/rejected": 11.373261451721191, "logps/chosen": -3.665278434753418, "logps/rejected": -3.512361526489258, "loss": 4.0931, "rewards/accuracies": 0.25, "rewards/chosen": -36.65278625488281, "rewards/margins": -1.5291671752929688, "rewards/rejected": -35.123619079589844, "step": 2778 }, { "epoch": 0.3784041394335512, "grad_norm": 44.25079650475654, "learning_rate": 6.255860494269137e-07, "logits/chosen": 10.626145362854004, "logits/rejected": 11.86949634552002, "logps/chosen": -3.629739284515381, "logps/rejected": -3.8871073722839355, "loss": 3.8148, "rewards/accuracies": 0.75, "rewards/chosen": -36.297393798828125, "rewards/margins": 2.5736799240112305, "rewards/rejected": -38.87107467651367, "step": 2779 }, { "epoch": 0.37854030501089325, "grad_norm": 75.49666368960733, "learning_rate": 6.254290063740763e-07, "logits/chosen": 11.536552429199219, "logits/rejected": 11.92672348022461, "logps/chosen": -3.7593092918395996, "logps/rejected": -3.889357089996338, "loss": 4.3829, "rewards/accuracies": 0.5, "rewards/chosen": -37.59309387207031, "rewards/margins": 1.30047607421875, "rewards/rejected": -38.89356994628906, "step": 2780 }, { "epoch": 0.3786764705882353, "grad_norm": 45.050372151367206, "learning_rate": 6.252719123836915e-07, "logits/chosen": 11.383672714233398, "logits/rejected": 11.404402732849121, "logps/chosen": -3.394082546234131, "logps/rejected": -3.6645472049713135, "loss": 4.2876, "rewards/accuracies": 0.5, "rewards/chosen": -33.940826416015625, "rewards/margins": 2.7046456336975098, "rewards/rejected": -36.64547348022461, "step": 2781 }, { "epoch": 0.37881263616557737, "grad_norm": 46.022978608113895, "learning_rate": 6.251147674912561e-07, "logits/chosen": 11.343515396118164, "logits/rejected": 12.483527183532715, "logps/chosen": -3.7384047508239746, "logps/rejected": -3.8163137435913086, "loss": 3.7042, "rewards/accuracies": 0.75, "rewards/chosen": -37.38404846191406, "rewards/margins": 0.7790927886962891, "rewards/rejected": -38.16313934326172, "step": 2782 }, { "epoch": 0.3789488017429194, "grad_norm": 42.96405439826541, "learning_rate": 6.249575717322779e-07, "logits/chosen": 11.478645324707031, "logits/rejected": 11.811188697814941, "logps/chosen": -3.8019583225250244, "logps/rejected": -3.694291830062866, "loss": 4.2078, "rewards/accuracies": 0.25, "rewards/chosen": -38.01958465576172, "rewards/margins": -1.0766654014587402, "rewards/rejected": -36.94291687011719, "step": 2783 }, { "epoch": 0.3790849673202614, "grad_norm": 45.45799846414211, "learning_rate": 6.248003251422771e-07, "logits/chosen": 11.104750633239746, "logits/rejected": 11.442846298217773, "logps/chosen": -3.452380418777466, "logps/rejected": -3.640092611312866, "loss": 3.8891, "rewards/accuracies": 0.75, "rewards/chosen": -34.5238037109375, "rewards/margins": 1.8771238327026367, "rewards/rejected": -36.40092849731445, "step": 2784 }, { "epoch": 0.3792211328976035, "grad_norm": 44.45839275116951, "learning_rate": 6.246430277567846e-07, "logits/chosen": 12.168364524841309, "logits/rejected": 11.92992115020752, "logps/chosen": -3.7074785232543945, "logps/rejected": -3.936994791030884, "loss": 4.2249, "rewards/accuracies": 0.5, "rewards/chosen": -37.07478332519531, "rewards/margins": 2.2951622009277344, "rewards/rejected": -39.36994934082031, "step": 2785 }, { "epoch": 0.37935729847494554, "grad_norm": 42.3833975404901, "learning_rate": 6.244856796113429e-07, "logits/chosen": 11.995870590209961, "logits/rejected": 11.704553604125977, "logps/chosen": -3.7814581394195557, "logps/rejected": -3.9759199619293213, "loss": 4.0057, "rewards/accuracies": 0.75, "rewards/chosen": -37.81458282470703, "rewards/margins": 1.9446182250976562, "rewards/rejected": -39.75920104980469, "step": 2786 }, { "epoch": 0.37949346405228757, "grad_norm": 40.72320654284707, "learning_rate": 6.243282807415063e-07, "logits/chosen": 11.671073913574219, "logits/rejected": 11.79914665222168, "logps/chosen": -3.4574615955352783, "logps/rejected": -3.580122947692871, "loss": 4.1974, "rewards/accuracies": 0.5, "rewards/chosen": -34.574615478515625, "rewards/margins": 1.2266106605529785, "rewards/rejected": -35.80122375488281, "step": 2787 }, { "epoch": 0.37962962962962965, "grad_norm": 44.16426797439462, "learning_rate": 6.241708311828406e-07, "logits/chosen": 11.162839889526367, "logits/rejected": 11.446640014648438, "logps/chosen": -3.4802405834198, "logps/rejected": -3.672039031982422, "loss": 4.158, "rewards/accuracies": 0.5, "rewards/chosen": -34.802406311035156, "rewards/margins": 1.9179844856262207, "rewards/rejected": -36.72039031982422, "step": 2788 }, { "epoch": 0.3797657952069717, "grad_norm": 48.002660910625885, "learning_rate": 6.240133309709223e-07, "logits/chosen": 10.835848808288574, "logits/rejected": 12.217239379882812, "logps/chosen": -3.233675241470337, "logps/rejected": -3.7383506298065186, "loss": 4.4102, "rewards/accuracies": 1.0, "rewards/chosen": -32.336753845214844, "rewards/margins": 5.046753406524658, "rewards/rejected": -37.383506774902344, "step": 2789 }, { "epoch": 0.3799019607843137, "grad_norm": 39.49046105537438, "learning_rate": 6.238557801413402e-07, "logits/chosen": 11.213544845581055, "logits/rejected": 11.606523513793945, "logps/chosen": -3.066601276397705, "logps/rejected": -3.5313544273376465, "loss": 3.8484, "rewards/accuracies": 0.75, "rewards/chosen": -30.666011810302734, "rewards/margins": 4.647531986236572, "rewards/rejected": -35.31354522705078, "step": 2790 }, { "epoch": 0.3800381263616558, "grad_norm": 46.34293947881397, "learning_rate": 6.236981787296942e-07, "logits/chosen": 11.121315002441406, "logits/rejected": 10.45273208618164, "logps/chosen": -3.1858136653900146, "logps/rejected": -3.4342806339263916, "loss": 4.0438, "rewards/accuracies": 1.0, "rewards/chosen": -31.858137130737305, "rewards/margins": 2.4846696853637695, "rewards/rejected": -34.342803955078125, "step": 2791 }, { "epoch": 0.3801742919389978, "grad_norm": 126.73814646227945, "learning_rate": 6.235405267715955e-07, "logits/chosen": 12.154685974121094, "logits/rejected": 11.841038703918457, "logps/chosen": -3.7241454124450684, "logps/rejected": -4.132770538330078, "loss": 4.1967, "rewards/accuracies": 1.0, "rewards/chosen": -37.241455078125, "rewards/margins": 4.08625602722168, "rewards/rejected": -41.32770919799805, "step": 2792 }, { "epoch": 0.38031045751633985, "grad_norm": 41.6362119452298, "learning_rate": 6.233828243026673e-07, "logits/chosen": 10.989065170288086, "logits/rejected": 11.45551872253418, "logps/chosen": -3.5486345291137695, "logps/rejected": -3.662630796432495, "loss": 4.1124, "rewards/accuracies": 0.75, "rewards/chosen": -35.48634338378906, "rewards/margins": 1.1399650573730469, "rewards/rejected": -36.62630844116211, "step": 2793 }, { "epoch": 0.38044662309368193, "grad_norm": 41.21327428314335, "learning_rate": 6.232250713585432e-07, "logits/chosen": 10.87070083618164, "logits/rejected": 11.079814910888672, "logps/chosen": -3.3927628993988037, "logps/rejected": -3.42482590675354, "loss": 3.5069, "rewards/accuracies": 0.25, "rewards/chosen": -33.92762756347656, "rewards/margins": 0.3206315040588379, "rewards/rejected": -34.248260498046875, "step": 2794 }, { "epoch": 0.38058278867102396, "grad_norm": 56.74884142900814, "learning_rate": 6.230672679748691e-07, "logits/chosen": 10.318489074707031, "logits/rejected": 11.034103393554688, "logps/chosen": -3.156355857849121, "logps/rejected": -3.329578399658203, "loss": 4.0893, "rewards/accuracies": 0.5, "rewards/chosen": -31.563556671142578, "rewards/margins": 1.7322254180908203, "rewards/rejected": -33.29578399658203, "step": 2795 }, { "epoch": 0.380718954248366, "grad_norm": 47.21756453580767, "learning_rate": 6.229094141873019e-07, "logits/chosen": 10.73375415802002, "logits/rejected": 11.28353500366211, "logps/chosen": -3.625615119934082, "logps/rejected": -3.5627076625823975, "loss": 4.176, "rewards/accuracies": 0.5, "rewards/chosen": -36.25615310668945, "rewards/margins": -0.6290750503540039, "rewards/rejected": -35.627079010009766, "step": 2796 }, { "epoch": 0.3808551198257081, "grad_norm": 43.17421441423761, "learning_rate": 6.227515100315099e-07, "logits/chosen": 10.472299575805664, "logits/rejected": 10.93300724029541, "logps/chosen": -3.2509372234344482, "logps/rejected": -3.3951950073242188, "loss": 3.8456, "rewards/accuracies": 0.5, "rewards/chosen": -32.50937271118164, "rewards/margins": 1.4425787925720215, "rewards/rejected": -33.95195007324219, "step": 2797 }, { "epoch": 0.3809912854030501, "grad_norm": 59.31057291575199, "learning_rate": 6.22593555543173e-07, "logits/chosen": 10.95348072052002, "logits/rejected": 11.065495491027832, "logps/chosen": -3.795651435852051, "logps/rejected": -3.993913173675537, "loss": 3.4898, "rewards/accuracies": 0.75, "rewards/chosen": -37.956512451171875, "rewards/margins": 1.9826183319091797, "rewards/rejected": -39.93913269042969, "step": 2798 }, { "epoch": 0.38112745098039214, "grad_norm": 45.14545418186428, "learning_rate": 6.224355507579822e-07, "logits/chosen": 11.328987121582031, "logits/rejected": 11.387325286865234, "logps/chosen": -3.558603525161743, "logps/rejected": -3.5689425468444824, "loss": 3.594, "rewards/accuracies": 0.5, "rewards/chosen": -35.58603286743164, "rewards/margins": 0.10338973999023438, "rewards/rejected": -35.689422607421875, "step": 2799 }, { "epoch": 0.3812636165577342, "grad_norm": 44.40189308668213, "learning_rate": 6.222774957116401e-07, "logits/chosen": 10.767433166503906, "logits/rejected": 10.291611671447754, "logps/chosen": -3.4178857803344727, "logps/rejected": -3.580833911895752, "loss": 3.7692, "rewards/accuracies": 0.75, "rewards/chosen": -34.178855895996094, "rewards/margins": 1.6294822692871094, "rewards/rejected": -35.80834197998047, "step": 2800 }, { "epoch": 0.38139978213507625, "grad_norm": 64.8001799341774, "learning_rate": 6.221193904398604e-07, "logits/chosen": 10.800989151000977, "logits/rejected": 11.442585945129395, "logps/chosen": -3.5001535415649414, "logps/rejected": -3.6817095279693604, "loss": 4.0301, "rewards/accuracies": 0.75, "rewards/chosen": -35.00153350830078, "rewards/margins": 1.815561294555664, "rewards/rejected": -36.81709671020508, "step": 2801 }, { "epoch": 0.3815359477124183, "grad_norm": 44.725506181976336, "learning_rate": 6.219612349783684e-07, "logits/chosen": 10.998517990112305, "logits/rejected": 11.280645370483398, "logps/chosen": -3.366692066192627, "logps/rejected": -3.5185093879699707, "loss": 4.0078, "rewards/accuracies": 0.5, "rewards/chosen": -33.66692352294922, "rewards/margins": 1.5181736946105957, "rewards/rejected": -35.185096740722656, "step": 2802 }, { "epoch": 0.38167211328976036, "grad_norm": 49.74509208689452, "learning_rate": 6.218030293629007e-07, "logits/chosen": 11.301301956176758, "logits/rejected": 11.588970184326172, "logps/chosen": -3.5524747371673584, "logps/rejected": -3.682767152786255, "loss": 4.7351, "rewards/accuracies": 0.75, "rewards/chosen": -35.52474594116211, "rewards/margins": 1.3029227256774902, "rewards/rejected": -36.82767105102539, "step": 2803 }, { "epoch": 0.3818082788671024, "grad_norm": 44.06235297347529, "learning_rate": 6.21644773629205e-07, "logits/chosen": 10.58653736114502, "logits/rejected": 10.622222900390625, "logps/chosen": -3.413421154022217, "logps/rejected": -3.3755879402160645, "loss": 3.7984, "rewards/accuracies": 0.25, "rewards/chosen": -34.134212493896484, "rewards/margins": -0.37833213806152344, "rewards/rejected": -33.75587844848633, "step": 2804 }, { "epoch": 0.3819444444444444, "grad_norm": 40.137709832099034, "learning_rate": 6.214864678130405e-07, "logits/chosen": 10.756660461425781, "logits/rejected": 11.601381301879883, "logps/chosen": -3.0875418186187744, "logps/rejected": -3.736827850341797, "loss": 3.9814, "rewards/accuracies": 1.0, "rewards/chosen": -30.87541961669922, "rewards/margins": 6.492859363555908, "rewards/rejected": -37.36827850341797, "step": 2805 }, { "epoch": 0.3820806100217865, "grad_norm": 44.614190780010176, "learning_rate": 6.213281119501779e-07, "logits/chosen": 10.273151397705078, "logits/rejected": 10.238717079162598, "logps/chosen": -3.099468231201172, "logps/rejected": -2.9637372493743896, "loss": 4.3107, "rewards/accuracies": 0.25, "rewards/chosen": -30.99468231201172, "rewards/margins": -1.3573088645935059, "rewards/rejected": -29.637372970581055, "step": 2806 }, { "epoch": 0.38221677559912853, "grad_norm": 51.128466115595465, "learning_rate": 6.211697060763989e-07, "logits/chosen": 10.20438289642334, "logits/rejected": 9.319863319396973, "logps/chosen": -2.952451229095459, "logps/rejected": -3.2015492916107178, "loss": 4.0299, "rewards/accuracies": 0.5, "rewards/chosen": -29.524513244628906, "rewards/margins": 2.4909801483154297, "rewards/rejected": -32.01549530029297, "step": 2807 }, { "epoch": 0.38235294117647056, "grad_norm": 55.409458307023186, "learning_rate": 6.210112502274964e-07, "logits/chosen": 11.241992950439453, "logits/rejected": 11.178723335266113, "logps/chosen": -3.0640857219696045, "logps/rejected": -3.560236930847168, "loss": 4.2313, "rewards/accuracies": 1.0, "rewards/chosen": -30.64085578918457, "rewards/margins": 4.961512088775635, "rewards/rejected": -35.60237121582031, "step": 2808 }, { "epoch": 0.38248910675381265, "grad_norm": 50.544672380024174, "learning_rate": 6.208527444392752e-07, "logits/chosen": 8.69536304473877, "logits/rejected": 10.50152587890625, "logps/chosen": -3.3945257663726807, "logps/rejected": -3.7868659496307373, "loss": 4.751, "rewards/accuracies": 0.75, "rewards/chosen": -33.94525909423828, "rewards/margins": 3.9233999252319336, "rewards/rejected": -37.868656158447266, "step": 2809 }, { "epoch": 0.3826252723311547, "grad_norm": 62.03827253979223, "learning_rate": 6.206941887475507e-07, "logits/chosen": 10.843400001525879, "logits/rejected": 10.80426025390625, "logps/chosen": -3.5054750442504883, "logps/rejected": -3.4036669731140137, "loss": 4.5606, "rewards/accuracies": 0.5, "rewards/chosen": -35.05474853515625, "rewards/margins": -1.018080234527588, "rewards/rejected": -34.03666687011719, "step": 2810 }, { "epoch": 0.3827614379084967, "grad_norm": 49.36570101909371, "learning_rate": 6.2053558318815e-07, "logits/chosen": 10.395944595336914, "logits/rejected": 10.741154670715332, "logps/chosen": -3.0811195373535156, "logps/rejected": -3.75083589553833, "loss": 4.3944, "rewards/accuracies": 0.75, "rewards/chosen": -30.811195373535156, "rewards/margins": 6.697162628173828, "rewards/rejected": -37.508358001708984, "step": 2811 }, { "epoch": 0.3828976034858388, "grad_norm": 45.185829983736234, "learning_rate": 6.203769277969113e-07, "logits/chosen": 10.743242263793945, "logits/rejected": 11.091299057006836, "logps/chosen": -3.6277503967285156, "logps/rejected": -3.7837278842926025, "loss": 4.3548, "rewards/accuracies": 0.75, "rewards/chosen": -36.277503967285156, "rewards/margins": 1.5597739219665527, "rewards/rejected": -37.8372802734375, "step": 2812 }, { "epoch": 0.3830337690631808, "grad_norm": 62.408052147445716, "learning_rate": 6.202182226096842e-07, "logits/chosen": 12.017087936401367, "logits/rejected": 11.803936004638672, "logps/chosen": -3.643220901489258, "logps/rejected": -3.6549136638641357, "loss": 3.9968, "rewards/accuracies": 0.75, "rewards/chosen": -36.43220901489258, "rewards/margins": 0.1169285774230957, "rewards/rejected": -36.549137115478516, "step": 2813 }, { "epoch": 0.3831699346405229, "grad_norm": 44.327138806852396, "learning_rate": 6.200594676623293e-07, "logits/chosen": 10.807022094726562, "logits/rejected": 11.203904151916504, "logps/chosen": -3.695528984069824, "logps/rejected": -3.7730178833007812, "loss": 3.7492, "rewards/accuracies": 0.75, "rewards/chosen": -36.95528793334961, "rewards/margins": 0.7748880386352539, "rewards/rejected": -37.73017883300781, "step": 2814 }, { "epoch": 0.38330610021786493, "grad_norm": 57.19351425882514, "learning_rate": 6.199006629907186e-07, "logits/chosen": 12.441108703613281, "logits/rejected": 11.908470153808594, "logps/chosen": -3.766646146774292, "logps/rejected": -3.7156660556793213, "loss": 3.7006, "rewards/accuracies": 0.25, "rewards/chosen": -37.66646194458008, "rewards/margins": -0.5097999572753906, "rewards/rejected": -37.15666198730469, "step": 2815 }, { "epoch": 0.38344226579520696, "grad_norm": 42.11709081624755, "learning_rate": 6.197418086307355e-07, "logits/chosen": 9.860145568847656, "logits/rejected": 11.550271987915039, "logps/chosen": -3.6174397468566895, "logps/rejected": -3.8098909854888916, "loss": 4.1631, "rewards/accuracies": 0.5, "rewards/chosen": -36.174400329589844, "rewards/margins": 1.9245109558105469, "rewards/rejected": -38.098907470703125, "step": 2816 }, { "epoch": 0.38357843137254904, "grad_norm": 43.89876361617811, "learning_rate": 6.195829046182742e-07, "logits/chosen": 10.119730949401855, "logits/rejected": 11.503471374511719, "logps/chosen": -3.4763920307159424, "logps/rejected": -3.7176706790924072, "loss": 4.1821, "rewards/accuracies": 0.75, "rewards/chosen": -34.763919830322266, "rewards/margins": 2.4127869606018066, "rewards/rejected": -37.17670822143555, "step": 2817 }, { "epoch": 0.3837145969498911, "grad_norm": 45.89093944672534, "learning_rate": 6.194239509892407e-07, "logits/chosen": 10.975715637207031, "logits/rejected": 12.124345779418945, "logps/chosen": -3.343597412109375, "logps/rejected": -3.8237130641937256, "loss": 4.1542, "rewards/accuracies": 1.0, "rewards/chosen": -33.43597412109375, "rewards/margins": 4.801157474517822, "rewards/rejected": -38.23712921142578, "step": 2818 }, { "epoch": 0.3838507625272331, "grad_norm": 53.73181745277534, "learning_rate": 6.192649477795515e-07, "logits/chosen": 11.318973541259766, "logits/rejected": 11.365346908569336, "logps/chosen": -3.4134082794189453, "logps/rejected": -3.5375444889068604, "loss": 4.7102, "rewards/accuracies": 0.5, "rewards/chosen": -34.13407897949219, "rewards/margins": 1.2413640022277832, "rewards/rejected": -35.37544250488281, "step": 2819 }, { "epoch": 0.3839869281045752, "grad_norm": 37.849743085954806, "learning_rate": 6.19105895025135e-07, "logits/chosen": 11.20650863647461, "logits/rejected": 11.147748947143555, "logps/chosen": -3.6541314125061035, "logps/rejected": -3.8913559913635254, "loss": 3.4602, "rewards/accuracies": 0.75, "rewards/chosen": -36.54131317138672, "rewards/margins": 2.372248649597168, "rewards/rejected": -38.91355895996094, "step": 2820 }, { "epoch": 0.3841230936819172, "grad_norm": 100.39464397764557, "learning_rate": 6.189467927619304e-07, "logits/chosen": 11.596519470214844, "logits/rejected": 11.787124633789062, "logps/chosen": -3.1274895668029785, "logps/rejected": -3.2302117347717285, "loss": 3.9845, "rewards/accuracies": 0.75, "rewards/chosen": -31.27489471435547, "rewards/margins": 1.027219295501709, "rewards/rejected": -32.30211639404297, "step": 2821 }, { "epoch": 0.38425925925925924, "grad_norm": 45.9517584744854, "learning_rate": 6.18787641025888e-07, "logits/chosen": 11.796655654907227, "logits/rejected": 10.116600036621094, "logps/chosen": -3.3578319549560547, "logps/rejected": -3.146864414215088, "loss": 4.566, "rewards/accuracies": 0.25, "rewards/chosen": -33.57831573486328, "rewards/margins": -2.1096725463867188, "rewards/rejected": -31.468647003173828, "step": 2822 }, { "epoch": 0.38439542483660133, "grad_norm": 40.04125263160398, "learning_rate": 6.186284398529696e-07, "logits/chosen": 10.130800247192383, "logits/rejected": 10.416142463684082, "logps/chosen": -3.1067652702331543, "logps/rejected": -3.432384967803955, "loss": 3.8985, "rewards/accuracies": 0.75, "rewards/chosen": -31.06765365600586, "rewards/margins": 3.2561964988708496, "rewards/rejected": -34.323848724365234, "step": 2823 }, { "epoch": 0.38453159041394336, "grad_norm": 46.15182455531094, "learning_rate": 6.184691892791482e-07, "logits/chosen": 10.628684043884277, "logits/rejected": 10.590629577636719, "logps/chosen": -3.4617555141448975, "logps/rejected": -3.3593521118164062, "loss": 4.3769, "rewards/accuracies": 0.25, "rewards/chosen": -34.6175537109375, "rewards/margins": -1.024034023284912, "rewards/rejected": -33.59352111816406, "step": 2824 }, { "epoch": 0.3846677559912854, "grad_norm": 47.08979165175097, "learning_rate": 6.183098893404075e-07, "logits/chosen": 10.871685028076172, "logits/rejected": 11.268251419067383, "logps/chosen": -3.366227626800537, "logps/rejected": -3.438847541809082, "loss": 3.7402, "rewards/accuracies": 0.5, "rewards/chosen": -33.66227722167969, "rewards/margins": 0.7261991500854492, "rewards/rejected": -34.38847732543945, "step": 2825 }, { "epoch": 0.38480392156862747, "grad_norm": 52.91839120819857, "learning_rate": 6.181505400727428e-07, "logits/chosen": 10.757440567016602, "logits/rejected": 11.870939254760742, "logps/chosen": -3.4303770065307617, "logps/rejected": -3.6180200576782227, "loss": 3.8634, "rewards/accuracies": 0.75, "rewards/chosen": -34.30377197265625, "rewards/margins": 1.8764305114746094, "rewards/rejected": -36.180198669433594, "step": 2826 }, { "epoch": 0.3849400871459695, "grad_norm": 41.87822650865816, "learning_rate": 6.179911415121602e-07, "logits/chosen": 11.012075424194336, "logits/rejected": 12.504048347473145, "logps/chosen": -3.2732534408569336, "logps/rejected": -3.7695882320404053, "loss": 3.7313, "rewards/accuracies": 1.0, "rewards/chosen": -32.7325325012207, "rewards/margins": 4.963348865509033, "rewards/rejected": -37.695884704589844, "step": 2827 }, { "epoch": 0.38507625272331153, "grad_norm": 53.87124334559555, "learning_rate": 6.178316936946772e-07, "logits/chosen": 11.559771537780762, "logits/rejected": 13.093982696533203, "logps/chosen": -3.4326720237731934, "logps/rejected": -3.815939426422119, "loss": 3.9118, "rewards/accuracies": 0.75, "rewards/chosen": -34.32672119140625, "rewards/margins": 3.832674980163574, "rewards/rejected": -38.159393310546875, "step": 2828 }, { "epoch": 0.3852124183006536, "grad_norm": 44.48094452637199, "learning_rate": 6.176721966563224e-07, "logits/chosen": 11.598226547241211, "logits/rejected": 11.91992473602295, "logps/chosen": -3.512956142425537, "logps/rejected": -3.7174723148345947, "loss": 4.2112, "rewards/accuracies": 0.75, "rewards/chosen": -35.12956237792969, "rewards/margins": 2.045163631439209, "rewards/rejected": -37.174720764160156, "step": 2829 }, { "epoch": 0.38534858387799564, "grad_norm": 46.603180970813916, "learning_rate": 6.175126504331357e-07, "logits/chosen": 11.437049865722656, "logits/rejected": 11.041790962219238, "logps/chosen": -3.362187623977661, "logps/rejected": -3.5067124366760254, "loss": 3.6028, "rewards/accuracies": 0.75, "rewards/chosen": -33.62187957763672, "rewards/margins": 1.4452471733093262, "rewards/rejected": -35.06712341308594, "step": 2830 }, { "epoch": 0.38548474945533767, "grad_norm": 46.20582980098318, "learning_rate": 6.173530550611675e-07, "logits/chosen": 11.061661720275879, "logits/rejected": 11.219640731811523, "logps/chosen": -3.8907318115234375, "logps/rejected": -3.765957832336426, "loss": 4.4324, "rewards/accuracies": 0.5, "rewards/chosen": -38.907318115234375, "rewards/margins": -1.2477383613586426, "rewards/rejected": -37.65958023071289, "step": 2831 }, { "epoch": 0.38562091503267976, "grad_norm": 43.90412214056246, "learning_rate": 6.171934105764797e-07, "logits/chosen": 8.780752182006836, "logits/rejected": 10.94894027709961, "logps/chosen": -3.23514461517334, "logps/rejected": -3.6040337085723877, "loss": 4.1376, "rewards/accuracies": 0.75, "rewards/chosen": -32.35144805908203, "rewards/margins": 3.6888928413391113, "rewards/rejected": -36.04033660888672, "step": 2832 }, { "epoch": 0.3857570806100218, "grad_norm": 44.64235089752929, "learning_rate": 6.170337170151457e-07, "logits/chosen": 10.310336112976074, "logits/rejected": 10.761093139648438, "logps/chosen": -3.2158284187316895, "logps/rejected": -3.4794294834136963, "loss": 4.0543, "rewards/accuracies": 0.75, "rewards/chosen": -32.158287048339844, "rewards/margins": 2.636009693145752, "rewards/rejected": -34.79429626464844, "step": 2833 }, { "epoch": 0.3858932461873638, "grad_norm": 43.456531799048356, "learning_rate": 6.168739744132492e-07, "logits/chosen": 12.329473495483398, "logits/rejected": 12.011805534362793, "logps/chosen": -3.523838996887207, "logps/rejected": -3.6773972511291504, "loss": 3.8387, "rewards/accuracies": 0.5, "rewards/chosen": -35.23838806152344, "rewards/margins": 1.535581111907959, "rewards/rejected": -36.77397155761719, "step": 2834 }, { "epoch": 0.3860294117647059, "grad_norm": 41.42162595490188, "learning_rate": 6.167141828068855e-07, "logits/chosen": 10.57697868347168, "logits/rejected": 12.509831428527832, "logps/chosen": -3.491795301437378, "logps/rejected": -3.8707473278045654, "loss": 2.9408, "rewards/accuracies": 0.75, "rewards/chosen": -34.91795349121094, "rewards/margins": 3.7895188331604004, "rewards/rejected": -38.70747375488281, "step": 2835 }, { "epoch": 0.3861655773420479, "grad_norm": 45.57187896088255, "learning_rate": 6.165543422321609e-07, "logits/chosen": 12.264101028442383, "logits/rejected": 12.196405410766602, "logps/chosen": -3.817605972290039, "logps/rejected": -3.9328346252441406, "loss": 4.3247, "rewards/accuracies": 0.5, "rewards/chosen": -38.176063537597656, "rewards/margins": 1.1522846221923828, "rewards/rejected": -39.328346252441406, "step": 2836 }, { "epoch": 0.38630174291938996, "grad_norm": 47.21934710634155, "learning_rate": 6.163944527251925e-07, "logits/chosen": 12.385473251342773, "logits/rejected": 13.016156196594238, "logps/chosen": -4.046324729919434, "logps/rejected": -4.040454864501953, "loss": 3.4624, "rewards/accuracies": 0.5, "rewards/chosen": -40.46324920654297, "rewards/margins": -0.058696746826171875, "rewards/rejected": -40.4045524597168, "step": 2837 }, { "epoch": 0.38643790849673204, "grad_norm": 46.15866593670856, "learning_rate": 6.162345143221088e-07, "logits/chosen": 12.333047866821289, "logits/rejected": 11.960527420043945, "logps/chosen": -3.7038512229919434, "logps/rejected": -3.7537784576416016, "loss": 4.3591, "rewards/accuracies": 0.5, "rewards/chosen": -37.03851318359375, "rewards/margins": 0.49927234649658203, "rewards/rejected": -37.537784576416016, "step": 2838 }, { "epoch": 0.38657407407407407, "grad_norm": 47.24420875590826, "learning_rate": 6.160745270590493e-07, "logits/chosen": 10.715831756591797, "logits/rejected": 12.642660140991211, "logps/chosen": -3.3900256156921387, "logps/rejected": -3.6901304721832275, "loss": 3.85, "rewards/accuracies": 0.5, "rewards/chosen": -33.9002571105957, "rewards/margins": 3.0010461807250977, "rewards/rejected": -36.90130615234375, "step": 2839 }, { "epoch": 0.3867102396514161, "grad_norm": 50.73912372450881, "learning_rate": 6.15914490972164e-07, "logits/chosen": 11.782064437866211, "logits/rejected": 12.069002151489258, "logps/chosen": -3.5459184646606445, "logps/rejected": -3.7851619720458984, "loss": 4.1756, "rewards/accuracies": 0.5, "rewards/chosen": -35.45918273925781, "rewards/margins": 2.3924331665039062, "rewards/rejected": -37.85161590576172, "step": 2840 }, { "epoch": 0.3868464052287582, "grad_norm": 47.49362798420218, "learning_rate": 6.15754406097615e-07, "logits/chosen": 11.532906532287598, "logits/rejected": 13.379650115966797, "logps/chosen": -3.378380060195923, "logps/rejected": -4.106446266174316, "loss": 4.6183, "rewards/accuracies": 0.75, "rewards/chosen": -33.7838020324707, "rewards/margins": 7.280657768249512, "rewards/rejected": -41.06446075439453, "step": 2841 }, { "epoch": 0.3869825708061002, "grad_norm": 44.16540949071974, "learning_rate": 6.155942724715744e-07, "logits/chosen": 12.370004653930664, "logits/rejected": 12.830453872680664, "logps/chosen": -3.7139196395874023, "logps/rejected": -4.176851272583008, "loss": 3.951, "rewards/accuracies": 0.75, "rewards/chosen": -37.139198303222656, "rewards/margins": 4.629315376281738, "rewards/rejected": -41.768516540527344, "step": 2842 }, { "epoch": 0.38711873638344224, "grad_norm": 46.27532153104491, "learning_rate": 6.154340901302257e-07, "logits/chosen": 12.593149185180664, "logits/rejected": 12.467138290405273, "logps/chosen": -4.130403518676758, "logps/rejected": -4.097543716430664, "loss": 3.8657, "rewards/accuracies": 0.5, "rewards/chosen": -41.30403518676758, "rewards/margins": -0.3285989761352539, "rewards/rejected": -40.97543716430664, "step": 2843 }, { "epoch": 0.3872549019607843, "grad_norm": 42.43216684643948, "learning_rate": 6.152738591097637e-07, "logits/chosen": 12.797147750854492, "logits/rejected": 14.007179260253906, "logps/chosen": -3.488629102706909, "logps/rejected": -3.978243350982666, "loss": 3.726, "rewards/accuracies": 0.75, "rewards/chosen": -34.88629150390625, "rewards/margins": 4.89614200592041, "rewards/rejected": -39.782432556152344, "step": 2844 }, { "epoch": 0.38739106753812635, "grad_norm": 43.35721235640409, "learning_rate": 6.151135794463937e-07, "logits/chosen": 12.52506160736084, "logits/rejected": 12.863710403442383, "logps/chosen": -3.476308584213257, "logps/rejected": -3.752279043197632, "loss": 4.3267, "rewards/accuracies": 1.0, "rewards/chosen": -34.763084411621094, "rewards/margins": 2.7597055435180664, "rewards/rejected": -37.522789001464844, "step": 2845 }, { "epoch": 0.3875272331154684, "grad_norm": 51.08521314596891, "learning_rate": 6.149532511763323e-07, "logits/chosen": 11.98825454711914, "logits/rejected": 12.40420913696289, "logps/chosen": -3.945094585418701, "logps/rejected": -4.125500679016113, "loss": 4.2665, "rewards/accuracies": 0.75, "rewards/chosen": -39.45094680786133, "rewards/margins": 1.8040623664855957, "rewards/rejected": -41.255008697509766, "step": 2846 }, { "epoch": 0.38766339869281047, "grad_norm": 61.508570552025965, "learning_rate": 6.147928743358071e-07, "logits/chosen": 12.094865798950195, "logits/rejected": 12.849119186401367, "logps/chosen": -3.770641326904297, "logps/rejected": -3.9459526538848877, "loss": 3.782, "rewards/accuracies": 0.5, "rewards/chosen": -37.70641326904297, "rewards/margins": 1.75311279296875, "rewards/rejected": -39.45952606201172, "step": 2847 }, { "epoch": 0.3877995642701525, "grad_norm": 45.06960987454, "learning_rate": 6.146324489610563e-07, "logits/chosen": 12.198564529418945, "logits/rejected": 12.529041290283203, "logps/chosen": -3.6670639514923096, "logps/rejected": -3.8822436332702637, "loss": 4.2516, "rewards/accuracies": 0.75, "rewards/chosen": -36.67063903808594, "rewards/margins": 2.151797294616699, "rewards/rejected": -38.82243728637695, "step": 2848 }, { "epoch": 0.3879357298474945, "grad_norm": 45.42485781522036, "learning_rate": 6.144719750883294e-07, "logits/chosen": 12.198269844055176, "logits/rejected": 12.837480545043945, "logps/chosen": -4.008089542388916, "logps/rejected": -3.8757543563842773, "loss": 3.9263, "rewards/accuracies": 0.25, "rewards/chosen": -40.08089828491211, "rewards/margins": -1.3233532905578613, "rewards/rejected": -38.757545471191406, "step": 2849 }, { "epoch": 0.3880718954248366, "grad_norm": 39.36494432029135, "learning_rate": 6.14311452753887e-07, "logits/chosen": 12.351449012756348, "logits/rejected": 11.93532943725586, "logps/chosen": -3.7272043228149414, "logps/rejected": -3.6948046684265137, "loss": 3.6295, "rewards/accuracies": 0.25, "rewards/chosen": -37.27204132080078, "rewards/margins": -0.3239936828613281, "rewards/rejected": -36.94804763793945, "step": 2850 }, { "epoch": 0.38820806100217864, "grad_norm": 58.17769124219415, "learning_rate": 6.141508819940004e-07, "logits/chosen": 12.505670547485352, "logits/rejected": 12.64723014831543, "logps/chosen": -3.7026405334472656, "logps/rejected": -3.982046365737915, "loss": 4.3746, "rewards/accuracies": 1.0, "rewards/chosen": -37.026405334472656, "rewards/margins": 2.794057846069336, "rewards/rejected": -39.820465087890625, "step": 2851 }, { "epoch": 0.3883442265795207, "grad_norm": 54.82319778984107, "learning_rate": 6.139902628449517e-07, "logits/chosen": 12.821056365966797, "logits/rejected": 12.809596061706543, "logps/chosen": -3.6137242317199707, "logps/rejected": -3.7154793739318848, "loss": 4.0023, "rewards/accuracies": 0.75, "rewards/chosen": -36.137245178222656, "rewards/margins": 1.0175490379333496, "rewards/rejected": -37.15479278564453, "step": 2852 }, { "epoch": 0.38848039215686275, "grad_norm": 43.38348958687503, "learning_rate": 6.138295953430343e-07, "logits/chosen": 12.200931549072266, "logits/rejected": 13.27690315246582, "logps/chosen": -3.7020702362060547, "logps/rejected": -4.213587760925293, "loss": 4.1171, "rewards/accuracies": 1.0, "rewards/chosen": -37.02070236206055, "rewards/margins": 5.115169525146484, "rewards/rejected": -42.13587188720703, "step": 2853 }, { "epoch": 0.3886165577342048, "grad_norm": 49.98737265421326, "learning_rate": 6.136688795245523e-07, "logits/chosen": 12.838203430175781, "logits/rejected": 12.818229675292969, "logps/chosen": -3.752894163131714, "logps/rejected": -4.047417640686035, "loss": 4.6446, "rewards/accuracies": 0.75, "rewards/chosen": -37.5289421081543, "rewards/margins": 2.9452342987060547, "rewards/rejected": -40.47417449951172, "step": 2854 }, { "epoch": 0.38875272331154687, "grad_norm": 81.12914510089551, "learning_rate": 6.135081154258208e-07, "logits/chosen": 12.630468368530273, "logits/rejected": 11.600433349609375, "logps/chosen": -3.5755615234375, "logps/rejected": -3.4835433959960938, "loss": 4.3431, "rewards/accuracies": 0.25, "rewards/chosen": -35.755615234375, "rewards/margins": -0.9201827049255371, "rewards/rejected": -34.83543395996094, "step": 2855 }, { "epoch": 0.3888888888888889, "grad_norm": 44.84539823196435, "learning_rate": 6.133473030831657e-07, "logits/chosen": 13.109668731689453, "logits/rejected": 13.25069808959961, "logps/chosen": -3.69757080078125, "logps/rejected": -3.786100387573242, "loss": 3.9745, "rewards/accuracies": 0.5, "rewards/chosen": -36.975704193115234, "rewards/margins": 0.8852977752685547, "rewards/rejected": -37.86100387573242, "step": 2856 }, { "epoch": 0.3890250544662309, "grad_norm": 41.82382678122138, "learning_rate": 6.131864425329239e-07, "logits/chosen": 10.958650588989258, "logits/rejected": 12.067155838012695, "logps/chosen": -3.4099478721618652, "logps/rejected": -3.599090576171875, "loss": 4.242, "rewards/accuracies": 0.75, "rewards/chosen": -34.09947967529297, "rewards/margins": 1.8914251327514648, "rewards/rejected": -35.99090576171875, "step": 2857 }, { "epoch": 0.389161220043573, "grad_norm": 46.25457059908314, "learning_rate": 6.130255338114432e-07, "logits/chosen": 12.631190299987793, "logits/rejected": 13.667806625366211, "logps/chosen": -4.002368450164795, "logps/rejected": -3.990152597427368, "loss": 4.5037, "rewards/accuracies": 0.25, "rewards/chosen": -40.023685455322266, "rewards/margins": -0.12215900421142578, "rewards/rejected": -39.901527404785156, "step": 2858 }, { "epoch": 0.38929738562091504, "grad_norm": 41.4303479859947, "learning_rate": 6.128645769550823e-07, "logits/chosen": 13.204343795776367, "logits/rejected": 13.422449111938477, "logps/chosen": -3.544473886489868, "logps/rejected": -3.8877665996551514, "loss": 4.095, "rewards/accuracies": 1.0, "rewards/chosen": -35.444740295410156, "rewards/margins": 3.4329280853271484, "rewards/rejected": -38.877662658691406, "step": 2859 }, { "epoch": 0.38943355119825707, "grad_norm": 45.491948888126295, "learning_rate": 6.127035720002107e-07, "logits/chosen": 13.190629959106445, "logits/rejected": 13.082534790039062, "logps/chosen": -3.7305283546447754, "logps/rejected": -4.102214813232422, "loss": 4.8148, "rewards/accuracies": 0.75, "rewards/chosen": -37.30528259277344, "rewards/margins": 3.716862678527832, "rewards/rejected": -41.02214431762695, "step": 2860 }, { "epoch": 0.38956971677559915, "grad_norm": 46.12831275548735, "learning_rate": 6.125425189832086e-07, "logits/chosen": 12.012709617614746, "logits/rejected": 12.969148635864258, "logps/chosen": -3.5325357913970947, "logps/rejected": -3.803035259246826, "loss": 4.5329, "rewards/accuracies": 0.75, "rewards/chosen": -35.325355529785156, "rewards/margins": 2.70499324798584, "rewards/rejected": -38.03034973144531, "step": 2861 }, { "epoch": 0.3897058823529412, "grad_norm": 44.92256549528014, "learning_rate": 6.123814179404677e-07, "logits/chosen": 12.20188045501709, "logits/rejected": 12.67626953125, "logps/chosen": -3.4532952308654785, "logps/rejected": -3.8443570137023926, "loss": 3.9805, "rewards/accuracies": 0.75, "rewards/chosen": -34.53295135498047, "rewards/margins": 3.910621166229248, "rewards/rejected": -38.443572998046875, "step": 2862 }, { "epoch": 0.3898420479302832, "grad_norm": 47.922201208068394, "learning_rate": 6.122202689083896e-07, "logits/chosen": 12.50979232788086, "logits/rejected": 13.136800765991211, "logps/chosen": -3.6323819160461426, "logps/rejected": -3.9127306938171387, "loss": 3.8823, "rewards/accuracies": 0.5, "rewards/chosen": -36.32381820678711, "rewards/margins": 2.803487777709961, "rewards/rejected": -39.12730407714844, "step": 2863 }, { "epoch": 0.3899782135076253, "grad_norm": 46.077356381357156, "learning_rate": 6.120590719233876e-07, "logits/chosen": 11.523953437805176, "logits/rejected": 11.825536727905273, "logps/chosen": -3.6421008110046387, "logps/rejected": -4.055383205413818, "loss": 3.8666, "rewards/accuracies": 1.0, "rewards/chosen": -36.4210090637207, "rewards/margins": 4.132823944091797, "rewards/rejected": -40.5538330078125, "step": 2864 }, { "epoch": 0.3901143790849673, "grad_norm": 41.596135880884084, "learning_rate": 6.118978270218854e-07, "logits/chosen": 12.41227912902832, "logits/rejected": 13.082921981811523, "logps/chosen": -3.7475528717041016, "logps/rejected": -3.832094192504883, "loss": 3.9979, "rewards/accuracies": 0.5, "rewards/chosen": -37.475528717041016, "rewards/margins": 0.8454122543334961, "rewards/rejected": -38.32093811035156, "step": 2865 }, { "epoch": 0.39025054466230935, "grad_norm": 39.82702051974022, "learning_rate": 6.117365342403177e-07, "logits/chosen": 11.992345809936523, "logits/rejected": 11.514781951904297, "logps/chosen": -3.7886507511138916, "logps/rejected": -3.6539535522460938, "loss": 3.8356, "rewards/accuracies": 0.25, "rewards/chosen": -37.886505126953125, "rewards/margins": -1.346972942352295, "rewards/rejected": -36.53953170776367, "step": 2866 }, { "epoch": 0.39038671023965144, "grad_norm": 43.12064150584288, "learning_rate": 6.115751936151298e-07, "logits/chosen": 11.778524398803711, "logits/rejected": 12.354340553283691, "logps/chosen": -3.5063090324401855, "logps/rejected": -3.823730707168579, "loss": 4.1539, "rewards/accuracies": 0.75, "rewards/chosen": -35.06309127807617, "rewards/margins": 3.174215793609619, "rewards/rejected": -38.2373046875, "step": 2867 }, { "epoch": 0.39052287581699346, "grad_norm": 44.8196806191984, "learning_rate": 6.114138051827779e-07, "logits/chosen": 11.652795791625977, "logits/rejected": 12.539457321166992, "logps/chosen": -3.701807975769043, "logps/rejected": -3.861470937728882, "loss": 3.6467, "rewards/accuracies": 0.75, "rewards/chosen": -37.01808166503906, "rewards/margins": 1.596628189086914, "rewards/rejected": -38.614707946777344, "step": 2868 }, { "epoch": 0.3906590413943355, "grad_norm": 45.33206891302908, "learning_rate": 6.112523689797294e-07, "logits/chosen": 12.701568603515625, "logits/rejected": 12.125707626342773, "logps/chosen": -3.8996832370758057, "logps/rejected": -3.651733160018921, "loss": 4.1543, "rewards/accuracies": 0.25, "rewards/chosen": -38.99683380126953, "rewards/margins": -2.4795007705688477, "rewards/rejected": -36.517333984375, "step": 2869 }, { "epoch": 0.3907952069716776, "grad_norm": 50.02962460645468, "learning_rate": 6.110908850424617e-07, "logits/chosen": 12.11337661743164, "logits/rejected": 13.292598724365234, "logps/chosen": -3.7671549320220947, "logps/rejected": -4.164152145385742, "loss": 3.1699, "rewards/accuracies": 1.0, "rewards/chosen": -37.671546936035156, "rewards/margins": 3.9699716567993164, "rewards/rejected": -41.64152145385742, "step": 2870 }, { "epoch": 0.3909313725490196, "grad_norm": 50.405800817440266, "learning_rate": 6.109293534074637e-07, "logits/chosen": 13.460432052612305, "logits/rejected": 12.986045837402344, "logps/chosen": -3.6852145195007324, "logps/rejected": -3.6034364700317383, "loss": 4.2322, "rewards/accuracies": 0.25, "rewards/chosen": -36.852142333984375, "rewards/margins": -0.8177824020385742, "rewards/rejected": -36.03436279296875, "step": 2871 }, { "epoch": 0.39106753812636164, "grad_norm": 71.76671849606997, "learning_rate": 6.107677741112348e-07, "logits/chosen": 12.45839786529541, "logits/rejected": 11.59920883178711, "logps/chosen": -3.7165870666503906, "logps/rejected": -3.8739981651306152, "loss": 4.2073, "rewards/accuracies": 0.75, "rewards/chosen": -37.165870666503906, "rewards/margins": 1.5741100311279297, "rewards/rejected": -38.73998260498047, "step": 2872 }, { "epoch": 0.3912037037037037, "grad_norm": 46.072100083053094, "learning_rate": 6.10606147190285e-07, "logits/chosen": 11.666393280029297, "logits/rejected": 12.439275741577148, "logps/chosen": -3.69155216217041, "logps/rejected": -3.8024752140045166, "loss": 4.2657, "rewards/accuracies": 0.5, "rewards/chosen": -36.91551971435547, "rewards/margins": 1.1092276573181152, "rewards/rejected": -38.024749755859375, "step": 2873 }, { "epoch": 0.39133986928104575, "grad_norm": 48.59430068105013, "learning_rate": 6.104444726811355e-07, "logits/chosen": 11.005014419555664, "logits/rejected": 11.756637573242188, "logps/chosen": -3.55889630317688, "logps/rejected": -3.8586065769195557, "loss": 3.6917, "rewards/accuracies": 1.0, "rewards/chosen": -35.588966369628906, "rewards/margins": 2.9971017837524414, "rewards/rejected": -38.58606719970703, "step": 2874 }, { "epoch": 0.3914760348583878, "grad_norm": 42.0620050246941, "learning_rate": 6.102827506203176e-07, "logits/chosen": 11.357099533081055, "logits/rejected": 12.290742874145508, "logps/chosen": -3.558110237121582, "logps/rejected": -3.567999839782715, "loss": 3.708, "rewards/accuracies": 0.25, "rewards/chosen": -35.58110427856445, "rewards/margins": 0.09889411926269531, "rewards/rejected": -35.68000030517578, "step": 2875 }, { "epoch": 0.39161220043572986, "grad_norm": 50.440893556000326, "learning_rate": 6.101209810443742e-07, "logits/chosen": 11.189029693603516, "logits/rejected": 12.685884475708008, "logps/chosen": -3.323329448699951, "logps/rejected": -3.745544910430908, "loss": 4.2399, "rewards/accuracies": 0.75, "rewards/chosen": -33.23329544067383, "rewards/margins": 4.222153663635254, "rewards/rejected": -37.455448150634766, "step": 2876 }, { "epoch": 0.3917483660130719, "grad_norm": 46.55479637109761, "learning_rate": 6.099591639898582e-07, "logits/chosen": 12.993374824523926, "logits/rejected": 12.343912124633789, "logps/chosen": -3.897716522216797, "logps/rejected": -3.859699010848999, "loss": 4.1837, "rewards/accuracies": 0.25, "rewards/chosen": -38.97716522216797, "rewards/margins": -0.3801736831665039, "rewards/rejected": -38.59699249267578, "step": 2877 }, { "epoch": 0.3918845315904139, "grad_norm": 43.40204090058753, "learning_rate": 6.097972994933336e-07, "logits/chosen": 13.186246871948242, "logits/rejected": 13.019485473632812, "logps/chosen": -4.0787200927734375, "logps/rejected": -3.609999179840088, "loss": 4.2966, "rewards/accuracies": 0.25, "rewards/chosen": -40.78720474243164, "rewards/margins": -4.687211990356445, "rewards/rejected": -36.09999084472656, "step": 2878 }, { "epoch": 0.392020697167756, "grad_norm": 52.57086177110517, "learning_rate": 6.096353875913749e-07, "logits/chosen": 11.506000518798828, "logits/rejected": 12.076894760131836, "logps/chosen": -3.526655673980713, "logps/rejected": -3.976651191711426, "loss": 4.0598, "rewards/accuracies": 1.0, "rewards/chosen": -35.26655578613281, "rewards/margins": 4.499954700469971, "rewards/rejected": -39.766510009765625, "step": 2879 }, { "epoch": 0.39215686274509803, "grad_norm": 40.14955891690512, "learning_rate": 6.094734283205675e-07, "logits/chosen": 13.169719696044922, "logits/rejected": 12.322919845581055, "logps/chosen": -4.178403854370117, "logps/rejected": -3.8377602100372314, "loss": 4.0912, "rewards/accuracies": 0.0, "rewards/chosen": -41.784034729003906, "rewards/margins": -3.4064369201660156, "rewards/rejected": -38.377601623535156, "step": 2880 }, { "epoch": 0.39229302832244006, "grad_norm": 53.775586872278716, "learning_rate": 6.093114217175075e-07, "logits/chosen": 10.994346618652344, "logits/rejected": 12.571054458618164, "logps/chosen": -3.2379024028778076, "logps/rejected": -3.3819549083709717, "loss": 4.147, "rewards/accuracies": 0.5, "rewards/chosen": -32.379024505615234, "rewards/margins": 1.440526008605957, "rewards/rejected": -33.819549560546875, "step": 2881 }, { "epoch": 0.39242919389978215, "grad_norm": 47.60636287695563, "learning_rate": 6.091493678188015e-07, "logits/chosen": 12.52105712890625, "logits/rejected": 12.612756729125977, "logps/chosen": -4.056390285491943, "logps/rejected": -4.171770095825195, "loss": 3.6545, "rewards/accuracies": 0.5, "rewards/chosen": -40.563899993896484, "rewards/margins": 1.1538009643554688, "rewards/rejected": -41.71770477294922, "step": 2882 }, { "epoch": 0.3925653594771242, "grad_norm": 49.04968111550923, "learning_rate": 6.089872666610671e-07, "logits/chosen": 11.867500305175781, "logits/rejected": 11.99818229675293, "logps/chosen": -3.975041627883911, "logps/rejected": -3.844611406326294, "loss": 4.0371, "rewards/accuracies": 0.75, "rewards/chosen": -39.75041580200195, "rewards/margins": -1.304300308227539, "rewards/rejected": -38.44611358642578, "step": 2883 }, { "epoch": 0.3927015250544662, "grad_norm": 41.8230914298235, "learning_rate": 6.088251182809323e-07, "logits/chosen": 11.747772216796875, "logits/rejected": 12.655454635620117, "logps/chosen": -3.5528078079223633, "logps/rejected": -4.12678337097168, "loss": 3.807, "rewards/accuracies": 1.0, "rewards/chosen": -35.528076171875, "rewards/margins": 5.739758014678955, "rewards/rejected": -41.26783752441406, "step": 2884 }, { "epoch": 0.3928376906318083, "grad_norm": 49.36044396383706, "learning_rate": 6.086629227150357e-07, "logits/chosen": 12.422969818115234, "logits/rejected": 12.82458209991455, "logps/chosen": -3.4580211639404297, "logps/rejected": -3.8869519233703613, "loss": 4.4679, "rewards/accuracies": 0.75, "rewards/chosen": -34.5802116394043, "rewards/margins": 4.289305686950684, "rewards/rejected": -38.86951446533203, "step": 2885 }, { "epoch": 0.3929738562091503, "grad_norm": 49.16033211758054, "learning_rate": 6.08500680000027e-07, "logits/chosen": 12.102947235107422, "logits/rejected": 12.598655700683594, "logps/chosen": -3.8762869834899902, "logps/rejected": -4.269051551818848, "loss": 4.0707, "rewards/accuracies": 0.75, "rewards/chosen": -38.76287078857422, "rewards/margins": 3.9276466369628906, "rewards/rejected": -42.690513610839844, "step": 2886 }, { "epoch": 0.39311002178649235, "grad_norm": 43.570686065734826, "learning_rate": 6.083383901725662e-07, "logits/chosen": 12.214482307434082, "logits/rejected": 12.780670166015625, "logps/chosen": -3.6662445068359375, "logps/rejected": -3.7706260681152344, "loss": 4.4504, "rewards/accuracies": 0.5, "rewards/chosen": -36.662445068359375, "rewards/margins": 1.0438141822814941, "rewards/rejected": -37.706260681152344, "step": 2887 }, { "epoch": 0.39324618736383443, "grad_norm": 47.6980293465081, "learning_rate": 6.08176053269324e-07, "logits/chosen": 12.638290405273438, "logits/rejected": 12.891663551330566, "logps/chosen": -3.373560667037964, "logps/rejected": -3.4292893409729004, "loss": 4.4546, "rewards/accuracies": 0.5, "rewards/chosen": -33.7356071472168, "rewards/margins": 0.557286262512207, "rewards/rejected": -34.29289245605469, "step": 2888 }, { "epoch": 0.39338235294117646, "grad_norm": 41.244332263982976, "learning_rate": 6.080136693269816e-07, "logits/chosen": 12.815635681152344, "logits/rejected": 12.571544647216797, "logps/chosen": -3.873957872390747, "logps/rejected": -4.182886123657227, "loss": 3.9606, "rewards/accuracies": 1.0, "rewards/chosen": -38.73957824707031, "rewards/margins": 3.089284896850586, "rewards/rejected": -41.828861236572266, "step": 2889 }, { "epoch": 0.39351851851851855, "grad_norm": 41.20632606099989, "learning_rate": 6.078512383822314e-07, "logits/chosen": 13.177125930786133, "logits/rejected": 12.485382080078125, "logps/chosen": -3.6730523109436035, "logps/rejected": -3.77105712890625, "loss": 4.4244, "rewards/accuracies": 0.5, "rewards/chosen": -36.730525970458984, "rewards/margins": 0.980046272277832, "rewards/rejected": -37.7105712890625, "step": 2890 }, { "epoch": 0.3936546840958606, "grad_norm": 42.39320077334299, "learning_rate": 6.076887604717756e-07, "logits/chosen": 12.362204551696777, "logits/rejected": 12.430273056030273, "logps/chosen": -3.7017807960510254, "logps/rejected": -4.002545356750488, "loss": 4.3235, "rewards/accuracies": 0.75, "rewards/chosen": -37.01780700683594, "rewards/margins": 3.0076475143432617, "rewards/rejected": -40.025455474853516, "step": 2891 }, { "epoch": 0.3937908496732026, "grad_norm": 57.26945984047823, "learning_rate": 6.075262356323277e-07, "logits/chosen": 11.197128295898438, "logits/rejected": 13.138956069946289, "logps/chosen": -3.6128222942352295, "logps/rejected": -4.300230026245117, "loss": 4.0542, "rewards/accuracies": 1.0, "rewards/chosen": -36.12822341918945, "rewards/margins": 6.8740763664245605, "rewards/rejected": -43.002296447753906, "step": 2892 }, { "epoch": 0.3939270152505447, "grad_norm": 41.36455060648566, "learning_rate": 6.073636639006113e-07, "logits/chosen": 11.887516975402832, "logits/rejected": 11.640012741088867, "logps/chosen": -3.7577826976776123, "logps/rejected": -3.8118326663970947, "loss": 3.6535, "rewards/accuracies": 0.5, "rewards/chosen": -37.57782745361328, "rewards/margins": 0.5404996871948242, "rewards/rejected": -38.118324279785156, "step": 2893 }, { "epoch": 0.3940631808278867, "grad_norm": 42.7580736793804, "learning_rate": 6.07201045313361e-07, "logits/chosen": 11.151970863342285, "logits/rejected": 12.60100269317627, "logps/chosen": -3.1803512573242188, "logps/rejected": -3.589524269104004, "loss": 3.9592, "rewards/accuracies": 1.0, "rewards/chosen": -31.803512573242188, "rewards/margins": 4.091728687286377, "rewards/rejected": -35.895240783691406, "step": 2894 }, { "epoch": 0.39419934640522875, "grad_norm": 43.59589129010802, "learning_rate": 6.070383799073219e-07, "logits/chosen": 13.114645004272461, "logits/rejected": 13.031364440917969, "logps/chosen": -3.8499608039855957, "logps/rejected": -4.069730758666992, "loss": 4.7778, "rewards/accuracies": 0.75, "rewards/chosen": -38.499610900878906, "rewards/margins": 2.1976966857910156, "rewards/rejected": -40.697303771972656, "step": 2895 }, { "epoch": 0.39433551198257083, "grad_norm": 83.81821274696618, "learning_rate": 6.068756677192493e-07, "logits/chosen": 12.210941314697266, "logits/rejected": 12.456779479980469, "logps/chosen": -3.7608704566955566, "logps/rejected": -3.8921518325805664, "loss": 4.3247, "rewards/accuracies": 0.5, "rewards/chosen": -37.60870361328125, "rewards/margins": 1.3128128051757812, "rewards/rejected": -38.92151641845703, "step": 2896 }, { "epoch": 0.39447167755991286, "grad_norm": 43.48262542883702, "learning_rate": 6.067129087859095e-07, "logits/chosen": 12.966644287109375, "logits/rejected": 11.87048053741455, "logps/chosen": -4.074362754821777, "logps/rejected": -3.8664560317993164, "loss": 4.1159, "rewards/accuracies": 0.25, "rewards/chosen": -40.743629455566406, "rewards/margins": -2.0790700912475586, "rewards/rejected": -38.66455841064453, "step": 2897 }, { "epoch": 0.3946078431372549, "grad_norm": 40.70577179063789, "learning_rate": 6.065501031440793e-07, "logits/chosen": 12.11171817779541, "logits/rejected": 12.219131469726562, "logps/chosen": -3.659473419189453, "logps/rejected": -3.871389389038086, "loss": 3.8241, "rewards/accuracies": 0.75, "rewards/chosen": -36.59473419189453, "rewards/margins": 2.119161605834961, "rewards/rejected": -38.71389389038086, "step": 2898 }, { "epoch": 0.394744008714597, "grad_norm": 46.88356691936091, "learning_rate": 6.063872508305461e-07, "logits/chosen": 13.289777755737305, "logits/rejected": 13.569365501403809, "logps/chosen": -4.219836235046387, "logps/rejected": -4.518156051635742, "loss": 4.183, "rewards/accuracies": 0.75, "rewards/chosen": -42.1983642578125, "rewards/margins": 2.9831953048706055, "rewards/rejected": -45.181556701660156, "step": 2899 }, { "epoch": 0.394880174291939, "grad_norm": 63.832595349145784, "learning_rate": 6.062243518821075e-07, "logits/chosen": 12.474374771118164, "logits/rejected": 12.681130409240723, "logps/chosen": -3.8601789474487305, "logps/rejected": -3.7389121055603027, "loss": 4.1034, "rewards/accuracies": 0.0, "rewards/chosen": -38.60179138183594, "rewards/margins": -1.2126703262329102, "rewards/rejected": -37.389122009277344, "step": 2900 }, { "epoch": 0.39501633986928103, "grad_norm": 43.84729655863516, "learning_rate": 6.060614063355718e-07, "logits/chosen": 12.08194637298584, "logits/rejected": 12.639640808105469, "logps/chosen": -3.8393077850341797, "logps/rejected": -4.207420349121094, "loss": 3.9638, "rewards/accuracies": 0.75, "rewards/chosen": -38.3930778503418, "rewards/margins": 3.6811304092407227, "rewards/rejected": -42.0742073059082, "step": 2901 }, { "epoch": 0.3951525054466231, "grad_norm": 45.999292610951194, "learning_rate": 6.058984142277582e-07, "logits/chosen": 12.97354793548584, "logits/rejected": 12.822211265563965, "logps/chosen": -3.8201966285705566, "logps/rejected": -4.094185829162598, "loss": 3.7187, "rewards/accuracies": 0.75, "rewards/chosen": -38.20196533203125, "rewards/margins": 2.7398900985717773, "rewards/rejected": -40.941856384277344, "step": 2902 }, { "epoch": 0.39528867102396514, "grad_norm": 39.25956889714026, "learning_rate": 6.057353755954957e-07, "logits/chosen": 11.524324417114258, "logits/rejected": 13.025121688842773, "logps/chosen": -3.7820944786071777, "logps/rejected": -4.050940990447998, "loss": 4.3221, "rewards/accuracies": 0.75, "rewards/chosen": -37.820945739746094, "rewards/margins": 2.6884632110595703, "rewards/rejected": -40.50940704345703, "step": 2903 }, { "epoch": 0.3954248366013072, "grad_norm": 74.07366115110224, "learning_rate": 6.055722904756246e-07, "logits/chosen": 12.087108612060547, "logits/rejected": 13.014776229858398, "logps/chosen": -3.5777153968811035, "logps/rejected": -3.9299142360687256, "loss": 3.736, "rewards/accuracies": 0.75, "rewards/chosen": -35.77715301513672, "rewards/margins": 3.521986961364746, "rewards/rejected": -39.29914093017578, "step": 2904 }, { "epoch": 0.39556100217864926, "grad_norm": 39.919464694460785, "learning_rate": 6.054091589049951e-07, "logits/chosen": 11.730573654174805, "logits/rejected": 12.240531921386719, "logps/chosen": -3.69711971282959, "logps/rejected": -3.9777674674987793, "loss": 3.388, "rewards/accuracies": 1.0, "rewards/chosen": -36.97119903564453, "rewards/margins": 2.8064780235290527, "rewards/rejected": -39.777671813964844, "step": 2905 }, { "epoch": 0.3956971677559913, "grad_norm": 41.851317721918946, "learning_rate": 6.052459809204683e-07, "logits/chosen": 12.096549987792969, "logits/rejected": 12.263365745544434, "logps/chosen": -3.608086585998535, "logps/rejected": -3.6797518730163574, "loss": 4.3515, "rewards/accuracies": 0.75, "rewards/chosen": -36.08086395263672, "rewards/margins": 0.7166519165039062, "rewards/rejected": -36.797515869140625, "step": 2906 }, { "epoch": 0.3958333333333333, "grad_norm": 45.05060954436737, "learning_rate": 6.050827565589156e-07, "logits/chosen": 13.127142906188965, "logits/rejected": 12.563626289367676, "logps/chosen": -3.883829116821289, "logps/rejected": -3.6043357849121094, "loss": 4.1204, "rewards/accuracies": 0.25, "rewards/chosen": -38.838294982910156, "rewards/margins": -2.7949342727661133, "rewards/rejected": -36.043357849121094, "step": 2907 }, { "epoch": 0.3959694989106754, "grad_norm": 41.8377597018319, "learning_rate": 6.049194858572187e-07, "logits/chosen": 12.346551895141602, "logits/rejected": 13.389310836791992, "logps/chosen": -3.635059356689453, "logps/rejected": -3.6640496253967285, "loss": 4.1023, "rewards/accuracies": 0.5, "rewards/chosen": -36.35059356689453, "rewards/margins": 0.2899022102355957, "rewards/rejected": -36.64049530029297, "step": 2908 }, { "epoch": 0.39610566448801743, "grad_norm": 46.709309085871034, "learning_rate": 6.047561688522701e-07, "logits/chosen": 13.127798080444336, "logits/rejected": 12.266282081604004, "logps/chosen": -3.812077283859253, "logps/rejected": -3.755582332611084, "loss": 4.2893, "rewards/accuracies": 0.5, "rewards/chosen": -38.12077331542969, "rewards/margins": -0.5649499893188477, "rewards/rejected": -37.555824279785156, "step": 2909 }, { "epoch": 0.39624183006535946, "grad_norm": 42.126418733238744, "learning_rate": 6.045928055809726e-07, "logits/chosen": 11.958965301513672, "logits/rejected": 12.665481567382812, "logps/chosen": -3.8430633544921875, "logps/rejected": -3.729055881500244, "loss": 4.3482, "rewards/accuracies": 0.25, "rewards/chosen": -38.430633544921875, "rewards/margins": -1.1400737762451172, "rewards/rejected": -37.290557861328125, "step": 2910 }, { "epoch": 0.39637799564270154, "grad_norm": 41.8338246272585, "learning_rate": 6.044293960802395e-07, "logits/chosen": 13.366264343261719, "logits/rejected": 12.437326431274414, "logps/chosen": -3.8622965812683105, "logps/rejected": -3.8334147930145264, "loss": 3.8343, "rewards/accuracies": 0.5, "rewards/chosen": -38.62296676635742, "rewards/margins": -0.288818359375, "rewards/rejected": -38.33414840698242, "step": 2911 }, { "epoch": 0.39651416122004357, "grad_norm": 42.12915434952878, "learning_rate": 6.042659403869945e-07, "logits/chosen": 11.88100814819336, "logits/rejected": 12.748910903930664, "logps/chosen": -3.707306385040283, "logps/rejected": -3.802818536758423, "loss": 3.9262, "rewards/accuracies": 0.5, "rewards/chosen": -37.073062896728516, "rewards/margins": 0.9551205635070801, "rewards/rejected": -38.02818298339844, "step": 2912 }, { "epoch": 0.3966503267973856, "grad_norm": 46.49888731721659, "learning_rate": 6.04102438538172e-07, "logits/chosen": 13.17122745513916, "logits/rejected": 13.28236198425293, "logps/chosen": -3.9029111862182617, "logps/rejected": -3.841689109802246, "loss": 4.4621, "rewards/accuracies": 0.5, "rewards/chosen": -39.02911376953125, "rewards/margins": -0.6122207641601562, "rewards/rejected": -38.416893005371094, "step": 2913 }, { "epoch": 0.3967864923747277, "grad_norm": 40.347748111172706, "learning_rate": 6.039388905707162e-07, "logits/chosen": 12.318925857543945, "logits/rejected": 12.937459945678711, "logps/chosen": -3.5855116844177246, "logps/rejected": -4.00325345993042, "loss": 4.2975, "rewards/accuracies": 0.75, "rewards/chosen": -35.85511779785156, "rewards/margins": 4.177417755126953, "rewards/rejected": -40.032535552978516, "step": 2914 }, { "epoch": 0.3969226579520697, "grad_norm": 46.94150278359444, "learning_rate": 6.037752965215824e-07, "logits/chosen": 12.000274658203125, "logits/rejected": 12.649683952331543, "logps/chosen": -3.613525629043579, "logps/rejected": -3.9948887825012207, "loss": 4.4305, "rewards/accuracies": 0.75, "rewards/chosen": -36.13525390625, "rewards/margins": 3.8136348724365234, "rewards/rejected": -39.948890686035156, "step": 2915 }, { "epoch": 0.39705882352941174, "grad_norm": 43.86934801714759, "learning_rate": 6.036116564277358e-07, "logits/chosen": 13.4493408203125, "logits/rejected": 12.722841262817383, "logps/chosen": -3.8847556114196777, "logps/rejected": -3.8669304847717285, "loss": 4.0328, "rewards/accuracies": 0.5, "rewards/chosen": -38.847557067871094, "rewards/margins": -0.17825031280517578, "rewards/rejected": -38.66930389404297, "step": 2916 }, { "epoch": 0.3971949891067538, "grad_norm": 41.10434426843349, "learning_rate": 6.034479703261524e-07, "logits/chosen": 13.075662612915039, "logits/rejected": 12.428384780883789, "logps/chosen": -3.8608388900756836, "logps/rejected": -4.053544044494629, "loss": 3.6503, "rewards/accuracies": 0.75, "rewards/chosen": -38.60839080810547, "rewards/margins": 1.9270524978637695, "rewards/rejected": -40.53544235229492, "step": 2917 }, { "epoch": 0.39733115468409586, "grad_norm": 57.32016937066339, "learning_rate": 6.032842382538184e-07, "logits/chosen": 12.412641525268555, "logits/rejected": 12.659130096435547, "logps/chosen": -3.4594264030456543, "logps/rejected": -3.7421679496765137, "loss": 3.5258, "rewards/accuracies": 0.75, "rewards/chosen": -34.594261169433594, "rewards/margins": 2.82741641998291, "rewards/rejected": -37.42168045043945, "step": 2918 }, { "epoch": 0.3974673202614379, "grad_norm": 41.68927442332175, "learning_rate": 6.031204602477304e-07, "logits/chosen": 13.026437759399414, "logits/rejected": 13.275638580322266, "logps/chosen": -4.1163482666015625, "logps/rejected": -4.152878761291504, "loss": 3.9847, "rewards/accuracies": 0.75, "rewards/chosen": -41.163482666015625, "rewards/margins": 0.36530399322509766, "rewards/rejected": -41.528785705566406, "step": 2919 }, { "epoch": 0.39760348583877997, "grad_norm": 55.06227701476813, "learning_rate": 6.029566363448954e-07, "logits/chosen": 12.564070701599121, "logits/rejected": 12.354482650756836, "logps/chosen": -3.771235942840576, "logps/rejected": -3.9954276084899902, "loss": 4.1564, "rewards/accuracies": 0.5, "rewards/chosen": -37.71236038208008, "rewards/margins": 2.241912841796875, "rewards/rejected": -39.95427322387695, "step": 2920 }, { "epoch": 0.397739651416122, "grad_norm": 63.601806360852414, "learning_rate": 6.027927665823307e-07, "logits/chosen": 12.389890670776367, "logits/rejected": 12.896594047546387, "logps/chosen": -3.6330654621124268, "logps/rejected": -4.031676769256592, "loss": 3.4105, "rewards/accuracies": 1.0, "rewards/chosen": -36.330657958984375, "rewards/margins": 3.986111640930176, "rewards/rejected": -40.316768646240234, "step": 2921 }, { "epoch": 0.397875816993464, "grad_norm": 68.54906221107814, "learning_rate": 6.026288509970643e-07, "logits/chosen": 12.964056015014648, "logits/rejected": 12.979090690612793, "logps/chosen": -4.124354362487793, "logps/rejected": -4.156888008117676, "loss": 4.1466, "rewards/accuracies": 0.5, "rewards/chosen": -41.24354553222656, "rewards/margins": 0.3253355026245117, "rewards/rejected": -41.568878173828125, "step": 2922 }, { "epoch": 0.3980119825708061, "grad_norm": 53.982931552066255, "learning_rate": 6.024648896261339e-07, "logits/chosen": 12.163753509521484, "logits/rejected": 12.822423934936523, "logps/chosen": -3.8047592639923096, "logps/rejected": -4.000910758972168, "loss": 4.3635, "rewards/accuracies": 0.5, "rewards/chosen": -38.04759216308594, "rewards/margins": 1.9615163803100586, "rewards/rejected": -40.00910949707031, "step": 2923 }, { "epoch": 0.39814814814814814, "grad_norm": 41.39566535116492, "learning_rate": 6.023008825065881e-07, "logits/chosen": 11.99767017364502, "logits/rejected": 13.12646484375, "logps/chosen": -3.890477418899536, "logps/rejected": -4.22892951965332, "loss": 3.988, "rewards/accuracies": 1.0, "rewards/chosen": -38.9047737121582, "rewards/margins": 3.3845224380493164, "rewards/rejected": -42.28929901123047, "step": 2924 }, { "epoch": 0.39828431372549017, "grad_norm": 41.49467818509993, "learning_rate": 6.021368296754857e-07, "logits/chosen": 12.390813827514648, "logits/rejected": 12.908407211303711, "logps/chosen": -3.8956401348114014, "logps/rejected": -4.3525614738464355, "loss": 3.5825, "rewards/accuracies": 1.0, "rewards/chosen": -38.956398010253906, "rewards/margins": 4.569212913513184, "rewards/rejected": -43.525611877441406, "step": 2925 }, { "epoch": 0.39842047930283225, "grad_norm": 48.9226187396948, "learning_rate": 6.019727311698957e-07, "logits/chosen": 12.013571739196777, "logits/rejected": 12.073966979980469, "logps/chosen": -3.5997462272644043, "logps/rejected": -3.5609798431396484, "loss": 4.4451, "rewards/accuracies": 0.5, "rewards/chosen": -35.997459411621094, "rewards/margins": -0.3876628875732422, "rewards/rejected": -35.609798431396484, "step": 2926 }, { "epoch": 0.3985566448801743, "grad_norm": 50.25953236986422, "learning_rate": 6.018085870268976e-07, "logits/chosen": 12.428363800048828, "logits/rejected": 12.884559631347656, "logps/chosen": -3.713763952255249, "logps/rejected": -3.963611125946045, "loss": 4.9882, "rewards/accuracies": 0.5, "rewards/chosen": -37.13764190673828, "rewards/margins": 2.498471260070801, "rewards/rejected": -39.6361083984375, "step": 2927 }, { "epoch": 0.39869281045751637, "grad_norm": 49.649686206027276, "learning_rate": 6.016443972835811e-07, "logits/chosen": 11.53575611114502, "logits/rejected": 11.96934700012207, "logps/chosen": -3.5232694149017334, "logps/rejected": -3.6766273975372314, "loss": 3.5676, "rewards/accuracies": 0.5, "rewards/chosen": -35.23269271850586, "rewards/margins": 1.5335793495178223, "rewards/rejected": -36.766273498535156, "step": 2928 }, { "epoch": 0.3988289760348584, "grad_norm": 43.38596139344621, "learning_rate": 6.014801619770463e-07, "logits/chosen": 11.197193145751953, "logits/rejected": 12.05837345123291, "logps/chosen": -3.612015962600708, "logps/rejected": -3.8841540813446045, "loss": 4.0443, "rewards/accuracies": 1.0, "rewards/chosen": -36.12015914916992, "rewards/margins": 2.721379280090332, "rewards/rejected": -38.84153747558594, "step": 2929 }, { "epoch": 0.3989651416122004, "grad_norm": 44.44754050058025, "learning_rate": 6.013158811444033e-07, "logits/chosen": 12.436200141906738, "logits/rejected": 13.247215270996094, "logps/chosen": -3.633119821548462, "logps/rejected": -4.001314163208008, "loss": 4.2947, "rewards/accuracies": 0.75, "rewards/chosen": -36.331199645996094, "rewards/margins": 3.681941509246826, "rewards/rejected": -40.01314163208008, "step": 2930 }, { "epoch": 0.3991013071895425, "grad_norm": 45.223721426640196, "learning_rate": 6.01151554822773e-07, "logits/chosen": 12.486698150634766, "logits/rejected": 12.149191856384277, "logps/chosen": -3.6901793479919434, "logps/rejected": -3.7078967094421387, "loss": 4.215, "rewards/accuracies": 0.75, "rewards/chosen": -36.901790618896484, "rewards/margins": 0.17717361450195312, "rewards/rejected": -37.0789680480957, "step": 2931 }, { "epoch": 0.39923747276688454, "grad_norm": 41.15012085286419, "learning_rate": 6.00987183049286e-07, "logits/chosen": 11.129826545715332, "logits/rejected": 11.794570922851562, "logps/chosen": -3.559617042541504, "logps/rejected": -3.762160539627075, "loss": 4.1406, "rewards/accuracies": 0.75, "rewards/chosen": -35.59617233276367, "rewards/margins": 2.0254335403442383, "rewards/rejected": -37.621604919433594, "step": 2932 }, { "epoch": 0.39937363834422657, "grad_norm": 53.59843747030609, "learning_rate": 6.008227658610838e-07, "logits/chosen": 11.505468368530273, "logits/rejected": 13.553070068359375, "logps/chosen": -3.4741296768188477, "logps/rejected": -3.8332467079162598, "loss": 3.7782, "rewards/accuracies": 0.5, "rewards/chosen": -34.74129867553711, "rewards/margins": 3.5911688804626465, "rewards/rejected": -38.33246612548828, "step": 2933 }, { "epoch": 0.39950980392156865, "grad_norm": 40.54672252751634, "learning_rate": 6.006583032953175e-07, "logits/chosen": 11.799886703491211, "logits/rejected": 12.578926086425781, "logps/chosen": -3.2523193359375, "logps/rejected": -3.5359115600585938, "loss": 3.753, "rewards/accuracies": 0.75, "rewards/chosen": -32.523193359375, "rewards/margins": 2.8359198570251465, "rewards/rejected": -35.35911560058594, "step": 2934 }, { "epoch": 0.3996459694989107, "grad_norm": 50.484441701427656, "learning_rate": 6.00493795389149e-07, "logits/chosen": 11.762057304382324, "logits/rejected": 12.458888053894043, "logps/chosen": -3.5416035652160645, "logps/rejected": -3.5045018196105957, "loss": 4.8845, "rewards/accuracies": 0.25, "rewards/chosen": -35.41603469848633, "rewards/margins": -0.3710155487060547, "rewards/rejected": -35.045021057128906, "step": 2935 }, { "epoch": 0.3997821350762527, "grad_norm": 39.70632182148921, "learning_rate": 6.0032924217975e-07, "logits/chosen": 11.879522323608398, "logits/rejected": 13.144359588623047, "logps/chosen": -3.473092555999756, "logps/rejected": -3.897202491760254, "loss": 4.0035, "rewards/accuracies": 1.0, "rewards/chosen": -34.730926513671875, "rewards/margins": 4.241098880767822, "rewards/rejected": -38.972023010253906, "step": 2936 }, { "epoch": 0.3999183006535948, "grad_norm": 47.73277040740743, "learning_rate": 6.00164643704303e-07, "logits/chosen": 11.771039962768555, "logits/rejected": 12.096915245056152, "logps/chosen": -3.2815632820129395, "logps/rejected": -3.5061683654785156, "loss": 4.1094, "rewards/accuracies": 0.75, "rewards/chosen": -32.81563186645508, "rewards/margins": 2.2460498809814453, "rewards/rejected": -35.061683654785156, "step": 2937 }, { "epoch": 0.4000544662309368, "grad_norm": 47.18484405595583, "learning_rate": 6e-07, "logits/chosen": 12.472356796264648, "logits/rejected": 12.558465957641602, "logps/chosen": -3.872570276260376, "logps/rejected": -3.630467653274536, "loss": 4.4103, "rewards/accuracies": 0.25, "rewards/chosen": -38.725704193115234, "rewards/margins": -2.4210262298583984, "rewards/rejected": -36.3046760559082, "step": 2938 }, { "epoch": 0.40019063180827885, "grad_norm": 40.545767862295016, "learning_rate": 5.998353111040437e-07, "logits/chosen": 12.663227081298828, "logits/rejected": 11.834683418273926, "logps/chosen": -3.5080599784851074, "logps/rejected": -3.286449909210205, "loss": 3.8103, "rewards/accuracies": 0.25, "rewards/chosen": -35.08060073852539, "rewards/margins": -2.21610164642334, "rewards/rejected": -32.864498138427734, "step": 2939 }, { "epoch": 0.40032679738562094, "grad_norm": 45.03895300167199, "learning_rate": 5.996705770536472e-07, "logits/chosen": 11.886669158935547, "logits/rejected": 11.52034854888916, "logps/chosen": -3.5239129066467285, "logps/rejected": -4.041259288787842, "loss": 3.9984, "rewards/accuracies": 1.0, "rewards/chosen": -35.23912811279297, "rewards/margins": 5.173465728759766, "rewards/rejected": -40.412593841552734, "step": 2940 }, { "epoch": 0.40046296296296297, "grad_norm": 36.366243100717526, "learning_rate": 5.995057978860334e-07, "logits/chosen": 12.383687973022461, "logits/rejected": 12.495708465576172, "logps/chosen": -3.6659622192382812, "logps/rejected": -3.8760132789611816, "loss": 3.9924, "rewards/accuracies": 0.5, "rewards/chosen": -36.65962219238281, "rewards/margins": 2.1005115509033203, "rewards/rejected": -38.760135650634766, "step": 2941 }, { "epoch": 0.400599128540305, "grad_norm": 36.68363698214562, "learning_rate": 5.993409736384352e-07, "logits/chosen": 10.643280029296875, "logits/rejected": 12.327489852905273, "logps/chosen": -3.3073415756225586, "logps/rejected": -3.8005900382995605, "loss": 3.8424, "rewards/accuracies": 1.0, "rewards/chosen": -33.07341766357422, "rewards/margins": 4.932482719421387, "rewards/rejected": -38.005897521972656, "step": 2942 }, { "epoch": 0.4007352941176471, "grad_norm": 40.436858409317345, "learning_rate": 5.991761043480964e-07, "logits/chosen": 11.77177619934082, "logits/rejected": 12.355592727661133, "logps/chosen": -3.6754298210144043, "logps/rejected": -3.760988235473633, "loss": 3.9873, "rewards/accuracies": 0.75, "rewards/chosen": -36.75429916381836, "rewards/margins": 0.8555822372436523, "rewards/rejected": -37.60988235473633, "step": 2943 }, { "epoch": 0.4008714596949891, "grad_norm": 41.40818520271916, "learning_rate": 5.990111900522703e-07, "logits/chosen": 12.317327499389648, "logits/rejected": 10.558752059936523, "logps/chosen": -3.4968810081481934, "logps/rejected": -3.2473459243774414, "loss": 3.9243, "rewards/accuracies": 0.0, "rewards/chosen": -34.96881103515625, "rewards/margins": -2.495349407196045, "rewards/rejected": -32.47345733642578, "step": 2944 }, { "epoch": 0.40100762527233114, "grad_norm": 121.22128676143372, "learning_rate": 5.988462307882208e-07, "logits/chosen": 12.102394104003906, "logits/rejected": 12.702166557312012, "logps/chosen": -3.6982250213623047, "logps/rejected": -3.6669249534606934, "loss": 4.9309, "rewards/accuracies": 0.5, "rewards/chosen": -36.98224639892578, "rewards/margins": -0.31299781799316406, "rewards/rejected": -36.66925048828125, "step": 2945 }, { "epoch": 0.4011437908496732, "grad_norm": 35.674310546779864, "learning_rate": 5.986812265932218e-07, "logits/chosen": 12.188457489013672, "logits/rejected": 11.715667724609375, "logps/chosen": -3.5627543926239014, "logps/rejected": -3.430238723754883, "loss": 4.1603, "rewards/accuracies": 0.25, "rewards/chosen": -35.62754440307617, "rewards/margins": -1.3251566886901855, "rewards/rejected": -34.30238723754883, "step": 2946 }, { "epoch": 0.40127995642701525, "grad_norm": 44.86792417010983, "learning_rate": 5.985161775045574e-07, "logits/chosen": 12.20779037475586, "logits/rejected": 12.442832946777344, "logps/chosen": -3.6192617416381836, "logps/rejected": -3.7028045654296875, "loss": 4.2956, "rewards/accuracies": 0.75, "rewards/chosen": -36.1926155090332, "rewards/margins": 0.8354291915893555, "rewards/rejected": -37.028045654296875, "step": 2947 }, { "epoch": 0.4014161220043573, "grad_norm": 35.82483670601355, "learning_rate": 5.983510835595216e-07, "logits/chosen": 11.27992057800293, "logits/rejected": 11.993165016174316, "logps/chosen": -3.307978391647339, "logps/rejected": -3.59171724319458, "loss": 3.9631, "rewards/accuracies": 0.75, "rewards/chosen": -33.07978439331055, "rewards/margins": 2.8373894691467285, "rewards/rejected": -35.91717529296875, "step": 2948 }, { "epoch": 0.40155228758169936, "grad_norm": 47.508173353294524, "learning_rate": 5.981859447954189e-07, "logits/chosen": 11.569472312927246, "logits/rejected": 11.736627578735352, "logps/chosen": -3.424098014831543, "logps/rejected": -3.3608057498931885, "loss": 3.8724, "rewards/accuracies": 0.25, "rewards/chosen": -34.24098205566406, "rewards/margins": -0.6329226493835449, "rewards/rejected": -33.60805892944336, "step": 2949 }, { "epoch": 0.4016884531590414, "grad_norm": 39.12825908329093, "learning_rate": 5.980207612495638e-07, "logits/chosen": 12.11595630645752, "logits/rejected": 11.978803634643555, "logps/chosen": -3.4362168312072754, "logps/rejected": -3.538344144821167, "loss": 4.0286, "rewards/accuracies": 0.75, "rewards/chosen": -34.36216735839844, "rewards/margins": 1.0212750434875488, "rewards/rejected": -35.38344192504883, "step": 2950 }, { "epoch": 0.4018246187363834, "grad_norm": 39.55781945933772, "learning_rate": 5.978555329592808e-07, "logits/chosen": 11.856103897094727, "logits/rejected": 12.193750381469727, "logps/chosen": -3.475226402282715, "logps/rejected": -3.5788443088531494, "loss": 4.1094, "rewards/accuracies": 0.75, "rewards/chosen": -34.75226593017578, "rewards/margins": 1.0361785888671875, "rewards/rejected": -35.78844451904297, "step": 2951 }, { "epoch": 0.4019607843137255, "grad_norm": 37.61922874289873, "learning_rate": 5.976902599619047e-07, "logits/chosen": 12.589776992797852, "logits/rejected": 12.724602699279785, "logps/chosen": -3.506216049194336, "logps/rejected": -3.8434903621673584, "loss": 3.3893, "rewards/accuracies": 0.5, "rewards/chosen": -35.06216049194336, "rewards/margins": 3.3727426528930664, "rewards/rejected": -38.434906005859375, "step": 2952 }, { "epoch": 0.40209694989106753, "grad_norm": 40.40249619266986, "learning_rate": 5.975249422947802e-07, "logits/chosen": 12.092306137084961, "logits/rejected": 13.146343231201172, "logps/chosen": -3.704439163208008, "logps/rejected": -3.830023765563965, "loss": 4.892, "rewards/accuracies": 0.75, "rewards/chosen": -37.04439163208008, "rewards/margins": 1.2558469772338867, "rewards/rejected": -38.30023956298828, "step": 2953 }, { "epoch": 0.40223311546840956, "grad_norm": 41.322462243512625, "learning_rate": 5.973595799952622e-07, "logits/chosen": 12.030647277832031, "logits/rejected": 13.583515167236328, "logps/chosen": -3.537734031677246, "logps/rejected": -3.9082157611846924, "loss": 3.8561, "rewards/accuracies": 1.0, "rewards/chosen": -35.37733840942383, "rewards/margins": 3.704819679260254, "rewards/rejected": -39.082157135009766, "step": 2954 }, { "epoch": 0.40236928104575165, "grad_norm": 39.46199249722008, "learning_rate": 5.971941731007158e-07, "logits/chosen": 12.81882095336914, "logits/rejected": 12.592992782592773, "logps/chosen": -3.8007631301879883, "logps/rejected": -3.898285388946533, "loss": 4.085, "rewards/accuracies": 0.5, "rewards/chosen": -38.00762939453125, "rewards/margins": 0.9752216339111328, "rewards/rejected": -38.982852935791016, "step": 2955 }, { "epoch": 0.4025054466230937, "grad_norm": 62.14167224339994, "learning_rate": 5.97028721648516e-07, "logits/chosen": 12.64621353149414, "logits/rejected": 12.861723899841309, "logps/chosen": -3.6987404823303223, "logps/rejected": -3.9386467933654785, "loss": 3.5229, "rewards/accuracies": 0.75, "rewards/chosen": -36.98740768432617, "rewards/margins": 2.3990588188171387, "rewards/rejected": -39.38646697998047, "step": 2956 }, { "epoch": 0.4026416122004357, "grad_norm": 42.42292081761341, "learning_rate": 5.968632256760477e-07, "logits/chosen": 11.820687294006348, "logits/rejected": 12.367769241333008, "logps/chosen": -3.4985148906707764, "logps/rejected": -3.7547249794006348, "loss": 4.1638, "rewards/accuracies": 0.75, "rewards/chosen": -34.98514938354492, "rewards/margins": 2.562100410461426, "rewards/rejected": -37.54724884033203, "step": 2957 }, { "epoch": 0.4027777777777778, "grad_norm": 37.687192033683495, "learning_rate": 5.966976852207064e-07, "logits/chosen": 12.96273422241211, "logits/rejected": 12.870039939880371, "logps/chosen": -3.777264356613159, "logps/rejected": -3.7123372554779053, "loss": 4.0435, "rewards/accuracies": 0.5, "rewards/chosen": -37.77264404296875, "rewards/margins": -0.6492710113525391, "rewards/rejected": -37.12337112426758, "step": 2958 }, { "epoch": 0.4029139433551198, "grad_norm": 43.88270487369445, "learning_rate": 5.965321003198972e-07, "logits/chosen": 11.94874095916748, "logits/rejected": 12.374734878540039, "logps/chosen": -3.323023557662964, "logps/rejected": -3.540642499923706, "loss": 3.9725, "rewards/accuracies": 0.75, "rewards/chosen": -33.2302360534668, "rewards/margins": 2.176189422607422, "rewards/rejected": -35.40642547607422, "step": 2959 }, { "epoch": 0.40305010893246185, "grad_norm": 39.302134988682475, "learning_rate": 5.963664710110354e-07, "logits/chosen": 11.92548942565918, "logits/rejected": 11.947378158569336, "logps/chosen": -3.5471816062927246, "logps/rejected": -3.7932729721069336, "loss": 4.5892, "rewards/accuracies": 0.75, "rewards/chosen": -35.47181701660156, "rewards/margins": 2.4609122276306152, "rewards/rejected": -37.93273162841797, "step": 2960 }, { "epoch": 0.40318627450980393, "grad_norm": 46.21078814020278, "learning_rate": 5.962007973315462e-07, "logits/chosen": 12.854111671447754, "logits/rejected": 12.460090637207031, "logps/chosen": -3.7049407958984375, "logps/rejected": -3.697526454925537, "loss": 4.2358, "rewards/accuracies": 0.5, "rewards/chosen": -37.04940414428711, "rewards/margins": -0.0741415023803711, "rewards/rejected": -36.97526550292969, "step": 2961 }, { "epoch": 0.40332244008714596, "grad_norm": 47.99121906143567, "learning_rate": 5.960350793188651e-07, "logits/chosen": 11.950684547424316, "logits/rejected": 11.972354888916016, "logps/chosen": -3.769923686981201, "logps/rejected": -4.037425994873047, "loss": 3.595, "rewards/accuracies": 0.75, "rewards/chosen": -37.69923782348633, "rewards/margins": 2.6750240325927734, "rewards/rejected": -40.374263763427734, "step": 2962 }, { "epoch": 0.403458605664488, "grad_norm": 37.875373186309645, "learning_rate": 5.958693170104373e-07, "logits/chosen": 12.80752944946289, "logits/rejected": 12.532975196838379, "logps/chosen": -3.762101173400879, "logps/rejected": -3.4137659072875977, "loss": 4.0675, "rewards/accuracies": 0.25, "rewards/chosen": -37.621009826660156, "rewards/margins": -3.4833498001098633, "rewards/rejected": -34.13766098022461, "step": 2963 }, { "epoch": 0.4035947712418301, "grad_norm": 39.09773490046609, "learning_rate": 5.957035104437183e-07, "logits/chosen": 12.063583374023438, "logits/rejected": 12.852137565612793, "logps/chosen": -3.764310598373413, "logps/rejected": -4.066147804260254, "loss": 3.4307, "rewards/accuracies": 1.0, "rewards/chosen": -37.643104553222656, "rewards/margins": 3.0183706283569336, "rewards/rejected": -40.661476135253906, "step": 2964 }, { "epoch": 0.4037309368191721, "grad_norm": 37.43745199082798, "learning_rate": 5.955376596561735e-07, "logits/chosen": 11.752226829528809, "logits/rejected": 13.616758346557617, "logps/chosen": -3.5740256309509277, "logps/rejected": -3.8099112510681152, "loss": 3.9266, "rewards/accuracies": 0.75, "rewards/chosen": -35.740257263183594, "rewards/margins": 2.3588571548461914, "rewards/rejected": -38.09911346435547, "step": 2965 }, { "epoch": 0.4038671023965142, "grad_norm": 51.542038403776345, "learning_rate": 5.953717646852781e-07, "logits/chosen": 12.770089149475098, "logits/rejected": 12.734301567077637, "logps/chosen": -3.784153938293457, "logps/rejected": -4.056357383728027, "loss": 4.144, "rewards/accuracies": 0.75, "rewards/chosen": -37.84153747558594, "rewards/margins": 2.722036361694336, "rewards/rejected": -40.563575744628906, "step": 2966 }, { "epoch": 0.4040032679738562, "grad_norm": 45.87653817599633, "learning_rate": 5.952058255685175e-07, "logits/chosen": 13.030643463134766, "logits/rejected": 13.148782730102539, "logps/chosen": -3.755239725112915, "logps/rejected": -4.0919508934021, "loss": 4.3069, "rewards/accuracies": 0.75, "rewards/chosen": -37.552398681640625, "rewards/margins": 3.367110252380371, "rewards/rejected": -40.91950607299805, "step": 2967 }, { "epoch": 0.40413943355119825, "grad_norm": 40.99646848204835, "learning_rate": 5.950398423433871e-07, "logits/chosen": 11.827657699584961, "logits/rejected": 11.920858383178711, "logps/chosen": -3.678039073944092, "logps/rejected": -3.637118101119995, "loss": 4.1936, "rewards/accuracies": 0.5, "rewards/chosen": -36.780391693115234, "rewards/margins": -0.409210205078125, "rewards/rejected": -36.37118148803711, "step": 2968 }, { "epoch": 0.40427559912854033, "grad_norm": 43.034933580085806, "learning_rate": 5.94873815047392e-07, "logits/chosen": 12.354414939880371, "logits/rejected": 13.369988441467285, "logps/chosen": -3.7222697734832764, "logps/rejected": -4.253898620605469, "loss": 4.1252, "rewards/accuracies": 1.0, "rewards/chosen": -37.22269821166992, "rewards/margins": 5.316289901733398, "rewards/rejected": -42.53898620605469, "step": 2969 }, { "epoch": 0.40441176470588236, "grad_norm": 74.69973104401501, "learning_rate": 5.947077437180475e-07, "logits/chosen": 12.322283744812012, "logits/rejected": 12.31103229522705, "logps/chosen": -3.6782140731811523, "logps/rejected": -3.449575424194336, "loss": 4.0777, "rewards/accuracies": 0.5, "rewards/chosen": -36.78213882446289, "rewards/margins": -2.286384105682373, "rewards/rejected": -34.49575424194336, "step": 2970 }, { "epoch": 0.4045479302832244, "grad_norm": 122.39644407821295, "learning_rate": 5.94541628392879e-07, "logits/chosen": 12.826603889465332, "logits/rejected": 12.947549819946289, "logps/chosen": -3.8499205112457275, "logps/rejected": -3.8751888275146484, "loss": 4.1226, "rewards/accuracies": 0.5, "rewards/chosen": -38.499202728271484, "rewards/margins": 0.2526836395263672, "rewards/rejected": -38.75188446044922, "step": 2971 }, { "epoch": 0.4046840958605665, "grad_norm": 48.58557905670962, "learning_rate": 5.943754691094213e-07, "logits/chosen": 13.016833305358887, "logits/rejected": 12.866416931152344, "logps/chosen": -3.5690784454345703, "logps/rejected": -3.799248695373535, "loss": 4.9787, "rewards/accuracies": 0.5, "rewards/chosen": -35.6907844543457, "rewards/margins": 2.3017048835754395, "rewards/rejected": -37.992488861083984, "step": 2972 }, { "epoch": 0.4048202614379085, "grad_norm": 48.160223190372555, "learning_rate": 5.942092659052198e-07, "logits/chosen": 12.47049331665039, "logits/rejected": 11.892467498779297, "logps/chosen": -3.7436983585357666, "logps/rejected": -3.9169411659240723, "loss": 4.0134, "rewards/accuracies": 0.75, "rewards/chosen": -37.43698501586914, "rewards/margins": 1.7324285507202148, "rewards/rejected": -39.169410705566406, "step": 2973 }, { "epoch": 0.40495642701525053, "grad_norm": 43.50100811320999, "learning_rate": 5.940430188178293e-07, "logits/chosen": 12.308769226074219, "logits/rejected": 12.70988655090332, "logps/chosen": -3.7582693099975586, "logps/rejected": -3.8907723426818848, "loss": 4.1239, "rewards/accuracies": 0.75, "rewards/chosen": -37.58269500732422, "rewards/margins": 1.3250312805175781, "rewards/rejected": -38.90772247314453, "step": 2974 }, { "epoch": 0.4050925925925926, "grad_norm": 50.66783011025331, "learning_rate": 5.938767278848146e-07, "logits/chosen": 12.376199722290039, "logits/rejected": 12.025306701660156, "logps/chosen": -3.6290688514709473, "logps/rejected": -3.7172904014587402, "loss": 3.6265, "rewards/accuracies": 0.5, "rewards/chosen": -36.290687561035156, "rewards/margins": 0.8822174072265625, "rewards/rejected": -37.17290496826172, "step": 2975 }, { "epoch": 0.40522875816993464, "grad_norm": 42.613340712922046, "learning_rate": 5.937103931437507e-07, "logits/chosen": 12.407629013061523, "logits/rejected": 12.453426361083984, "logps/chosen": -4.138796806335449, "logps/rejected": -3.807173013687134, "loss": 4.4285, "rewards/accuracies": 0.0, "rewards/chosen": -41.387969970703125, "rewards/margins": -3.316239356994629, "rewards/rejected": -38.07173156738281, "step": 2976 }, { "epoch": 0.4053649237472767, "grad_norm": 88.77790533295939, "learning_rate": 5.935440146322223e-07, "logits/chosen": 12.351752281188965, "logits/rejected": 12.879362106323242, "logps/chosen": -3.721820831298828, "logps/rejected": -3.8734726905822754, "loss": 3.4763, "rewards/accuracies": 0.75, "rewards/chosen": -37.21820831298828, "rewards/margins": 1.5165200233459473, "rewards/rejected": -38.73472595214844, "step": 2977 }, { "epoch": 0.40550108932461876, "grad_norm": 39.20538429375585, "learning_rate": 5.933775923878238e-07, "logits/chosen": 12.124895095825195, "logits/rejected": 13.077177047729492, "logps/chosen": -3.6555490493774414, "logps/rejected": -3.943387746810913, "loss": 3.7829, "rewards/accuracies": 1.0, "rewards/chosen": -36.55549240112305, "rewards/margins": 2.878386974334717, "rewards/rejected": -39.433876037597656, "step": 2978 }, { "epoch": 0.4056372549019608, "grad_norm": 42.403330147055634, "learning_rate": 5.9321112644816e-07, "logits/chosen": 11.757706642150879, "logits/rejected": 12.720806121826172, "logps/chosen": -3.6507718563079834, "logps/rejected": -3.9977059364318848, "loss": 4.3688, "rewards/accuracies": 1.0, "rewards/chosen": -36.507720947265625, "rewards/margins": 3.469341278076172, "rewards/rejected": -39.9770622253418, "step": 2979 }, { "epoch": 0.4057734204793028, "grad_norm": 45.153620280803764, "learning_rate": 5.93044616850845e-07, "logits/chosen": 11.496452331542969, "logits/rejected": 12.203235626220703, "logps/chosen": -3.505732297897339, "logps/rejected": -3.7886457443237305, "loss": 4.5161, "rewards/accuracies": 0.75, "rewards/chosen": -35.05732345581055, "rewards/margins": 2.8291358947753906, "rewards/rejected": -37.88645935058594, "step": 2980 }, { "epoch": 0.4059095860566449, "grad_norm": 46.85308428758072, "learning_rate": 5.92878063633503e-07, "logits/chosen": 11.874418258666992, "logits/rejected": 11.866159439086914, "logps/chosen": -3.696289300918579, "logps/rejected": -3.9158387184143066, "loss": 3.863, "rewards/accuracies": 1.0, "rewards/chosen": -36.962890625, "rewards/margins": 2.1954946517944336, "rewards/rejected": -39.15838623046875, "step": 2981 }, { "epoch": 0.40604575163398693, "grad_norm": 42.05980303618415, "learning_rate": 5.927114668337683e-07, "logits/chosen": 12.140542984008789, "logits/rejected": 12.361089706420898, "logps/chosen": -4.001437664031982, "logps/rejected": -3.8236663341522217, "loss": 4.3557, "rewards/accuracies": 0.25, "rewards/chosen": -40.01437759399414, "rewards/margins": -1.7777118682861328, "rewards/rejected": -38.23666763305664, "step": 2982 }, { "epoch": 0.40618191721132896, "grad_norm": 40.86397039763279, "learning_rate": 5.925448264892847e-07, "logits/chosen": 11.872702598571777, "logits/rejected": 11.97492790222168, "logps/chosen": -4.0743913650512695, "logps/rejected": -3.537428379058838, "loss": 4.4001, "rewards/accuracies": 0.25, "rewards/chosen": -40.74391174316406, "rewards/margins": -5.369626045227051, "rewards/rejected": -35.37428665161133, "step": 2983 }, { "epoch": 0.40631808278867104, "grad_norm": 38.8710430229873, "learning_rate": 5.923781426377059e-07, "logits/chosen": 12.076072692871094, "logits/rejected": 12.812715530395508, "logps/chosen": -3.750199794769287, "logps/rejected": -3.9370017051696777, "loss": 3.5977, "rewards/accuracies": 0.75, "rewards/chosen": -37.50199890136719, "rewards/margins": 1.868021011352539, "rewards/rejected": -39.370018005371094, "step": 2984 }, { "epoch": 0.40645424836601307, "grad_norm": 40.66461628350393, "learning_rate": 5.922114153166956e-07, "logits/chosen": 12.295331954956055, "logits/rejected": 12.883697509765625, "logps/chosen": -3.9407906532287598, "logps/rejected": -4.089909553527832, "loss": 4.3328, "rewards/accuracies": 0.75, "rewards/chosen": -39.40790557861328, "rewards/margins": 1.4911870956420898, "rewards/rejected": -40.89909362792969, "step": 2985 }, { "epoch": 0.4065904139433551, "grad_norm": 37.34638058970001, "learning_rate": 5.920446445639272e-07, "logits/chosen": 12.717382431030273, "logits/rejected": 12.605131149291992, "logps/chosen": -3.7520923614501953, "logps/rejected": -4.239675045013428, "loss": 3.924, "rewards/accuracies": 0.75, "rewards/chosen": -37.52092361450195, "rewards/margins": 4.875826835632324, "rewards/rejected": -42.396751403808594, "step": 2986 }, { "epoch": 0.4067265795206972, "grad_norm": 42.13002945514135, "learning_rate": 5.918778304170838e-07, "logits/chosen": 12.338356018066406, "logits/rejected": 12.95121955871582, "logps/chosen": -3.7196927070617676, "logps/rejected": -3.809934616088867, "loss": 4.0384, "rewards/accuracies": 0.5, "rewards/chosen": -37.19692611694336, "rewards/margins": 0.9024205207824707, "rewards/rejected": -38.09934997558594, "step": 2987 }, { "epoch": 0.4068627450980392, "grad_norm": 44.90815331549055, "learning_rate": 5.917109729138586e-07, "logits/chosen": 12.280632019042969, "logits/rejected": 11.411439895629883, "logps/chosen": -4.014224052429199, "logps/rejected": -3.3782784938812256, "loss": 4.3898, "rewards/accuracies": 0.0, "rewards/chosen": -40.142242431640625, "rewards/margins": -6.359457015991211, "rewards/rejected": -33.78278350830078, "step": 2988 }, { "epoch": 0.40699891067538124, "grad_norm": 45.47660407888409, "learning_rate": 5.915440720919545e-07, "logits/chosen": 13.016661643981934, "logits/rejected": 13.433395385742188, "logps/chosen": -4.077242851257324, "logps/rejected": -4.1186394691467285, "loss": 4.329, "rewards/accuracies": 0.75, "rewards/chosen": -40.77242660522461, "rewards/margins": 0.4139680862426758, "rewards/rejected": -41.18639373779297, "step": 2989 }, { "epoch": 0.4071350762527233, "grad_norm": 44.3602360388549, "learning_rate": 5.913771279890838e-07, "logits/chosen": 11.270281791687012, "logits/rejected": 11.992034912109375, "logps/chosen": -3.832709789276123, "logps/rejected": -4.076096057891846, "loss": 3.7096, "rewards/accuracies": 0.75, "rewards/chosen": -38.32709884643555, "rewards/margins": 2.43386173248291, "rewards/rejected": -40.76095962524414, "step": 2990 }, { "epoch": 0.40727124183006536, "grad_norm": 45.95079945040711, "learning_rate": 5.912101406429691e-07, "logits/chosen": 12.238439559936523, "logits/rejected": 12.711929321289062, "logps/chosen": -3.6877331733703613, "logps/rejected": -3.753413677215576, "loss": 3.6295, "rewards/accuracies": 0.5, "rewards/chosen": -36.87733459472656, "rewards/margins": 0.6568050384521484, "rewards/rejected": -37.53413772583008, "step": 2991 }, { "epoch": 0.4074074074074074, "grad_norm": 63.68150510557293, "learning_rate": 5.910431100913427e-07, "logits/chosen": 11.630361557006836, "logits/rejected": 11.378034591674805, "logps/chosen": -3.7753829956054688, "logps/rejected": -3.657259464263916, "loss": 4.3398, "rewards/accuracies": 0.25, "rewards/chosen": -37.75382995605469, "rewards/margins": -1.1812362670898438, "rewards/rejected": -36.572593688964844, "step": 2992 }, { "epoch": 0.40754357298474947, "grad_norm": 50.192737333348376, "learning_rate": 5.908760363719463e-07, "logits/chosen": 10.800881385803223, "logits/rejected": 11.912353515625, "logps/chosen": -3.4432549476623535, "logps/rejected": -3.677565574645996, "loss": 4.263, "rewards/accuracies": 0.75, "rewards/chosen": -34.43254852294922, "rewards/margins": 2.343106746673584, "rewards/rejected": -36.77565383911133, "step": 2993 }, { "epoch": 0.4076797385620915, "grad_norm": 44.98785212415795, "learning_rate": 5.907089195225316e-07, "logits/chosen": 12.426162719726562, "logits/rejected": 12.229134559631348, "logps/chosen": -4.15748405456543, "logps/rejected": -4.175647735595703, "loss": 4.6936, "rewards/accuracies": 0.25, "rewards/chosen": -41.5748405456543, "rewards/margins": 0.18163394927978516, "rewards/rejected": -41.75647735595703, "step": 2994 }, { "epoch": 0.4078159041394335, "grad_norm": 52.8371655109996, "learning_rate": 5.905417595808603e-07, "logits/chosen": 11.648274421691895, "logits/rejected": 11.520671844482422, "logps/chosen": -3.81864070892334, "logps/rejected": -3.860894203186035, "loss": 4.3015, "rewards/accuracies": 0.5, "rewards/chosen": -38.18640899658203, "rewards/margins": 0.4225330352783203, "rewards/rejected": -38.60894012451172, "step": 2995 }, { "epoch": 0.4079520697167756, "grad_norm": 41.90340706153067, "learning_rate": 5.903745565847033e-07, "logits/chosen": 11.450302124023438, "logits/rejected": 11.454873085021973, "logps/chosen": -3.431736946105957, "logps/rejected": -3.922762393951416, "loss": 3.6701, "rewards/accuracies": 0.75, "rewards/chosen": -34.3173713684082, "rewards/margins": 4.910253524780273, "rewards/rejected": -39.227622985839844, "step": 2996 }, { "epoch": 0.40808823529411764, "grad_norm": 54.09725163133465, "learning_rate": 5.902073105718416e-07, "logits/chosen": 11.715368270874023, "logits/rejected": 11.740818977355957, "logps/chosen": -3.4047040939331055, "logps/rejected": -3.5643553733825684, "loss": 3.7346, "rewards/accuracies": 0.75, "rewards/chosen": -34.04704284667969, "rewards/margins": 1.5965113639831543, "rewards/rejected": -35.6435546875, "step": 2997 }, { "epoch": 0.40822440087145967, "grad_norm": 40.044618991096485, "learning_rate": 5.900400215800658e-07, "logits/chosen": 11.80811882019043, "logits/rejected": 12.371519088745117, "logps/chosen": -3.753361225128174, "logps/rejected": -4.191271781921387, "loss": 3.8581, "rewards/accuracies": 0.75, "rewards/chosen": -37.53361511230469, "rewards/margins": 4.379105567932129, "rewards/rejected": -41.9127197265625, "step": 2998 }, { "epoch": 0.40836056644880175, "grad_norm": 41.02851365427566, "learning_rate": 5.898726896471763e-07, "logits/chosen": 12.841657638549805, "logits/rejected": 12.458218574523926, "logps/chosen": -3.7827072143554688, "logps/rejected": -3.736799478530884, "loss": 4.194, "rewards/accuracies": 0.25, "rewards/chosen": -37.82707214355469, "rewards/margins": -0.4590778350830078, "rewards/rejected": -37.36799621582031, "step": 2999 }, { "epoch": 0.4084967320261438, "grad_norm": 43.381407527522654, "learning_rate": 5.89705314810983e-07, "logits/chosen": 12.122405052185059, "logits/rejected": 11.780750274658203, "logps/chosen": -3.7865235805511475, "logps/rejected": -3.7311649322509766, "loss": 3.4559, "rewards/accuracies": 0.75, "rewards/chosen": -37.865234375, "rewards/margins": -0.5535893440246582, "rewards/rejected": -37.3116455078125, "step": 3000 }, { "epoch": 0.4086328976034858, "grad_norm": 47.16031707064229, "learning_rate": 5.895378971093056e-07, "logits/chosen": 11.963903427124023, "logits/rejected": 11.831098556518555, "logps/chosen": -3.4588732719421387, "logps/rejected": -4.04326057434082, "loss": 4.144, "rewards/accuracies": 0.75, "rewards/chosen": -34.58872985839844, "rewards/margins": 5.843872547149658, "rewards/rejected": -40.4326057434082, "step": 3001 }, { "epoch": 0.4087690631808279, "grad_norm": 47.58303286852289, "learning_rate": 5.893704365799738e-07, "logits/chosen": 11.186379432678223, "logits/rejected": 12.54203987121582, "logps/chosen": -3.71354603767395, "logps/rejected": -4.236198425292969, "loss": 4.2384, "rewards/accuracies": 1.0, "rewards/chosen": -37.135459899902344, "rewards/margins": 5.226526260375977, "rewards/rejected": -42.36198425292969, "step": 3002 }, { "epoch": 0.4089052287581699, "grad_norm": 50.651379181102875, "learning_rate": 5.892029332608263e-07, "logits/chosen": 11.25956916809082, "logits/rejected": 10.809272766113281, "logps/chosen": -3.577188491821289, "logps/rejected": -3.6105332374572754, "loss": 4.1976, "rewards/accuracies": 0.5, "rewards/chosen": -35.77188491821289, "rewards/margins": 0.3334474563598633, "rewards/rejected": -36.1053352355957, "step": 3003 }, { "epoch": 0.409041394335512, "grad_norm": 49.038358204670374, "learning_rate": 5.890353871897122e-07, "logits/chosen": 12.317468643188477, "logits/rejected": 12.540275573730469, "logps/chosen": -4.003769397735596, "logps/rejected": -3.973569631576538, "loss": 3.989, "rewards/accuracies": 0.25, "rewards/chosen": -40.03769302368164, "rewards/margins": -0.30199623107910156, "rewards/rejected": -39.735694885253906, "step": 3004 }, { "epoch": 0.40917755991285404, "grad_norm": 40.92804572522297, "learning_rate": 5.888677984044898e-07, "logits/chosen": 10.08181381225586, "logits/rejected": 10.450020790100098, "logps/chosen": -3.5184688568115234, "logps/rejected": -3.499394416809082, "loss": 4.0887, "rewards/accuracies": 0.75, "rewards/chosen": -35.184688568115234, "rewards/margins": -0.19074678421020508, "rewards/rejected": -34.99394226074219, "step": 3005 }, { "epoch": 0.40931372549019607, "grad_norm": 46.396210343239595, "learning_rate": 5.887001669430271e-07, "logits/chosen": 11.366327285766602, "logits/rejected": 11.060358047485352, "logps/chosen": -3.820021152496338, "logps/rejected": -3.6349635124206543, "loss": 4.7543, "rewards/accuracies": 0.5, "rewards/chosen": -38.20021057128906, "rewards/margins": -1.8505773544311523, "rewards/rejected": -36.349632263183594, "step": 3006 }, { "epoch": 0.40944989106753815, "grad_norm": 45.51472536928758, "learning_rate": 5.88532492843202e-07, "logits/chosen": 11.570173263549805, "logits/rejected": 12.074100494384766, "logps/chosen": -3.5323128700256348, "logps/rejected": -3.5675723552703857, "loss": 4.1982, "rewards/accuracies": 0.75, "rewards/chosen": -35.323123931884766, "rewards/margins": 0.352597713470459, "rewards/rejected": -35.675724029541016, "step": 3007 }, { "epoch": 0.4095860566448802, "grad_norm": 60.35694015264992, "learning_rate": 5.883647761429015e-07, "logits/chosen": 11.922136306762695, "logits/rejected": 12.387895584106445, "logps/chosen": -3.937147617340088, "logps/rejected": -4.04072380065918, "loss": 4.2647, "rewards/accuracies": 0.5, "rewards/chosen": -39.37147521972656, "rewards/margins": 1.0357637405395508, "rewards/rejected": -40.40724182128906, "step": 3008 }, { "epoch": 0.4097222222222222, "grad_norm": 63.88901044761529, "learning_rate": 5.88197016880023e-07, "logits/chosen": 11.350622177124023, "logits/rejected": 12.789253234863281, "logps/chosen": -3.441776752471924, "logps/rejected": -4.123124122619629, "loss": 3.9285, "rewards/accuracies": 1.0, "rewards/chosen": -34.41776657104492, "rewards/margins": 6.813477516174316, "rewards/rejected": -41.23124313354492, "step": 3009 }, { "epoch": 0.4098583877995643, "grad_norm": 42.32961556506181, "learning_rate": 5.880292150924726e-07, "logits/chosen": 10.973651885986328, "logits/rejected": 12.366190910339355, "logps/chosen": -4.122970104217529, "logps/rejected": -4.382069110870361, "loss": 3.491, "rewards/accuracies": 0.75, "rewards/chosen": -41.229698181152344, "rewards/margins": 2.5909910202026367, "rewards/rejected": -43.8206901550293, "step": 3010 }, { "epoch": 0.4099945533769063, "grad_norm": 46.81576782137269, "learning_rate": 5.878613708181671e-07, "logits/chosen": 11.264833450317383, "logits/rejected": 12.551521301269531, "logps/chosen": -3.6749086380004883, "logps/rejected": -4.170168876647949, "loss": 3.9886, "rewards/accuracies": 0.75, "rewards/chosen": -36.74908447265625, "rewards/margins": 4.95259952545166, "rewards/rejected": -41.70168685913086, "step": 3011 }, { "epoch": 0.41013071895424835, "grad_norm": 47.05304623031497, "learning_rate": 5.876934840950319e-07, "logits/chosen": 11.839588165283203, "logits/rejected": 12.243701934814453, "logps/chosen": -3.4673924446105957, "logps/rejected": -3.623180389404297, "loss": 3.7385, "rewards/accuracies": 0.75, "rewards/chosen": -34.67392349243164, "rewards/margins": 1.5578784942626953, "rewards/rejected": -36.23180389404297, "step": 3012 }, { "epoch": 0.41026688453159044, "grad_norm": 48.81589417375305, "learning_rate": 5.875255549610023e-07, "logits/chosen": 11.701936721801758, "logits/rejected": 11.980241775512695, "logps/chosen": -3.6952807903289795, "logps/rejected": -3.9811980724334717, "loss": 4.3312, "rewards/accuracies": 0.5, "rewards/chosen": -36.95280838012695, "rewards/margins": 2.859172821044922, "rewards/rejected": -39.811981201171875, "step": 3013 }, { "epoch": 0.41040305010893247, "grad_norm": 42.36856228471769, "learning_rate": 5.873575834540236e-07, "logits/chosen": 11.407410621643066, "logits/rejected": 12.247706413269043, "logps/chosen": -3.4133124351501465, "logps/rejected": -3.626527786254883, "loss": 3.264, "rewards/accuracies": 0.75, "rewards/chosen": -34.13312530517578, "rewards/margins": 2.132154941558838, "rewards/rejected": -36.26527786254883, "step": 3014 }, { "epoch": 0.4105392156862745, "grad_norm": 48.56605502114933, "learning_rate": 5.871895696120502e-07, "logits/chosen": 10.235870361328125, "logits/rejected": 11.584163665771484, "logps/chosen": -3.7393853664398193, "logps/rejected": -3.9183554649353027, "loss": 3.9601, "rewards/accuracies": 0.75, "rewards/chosen": -37.39385223388672, "rewards/margins": 1.7896990776062012, "rewards/rejected": -39.183555603027344, "step": 3015 }, { "epoch": 0.4106753812636166, "grad_norm": 45.78060178115673, "learning_rate": 5.870215134730463e-07, "logits/chosen": 11.09398365020752, "logits/rejected": 11.638614654541016, "logps/chosen": -3.8078832626342773, "logps/rejected": -3.8498566150665283, "loss": 3.3659, "rewards/accuracies": 0.5, "rewards/chosen": -38.078834533691406, "rewards/margins": 0.4197349548339844, "rewards/rejected": -38.49856948852539, "step": 3016 }, { "epoch": 0.4108115468409586, "grad_norm": 46.889133098868996, "learning_rate": 5.868534150749852e-07, "logits/chosen": 12.246086120605469, "logits/rejected": 12.456832885742188, "logps/chosen": -4.142461776733398, "logps/rejected": -4.515660285949707, "loss": 3.4412, "rewards/accuracies": 1.0, "rewards/chosen": -41.42462158203125, "rewards/margins": 3.7319822311401367, "rewards/rejected": -45.15660095214844, "step": 3017 }, { "epoch": 0.41094771241830064, "grad_norm": 45.41434512077852, "learning_rate": 5.866852744558507e-07, "logits/chosen": 10.845845222473145, "logits/rejected": 11.022378921508789, "logps/chosen": -3.5290687084198, "logps/rejected": -3.722733974456787, "loss": 4.0321, "rewards/accuracies": 0.75, "rewards/chosen": -35.290687561035156, "rewards/margins": 1.936652660369873, "rewards/rejected": -37.22734069824219, "step": 3018 }, { "epoch": 0.4110838779956427, "grad_norm": 47.46561560355561, "learning_rate": 5.865170916536353e-07, "logits/chosen": 11.884531021118164, "logits/rejected": 11.911235809326172, "logps/chosen": -3.942477226257324, "logps/rejected": -3.661609172821045, "loss": 3.7719, "rewards/accuracies": 0.25, "rewards/chosen": -39.424774169921875, "rewards/margins": -2.8086819648742676, "rewards/rejected": -36.6160888671875, "step": 3019 }, { "epoch": 0.41122004357298475, "grad_norm": 44.18640242995413, "learning_rate": 5.863488667063411e-07, "logits/chosen": 11.746363639831543, "logits/rejected": 10.988895416259766, "logps/chosen": -3.4679110050201416, "logps/rejected": -3.649958610534668, "loss": 4.1997, "rewards/accuracies": 0.5, "rewards/chosen": -34.67911148071289, "rewards/margins": 1.8204731941223145, "rewards/rejected": -36.49958038330078, "step": 3020 }, { "epoch": 0.4113562091503268, "grad_norm": 41.539570803729085, "learning_rate": 5.861805996519801e-07, "logits/chosen": 11.191763877868652, "logits/rejected": 11.735272407531738, "logps/chosen": -3.3543620109558105, "logps/rejected": -3.7643485069274902, "loss": 3.8843, "rewards/accuracies": 0.75, "rewards/chosen": -33.54362106323242, "rewards/margins": 4.099862575531006, "rewards/rejected": -37.64348602294922, "step": 3021 }, { "epoch": 0.41149237472766886, "grad_norm": 48.74878784097499, "learning_rate": 5.860122905285737e-07, "logits/chosen": 11.565529823303223, "logits/rejected": 12.704214096069336, "logps/chosen": -3.301074504852295, "logps/rejected": -3.69614839553833, "loss": 4.5274, "rewards/accuracies": 0.75, "rewards/chosen": -33.010746002197266, "rewards/margins": 3.9507384300231934, "rewards/rejected": -36.96148681640625, "step": 3022 }, { "epoch": 0.4116285403050109, "grad_norm": 41.88807954830254, "learning_rate": 5.858439393741527e-07, "logits/chosen": 11.699493408203125, "logits/rejected": 12.992374420166016, "logps/chosen": -3.3228824138641357, "logps/rejected": -3.9037952423095703, "loss": 4.1148, "rewards/accuracies": 1.0, "rewards/chosen": -33.22882843017578, "rewards/margins": 5.809127330780029, "rewards/rejected": -39.0379524230957, "step": 3023 }, { "epoch": 0.4117647058823529, "grad_norm": 52.96931869584595, "learning_rate": 5.856755462267573e-07, "logits/chosen": 11.973758697509766, "logits/rejected": 12.736505508422852, "logps/chosen": -3.781895875930786, "logps/rejected": -4.102927207946777, "loss": 3.6023, "rewards/accuracies": 0.75, "rewards/chosen": -37.8189582824707, "rewards/margins": 3.210315704345703, "rewards/rejected": -41.029273986816406, "step": 3024 }, { "epoch": 0.411900871459695, "grad_norm": 44.10387280052245, "learning_rate": 5.855071111244376e-07, "logits/chosen": 12.029648780822754, "logits/rejected": 12.531160354614258, "logps/chosen": -3.904036045074463, "logps/rejected": -4.140020370483398, "loss": 3.7288, "rewards/accuracies": 0.75, "rewards/chosen": -39.04035949707031, "rewards/margins": 2.3598451614379883, "rewards/rejected": -41.400203704833984, "step": 3025 }, { "epoch": 0.41203703703703703, "grad_norm": 44.76459966590855, "learning_rate": 5.853386341052525e-07, "logits/chosen": 10.964466094970703, "logits/rejected": 11.332691192626953, "logps/chosen": -3.5552070140838623, "logps/rejected": -3.753596782684326, "loss": 4.0482, "rewards/accuracies": 0.75, "rewards/chosen": -35.55207061767578, "rewards/margins": 1.9838981628417969, "rewards/rejected": -37.53596496582031, "step": 3026 }, { "epoch": 0.41217320261437906, "grad_norm": 45.43000256055086, "learning_rate": 5.851701152072711e-07, "logits/chosen": 11.192790031433105, "logits/rejected": 11.649639129638672, "logps/chosen": -3.847869634628296, "logps/rejected": -4.170529365539551, "loss": 4.077, "rewards/accuracies": 0.75, "rewards/chosen": -38.478694915771484, "rewards/margins": 3.2265939712524414, "rewards/rejected": -41.705291748046875, "step": 3027 }, { "epoch": 0.41230936819172115, "grad_norm": 48.83986395336143, "learning_rate": 5.850015544685716e-07, "logits/chosen": 13.012462615966797, "logits/rejected": 11.711785316467285, "logps/chosen": -4.00968074798584, "logps/rejected": -3.748253107070923, "loss": 3.7527, "rewards/accuracies": 0.0, "rewards/chosen": -40.0968017578125, "rewards/margins": -2.614272117614746, "rewards/rejected": -37.4825325012207, "step": 3028 }, { "epoch": 0.4124455337690632, "grad_norm": 42.15624936676039, "learning_rate": 5.848329519272414e-07, "logits/chosen": 12.469013214111328, "logits/rejected": 13.315605163574219, "logps/chosen": -4.003867149353027, "logps/rejected": -4.183524131774902, "loss": 3.7554, "rewards/accuracies": 0.5, "rewards/chosen": -40.03866958618164, "rewards/margins": 1.7965726852416992, "rewards/rejected": -41.835243225097656, "step": 3029 }, { "epoch": 0.4125816993464052, "grad_norm": 55.350481572675506, "learning_rate": 5.846643076213781e-07, "logits/chosen": 11.615528106689453, "logits/rejected": 11.795459747314453, "logps/chosen": -3.74904727935791, "logps/rejected": -3.9320218563079834, "loss": 4.0983, "rewards/accuracies": 0.5, "rewards/chosen": -37.490474700927734, "rewards/margins": 1.8297452926635742, "rewards/rejected": -39.320220947265625, "step": 3030 }, { "epoch": 0.4127178649237473, "grad_norm": 40.56555071159819, "learning_rate": 5.84495621589088e-07, "logits/chosen": 11.70538330078125, "logits/rejected": 12.461468696594238, "logps/chosen": -3.4744718074798584, "logps/rejected": -4.101595878601074, "loss": 3.8452, "rewards/accuracies": 1.0, "rewards/chosen": -34.744720458984375, "rewards/margins": 6.271242141723633, "rewards/rejected": -41.015960693359375, "step": 3031 }, { "epoch": 0.4128540305010893, "grad_norm": 54.95582222370779, "learning_rate": 5.843268938684871e-07, "logits/chosen": 12.731993675231934, "logits/rejected": 12.309196472167969, "logps/chosen": -3.8632664680480957, "logps/rejected": -3.9937453269958496, "loss": 4.3447, "rewards/accuracies": 0.5, "rewards/chosen": -38.632659912109375, "rewards/margins": 1.3047914505004883, "rewards/rejected": -39.93745422363281, "step": 3032 }, { "epoch": 0.41299019607843135, "grad_norm": 39.26812790436333, "learning_rate": 5.841581244977009e-07, "logits/chosen": 12.24182415008545, "logits/rejected": 12.635491371154785, "logps/chosen": -3.7823219299316406, "logps/rejected": -3.864710807800293, "loss": 4.0489, "rewards/accuracies": 0.5, "rewards/chosen": -37.823219299316406, "rewards/margins": 0.8238925933837891, "rewards/rejected": -38.64710998535156, "step": 3033 }, { "epoch": 0.41312636165577343, "grad_norm": 42.91213443477985, "learning_rate": 5.839893135148642e-07, "logits/chosen": 12.421903610229492, "logits/rejected": 12.432092666625977, "logps/chosen": -3.737288475036621, "logps/rejected": -3.9979443550109863, "loss": 3.4381, "rewards/accuracies": 0.75, "rewards/chosen": -37.372886657714844, "rewards/margins": 2.606557846069336, "rewards/rejected": -39.97944641113281, "step": 3034 }, { "epoch": 0.41326252723311546, "grad_norm": 41.64336768845708, "learning_rate": 5.838204609581212e-07, "logits/chosen": 12.705876350402832, "logits/rejected": 12.813430786132812, "logps/chosen": -3.485229253768921, "logps/rejected": -3.6401631832122803, "loss": 3.5029, "rewards/accuracies": 0.5, "rewards/chosen": -34.852291107177734, "rewards/margins": 1.5493392944335938, "rewards/rejected": -36.401634216308594, "step": 3035 }, { "epoch": 0.4133986928104575, "grad_norm": 155.52312647384593, "learning_rate": 5.836515668656256e-07, "logits/chosen": 12.826730728149414, "logits/rejected": 13.729387283325195, "logps/chosen": -3.9967398643493652, "logps/rejected": -4.281466484069824, "loss": 3.6179, "rewards/accuracies": 0.75, "rewards/chosen": -39.9673957824707, "rewards/margins": 2.84726619720459, "rewards/rejected": -42.81466293334961, "step": 3036 }, { "epoch": 0.4135348583877996, "grad_norm": 43.345271638074955, "learning_rate": 5.834826312755404e-07, "logits/chosen": 10.906990051269531, "logits/rejected": 12.435783386230469, "logps/chosen": -3.2861950397491455, "logps/rejected": -3.859959840774536, "loss": 3.9626, "rewards/accuracies": 1.0, "rewards/chosen": -32.86195373535156, "rewards/margins": 5.737646579742432, "rewards/rejected": -38.5995979309082, "step": 3037 }, { "epoch": 0.4136710239651416, "grad_norm": 46.71527462133952, "learning_rate": 5.83313654226038e-07, "logits/chosen": 12.975582122802734, "logits/rejected": 12.152759552001953, "logps/chosen": -4.027556419372559, "logps/rejected": -4.051995277404785, "loss": 4.3964, "rewards/accuracies": 0.25, "rewards/chosen": -40.27556610107422, "rewards/margins": 0.24438953399658203, "rewards/rejected": -40.519954681396484, "step": 3038 }, { "epoch": 0.41380718954248363, "grad_norm": 45.32029585506695, "learning_rate": 5.831446357553001e-07, "logits/chosen": 11.665990829467773, "logits/rejected": 12.370136260986328, "logps/chosen": -3.7251181602478027, "logps/rejected": -3.9782795906066895, "loss": 4.1909, "rewards/accuracies": 0.5, "rewards/chosen": -37.251182556152344, "rewards/margins": 2.5316152572631836, "rewards/rejected": -39.782798767089844, "step": 3039 }, { "epoch": 0.4139433551198257, "grad_norm": 45.936911492086224, "learning_rate": 5.829755759015179e-07, "logits/chosen": 11.636369705200195, "logits/rejected": 12.534406661987305, "logps/chosen": -3.92423152923584, "logps/rejected": -4.266040802001953, "loss": 4.0648, "rewards/accuracies": 0.75, "rewards/chosen": -39.24231719970703, "rewards/margins": 3.4180898666381836, "rewards/rejected": -42.66040802001953, "step": 3040 }, { "epoch": 0.41407952069716775, "grad_norm": 44.13274668633771, "learning_rate": 5.828064747028918e-07, "logits/chosen": 12.684121131896973, "logits/rejected": 12.851402282714844, "logps/chosen": -4.357881546020508, "logps/rejected": -4.343508720397949, "loss": 3.6143, "rewards/accuracies": 0.75, "rewards/chosen": -43.578819274902344, "rewards/margins": -0.1437368392944336, "rewards/rejected": -43.435081481933594, "step": 3041 }, { "epoch": 0.41421568627450983, "grad_norm": 41.41535665407388, "learning_rate": 5.826373321976316e-07, "logits/chosen": 12.193729400634766, "logits/rejected": 13.21120548248291, "logps/chosen": -3.983154535293579, "logps/rejected": -4.315655708312988, "loss": 4.1765, "rewards/accuracies": 1.0, "rewards/chosen": -39.831546783447266, "rewards/margins": 3.325010299682617, "rewards/rejected": -43.15655517578125, "step": 3042 }, { "epoch": 0.41435185185185186, "grad_norm": 44.295077466866196, "learning_rate": 5.824681484239565e-07, "logits/chosen": 12.915355682373047, "logits/rejected": 13.161123275756836, "logps/chosen": -4.539334774017334, "logps/rejected": -4.492544174194336, "loss": 4.1061, "rewards/accuracies": 0.5, "rewards/chosen": -45.393348693847656, "rewards/margins": -0.46790599822998047, "rewards/rejected": -44.925437927246094, "step": 3043 }, { "epoch": 0.4144880174291939, "grad_norm": 98.30074422660354, "learning_rate": 5.82298923420095e-07, "logits/chosen": 12.594493865966797, "logits/rejected": 11.698944091796875, "logps/chosen": -4.049975395202637, "logps/rejected": -3.6390674114227295, "loss": 4.1768, "rewards/accuracies": 0.0, "rewards/chosen": -40.49974822998047, "rewards/margins": -4.109077453613281, "rewards/rejected": -36.39067459106445, "step": 3044 }, { "epoch": 0.414624183006536, "grad_norm": 42.141144798857894, "learning_rate": 5.821296572242849e-07, "logits/chosen": 11.859746932983398, "logits/rejected": 12.781740188598633, "logps/chosen": -3.7675094604492188, "logps/rejected": -4.1093597412109375, "loss": 3.8502, "rewards/accuracies": 1.0, "rewards/chosen": -37.67509460449219, "rewards/margins": 3.4185047149658203, "rewards/rejected": -41.093597412109375, "step": 3045 }, { "epoch": 0.414760348583878, "grad_norm": 40.64058921456594, "learning_rate": 5.819603498747733e-07, "logits/chosen": 12.69062614440918, "logits/rejected": 12.875869750976562, "logps/chosen": -4.109967231750488, "logps/rejected": -4.184353828430176, "loss": 3.4746, "rewards/accuracies": 0.5, "rewards/chosen": -41.09967041015625, "rewards/margins": 0.7438650131225586, "rewards/rejected": -41.843536376953125, "step": 3046 }, { "epoch": 0.41489651416122003, "grad_norm": 56.39496179166334, "learning_rate": 5.817910014098164e-07, "logits/chosen": 12.981430053710938, "logits/rejected": 13.504964828491211, "logps/chosen": -3.8503735065460205, "logps/rejected": -4.312138557434082, "loss": 3.9038, "rewards/accuracies": 0.75, "rewards/chosen": -38.50373458862305, "rewards/margins": 4.617652893066406, "rewards/rejected": -43.12138748168945, "step": 3047 }, { "epoch": 0.4150326797385621, "grad_norm": 40.56951436273398, "learning_rate": 5.816216118676801e-07, "logits/chosen": 11.26440715789795, "logits/rejected": 12.174894332885742, "logps/chosen": -3.596992015838623, "logps/rejected": -4.157732009887695, "loss": 4.2401, "rewards/accuracies": 0.75, "rewards/chosen": -35.96992111206055, "rewards/margins": 5.607395172119141, "rewards/rejected": -41.57731628417969, "step": 3048 }, { "epoch": 0.41516884531590414, "grad_norm": 42.9747044936935, "learning_rate": 5.814521812866394e-07, "logits/chosen": 12.23879623413086, "logits/rejected": 12.578729629516602, "logps/chosen": -4.055931091308594, "logps/rejected": -4.206162929534912, "loss": 3.8064, "rewards/accuracies": 0.75, "rewards/chosen": -40.55931091308594, "rewards/margins": 1.5023174285888672, "rewards/rejected": -42.06163024902344, "step": 3049 }, { "epoch": 0.4153050108932462, "grad_norm": 46.2211588138585, "learning_rate": 5.812827097049782e-07, "logits/chosen": 12.19688606262207, "logits/rejected": 12.135004043579102, "logps/chosen": -3.5801916122436523, "logps/rejected": -4.373222351074219, "loss": 4.4, "rewards/accuracies": 1.0, "rewards/chosen": -35.801918029785156, "rewards/margins": 7.930304527282715, "rewards/rejected": -43.73221969604492, "step": 3050 }, { "epoch": 0.41544117647058826, "grad_norm": 45.41502648033263, "learning_rate": 5.811131971609905e-07, "logits/chosen": 12.470441818237305, "logits/rejected": 12.198797225952148, "logps/chosen": -3.8350114822387695, "logps/rejected": -4.142772197723389, "loss": 3.6184, "rewards/accuracies": 0.75, "rewards/chosen": -38.35011672973633, "rewards/margins": 3.0776071548461914, "rewards/rejected": -41.42771911621094, "step": 3051 }, { "epoch": 0.4155773420479303, "grad_norm": 44.07752863378816, "learning_rate": 5.809436436929787e-07, "logits/chosen": 11.191839218139648, "logits/rejected": 12.948949813842773, "logps/chosen": -3.700078010559082, "logps/rejected": -4.062837600708008, "loss": 4.4033, "rewards/accuracies": 1.0, "rewards/chosen": -37.00077819824219, "rewards/margins": 3.627596378326416, "rewards/rejected": -40.62837600708008, "step": 3052 }, { "epoch": 0.4157135076252723, "grad_norm": 41.81652656956423, "learning_rate": 5.807740493392549e-07, "logits/chosen": 12.633930206298828, "logits/rejected": 12.670085906982422, "logps/chosen": -4.0523681640625, "logps/rejected": -4.157597541809082, "loss": 3.192, "rewards/accuracies": 0.75, "rewards/chosen": -40.523681640625, "rewards/margins": 1.0522956848144531, "rewards/rejected": -41.57597351074219, "step": 3053 }, { "epoch": 0.4158496732026144, "grad_norm": 41.15511365507248, "learning_rate": 5.806044141381403e-07, "logits/chosen": 12.738862037658691, "logits/rejected": 13.535770416259766, "logps/chosen": -3.7630491256713867, "logps/rejected": -4.0171613693237305, "loss": 3.7206, "rewards/accuracies": 1.0, "rewards/chosen": -37.630489349365234, "rewards/margins": 2.5411224365234375, "rewards/rejected": -40.17161560058594, "step": 3054 }, { "epoch": 0.41598583877995643, "grad_norm": 44.28266284529917, "learning_rate": 5.804347381279655e-07, "logits/chosen": 12.39631175994873, "logits/rejected": 12.547013282775879, "logps/chosen": -3.815694808959961, "logps/rejected": -3.9418272972106934, "loss": 3.7133, "rewards/accuracies": 0.5, "rewards/chosen": -38.15694808959961, "rewards/margins": 1.2613258361816406, "rewards/rejected": -39.41827392578125, "step": 3055 }, { "epoch": 0.41612200435729846, "grad_norm": 38.12211737581172, "learning_rate": 5.802650213470701e-07, "logits/chosen": 12.602566719055176, "logits/rejected": 13.396383285522461, "logps/chosen": -4.153803825378418, "logps/rejected": -4.239325523376465, "loss": 3.7971, "rewards/accuracies": 0.5, "rewards/chosen": -41.53803634643555, "rewards/margins": 0.8552188873291016, "rewards/rejected": -42.39325714111328, "step": 3056 }, { "epoch": 0.41625816993464054, "grad_norm": 41.62243937673788, "learning_rate": 5.800952638338031e-07, "logits/chosen": 11.882787704467773, "logits/rejected": 12.935266494750977, "logps/chosen": -3.636861562728882, "logps/rejected": -4.2069854736328125, "loss": 4.0179, "rewards/accuracies": 1.0, "rewards/chosen": -36.368614196777344, "rewards/margins": 5.701240062713623, "rewards/rejected": -42.069854736328125, "step": 3057 }, { "epoch": 0.41639433551198257, "grad_norm": 49.83255157145706, "learning_rate": 5.799254656265225e-07, "logits/chosen": 11.977032661437988, "logits/rejected": 11.834268569946289, "logps/chosen": -3.808112144470215, "logps/rejected": -4.066928863525391, "loss": 3.9556, "rewards/accuracies": 0.75, "rewards/chosen": -38.08112335205078, "rewards/margins": 2.5881643295288086, "rewards/rejected": -40.669288635253906, "step": 3058 }, { "epoch": 0.4165305010893246, "grad_norm": 39.94257041438713, "learning_rate": 5.797556267635957e-07, "logits/chosen": 13.387454986572266, "logits/rejected": 14.336336135864258, "logps/chosen": -4.0202956199646, "logps/rejected": -4.219182014465332, "loss": 3.8086, "rewards/accuracies": 0.75, "rewards/chosen": -40.20295715332031, "rewards/margins": 1.9888639450073242, "rewards/rejected": -42.19182205200195, "step": 3059 }, { "epoch": 0.4166666666666667, "grad_norm": 43.06329624693909, "learning_rate": 5.795857472833991e-07, "logits/chosen": 11.50056266784668, "logits/rejected": 11.60666561126709, "logps/chosen": -3.662292718887329, "logps/rejected": -3.6798360347747803, "loss": 4.1542, "rewards/accuracies": 0.5, "rewards/chosen": -36.622928619384766, "rewards/margins": 0.1754322052001953, "rewards/rejected": -36.79835891723633, "step": 3060 }, { "epoch": 0.4168028322440087, "grad_norm": 40.097676140849465, "learning_rate": 5.794158272243185e-07, "logits/chosen": 12.976811408996582, "logits/rejected": 12.635011672973633, "logps/chosen": -4.059825897216797, "logps/rejected": -3.7414026260375977, "loss": 3.6052, "rewards/accuracies": 0.0, "rewards/chosen": -40.59825897216797, "rewards/margins": -3.184229850769043, "rewards/rejected": -37.414024353027344, "step": 3061 }, { "epoch": 0.41693899782135074, "grad_norm": 46.528388703610524, "learning_rate": 5.792458666247486e-07, "logits/chosen": 12.181710243225098, "logits/rejected": 12.478763580322266, "logps/chosen": -4.0969648361206055, "logps/rejected": -4.097316741943359, "loss": 4.5678, "rewards/accuracies": 0.5, "rewards/chosen": -40.96965026855469, "rewards/margins": 0.0035190582275390625, "rewards/rejected": -40.973167419433594, "step": 3062 }, { "epoch": 0.4170751633986928, "grad_norm": 37.29376502023815, "learning_rate": 5.790758655230935e-07, "logits/chosen": 11.499046325683594, "logits/rejected": 12.882829666137695, "logps/chosen": -3.709115505218506, "logps/rejected": -4.241096496582031, "loss": 3.1544, "rewards/accuracies": 0.75, "rewards/chosen": -37.091156005859375, "rewards/margins": 5.31981086730957, "rewards/rejected": -42.41096496582031, "step": 3063 }, { "epoch": 0.41721132897603486, "grad_norm": 57.47940202300386, "learning_rate": 5.789058239577663e-07, "logits/chosen": 11.403568267822266, "logits/rejected": 12.273482322692871, "logps/chosen": -3.63914155960083, "logps/rejected": -4.014731407165527, "loss": 3.9917, "rewards/accuracies": 1.0, "rewards/chosen": -36.39141845703125, "rewards/margins": 3.75589656829834, "rewards/rejected": -40.14731216430664, "step": 3064 }, { "epoch": 0.4173474945533769, "grad_norm": 46.02859757819768, "learning_rate": 5.787357419671895e-07, "logits/chosen": 11.813139915466309, "logits/rejected": 13.430219650268555, "logps/chosen": -3.549255847930908, "logps/rejected": -4.077766418457031, "loss": 3.6446, "rewards/accuracies": 0.75, "rewards/chosen": -35.49256134033203, "rewards/margins": 5.285101413726807, "rewards/rejected": -40.77766036987305, "step": 3065 }, { "epoch": 0.41748366013071897, "grad_norm": 48.59821361207547, "learning_rate": 5.785656195897942e-07, "logits/chosen": 12.15382194519043, "logits/rejected": 11.826515197753906, "logps/chosen": -3.679739475250244, "logps/rejected": -3.661839723587036, "loss": 4.058, "rewards/accuracies": 0.5, "rewards/chosen": -36.797393798828125, "rewards/margins": -0.17899656295776367, "rewards/rejected": -36.6183967590332, "step": 3066 }, { "epoch": 0.417619825708061, "grad_norm": 42.09062531019804, "learning_rate": 5.783954568640211e-07, "logits/chosen": 10.673371315002441, "logits/rejected": 11.369729042053223, "logps/chosen": -3.417442560195923, "logps/rejected": -3.7497596740722656, "loss": 3.2344, "rewards/accuracies": 0.5, "rewards/chosen": -34.1744270324707, "rewards/margins": 3.323169708251953, "rewards/rejected": -37.497596740722656, "step": 3067 }, { "epoch": 0.417755991285403, "grad_norm": 39.79640320547446, "learning_rate": 5.782252538283199e-07, "logits/chosen": 12.26682186126709, "logits/rejected": 12.198899269104004, "logps/chosen": -4.082452774047852, "logps/rejected": -3.9230401515960693, "loss": 3.539, "rewards/accuracies": 0.25, "rewards/chosen": -40.82453155517578, "rewards/margins": -1.594125747680664, "rewards/rejected": -39.23040008544922, "step": 3068 }, { "epoch": 0.4178921568627451, "grad_norm": 43.23554726969983, "learning_rate": 5.780550105211494e-07, "logits/chosen": 12.413753509521484, "logits/rejected": 11.70112419128418, "logps/chosen": -3.8219542503356934, "logps/rejected": -3.5533976554870605, "loss": 4.3238, "rewards/accuracies": 0.0, "rewards/chosen": -38.21954345703125, "rewards/margins": -2.685563087463379, "rewards/rejected": -35.53397750854492, "step": 3069 }, { "epoch": 0.41802832244008714, "grad_norm": 43.467525996722735, "learning_rate": 5.778847269809775e-07, "logits/chosen": 12.50380802154541, "logits/rejected": 12.598344802856445, "logps/chosen": -3.8992912769317627, "logps/rejected": -4.12486457824707, "loss": 3.8011, "rewards/accuracies": 1.0, "rewards/chosen": -38.99291229248047, "rewards/margins": 2.255732536315918, "rewards/rejected": -41.24864196777344, "step": 3070 }, { "epoch": 0.41816448801742917, "grad_norm": 53.255735432242126, "learning_rate": 5.777144032462811e-07, "logits/chosen": 12.280815124511719, "logits/rejected": 12.601287841796875, "logps/chosen": -3.6575613021850586, "logps/rejected": -4.168745994567871, "loss": 4.1897, "rewards/accuracies": 0.75, "rewards/chosen": -36.57561111450195, "rewards/margins": 5.111845016479492, "rewards/rejected": -41.68745422363281, "step": 3071 }, { "epoch": 0.41830065359477125, "grad_norm": 40.3537978396576, "learning_rate": 5.775440393555463e-07, "logits/chosen": 11.854692459106445, "logits/rejected": 13.036506652832031, "logps/chosen": -3.5456764698028564, "logps/rejected": -3.8726577758789062, "loss": 3.9263, "rewards/accuracies": 1.0, "rewards/chosen": -35.456764221191406, "rewards/margins": 3.2698116302490234, "rewards/rejected": -38.72657775878906, "step": 3072 }, { "epoch": 0.4184368191721133, "grad_norm": 45.01363126044915, "learning_rate": 5.773736353472682e-07, "logits/chosen": 10.577239036560059, "logits/rejected": 12.085752487182617, "logps/chosen": -3.275771141052246, "logps/rejected": -3.88059663772583, "loss": 4.0048, "rewards/accuracies": 0.75, "rewards/chosen": -32.757713317871094, "rewards/margins": 6.048254489898682, "rewards/rejected": -38.805965423583984, "step": 3073 }, { "epoch": 0.4185729847494553, "grad_norm": 50.16819400736063, "learning_rate": 5.772031912599509e-07, "logits/chosen": 11.814910888671875, "logits/rejected": 11.402334213256836, "logps/chosen": -3.3972644805908203, "logps/rejected": -3.507495880126953, "loss": 4.3105, "rewards/accuracies": 0.75, "rewards/chosen": -33.9726448059082, "rewards/margins": 1.1023154258728027, "rewards/rejected": -35.07495880126953, "step": 3074 }, { "epoch": 0.4187091503267974, "grad_norm": 42.93223048563655, "learning_rate": 5.770327071321078e-07, "logits/chosen": 12.143207550048828, "logits/rejected": 12.453481674194336, "logps/chosen": -3.8298072814941406, "logps/rejected": -4.167324066162109, "loss": 4.0378, "rewards/accuracies": 0.5, "rewards/chosen": -38.298072814941406, "rewards/margins": 3.3751726150512695, "rewards/rejected": -41.67324447631836, "step": 3075 }, { "epoch": 0.4188453159041394, "grad_norm": 58.58703619826493, "learning_rate": 5.768621830022613e-07, "logits/chosen": 11.305596351623535, "logits/rejected": 11.647863388061523, "logps/chosen": -3.5048460960388184, "logps/rejected": -3.7441463470458984, "loss": 4.0538, "rewards/accuracies": 0.75, "rewards/chosen": -35.0484619140625, "rewards/margins": 2.393002986907959, "rewards/rejected": -37.441463470458984, "step": 3076 }, { "epoch": 0.41898148148148145, "grad_norm": 45.58817472988957, "learning_rate": 5.766916189089425e-07, "logits/chosen": 12.036028861999512, "logits/rejected": 11.532624244689941, "logps/chosen": -3.836970090866089, "logps/rejected": -4.1296892166137695, "loss": 4.0941, "rewards/accuracies": 0.75, "rewards/chosen": -38.36969757080078, "rewards/margins": 2.927191734313965, "rewards/rejected": -41.29689025878906, "step": 3077 }, { "epoch": 0.41911764705882354, "grad_norm": 42.421721142957864, "learning_rate": 5.765210148906918e-07, "logits/chosen": 12.117916107177734, "logits/rejected": 11.446319580078125, "logps/chosen": -3.534160852432251, "logps/rejected": -3.7875194549560547, "loss": 4.3418, "rewards/accuracies": 0.75, "rewards/chosen": -35.34160614013672, "rewards/margins": 2.5335850715637207, "rewards/rejected": -37.87519073486328, "step": 3078 }, { "epoch": 0.41925381263616557, "grad_norm": 47.23570932404173, "learning_rate": 5.763503709860588e-07, "logits/chosen": 11.882987976074219, "logits/rejected": 11.827644348144531, "logps/chosen": -3.991603374481201, "logps/rejected": -4.238987922668457, "loss": 4.2968, "rewards/accuracies": 1.0, "rewards/chosen": -39.91603469848633, "rewards/margins": 2.473843574523926, "rewards/rejected": -42.38987731933594, "step": 3079 }, { "epoch": 0.41938997821350765, "grad_norm": 53.38768120181861, "learning_rate": 5.761796872336016e-07, "logits/chosen": 11.327280044555664, "logits/rejected": 11.959125518798828, "logps/chosen": -3.6220438480377197, "logps/rejected": -3.7731404304504395, "loss": 4.3694, "rewards/accuracies": 0.5, "rewards/chosen": -36.22043991088867, "rewards/margins": 1.510965347290039, "rewards/rejected": -37.73140335083008, "step": 3080 }, { "epoch": 0.4195261437908497, "grad_norm": 41.46492358220943, "learning_rate": 5.760089636718878e-07, "logits/chosen": 11.655474662780762, "logits/rejected": 12.505664825439453, "logps/chosen": -3.6199116706848145, "logps/rejected": -4.091332912445068, "loss": 3.4759, "rewards/accuracies": 1.0, "rewards/chosen": -36.199119567871094, "rewards/margins": 4.714210510253906, "rewards/rejected": -40.913330078125, "step": 3081 }, { "epoch": 0.4196623093681917, "grad_norm": 43.479984407017284, "learning_rate": 5.758382003394938e-07, "logits/chosen": 12.051685333251953, "logits/rejected": 12.143245697021484, "logps/chosen": -3.744657516479492, "logps/rejected": -3.584362506866455, "loss": 4.044, "rewards/accuracies": 0.25, "rewards/chosen": -37.44657516479492, "rewards/margins": -1.6029529571533203, "rewards/rejected": -35.843624114990234, "step": 3082 }, { "epoch": 0.4197984749455338, "grad_norm": 39.333398347566515, "learning_rate": 5.756673972750049e-07, "logits/chosen": 12.503423690795898, "logits/rejected": 13.384033203125, "logps/chosen": -3.8816475868225098, "logps/rejected": -4.3614397048950195, "loss": 3.7544, "rewards/accuracies": 0.75, "rewards/chosen": -38.81647491455078, "rewards/margins": 4.797921180725098, "rewards/rejected": -43.61439514160156, "step": 3083 }, { "epoch": 0.4199346405228758, "grad_norm": 44.295082815646545, "learning_rate": 5.754965545170155e-07, "logits/chosen": 12.064332962036133, "logits/rejected": 13.144866943359375, "logps/chosen": -4.017641067504883, "logps/rejected": -4.439457416534424, "loss": 4.0949, "rewards/accuracies": 0.75, "rewards/chosen": -40.17641067504883, "rewards/margins": 4.218164443969727, "rewards/rejected": -44.39457702636719, "step": 3084 }, { "epoch": 0.42007080610021785, "grad_norm": 46.22825073208411, "learning_rate": 5.75325672104129e-07, "logits/chosen": 12.815486907958984, "logits/rejected": 13.230649948120117, "logps/chosen": -3.554361343383789, "logps/rejected": -4.049062252044678, "loss": 3.5077, "rewards/accuracies": 1.0, "rewards/chosen": -35.54361343383789, "rewards/margins": 4.947009563446045, "rewards/rejected": -40.490623474121094, "step": 3085 }, { "epoch": 0.42020697167755994, "grad_norm": 41.026412545486565, "learning_rate": 5.751547500749575e-07, "logits/chosen": 12.270476341247559, "logits/rejected": 12.60567569732666, "logps/chosen": -3.761134386062622, "logps/rejected": -4.188867092132568, "loss": 4.0624, "rewards/accuracies": 1.0, "rewards/chosen": -37.61134338378906, "rewards/margins": 4.277327537536621, "rewards/rejected": -41.888671875, "step": 3086 }, { "epoch": 0.42034313725490197, "grad_norm": 42.57900905163844, "learning_rate": 5.749837884681226e-07, "logits/chosen": 12.246217727661133, "logits/rejected": 12.762214660644531, "logps/chosen": -4.226069450378418, "logps/rejected": -4.125162601470947, "loss": 4.2098, "rewards/accuracies": 0.5, "rewards/chosen": -42.26069641113281, "rewards/margins": -1.009073257446289, "rewards/rejected": -41.251625061035156, "step": 3087 }, { "epoch": 0.420479302832244, "grad_norm": 51.1865746069432, "learning_rate": 5.74812787322254e-07, "logits/chosen": 12.120418548583984, "logits/rejected": 12.333000183105469, "logps/chosen": -3.8236122131347656, "logps/rejected": -3.655057430267334, "loss": 4.5258, "rewards/accuracies": 0.25, "rewards/chosen": -38.236122131347656, "rewards/margins": -1.6855487823486328, "rewards/rejected": -36.550575256347656, "step": 3088 }, { "epoch": 0.4206154684095861, "grad_norm": 47.775450081520454, "learning_rate": 5.746417466759913e-07, "logits/chosen": 12.617372512817383, "logits/rejected": 12.821346282958984, "logps/chosen": -4.0615034103393555, "logps/rejected": -3.944995164871216, "loss": 4.709, "rewards/accuracies": 0.0, "rewards/chosen": -40.61503601074219, "rewards/margins": -1.1650848388671875, "rewards/rejected": -39.449951171875, "step": 3089 }, { "epoch": 0.4207516339869281, "grad_norm": 43.5776126209677, "learning_rate": 5.744706665679822e-07, "logits/chosen": 12.583818435668945, "logits/rejected": 12.752189636230469, "logps/chosen": -3.733217716217041, "logps/rejected": -3.751295566558838, "loss": 3.8737, "rewards/accuracies": 0.5, "rewards/chosen": -37.332176208496094, "rewards/margins": 0.18078041076660156, "rewards/rejected": -37.51295471191406, "step": 3090 }, { "epoch": 0.42088779956427014, "grad_norm": 40.756221721750585, "learning_rate": 5.742995470368838e-07, "logits/chosen": 12.489255905151367, "logits/rejected": 12.7125244140625, "logps/chosen": -3.893501043319702, "logps/rejected": -4.174614429473877, "loss": 4.0031, "rewards/accuracies": 0.75, "rewards/chosen": -38.93500900268555, "rewards/margins": 2.8111352920532227, "rewards/rejected": -41.74614715576172, "step": 3091 }, { "epoch": 0.4210239651416122, "grad_norm": 45.2469268728316, "learning_rate": 5.74128388121362e-07, "logits/chosen": 12.244974136352539, "logits/rejected": 12.631927490234375, "logps/chosen": -4.011639595031738, "logps/rejected": -4.236814498901367, "loss": 3.8916, "rewards/accuracies": 1.0, "rewards/chosen": -40.11640167236328, "rewards/margins": 2.25174617767334, "rewards/rejected": -42.36814498901367, "step": 3092 }, { "epoch": 0.42116013071895425, "grad_norm": 42.17815338402699, "learning_rate": 5.739571898600916e-07, "logits/chosen": 10.969658851623535, "logits/rejected": 11.760528564453125, "logps/chosen": -3.663647174835205, "logps/rejected": -3.7345621585845947, "loss": 3.717, "rewards/accuracies": 0.5, "rewards/chosen": -36.636470794677734, "rewards/margins": 0.7091503143310547, "rewards/rejected": -37.34562301635742, "step": 3093 }, { "epoch": 0.4212962962962963, "grad_norm": 119.76585376732248, "learning_rate": 5.737859522917561e-07, "logits/chosen": 11.79440689086914, "logits/rejected": 11.21850872039795, "logps/chosen": -3.614315986633301, "logps/rejected": -3.6731042861938477, "loss": 4.0482, "rewards/accuracies": 0.75, "rewards/chosen": -36.143157958984375, "rewards/margins": 0.5878839492797852, "rewards/rejected": -36.731040954589844, "step": 3094 }, { "epoch": 0.42143246187363836, "grad_norm": 41.08850158095762, "learning_rate": 5.736146754550482e-07, "logits/chosen": 11.130887985229492, "logits/rejected": 11.322094917297363, "logps/chosen": -3.3024418354034424, "logps/rejected": -3.6019973754882812, "loss": 4.0816, "rewards/accuracies": 0.75, "rewards/chosen": -33.0244140625, "rewards/margins": 2.9955573081970215, "rewards/rejected": -36.01997375488281, "step": 3095 }, { "epoch": 0.4215686274509804, "grad_norm": 41.30179679870284, "learning_rate": 5.734433593886694e-07, "logits/chosen": 12.851533889770508, "logits/rejected": 12.8731689453125, "logps/chosen": -4.058225631713867, "logps/rejected": -4.134258270263672, "loss": 4.2282, "rewards/accuracies": 0.25, "rewards/chosen": -40.58225631713867, "rewards/margins": 0.7603273391723633, "rewards/rejected": -41.34258270263672, "step": 3096 }, { "epoch": 0.4217047930283224, "grad_norm": 46.227267514361486, "learning_rate": 5.732720041313297e-07, "logits/chosen": 12.36278247833252, "logits/rejected": 12.418694496154785, "logps/chosen": -4.008033275604248, "logps/rejected": -3.695371627807617, "loss": 4.2281, "rewards/accuracies": 0.75, "rewards/chosen": -40.0803337097168, "rewards/margins": -3.1266160011291504, "rewards/rejected": -36.95372009277344, "step": 3097 }, { "epoch": 0.4218409586056645, "grad_norm": 39.28150248692617, "learning_rate": 5.731006097217485e-07, "logits/chosen": 12.393329620361328, "logits/rejected": 11.878378868103027, "logps/chosen": -3.6197986602783203, "logps/rejected": -3.4458260536193848, "loss": 3.7573, "rewards/accuracies": 0.25, "rewards/chosen": -36.19798278808594, "rewards/margins": -1.7397232055664062, "rewards/rejected": -34.45825958251953, "step": 3098 }, { "epoch": 0.42197712418300654, "grad_norm": 67.60867484359409, "learning_rate": 5.729291761986535e-07, "logits/chosen": 12.318424224853516, "logits/rejected": 12.569551467895508, "logps/chosen": -3.4759535789489746, "logps/rejected": -3.56286883354187, "loss": 3.6538, "rewards/accuracies": 0.5, "rewards/chosen": -34.75953674316406, "rewards/margins": 0.8691515922546387, "rewards/rejected": -35.62868881225586, "step": 3099 }, { "epoch": 0.42211328976034856, "grad_norm": 42.168979144062895, "learning_rate": 5.727577036007818e-07, "logits/chosen": 11.717619895935059, "logits/rejected": 13.005786895751953, "logps/chosen": -3.7939059734344482, "logps/rejected": -3.9773075580596924, "loss": 3.5419, "rewards/accuracies": 0.5, "rewards/chosen": -37.93906021118164, "rewards/margins": 1.834014892578125, "rewards/rejected": -39.773075103759766, "step": 3100 }, { "epoch": 0.42224945533769065, "grad_norm": 39.28953399725324, "learning_rate": 5.725861919668789e-07, "logits/chosen": 11.580829620361328, "logits/rejected": 11.247831344604492, "logps/chosen": -3.5409045219421387, "logps/rejected": -3.6080832481384277, "loss": 4.4594, "rewards/accuracies": 0.5, "rewards/chosen": -35.40904235839844, "rewards/margins": 0.6717891693115234, "rewards/rejected": -36.080833435058594, "step": 3101 }, { "epoch": 0.4223856209150327, "grad_norm": 41.018211323148904, "learning_rate": 5.724146413356994e-07, "logits/chosen": 12.439172744750977, "logits/rejected": 12.767133712768555, "logps/chosen": -3.6361641883850098, "logps/rejected": -3.7770650386810303, "loss": 4.2908, "rewards/accuracies": 0.5, "rewards/chosen": -36.36164093017578, "rewards/margins": 1.4090075492858887, "rewards/rejected": -37.770652770996094, "step": 3102 }, { "epoch": 0.4225217864923747, "grad_norm": 48.43051606483415, "learning_rate": 5.722430517460064e-07, "logits/chosen": 11.625391960144043, "logits/rejected": 12.495113372802734, "logps/chosen": -3.73221492767334, "logps/rejected": -4.036181449890137, "loss": 3.2857, "rewards/accuracies": 0.75, "rewards/chosen": -37.32215118408203, "rewards/margins": 3.039663314819336, "rewards/rejected": -40.36181640625, "step": 3103 }, { "epoch": 0.4226579520697168, "grad_norm": 45.28318114129884, "learning_rate": 5.720714232365721e-07, "logits/chosen": 12.828645706176758, "logits/rejected": 13.169839859008789, "logps/chosen": -3.8375794887542725, "logps/rejected": -4.074099540710449, "loss": 4.4121, "rewards/accuracies": 0.75, "rewards/chosen": -38.37579345703125, "rewards/margins": 2.365199089050293, "rewards/rejected": -40.74099349975586, "step": 3104 }, { "epoch": 0.4227941176470588, "grad_norm": 69.9739463782251, "learning_rate": 5.718997558461774e-07, "logits/chosen": 11.852523803710938, "logits/rejected": 12.080062866210938, "logps/chosen": -3.983734130859375, "logps/rejected": -4.4204559326171875, "loss": 3.8709, "rewards/accuracies": 0.75, "rewards/chosen": -39.83734130859375, "rewards/margins": 4.367219924926758, "rewards/rejected": -44.204559326171875, "step": 3105 }, { "epoch": 0.42293028322440085, "grad_norm": 43.57128325187681, "learning_rate": 5.717280496136119e-07, "logits/chosen": 12.585006713867188, "logits/rejected": 11.997760772705078, "logps/chosen": -3.8701257705688477, "logps/rejected": -3.874906063079834, "loss": 4.1145, "rewards/accuracies": 0.5, "rewards/chosen": -38.701255798339844, "rewards/margins": 0.04780101776123047, "rewards/rejected": -38.74905776977539, "step": 3106 }, { "epoch": 0.42306644880174293, "grad_norm": 45.54998765596224, "learning_rate": 5.71556304577674e-07, "logits/chosen": 12.25955581665039, "logits/rejected": 12.36111831665039, "logps/chosen": -3.880568504333496, "logps/rejected": -3.56820011138916, "loss": 4.1012, "rewards/accuracies": 0.25, "rewards/chosen": -38.805686950683594, "rewards/margins": -3.1236839294433594, "rewards/rejected": -35.682003021240234, "step": 3107 }, { "epoch": 0.42320261437908496, "grad_norm": 44.678473823469545, "learning_rate": 5.713845207771711e-07, "logits/chosen": 11.621959686279297, "logits/rejected": 11.729731559753418, "logps/chosen": -3.939432382583618, "logps/rejected": -4.122998237609863, "loss": 3.9569, "rewards/accuracies": 0.75, "rewards/chosen": -39.394325256347656, "rewards/margins": 1.8356590270996094, "rewards/rejected": -41.229984283447266, "step": 3108 }, { "epoch": 0.423338779956427, "grad_norm": 42.080788531168466, "learning_rate": 5.712126982509189e-07, "logits/chosen": 12.439342498779297, "logits/rejected": 12.211825370788574, "logps/chosen": -3.7276253700256348, "logps/rejected": -3.806448459625244, "loss": 4.2345, "rewards/accuracies": 0.5, "rewards/chosen": -37.27625274658203, "rewards/margins": 0.7882308959960938, "rewards/rejected": -38.064483642578125, "step": 3109 }, { "epoch": 0.4234749455337691, "grad_norm": 41.88218992478716, "learning_rate": 5.710408370377424e-07, "logits/chosen": 11.621166229248047, "logits/rejected": 11.566774368286133, "logps/chosen": -3.9298975467681885, "logps/rejected": -4.203738212585449, "loss": 3.6554, "rewards/accuracies": 0.75, "rewards/chosen": -39.29897689819336, "rewards/margins": 2.738408088684082, "rewards/rejected": -42.037384033203125, "step": 3110 }, { "epoch": 0.4236111111111111, "grad_norm": 37.41867999475932, "learning_rate": 5.70868937176475e-07, "logits/chosen": 11.839212417602539, "logits/rejected": 12.466050148010254, "logps/chosen": -3.3376755714416504, "logps/rejected": -3.925487518310547, "loss": 3.7811, "rewards/accuracies": 1.0, "rewards/chosen": -33.37675476074219, "rewards/margins": 5.878118515014648, "rewards/rejected": -39.25487518310547, "step": 3111 }, { "epoch": 0.42374727668845313, "grad_norm": 45.19410766286403, "learning_rate": 5.706969987059587e-07, "logits/chosen": 10.375711441040039, "logits/rejected": 12.30403995513916, "logps/chosen": -3.4268319606781006, "logps/rejected": -3.839566707611084, "loss": 4.2326, "rewards/accuracies": 0.5, "rewards/chosen": -34.26831817626953, "rewards/margins": 4.127346515655518, "rewards/rejected": -38.395668029785156, "step": 3112 }, { "epoch": 0.4238834422657952, "grad_norm": 41.852661936173845, "learning_rate": 5.705250216650446e-07, "logits/chosen": 11.358774185180664, "logits/rejected": 11.489599227905273, "logps/chosen": -3.908588409423828, "logps/rejected": -4.071052074432373, "loss": 3.8602, "rewards/accuracies": 1.0, "rewards/chosen": -39.08588409423828, "rewards/margins": 1.6246395111083984, "rewards/rejected": -40.71052169799805, "step": 3113 }, { "epoch": 0.42401960784313725, "grad_norm": 44.11278542286337, "learning_rate": 5.703530060925922e-07, "logits/chosen": 11.701763153076172, "logits/rejected": 12.602977752685547, "logps/chosen": -3.6603798866271973, "logps/rejected": -4.150274276733398, "loss": 4.6437, "rewards/accuracies": 0.75, "rewards/chosen": -36.603797912597656, "rewards/margins": 4.8989458084106445, "rewards/rejected": -41.50274658203125, "step": 3114 }, { "epoch": 0.4241557734204793, "grad_norm": 57.52295083559343, "learning_rate": 5.7018095202747e-07, "logits/chosen": 12.269664764404297, "logits/rejected": 13.225101470947266, "logps/chosen": -4.263535976409912, "logps/rejected": -4.286041259765625, "loss": 4.1226, "rewards/accuracies": 0.5, "rewards/chosen": -42.63535690307617, "rewards/margins": 0.2250518798828125, "rewards/rejected": -42.860408782958984, "step": 3115 }, { "epoch": 0.42429193899782136, "grad_norm": 40.514693487645054, "learning_rate": 5.70008859508555e-07, "logits/chosen": 10.896628379821777, "logits/rejected": 12.351791381835938, "logps/chosen": -3.77946138381958, "logps/rejected": -3.942493438720703, "loss": 4.1495, "rewards/accuracies": 0.75, "rewards/chosen": -37.794612884521484, "rewards/margins": 1.6303186416625977, "rewards/rejected": -39.42493438720703, "step": 3116 }, { "epoch": 0.4244281045751634, "grad_norm": 41.010321964766554, "learning_rate": 5.698367285747328e-07, "logits/chosen": 11.448660850524902, "logits/rejected": 13.266077995300293, "logps/chosen": -3.8917770385742188, "logps/rejected": -4.099586486816406, "loss": 3.6462, "rewards/accuracies": 0.75, "rewards/chosen": -38.91777038574219, "rewards/margins": 2.078094482421875, "rewards/rejected": -40.99586486816406, "step": 3117 }, { "epoch": 0.4245642701525055, "grad_norm": 45.59296711120201, "learning_rate": 5.696645592648979e-07, "logits/chosen": 11.453412055969238, "logits/rejected": 12.347909927368164, "logps/chosen": -3.8440473079681396, "logps/rejected": -4.096893310546875, "loss": 4.2637, "rewards/accuracies": 1.0, "rewards/chosen": -38.44047546386719, "rewards/margins": 2.5284595489501953, "rewards/rejected": -40.96893310546875, "step": 3118 }, { "epoch": 0.4247004357298475, "grad_norm": 42.27818764065889, "learning_rate": 5.694923516179534e-07, "logits/chosen": 11.184784889221191, "logits/rejected": 11.843301773071289, "logps/chosen": -3.8156254291534424, "logps/rejected": -4.124170303344727, "loss": 3.3906, "rewards/accuracies": 0.75, "rewards/chosen": -38.156253814697266, "rewards/margins": 3.08544921875, "rewards/rejected": -41.241703033447266, "step": 3119 }, { "epoch": 0.42483660130718953, "grad_norm": 41.93887545910711, "learning_rate": 5.693201056728111e-07, "logits/chosen": 12.51956558227539, "logits/rejected": 12.459402084350586, "logps/chosen": -4.115015506744385, "logps/rejected": -3.9724130630493164, "loss": 3.9456, "rewards/accuracies": 0.25, "rewards/chosen": -41.15015411376953, "rewards/margins": -1.4260234832763672, "rewards/rejected": -39.7241325378418, "step": 3120 }, { "epoch": 0.4249727668845316, "grad_norm": 42.92051312683145, "learning_rate": 5.691478214683912e-07, "logits/chosen": 12.293107032775879, "logits/rejected": 12.186111450195312, "logps/chosen": -3.9406661987304688, "logps/rejected": -3.8453893661499023, "loss": 4.2389, "rewards/accuracies": 0.5, "rewards/chosen": -39.40666198730469, "rewards/margins": -0.9527673721313477, "rewards/rejected": -38.453895568847656, "step": 3121 }, { "epoch": 0.42510893246187365, "grad_norm": 39.80458714507087, "learning_rate": 5.689754990436229e-07, "logits/chosen": 11.752762794494629, "logits/rejected": 12.305938720703125, "logps/chosen": -3.4593558311462402, "logps/rejected": -3.7122278213500977, "loss": 3.8084, "rewards/accuracies": 1.0, "rewards/chosen": -34.59355926513672, "rewards/margins": 2.528717517852783, "rewards/rejected": -37.122276306152344, "step": 3122 }, { "epoch": 0.4252450980392157, "grad_norm": 51.01523048387219, "learning_rate": 5.688031384374437e-07, "logits/chosen": 10.953641891479492, "logits/rejected": 11.425886154174805, "logps/chosen": -3.5503365993499756, "logps/rejected": -3.812507152557373, "loss": 4.2529, "rewards/accuracies": 0.75, "rewards/chosen": -35.50336837768555, "rewards/margins": 2.6217050552368164, "rewards/rejected": -38.12507247924805, "step": 3123 }, { "epoch": 0.42538126361655776, "grad_norm": 59.783813666242786, "learning_rate": 5.686307396888002e-07, "logits/chosen": 11.561172485351562, "logits/rejected": 12.093706130981445, "logps/chosen": -3.8513712882995605, "logps/rejected": -3.9856436252593994, "loss": 4.6759, "rewards/accuracies": 0.5, "rewards/chosen": -38.51371383666992, "rewards/margins": 1.3427209854125977, "rewards/rejected": -39.85643768310547, "step": 3124 }, { "epoch": 0.4255174291938998, "grad_norm": 49.93933344649731, "learning_rate": 5.68458302836647e-07, "logits/chosen": 11.415160179138184, "logits/rejected": 12.416720390319824, "logps/chosen": -3.709737777709961, "logps/rejected": -4.104450225830078, "loss": 4.0893, "rewards/accuracies": 1.0, "rewards/chosen": -37.097373962402344, "rewards/margins": 3.947124481201172, "rewards/rejected": -41.04450225830078, "step": 3125 }, { "epoch": 0.4256535947712418, "grad_norm": 43.52666325306571, "learning_rate": 5.682858279199478e-07, "logits/chosen": 12.072285652160645, "logits/rejected": 12.116777420043945, "logps/chosen": -3.8299078941345215, "logps/rejected": -3.960890293121338, "loss": 4.3619, "rewards/accuracies": 0.5, "rewards/chosen": -38.29907989501953, "rewards/margins": 1.3098220825195312, "rewards/rejected": -39.60890197753906, "step": 3126 }, { "epoch": 0.4257897603485839, "grad_norm": 43.11705737684846, "learning_rate": 5.681133149776748e-07, "logits/chosen": 12.51785945892334, "logits/rejected": 12.38254451751709, "logps/chosen": -3.9124655723571777, "logps/rejected": -3.9640235900878906, "loss": 3.8558, "rewards/accuracies": 0.5, "rewards/chosen": -39.124656677246094, "rewards/margins": 0.5155830383300781, "rewards/rejected": -39.640235900878906, "step": 3127 }, { "epoch": 0.42592592592592593, "grad_norm": 40.284627656113415, "learning_rate": 5.679407640488086e-07, "logits/chosen": 12.803531646728516, "logits/rejected": 12.40230941772461, "logps/chosen": -3.8880534172058105, "logps/rejected": -4.008196830749512, "loss": 3.9227, "rewards/accuracies": 0.75, "rewards/chosen": -38.880531311035156, "rewards/margins": 1.2014398574829102, "rewards/rejected": -40.08197021484375, "step": 3128 }, { "epoch": 0.42606209150326796, "grad_norm": 42.00265247363506, "learning_rate": 5.677681751723387e-07, "logits/chosen": 11.96450424194336, "logits/rejected": 12.312677383422852, "logps/chosen": -3.7848398685455322, "logps/rejected": -3.9116036891937256, "loss": 4.1926, "rewards/accuracies": 0.75, "rewards/chosen": -37.8484001159668, "rewards/margins": 1.26763916015625, "rewards/rejected": -39.11603546142578, "step": 3129 }, { "epoch": 0.42619825708061004, "grad_norm": 40.921359130957036, "learning_rate": 5.675955483872627e-07, "logits/chosen": 12.02365493774414, "logits/rejected": 12.240215301513672, "logps/chosen": -4.022693157196045, "logps/rejected": -4.34796142578125, "loss": 4.292, "rewards/accuracies": 0.5, "rewards/chosen": -40.22693634033203, "rewards/margins": 3.252678871154785, "rewards/rejected": -43.4796142578125, "step": 3130 }, { "epoch": 0.4263344226579521, "grad_norm": 41.56219477238556, "learning_rate": 5.674228837325872e-07, "logits/chosen": 12.915899276733398, "logits/rejected": 12.485334396362305, "logps/chosen": -3.8003647327423096, "logps/rejected": -3.8876657485961914, "loss": 4.2469, "rewards/accuracies": 0.5, "rewards/chosen": -38.00364685058594, "rewards/margins": 0.8730096817016602, "rewards/rejected": -38.87665557861328, "step": 3131 }, { "epoch": 0.4264705882352941, "grad_norm": 45.49520407671546, "learning_rate": 5.672501812473272e-07, "logits/chosen": 11.48210334777832, "logits/rejected": 12.954627990722656, "logps/chosen": -3.857224702835083, "logps/rejected": -4.1129865646362305, "loss": 4.372, "rewards/accuracies": 0.75, "rewards/chosen": -38.57224655151367, "rewards/margins": 2.557619094848633, "rewards/rejected": -41.12986755371094, "step": 3132 }, { "epoch": 0.4266067538126362, "grad_norm": 38.905938365614645, "learning_rate": 5.670774409705062e-07, "logits/chosen": 13.081396102905273, "logits/rejected": 12.64825439453125, "logps/chosen": -3.7893762588500977, "logps/rejected": -4.130396842956543, "loss": 4.066, "rewards/accuracies": 0.75, "rewards/chosen": -37.89376449584961, "rewards/margins": 3.4102048873901367, "rewards/rejected": -41.3039665222168, "step": 3133 }, { "epoch": 0.4267429193899782, "grad_norm": 68.8652982748312, "learning_rate": 5.669046629411563e-07, "logits/chosen": 11.973112106323242, "logits/rejected": 11.621082305908203, "logps/chosen": -3.8710312843322754, "logps/rejected": -3.86242938041687, "loss": 4.0339, "rewards/accuracies": 0.5, "rewards/chosen": -38.7103157043457, "rewards/margins": -0.08602046966552734, "rewards/rejected": -38.62429428100586, "step": 3134 }, { "epoch": 0.42687908496732024, "grad_norm": 47.17765539722654, "learning_rate": 5.667318471983183e-07, "logits/chosen": 11.26147174835205, "logits/rejected": 11.919143676757812, "logps/chosen": -3.780832052230835, "logps/rejected": -3.9320614337921143, "loss": 3.9301, "rewards/accuracies": 0.75, "rewards/chosen": -37.808319091796875, "rewards/margins": 1.5122957229614258, "rewards/rejected": -39.32061767578125, "step": 3135 }, { "epoch": 0.42701525054466233, "grad_norm": 67.95142796380317, "learning_rate": 5.665589937810412e-07, "logits/chosen": 11.411666870117188, "logits/rejected": 12.26727294921875, "logps/chosen": -3.7656469345092773, "logps/rejected": -3.9743990898132324, "loss": 4.079, "rewards/accuracies": 0.5, "rewards/chosen": -37.656471252441406, "rewards/margins": 2.0875234603881836, "rewards/rejected": -39.743995666503906, "step": 3136 }, { "epoch": 0.42715141612200436, "grad_norm": 49.80588803185948, "learning_rate": 5.663861027283826e-07, "logits/chosen": 11.746221542358398, "logits/rejected": 12.535385131835938, "logps/chosen": -3.863821029663086, "logps/rejected": -4.28977632522583, "loss": 4.3443, "rewards/accuracies": 1.0, "rewards/chosen": -38.638206481933594, "rewards/margins": 4.259553909301758, "rewards/rejected": -42.897762298583984, "step": 3137 }, { "epoch": 0.4272875816993464, "grad_norm": 43.179197443620225, "learning_rate": 5.662131740794086e-07, "logits/chosen": 12.591543197631836, "logits/rejected": 12.119930267333984, "logps/chosen": -4.059721946716309, "logps/rejected": -3.99507999420166, "loss": 4.2847, "rewards/accuracies": 0.5, "rewards/chosen": -40.59721755981445, "rewards/margins": -0.6464195251464844, "rewards/rejected": -39.95079803466797, "step": 3138 }, { "epoch": 0.42742374727668847, "grad_norm": 41.16074887783367, "learning_rate": 5.660402078731941e-07, "logits/chosen": 11.804483413696289, "logits/rejected": 11.779537200927734, "logps/chosen": -3.7181341648101807, "logps/rejected": -3.8266706466674805, "loss": 4.3203, "rewards/accuracies": 0.75, "rewards/chosen": -37.18134307861328, "rewards/margins": 1.0853633880615234, "rewards/rejected": -38.26670837402344, "step": 3139 }, { "epoch": 0.4275599128540305, "grad_norm": 44.54083128029579, "learning_rate": 5.658672041488222e-07, "logits/chosen": 12.78717041015625, "logits/rejected": 13.168924331665039, "logps/chosen": -4.051745414733887, "logps/rejected": -4.200777530670166, "loss": 4.4488, "rewards/accuracies": 0.5, "rewards/chosen": -40.5174560546875, "rewards/margins": 1.490321159362793, "rewards/rejected": -42.007774353027344, "step": 3140 }, { "epoch": 0.42769607843137253, "grad_norm": 39.229540905012904, "learning_rate": 5.656941629453843e-07, "logits/chosen": 11.46568489074707, "logits/rejected": 11.868818283081055, "logps/chosen": -3.840933084487915, "logps/rejected": -3.9991707801818848, "loss": 4.132, "rewards/accuracies": 0.75, "rewards/chosen": -38.409332275390625, "rewards/margins": 1.5823755264282227, "rewards/rejected": -39.99170684814453, "step": 3141 }, { "epoch": 0.4278322440087146, "grad_norm": 40.0908889139106, "learning_rate": 5.655210843019807e-07, "logits/chosen": 11.073153495788574, "logits/rejected": 11.866044044494629, "logps/chosen": -3.7376441955566406, "logps/rejected": -3.8892195224761963, "loss": 3.4356, "rewards/accuracies": 0.5, "rewards/chosen": -37.37643814086914, "rewards/margins": 1.5157556533813477, "rewards/rejected": -38.89219665527344, "step": 3142 }, { "epoch": 0.42796840958605664, "grad_norm": 39.436816032094875, "learning_rate": 5.6534796825772e-07, "logits/chosen": 11.334085464477539, "logits/rejected": 11.638071060180664, "logps/chosen": -3.7962145805358887, "logps/rejected": -4.045879364013672, "loss": 4.1721, "rewards/accuracies": 1.0, "rewards/chosen": -37.9621467590332, "rewards/margins": 2.4966506958007812, "rewards/rejected": -40.45879364013672, "step": 3143 }, { "epoch": 0.42810457516339867, "grad_norm": 45.123170169730805, "learning_rate": 5.65174814851719e-07, "logits/chosen": 12.42238712310791, "logits/rejected": 12.865684509277344, "logps/chosen": -3.6133623123168945, "logps/rejected": -4.106019020080566, "loss": 4.2086, "rewards/accuracies": 1.0, "rewards/chosen": -36.13362503051758, "rewards/margins": 4.926568984985352, "rewards/rejected": -41.06019592285156, "step": 3144 }, { "epoch": 0.42824074074074076, "grad_norm": 44.628947007874764, "learning_rate": 5.650016241231032e-07, "logits/chosen": 11.947946548461914, "logits/rejected": 12.4725341796875, "logps/chosen": -3.9104366302490234, "logps/rejected": -4.101096153259277, "loss": 3.9335, "rewards/accuracies": 0.75, "rewards/chosen": -39.1043701171875, "rewards/margins": 1.9065942764282227, "rewards/rejected": -41.010963439941406, "step": 3145 }, { "epoch": 0.4283769063180828, "grad_norm": 39.861639253783245, "learning_rate": 5.648283961110065e-07, "logits/chosen": 11.782342910766602, "logits/rejected": 12.763147354125977, "logps/chosen": -3.903337001800537, "logps/rejected": -4.312481880187988, "loss": 4.4925, "rewards/accuracies": 1.0, "rewards/chosen": -39.03337097167969, "rewards/margins": 4.091444969177246, "rewards/rejected": -43.12481689453125, "step": 3146 }, { "epoch": 0.4285130718954248, "grad_norm": 37.77875119439847, "learning_rate": 5.646551308545714e-07, "logits/chosen": 12.521726608276367, "logits/rejected": 12.310125350952148, "logps/chosen": -4.032742023468018, "logps/rejected": -4.250834941864014, "loss": 4.0907, "rewards/accuracies": 0.75, "rewards/chosen": -40.32741928100586, "rewards/margins": 2.1809282302856445, "rewards/rejected": -42.50835037231445, "step": 3147 }, { "epoch": 0.4286492374727669, "grad_norm": 41.295297463731224, "learning_rate": 5.644818283929482e-07, "logits/chosen": 12.316400527954102, "logits/rejected": 13.047235488891602, "logps/chosen": -3.5441787242889404, "logps/rejected": -3.739668607711792, "loss": 3.8731, "rewards/accuracies": 0.25, "rewards/chosen": -35.44178771972656, "rewards/margins": 1.9548988342285156, "rewards/rejected": -37.396690368652344, "step": 3148 }, { "epoch": 0.4287854030501089, "grad_norm": 45.331344022155264, "learning_rate": 5.643084887652964e-07, "logits/chosen": 11.84881591796875, "logits/rejected": 11.7349271774292, "logps/chosen": -3.6736059188842773, "logps/rejected": -4.009833335876465, "loss": 4.2572, "rewards/accuracies": 0.75, "rewards/chosen": -36.736061096191406, "rewards/margins": 3.362274169921875, "rewards/rejected": -40.09833526611328, "step": 3149 }, { "epoch": 0.42892156862745096, "grad_norm": 52.413210721467955, "learning_rate": 5.641351120107833e-07, "logits/chosen": 11.692148208618164, "logits/rejected": 11.501338958740234, "logps/chosen": -3.631395101547241, "logps/rejected": -3.665565013885498, "loss": 3.9078, "rewards/accuracies": 0.5, "rewards/chosen": -36.31394958496094, "rewards/margins": 0.34169864654541016, "rewards/rejected": -36.6556510925293, "step": 3150 }, { "epoch": 0.42905773420479304, "grad_norm": 38.926707235124795, "learning_rate": 5.639616981685849e-07, "logits/chosen": 12.180593490600586, "logits/rejected": 12.857370376586914, "logps/chosen": -3.6664862632751465, "logps/rejected": -4.1262383460998535, "loss": 3.9303, "rewards/accuracies": 1.0, "rewards/chosen": -36.66486358642578, "rewards/margins": 4.597521781921387, "rewards/rejected": -41.26238250732422, "step": 3151 }, { "epoch": 0.42919389978213507, "grad_norm": 64.05420616864212, "learning_rate": 5.637882472778855e-07, "logits/chosen": 11.34669303894043, "logits/rejected": 11.878982543945312, "logps/chosen": -3.7852823734283447, "logps/rejected": -4.048404216766357, "loss": 3.5023, "rewards/accuracies": 1.0, "rewards/chosen": -37.852821350097656, "rewards/margins": 2.6312179565429688, "rewards/rejected": -40.484039306640625, "step": 3152 }, { "epoch": 0.4293300653594771, "grad_norm": 72.58073377751336, "learning_rate": 5.636147593778778e-07, "logits/chosen": 12.016003608703613, "logits/rejected": 12.15298080444336, "logps/chosen": -3.8486080169677734, "logps/rejected": -3.801734209060669, "loss": 4.4601, "rewards/accuracies": 0.25, "rewards/chosen": -38.486080169677734, "rewards/margins": -0.4687376022338867, "rewards/rejected": -38.01734161376953, "step": 3153 }, { "epoch": 0.4294662309368192, "grad_norm": 46.161496687130025, "learning_rate": 5.634412345077626e-07, "logits/chosen": 11.978755950927734, "logits/rejected": 12.663497924804688, "logps/chosen": -4.128460884094238, "logps/rejected": -3.808133602142334, "loss": 3.3404, "rewards/accuracies": 0.25, "rewards/chosen": -41.28460693359375, "rewards/margins": -3.2032699584960938, "rewards/rejected": -38.081336975097656, "step": 3154 }, { "epoch": 0.4296023965141612, "grad_norm": 41.22641326687192, "learning_rate": 5.632676727067496e-07, "logits/chosen": 11.55105972290039, "logits/rejected": 12.568378448486328, "logps/chosen": -3.844705820083618, "logps/rejected": -4.368241310119629, "loss": 3.2922, "rewards/accuracies": 1.0, "rewards/chosen": -38.447059631347656, "rewards/margins": 5.235352516174316, "rewards/rejected": -43.68241500854492, "step": 3155 }, { "epoch": 0.4297385620915033, "grad_norm": 41.80061708942007, "learning_rate": 5.630940740140563e-07, "logits/chosen": 12.92491626739502, "logits/rejected": 11.943330764770508, "logps/chosen": -3.8820998668670654, "logps/rejected": -3.7265491485595703, "loss": 3.787, "rewards/accuracies": 0.0, "rewards/chosen": -38.82099914550781, "rewards/margins": -1.5555105209350586, "rewards/rejected": -37.26548767089844, "step": 3156 }, { "epoch": 0.4298747276688453, "grad_norm": 42.631266570932894, "learning_rate": 5.629204384689088e-07, "logits/chosen": 11.918571472167969, "logits/rejected": 11.552611351013184, "logps/chosen": -4.175202369689941, "logps/rejected": -3.8814971446990967, "loss": 3.8183, "rewards/accuracies": 0.5, "rewards/chosen": -41.75202941894531, "rewards/margins": -2.937056541442871, "rewards/rejected": -38.814971923828125, "step": 3157 }, { "epoch": 0.43001089324618735, "grad_norm": 45.461802528026666, "learning_rate": 5.627467661105416e-07, "logits/chosen": 12.113761901855469, "logits/rejected": 12.731439590454102, "logps/chosen": -3.983457088470459, "logps/rejected": -3.6791677474975586, "loss": 4.1182, "rewards/accuracies": 0.25, "rewards/chosen": -39.834571838378906, "rewards/margins": -3.0428972244262695, "rewards/rejected": -36.79167556762695, "step": 3158 }, { "epoch": 0.43014705882352944, "grad_norm": 45.365629119032576, "learning_rate": 5.625730569781973e-07, "logits/chosen": 12.140748023986816, "logits/rejected": 12.791642189025879, "logps/chosen": -4.035815238952637, "logps/rejected": -4.279618263244629, "loss": 4.2115, "rewards/accuracies": 0.75, "rewards/chosen": -40.358154296875, "rewards/margins": 2.4380311965942383, "rewards/rejected": -42.79618453979492, "step": 3159 }, { "epoch": 0.43028322440087147, "grad_norm": 45.86941883875422, "learning_rate": 5.623993111111267e-07, "logits/chosen": 12.193681716918945, "logits/rejected": 13.489115715026855, "logps/chosen": -3.6194803714752197, "logps/rejected": -4.22367000579834, "loss": 4.5061, "rewards/accuracies": 0.75, "rewards/chosen": -36.194801330566406, "rewards/margins": 6.041895389556885, "rewards/rejected": -42.23670196533203, "step": 3160 }, { "epoch": 0.4304193899782135, "grad_norm": 44.75340074191636, "learning_rate": 5.622255285485897e-07, "logits/chosen": 11.377864837646484, "logits/rejected": 11.807161331176758, "logps/chosen": -3.7087459564208984, "logps/rejected": -3.7675657272338867, "loss": 4.0658, "rewards/accuracies": 0.5, "rewards/chosen": -37.087459564208984, "rewards/margins": 0.588198184967041, "rewards/rejected": -37.6756591796875, "step": 3161 }, { "epoch": 0.4305555555555556, "grad_norm": 42.59912454501393, "learning_rate": 5.620517093298533e-07, "logits/chosen": 11.693363189697266, "logits/rejected": 11.691949844360352, "logps/chosen": -3.6235644817352295, "logps/rejected": -3.5190794467926025, "loss": 3.7078, "rewards/accuracies": 0.5, "rewards/chosen": -36.23564529418945, "rewards/margins": -1.044848918914795, "rewards/rejected": -35.1907958984375, "step": 3162 }, { "epoch": 0.4306917211328976, "grad_norm": 43.41818647620681, "learning_rate": 5.618778534941938e-07, "logits/chosen": 11.616796493530273, "logits/rejected": 12.21744155883789, "logps/chosen": -3.5838093757629395, "logps/rejected": -3.6756978034973145, "loss": 3.8429, "rewards/accuracies": 0.5, "rewards/chosen": -35.838096618652344, "rewards/margins": 0.918881893157959, "rewards/rejected": -36.75697708129883, "step": 3163 }, { "epoch": 0.43082788671023964, "grad_norm": 46.49503870401179, "learning_rate": 5.617039610808952e-07, "logits/chosen": 11.437560081481934, "logits/rejected": 12.520829200744629, "logps/chosen": -3.2589128017425537, "logps/rejected": -3.9730591773986816, "loss": 3.8822, "rewards/accuracies": 1.0, "rewards/chosen": -32.58912658691406, "rewards/margins": 7.141462326049805, "rewards/rejected": -39.7305908203125, "step": 3164 }, { "epoch": 0.4309640522875817, "grad_norm": 38.427601517086835, "learning_rate": 5.615300321292499e-07, "logits/chosen": 12.581554412841797, "logits/rejected": 12.727367401123047, "logps/chosen": -4.006956100463867, "logps/rejected": -3.888965606689453, "loss": 3.7223, "rewards/accuracies": 0.25, "rewards/chosen": -40.06956481933594, "rewards/margins": -1.1799097061157227, "rewards/rejected": -38.88965606689453, "step": 3165 }, { "epoch": 0.43110021786492375, "grad_norm": 47.10398368502016, "learning_rate": 5.613560666785585e-07, "logits/chosen": 12.126562118530273, "logits/rejected": 13.276034355163574, "logps/chosen": -3.5343775749206543, "logps/rejected": -4.458591938018799, "loss": 4.2194, "rewards/accuracies": 1.0, "rewards/chosen": -35.343772888183594, "rewards/margins": 9.242145538330078, "rewards/rejected": -44.58591842651367, "step": 3166 }, { "epoch": 0.4312363834422658, "grad_norm": 46.50702511940634, "learning_rate": 5.611820647681302e-07, "logits/chosen": 11.60509204864502, "logits/rejected": 13.17724323272705, "logps/chosen": -3.8454346656799316, "logps/rejected": -4.319494724273682, "loss": 4.2004, "rewards/accuracies": 0.75, "rewards/chosen": -38.454349517822266, "rewards/margins": 4.740601539611816, "rewards/rejected": -43.194950103759766, "step": 3167 }, { "epoch": 0.43137254901960786, "grad_norm": 56.25512574674775, "learning_rate": 5.61008026437282e-07, "logits/chosen": 12.349039077758789, "logits/rejected": 12.213885307312012, "logps/chosen": -3.901345729827881, "logps/rejected": -3.7649595737457275, "loss": 4.1286, "rewards/accuracies": 0.25, "rewards/chosen": -39.013458251953125, "rewards/margins": -1.3638620376586914, "rewards/rejected": -37.64959716796875, "step": 3168 }, { "epoch": 0.4315087145969499, "grad_norm": 40.88746625925677, "learning_rate": 5.608339517253393e-07, "logits/chosen": 13.008889198303223, "logits/rejected": 12.944174766540527, "logps/chosen": -4.238481521606445, "logps/rejected": -4.4139909744262695, "loss": 3.4649, "rewards/accuracies": 0.75, "rewards/chosen": -42.38481140136719, "rewards/margins": 1.7550992965698242, "rewards/rejected": -44.139915466308594, "step": 3169 }, { "epoch": 0.4316448801742919, "grad_norm": 48.23541271353315, "learning_rate": 5.606598406716357e-07, "logits/chosen": 10.974884033203125, "logits/rejected": 12.670098304748535, "logps/chosen": -3.608804225921631, "logps/rejected": -4.078278541564941, "loss": 3.5254, "rewards/accuracies": 1.0, "rewards/chosen": -36.088043212890625, "rewards/margins": 4.694744110107422, "rewards/rejected": -40.78278732299805, "step": 3170 }, { "epoch": 0.431781045751634, "grad_norm": 45.03478653743853, "learning_rate": 5.604856933155132e-07, "logits/chosen": 12.306785583496094, "logits/rejected": 12.32297134399414, "logps/chosen": -3.7448041439056396, "logps/rejected": -3.7794201374053955, "loss": 4.0856, "rewards/accuracies": 0.5, "rewards/chosen": -37.44804382324219, "rewards/margins": 0.3461580276489258, "rewards/rejected": -37.7942008972168, "step": 3171 }, { "epoch": 0.43191721132897604, "grad_norm": 41.59259095911092, "learning_rate": 5.603115096963215e-07, "logits/chosen": 12.373245239257812, "logits/rejected": 12.739195823669434, "logps/chosen": -4.163665771484375, "logps/rejected": -4.207136631011963, "loss": 3.766, "rewards/accuracies": 0.5, "rewards/chosen": -41.63665771484375, "rewards/margins": 0.4347066879272461, "rewards/rejected": -42.07136535644531, "step": 3172 }, { "epoch": 0.43205337690631807, "grad_norm": 44.643684865047874, "learning_rate": 5.601372898534193e-07, "logits/chosen": 12.667631149291992, "logits/rejected": 14.00520133972168, "logps/chosen": -3.9452641010284424, "logps/rejected": -4.197399616241455, "loss": 3.8644, "rewards/accuracies": 0.5, "rewards/chosen": -39.45264434814453, "rewards/margins": 2.5213518142700195, "rewards/rejected": -41.973995208740234, "step": 3173 }, { "epoch": 0.43218954248366015, "grad_norm": 43.988666712064514, "learning_rate": 5.599630338261725e-07, "logits/chosen": 12.849224090576172, "logits/rejected": 11.09425163269043, "logps/chosen": -4.016623497009277, "logps/rejected": -3.5270626544952393, "loss": 3.8619, "rewards/accuracies": 0.25, "rewards/chosen": -40.16623306274414, "rewards/margins": -4.895608425140381, "rewards/rejected": -35.27062225341797, "step": 3174 }, { "epoch": 0.4323257080610022, "grad_norm": 54.92598377678391, "learning_rate": 5.59788741653956e-07, "logits/chosen": 11.964484214782715, "logits/rejected": 13.037769317626953, "logps/chosen": -3.9993796348571777, "logps/rejected": -4.040605545043945, "loss": 3.7911, "rewards/accuracies": 0.5, "rewards/chosen": -39.99379348754883, "rewards/margins": 0.412259578704834, "rewards/rejected": -40.40605163574219, "step": 3175 }, { "epoch": 0.4324618736383442, "grad_norm": 48.81282120251733, "learning_rate": 5.596144133761526e-07, "logits/chosen": 11.260799407958984, "logits/rejected": 12.283394813537598, "logps/chosen": -3.8420979976654053, "logps/rejected": -4.11955451965332, "loss": 3.7385, "rewards/accuracies": 1.0, "rewards/chosen": -38.420982360839844, "rewards/margins": 2.7745676040649414, "rewards/rejected": -41.19554901123047, "step": 3176 }, { "epoch": 0.4325980392156863, "grad_norm": 60.212956656073956, "learning_rate": 5.594400490321531e-07, "logits/chosen": 13.093339920043945, "logits/rejected": 12.690088272094727, "logps/chosen": -4.095447540283203, "logps/rejected": -4.299562931060791, "loss": 4.4956, "rewards/accuracies": 0.75, "rewards/chosen": -40.95447540283203, "rewards/margins": 2.0411500930786133, "rewards/rejected": -42.995628356933594, "step": 3177 }, { "epoch": 0.4327342047930283, "grad_norm": 44.61674370143788, "learning_rate": 5.592656486613564e-07, "logits/chosen": 12.888470649719238, "logits/rejected": 13.020770072937012, "logps/chosen": -4.037970066070557, "logps/rejected": -4.072187900543213, "loss": 3.5289, "rewards/accuracies": 0.25, "rewards/chosen": -40.37969970703125, "rewards/margins": 0.3421773910522461, "rewards/rejected": -40.72187805175781, "step": 3178 }, { "epoch": 0.43287037037037035, "grad_norm": 40.32016985114983, "learning_rate": 5.590912123031701e-07, "logits/chosen": 12.353992462158203, "logits/rejected": 11.110441207885742, "logps/chosen": -3.8603134155273438, "logps/rejected": -3.727393627166748, "loss": 3.7929, "rewards/accuracies": 0.5, "rewards/chosen": -38.60313415527344, "rewards/margins": -1.329197883605957, "rewards/rejected": -37.27393341064453, "step": 3179 }, { "epoch": 0.43300653594771243, "grad_norm": 69.262524323175, "learning_rate": 5.589167399970092e-07, "logits/chosen": 12.032699584960938, "logits/rejected": 12.021123886108398, "logps/chosen": -3.7905304431915283, "logps/rejected": -4.013195037841797, "loss": 3.8684, "rewards/accuracies": 0.75, "rewards/chosen": -37.905303955078125, "rewards/margins": 2.2266454696655273, "rewards/rejected": -40.13195037841797, "step": 3180 }, { "epoch": 0.43314270152505446, "grad_norm": 41.96788623362944, "learning_rate": 5.587422317822973e-07, "logits/chosen": 12.371065139770508, "logits/rejected": 13.335428237915039, "logps/chosen": -3.9154629707336426, "logps/rejected": -4.305792331695557, "loss": 3.7994, "rewards/accuracies": 0.75, "rewards/chosen": -39.154632568359375, "rewards/margins": 3.9032936096191406, "rewards/rejected": -43.05792236328125, "step": 3181 }, { "epoch": 0.4332788671023965, "grad_norm": 40.725490280549245, "learning_rate": 5.58567687698466e-07, "logits/chosen": 12.39297866821289, "logits/rejected": 12.595630645751953, "logps/chosen": -3.631196975708008, "logps/rejected": -4.245559215545654, "loss": 4.1884, "rewards/accuracies": 1.0, "rewards/chosen": -36.31196594238281, "rewards/margins": 6.143623352050781, "rewards/rejected": -42.455589294433594, "step": 3182 }, { "epoch": 0.4334150326797386, "grad_norm": 51.1180639778519, "learning_rate": 5.58393107784955e-07, "logits/chosen": 11.228775024414062, "logits/rejected": 12.056388854980469, "logps/chosen": -3.1558356285095215, "logps/rejected": -3.478231906890869, "loss": 4.1021, "rewards/accuracies": 0.75, "rewards/chosen": -31.5583553314209, "rewards/margins": 3.223964214324951, "rewards/rejected": -34.782318115234375, "step": 3183 }, { "epoch": 0.4335511982570806, "grad_norm": 44.014565329384254, "learning_rate": 5.582184920812118e-07, "logits/chosen": 11.145193099975586, "logits/rejected": 12.31573486328125, "logps/chosen": -3.216970682144165, "logps/rejected": -3.65533447265625, "loss": 3.7028, "rewards/accuracies": 1.0, "rewards/chosen": -32.169708251953125, "rewards/margins": 4.383638381958008, "rewards/rejected": -36.5533447265625, "step": 3184 }, { "epoch": 0.43368736383442263, "grad_norm": 56.34877272656435, "learning_rate": 5.580438406266926e-07, "logits/chosen": 10.395809173583984, "logits/rejected": 12.19434928894043, "logps/chosen": -3.2476000785827637, "logps/rejected": -3.774911403656006, "loss": 4.6128, "rewards/accuracies": 0.75, "rewards/chosen": -32.47600173950195, "rewards/margins": 5.273114204406738, "rewards/rejected": -37.749114990234375, "step": 3185 }, { "epoch": 0.4338235294117647, "grad_norm": 43.41861294365558, "learning_rate": 5.578691534608611e-07, "logits/chosen": 11.622199058532715, "logits/rejected": 12.024467468261719, "logps/chosen": -3.4812371730804443, "logps/rejected": -3.730591297149658, "loss": 4.2093, "rewards/accuracies": 0.75, "rewards/chosen": -34.81237030029297, "rewards/margins": 2.4935412406921387, "rewards/rejected": -37.305912017822266, "step": 3186 }, { "epoch": 0.43395969498910675, "grad_norm": 39.068779331604496, "learning_rate": 5.576944306231894e-07, "logits/chosen": 12.030187606811523, "logits/rejected": 13.25119400024414, "logps/chosen": -4.108536720275879, "logps/rejected": -4.343456268310547, "loss": 3.4333, "rewards/accuracies": 0.75, "rewards/chosen": -41.08537292480469, "rewards/margins": 2.3491926193237305, "rewards/rejected": -43.43456268310547, "step": 3187 }, { "epoch": 0.4340958605664488, "grad_norm": 45.425090518586515, "learning_rate": 5.575196721531577e-07, "logits/chosen": 11.487119674682617, "logits/rejected": 12.423266410827637, "logps/chosen": -3.395768165588379, "logps/rejected": -3.704195737838745, "loss": 3.928, "rewards/accuracies": 0.75, "rewards/chosen": -33.957679748535156, "rewards/margins": 3.084275245666504, "rewards/rejected": -37.04195785522461, "step": 3188 }, { "epoch": 0.43423202614379086, "grad_norm": 45.53330655444428, "learning_rate": 5.57344878090254e-07, "logits/chosen": 11.11663818359375, "logits/rejected": 11.174105644226074, "logps/chosen": -3.98464298248291, "logps/rejected": -3.8043551445007324, "loss": 4.293, "rewards/accuracies": 0.25, "rewards/chosen": -39.84642791748047, "rewards/margins": -1.8028783798217773, "rewards/rejected": -38.043548583984375, "step": 3189 }, { "epoch": 0.4343681917211329, "grad_norm": 40.354075515747496, "learning_rate": 5.571700484739745e-07, "logits/chosen": 11.910028457641602, "logits/rejected": 13.183998107910156, "logps/chosen": -3.4349498748779297, "logps/rejected": -3.9192965030670166, "loss": 3.7447, "rewards/accuracies": 1.0, "rewards/chosen": -34.3494987487793, "rewards/margins": 4.843465805053711, "rewards/rejected": -39.192962646484375, "step": 3190 }, { "epoch": 0.4345043572984749, "grad_norm": 36.05559455858401, "learning_rate": 5.569951833438233e-07, "logits/chosen": 12.092388153076172, "logits/rejected": 12.422454833984375, "logps/chosen": -3.7181520462036133, "logps/rejected": -3.665266752243042, "loss": 3.9259, "rewards/accuracies": 0.5, "rewards/chosen": -37.1815185546875, "rewards/margins": -0.5288534164428711, "rewards/rejected": -36.65266799926758, "step": 3191 }, { "epoch": 0.434640522875817, "grad_norm": 44.88139885973097, "learning_rate": 5.568202827393127e-07, "logits/chosen": 11.987217903137207, "logits/rejected": 12.897829055786133, "logps/chosen": -3.628945827484131, "logps/rejected": -3.9627227783203125, "loss": 4.1131, "rewards/accuracies": 0.75, "rewards/chosen": -36.289459228515625, "rewards/margins": 3.3377676010131836, "rewards/rejected": -39.627227783203125, "step": 3192 }, { "epoch": 0.43477668845315903, "grad_norm": 44.647979206704385, "learning_rate": 5.566453466999629e-07, "logits/chosen": 12.676460266113281, "logits/rejected": 13.27642822265625, "logps/chosen": -4.127735137939453, "logps/rejected": -4.663050174713135, "loss": 4.4355, "rewards/accuracies": 1.0, "rewards/chosen": -41.27735137939453, "rewards/margins": 5.353153228759766, "rewards/rejected": -46.6305046081543, "step": 3193 }, { "epoch": 0.4349128540305011, "grad_norm": 40.409900486047604, "learning_rate": 5.564703752653022e-07, "logits/chosen": 11.298446655273438, "logits/rejected": 11.827119827270508, "logps/chosen": -3.3855433464050293, "logps/rejected": -3.5719847679138184, "loss": 3.697, "rewards/accuracies": 0.75, "rewards/chosen": -33.85543441772461, "rewards/margins": 1.8644142150878906, "rewards/rejected": -35.7198486328125, "step": 3194 }, { "epoch": 0.43504901960784315, "grad_norm": 37.21933758856733, "learning_rate": 5.562953684748668e-07, "logits/chosen": 11.899248123168945, "logits/rejected": 12.25490951538086, "logps/chosen": -3.5316269397735596, "logps/rejected": -3.9561898708343506, "loss": 3.72, "rewards/accuracies": 1.0, "rewards/chosen": -35.31626892089844, "rewards/margins": 4.245628356933594, "rewards/rejected": -39.56189727783203, "step": 3195 }, { "epoch": 0.4351851851851852, "grad_norm": 37.11054815989596, "learning_rate": 5.56120326368201e-07, "logits/chosen": 11.812249183654785, "logits/rejected": 12.10231876373291, "logps/chosen": -3.704350233078003, "logps/rejected": -3.961446762084961, "loss": 3.7668, "rewards/accuracies": 1.0, "rewards/chosen": -37.04350280761719, "rewards/margins": 2.5709657669067383, "rewards/rejected": -39.61446762084961, "step": 3196 }, { "epoch": 0.43532135076252726, "grad_norm": 51.16390146292856, "learning_rate": 5.559452489848569e-07, "logits/chosen": 11.020139694213867, "logits/rejected": 11.246572494506836, "logps/chosen": -3.824954032897949, "logps/rejected": -3.9410696029663086, "loss": 4.1764, "rewards/accuracies": 0.75, "rewards/chosen": -38.249542236328125, "rewards/margins": 1.1611547470092773, "rewards/rejected": -39.41069793701172, "step": 3197 }, { "epoch": 0.4354575163398693, "grad_norm": 36.55261303517646, "learning_rate": 5.557701363643949e-07, "logits/chosen": 10.799236297607422, "logits/rejected": 10.656768798828125, "logps/chosen": -3.6254348754882812, "logps/rejected": -3.626272201538086, "loss": 4.1176, "rewards/accuracies": 0.25, "rewards/chosen": -36.25434875488281, "rewards/margins": 0.008371353149414062, "rewards/rejected": -36.26272201538086, "step": 3198 }, { "epoch": 0.4355936819172113, "grad_norm": 51.62659235926716, "learning_rate": 5.555949885463827e-07, "logits/chosen": 11.680322647094727, "logits/rejected": 10.775613784790039, "logps/chosen": -3.726893901824951, "logps/rejected": -3.477163791656494, "loss": 4.2973, "rewards/accuracies": 0.25, "rewards/chosen": -37.26893615722656, "rewards/margins": -2.4972991943359375, "rewards/rejected": -34.771636962890625, "step": 3199 }, { "epoch": 0.4357298474945534, "grad_norm": 42.07832102840201, "learning_rate": 5.554198055703968e-07, "logits/chosen": 11.505058288574219, "logits/rejected": 12.372659683227539, "logps/chosen": -3.6875216960906982, "logps/rejected": -4.042481899261475, "loss": 4.1772, "rewards/accuracies": 0.75, "rewards/chosen": -36.875213623046875, "rewards/margins": 3.5496034622192383, "rewards/rejected": -40.42481994628906, "step": 3200 }, { "epoch": 0.43586601307189543, "grad_norm": 55.803731732497596, "learning_rate": 5.552445874760211e-07, "logits/chosen": 12.305273056030273, "logits/rejected": 13.124382972717285, "logps/chosen": -3.9440548419952393, "logps/rejected": -4.229237079620361, "loss": 4.1277, "rewards/accuracies": 1.0, "rewards/chosen": -39.440547943115234, "rewards/margins": 2.8518218994140625, "rewards/rejected": -42.2923698425293, "step": 3201 }, { "epoch": 0.43600217864923746, "grad_norm": 38.13381187740845, "learning_rate": 5.550693343028476e-07, "logits/chosen": 12.59286117553711, "logits/rejected": 12.782624244689941, "logps/chosen": -3.982605218887329, "logps/rejected": -4.254114151000977, "loss": 4.0103, "rewards/accuracies": 1.0, "rewards/chosen": -39.8260498046875, "rewards/margins": 2.715087890625, "rewards/rejected": -42.5411376953125, "step": 3202 }, { "epoch": 0.43613834422657954, "grad_norm": 41.21711159026814, "learning_rate": 5.548940460904762e-07, "logits/chosen": 12.114368438720703, "logits/rejected": 11.81750774383545, "logps/chosen": -3.7885847091674805, "logps/rejected": -3.739839553833008, "loss": 4.11, "rewards/accuracies": 0.5, "rewards/chosen": -37.88584518432617, "rewards/margins": -0.48745059967041016, "rewards/rejected": -37.39839553833008, "step": 3203 }, { "epoch": 0.4362745098039216, "grad_norm": 42.2196889968875, "learning_rate": 5.547187228785148e-07, "logits/chosen": 12.077272415161133, "logits/rejected": 12.59023666381836, "logps/chosen": -3.7965469360351562, "logps/rejected": -4.034156799316406, "loss": 4.3977, "rewards/accuracies": 0.75, "rewards/chosen": -37.96546936035156, "rewards/margins": 2.3760976791381836, "rewards/rejected": -40.34156799316406, "step": 3204 }, { "epoch": 0.4364106753812636, "grad_norm": 50.55592387626742, "learning_rate": 5.545433647065789e-07, "logits/chosen": 11.450543403625488, "logits/rejected": 12.358463287353516, "logps/chosen": -3.881258249282837, "logps/rejected": -4.088961601257324, "loss": 3.5494, "rewards/accuracies": 0.75, "rewards/chosen": -38.812583923339844, "rewards/margins": 2.0770339965820312, "rewards/rejected": -40.889617919921875, "step": 3205 }, { "epoch": 0.4365468409586057, "grad_norm": 36.155055197666925, "learning_rate": 5.543679716142923e-07, "logits/chosen": 11.91528606414795, "logits/rejected": 11.973228454589844, "logps/chosen": -4.061894416809082, "logps/rejected": -3.9698896408081055, "loss": 4.0864, "rewards/accuracies": 0.5, "rewards/chosen": -40.61894607543945, "rewards/margins": -0.920048713684082, "rewards/rejected": -39.69889831542969, "step": 3206 }, { "epoch": 0.4366830065359477, "grad_norm": 42.01509769846752, "learning_rate": 5.541925436412866e-07, "logits/chosen": 12.31037712097168, "logits/rejected": 12.260095596313477, "logps/chosen": -3.8669610023498535, "logps/rejected": -4.067225456237793, "loss": 3.9452, "rewards/accuracies": 0.5, "rewards/chosen": -38.66960906982422, "rewards/margins": 2.002645492553711, "rewards/rejected": -40.6722526550293, "step": 3207 }, { "epoch": 0.43681917211328974, "grad_norm": 39.78584630141543, "learning_rate": 5.540170808272011e-07, "logits/chosen": 11.506538391113281, "logits/rejected": 11.5846586227417, "logps/chosen": -3.940620183944702, "logps/rejected": -3.905902624130249, "loss": 4.0279, "rewards/accuracies": 0.5, "rewards/chosen": -39.40620422363281, "rewards/margins": -0.34717750549316406, "rewards/rejected": -39.059024810791016, "step": 3208 }, { "epoch": 0.43695533769063183, "grad_norm": 66.72798655591824, "learning_rate": 5.53841583211683e-07, "logits/chosen": 11.975733757019043, "logits/rejected": 12.737939834594727, "logps/chosen": -3.955599546432495, "logps/rejected": -4.080987930297852, "loss": 3.4572, "rewards/accuracies": 0.5, "rewards/chosen": -39.555999755859375, "rewards/margins": 1.2538871765136719, "rewards/rejected": -40.80988311767578, "step": 3209 }, { "epoch": 0.43709150326797386, "grad_norm": 37.05083443507579, "learning_rate": 5.536660508343875e-07, "logits/chosen": 11.284944534301758, "logits/rejected": 13.091079711914062, "logps/chosen": -3.5785040855407715, "logps/rejected": -4.2486348152160645, "loss": 3.4784, "rewards/accuracies": 1.0, "rewards/chosen": -35.78504180908203, "rewards/margins": 6.701306343078613, "rewards/rejected": -42.48634719848633, "step": 3210 }, { "epoch": 0.4372276688453159, "grad_norm": 43.38991598297674, "learning_rate": 5.534904837349775e-07, "logits/chosen": 12.384306907653809, "logits/rejected": 12.155795097351074, "logps/chosen": -3.95845365524292, "logps/rejected": -3.685363531112671, "loss": 4.2159, "rewards/accuracies": 0.25, "rewards/chosen": -39.584537506103516, "rewards/margins": -2.7308998107910156, "rewards/rejected": -36.8536376953125, "step": 3211 }, { "epoch": 0.43736383442265797, "grad_norm": 43.89827929833922, "learning_rate": 5.533148819531242e-07, "logits/chosen": 12.51327133178711, "logits/rejected": 12.83979606628418, "logps/chosen": -3.933913230895996, "logps/rejected": -3.852835178375244, "loss": 4.1854, "rewards/accuracies": 0.5, "rewards/chosen": -39.339134216308594, "rewards/margins": -0.8107795715332031, "rewards/rejected": -38.52835464477539, "step": 3212 }, { "epoch": 0.4375, "grad_norm": 40.22986404780276, "learning_rate": 5.531392455285058e-07, "logits/chosen": 11.383723258972168, "logits/rejected": 12.435892105102539, "logps/chosen": -3.523618221282959, "logps/rejected": -3.8583879470825195, "loss": 3.7648, "rewards/accuracies": 0.75, "rewards/chosen": -35.236183166503906, "rewards/margins": 3.347696304321289, "rewards/rejected": -38.58387756347656, "step": 3213 }, { "epoch": 0.43763616557734203, "grad_norm": 37.57143252473384, "learning_rate": 5.52963574500809e-07, "logits/chosen": 12.141368865966797, "logits/rejected": 11.702027320861816, "logps/chosen": -3.840425968170166, "logps/rejected": -3.874695301055908, "loss": 3.937, "rewards/accuracies": 0.25, "rewards/chosen": -38.404258728027344, "rewards/margins": 0.34269237518310547, "rewards/rejected": -38.74695587158203, "step": 3214 }, { "epoch": 0.4377723311546841, "grad_norm": 41.70884366419674, "learning_rate": 5.527878689097282e-07, "logits/chosen": 11.193204879760742, "logits/rejected": 11.337011337280273, "logps/chosen": -3.7638797760009766, "logps/rejected": -3.788083076477051, "loss": 4.0868, "rewards/accuracies": 0.75, "rewards/chosen": -37.63880157470703, "rewards/margins": 0.24203205108642578, "rewards/rejected": -37.880828857421875, "step": 3215 }, { "epoch": 0.43790849673202614, "grad_norm": 39.2036380154595, "learning_rate": 5.526121287949655e-07, "logits/chosen": 11.907485961914062, "logits/rejected": 12.206520080566406, "logps/chosen": -4.05753755569458, "logps/rejected": -4.258022308349609, "loss": 4.2465, "rewards/accuracies": 0.75, "rewards/chosen": -40.57537841796875, "rewards/margins": 2.004842758178711, "rewards/rejected": -42.58021926879883, "step": 3216 }, { "epoch": 0.43804466230936817, "grad_norm": 40.88927222918646, "learning_rate": 5.524363541962308e-07, "logits/chosen": 11.395374298095703, "logits/rejected": 11.463136672973633, "logps/chosen": -3.9045770168304443, "logps/rejected": -3.940971612930298, "loss": 4.4082, "rewards/accuracies": 0.5, "rewards/chosen": -39.045772552490234, "rewards/margins": 0.36394500732421875, "rewards/rejected": -39.40971755981445, "step": 3217 }, { "epoch": 0.43818082788671026, "grad_norm": 42.41497884504579, "learning_rate": 5.522605451532417e-07, "logits/chosen": 12.302644729614258, "logits/rejected": 11.811657905578613, "logps/chosen": -3.783694267272949, "logps/rejected": -3.8648200035095215, "loss": 4.054, "rewards/accuracies": 0.75, "rewards/chosen": -37.836944580078125, "rewards/margins": 0.811255931854248, "rewards/rejected": -38.64820098876953, "step": 3218 }, { "epoch": 0.4383169934640523, "grad_norm": 42.99375010116659, "learning_rate": 5.520847017057239e-07, "logits/chosen": 12.321300506591797, "logits/rejected": 11.627008438110352, "logps/chosen": -4.113719940185547, "logps/rejected": -4.065735816955566, "loss": 3.9846, "rewards/accuracies": 0.25, "rewards/chosen": -41.1371955871582, "rewards/margins": -0.4798393249511719, "rewards/rejected": -40.65735626220703, "step": 3219 }, { "epoch": 0.4384531590413943, "grad_norm": 45.628213988493506, "learning_rate": 5.519088238934106e-07, "logits/chosen": 13.086413383483887, "logits/rejected": 11.964454650878906, "logps/chosen": -4.057681083679199, "logps/rejected": -3.8910114765167236, "loss": 4.2128, "rewards/accuracies": 0.25, "rewards/chosen": -40.576812744140625, "rewards/margins": -1.666696548461914, "rewards/rejected": -38.910118103027344, "step": 3220 }, { "epoch": 0.4385893246187364, "grad_norm": 37.67359510225589, "learning_rate": 5.517329117560429e-07, "logits/chosen": 12.390924453735352, "logits/rejected": 12.561929702758789, "logps/chosen": -3.7565340995788574, "logps/rejected": -3.6666951179504395, "loss": 3.7908, "rewards/accuracies": 0.5, "rewards/chosen": -37.565338134765625, "rewards/margins": -0.8983888626098633, "rewards/rejected": -36.66695022583008, "step": 3221 }, { "epoch": 0.4387254901960784, "grad_norm": 44.48383119781206, "learning_rate": 5.515569653333695e-07, "logits/chosen": 12.24485969543457, "logits/rejected": 12.439346313476562, "logps/chosen": -3.884190320968628, "logps/rejected": -3.790300130844116, "loss": 3.742, "rewards/accuracies": 0.75, "rewards/chosen": -38.84190368652344, "rewards/margins": -0.9389028549194336, "rewards/rejected": -37.90299987792969, "step": 3222 }, { "epoch": 0.43886165577342046, "grad_norm": 40.79131214406864, "learning_rate": 5.513809846651469e-07, "logits/chosen": 12.391302108764648, "logits/rejected": 12.215005874633789, "logps/chosen": -3.894651174545288, "logps/rejected": -3.77683162689209, "loss": 3.9387, "rewards/accuracies": 0.25, "rewards/chosen": -38.946510314941406, "rewards/margins": -1.1781949996948242, "rewards/rejected": -37.768314361572266, "step": 3223 }, { "epoch": 0.43899782135076254, "grad_norm": 40.15797521578804, "learning_rate": 5.512049697911397e-07, "logits/chosen": 11.980891227722168, "logits/rejected": 11.493049621582031, "logps/chosen": -3.5689918994903564, "logps/rejected": -3.718510627746582, "loss": 3.8963, "rewards/accuracies": 0.5, "rewards/chosen": -35.689918518066406, "rewards/margins": 1.4951868057250977, "rewards/rejected": -37.18510437011719, "step": 3224 }, { "epoch": 0.43913398692810457, "grad_norm": 40.35472416997442, "learning_rate": 5.510289207511196e-07, "logits/chosen": 12.390260696411133, "logits/rejected": 13.474309921264648, "logps/chosen": -3.8775830268859863, "logps/rejected": -4.292979717254639, "loss": 3.7593, "rewards/accuracies": 1.0, "rewards/chosen": -38.77582931518555, "rewards/margins": 4.153965950012207, "rewards/rejected": -42.92979431152344, "step": 3225 }, { "epoch": 0.4392701525054466, "grad_norm": 40.24238419661422, "learning_rate": 5.508528375848664e-07, "logits/chosen": 10.934279441833496, "logits/rejected": 12.139820098876953, "logps/chosen": -3.433319568634033, "logps/rejected": -3.727113962173462, "loss": 3.8972, "rewards/accuracies": 1.0, "rewards/chosen": -34.33319854736328, "rewards/margins": 2.9379425048828125, "rewards/rejected": -37.271141052246094, "step": 3226 }, { "epoch": 0.4394063180827887, "grad_norm": 42.42509673043752, "learning_rate": 5.506767203321676e-07, "logits/chosen": 12.358787536621094, "logits/rejected": 12.975711822509766, "logps/chosen": -4.36161994934082, "logps/rejected": -4.661220550537109, "loss": 4.0936, "rewards/accuracies": 0.75, "rewards/chosen": -43.61619567871094, "rewards/margins": 2.9960107803344727, "rewards/rejected": -46.612205505371094, "step": 3227 }, { "epoch": 0.4395424836601307, "grad_norm": 43.588469333533034, "learning_rate": 5.505005690328184e-07, "logits/chosen": 11.913581848144531, "logits/rejected": 12.539617538452148, "logps/chosen": -3.321824073791504, "logps/rejected": -3.6915881633758545, "loss": 3.8515, "rewards/accuracies": 0.75, "rewards/chosen": -33.218238830566406, "rewards/margins": 3.6976399421691895, "rewards/rejected": -36.9158821105957, "step": 3228 }, { "epoch": 0.43967864923747274, "grad_norm": 44.83954343800918, "learning_rate": 5.503243837266215e-07, "logits/chosen": 10.972291946411133, "logits/rejected": 11.625852584838867, "logps/chosen": -3.430316686630249, "logps/rejected": -3.44016170501709, "loss": 3.8388, "rewards/accuracies": 0.5, "rewards/chosen": -34.30316925048828, "rewards/margins": 0.09845209121704102, "rewards/rejected": -34.40161895751953, "step": 3229 }, { "epoch": 0.4398148148148148, "grad_norm": 42.37647385115626, "learning_rate": 5.501481644533875e-07, "logits/chosen": 12.550762176513672, "logits/rejected": 12.067604064941406, "logps/chosen": -3.3431787490844727, "logps/rejected": -3.362992763519287, "loss": 4.2121, "rewards/accuracies": 0.5, "rewards/chosen": -33.431785583496094, "rewards/margins": 0.19814062118530273, "rewards/rejected": -33.62992858886719, "step": 3230 }, { "epoch": 0.43995098039215685, "grad_norm": 44.34929857714179, "learning_rate": 5.499719112529347e-07, "logits/chosen": 11.053274154663086, "logits/rejected": 12.298870086669922, "logps/chosen": -3.3684287071228027, "logps/rejected": -3.705345392227173, "loss": 3.9978, "rewards/accuracies": 1.0, "rewards/chosen": -33.684288024902344, "rewards/margins": 3.3691649436950684, "rewards/rejected": -37.05345153808594, "step": 3231 }, { "epoch": 0.4400871459694989, "grad_norm": 36.46559863154646, "learning_rate": 5.497956241650888e-07, "logits/chosen": 11.16893196105957, "logits/rejected": 11.87385368347168, "logps/chosen": -3.481797695159912, "logps/rejected": -3.9151530265808105, "loss": 3.9863, "rewards/accuracies": 1.0, "rewards/chosen": -34.81797409057617, "rewards/margins": 4.333555221557617, "rewards/rejected": -39.15153121948242, "step": 3232 }, { "epoch": 0.44022331154684097, "grad_norm": 47.74035612090186, "learning_rate": 5.496193032296834e-07, "logits/chosen": 11.801982879638672, "logits/rejected": 11.984548568725586, "logps/chosen": -3.6785736083984375, "logps/rejected": -3.6109261512756348, "loss": 4.7097, "rewards/accuracies": 0.25, "rewards/chosen": -36.785736083984375, "rewards/margins": -0.6764745712280273, "rewards/rejected": -36.10926055908203, "step": 3233 }, { "epoch": 0.440359477124183, "grad_norm": 41.392465718916995, "learning_rate": 5.494429484865597e-07, "logits/chosen": 11.573580741882324, "logits/rejected": 13.125448226928711, "logps/chosen": -3.7449774742126465, "logps/rejected": -4.032344341278076, "loss": 4.0829, "rewards/accuracies": 1.0, "rewards/chosen": -37.44977569580078, "rewards/margins": 2.8736648559570312, "rewards/rejected": -40.32344055175781, "step": 3234 }, { "epoch": 0.4404956427015251, "grad_norm": 38.3812807495628, "learning_rate": 5.492665599755664e-07, "logits/chosen": 12.573631286621094, "logits/rejected": 11.81444263458252, "logps/chosen": -3.542482852935791, "logps/rejected": -3.3736839294433594, "loss": 3.8656, "rewards/accuracies": 0.5, "rewards/chosen": -35.42483139038086, "rewards/margins": -1.6879911422729492, "rewards/rejected": -33.736839294433594, "step": 3235 }, { "epoch": 0.4406318082788671, "grad_norm": 40.91922136267212, "learning_rate": 5.490901377365601e-07, "logits/chosen": 11.068486213684082, "logits/rejected": 11.764094352722168, "logps/chosen": -3.769813060760498, "logps/rejected": -3.707613468170166, "loss": 4.5937, "rewards/accuracies": 0.25, "rewards/chosen": -37.69812774658203, "rewards/margins": -0.6219949722290039, "rewards/rejected": -37.076133728027344, "step": 3236 }, { "epoch": 0.44076797385620914, "grad_norm": 40.36929453491899, "learning_rate": 5.489136818094048e-07, "logits/chosen": 11.026873588562012, "logits/rejected": 11.204631805419922, "logps/chosen": -3.419236183166504, "logps/rejected": -3.4444026947021484, "loss": 3.7699, "rewards/accuracies": 0.75, "rewards/chosen": -34.192359924316406, "rewards/margins": 0.2516651153564453, "rewards/rejected": -34.44402313232422, "step": 3237 }, { "epoch": 0.4409041394335512, "grad_norm": 45.08828506475369, "learning_rate": 5.487371922339721e-07, "logits/chosen": 11.22414779663086, "logits/rejected": 11.608840942382812, "logps/chosen": -3.5185883045196533, "logps/rejected": -3.695847749710083, "loss": 3.6713, "rewards/accuracies": 0.5, "rewards/chosen": -35.185882568359375, "rewards/margins": 1.7725954055786133, "rewards/rejected": -36.95847702026367, "step": 3238 }, { "epoch": 0.44104030501089325, "grad_norm": 40.482807173675376, "learning_rate": 5.485606690501414e-07, "logits/chosen": 11.963138580322266, "logits/rejected": 11.879731178283691, "logps/chosen": -3.7693774700164795, "logps/rejected": -3.8518989086151123, "loss": 4.1243, "rewards/accuracies": 0.75, "rewards/chosen": -37.69377517700195, "rewards/margins": 0.8252139091491699, "rewards/rejected": -38.51898956298828, "step": 3239 }, { "epoch": 0.4411764705882353, "grad_norm": 65.15815964070181, "learning_rate": 5.483841122977995e-07, "logits/chosen": 12.461904525756836, "logits/rejected": 13.130359649658203, "logps/chosen": -3.938333034515381, "logps/rejected": -4.14324426651001, "loss": 3.9657, "rewards/accuracies": 1.0, "rewards/chosen": -39.383331298828125, "rewards/margins": 2.0491113662719727, "rewards/rejected": -41.43244171142578, "step": 3240 }, { "epoch": 0.44131263616557737, "grad_norm": 38.049033357368984, "learning_rate": 5.482075220168408e-07, "logits/chosen": 10.75558853149414, "logits/rejected": 11.594195365905762, "logps/chosen": -3.6174962520599365, "logps/rejected": -3.941183567047119, "loss": 3.7105, "rewards/accuracies": 0.75, "rewards/chosen": -36.17496109008789, "rewards/margins": 3.2368717193603516, "rewards/rejected": -39.411834716796875, "step": 3241 }, { "epoch": 0.4414488017429194, "grad_norm": 41.017643681532235, "learning_rate": 5.480308982471674e-07, "logits/chosen": 12.311722755432129, "logits/rejected": 11.773965835571289, "logps/chosen": -4.017672538757324, "logps/rejected": -3.821913242340088, "loss": 3.7396, "rewards/accuracies": 0.25, "rewards/chosen": -40.17672348022461, "rewards/margins": -1.9575920104980469, "rewards/rejected": -38.21913146972656, "step": 3242 }, { "epoch": 0.4415849673202614, "grad_norm": 44.21114966209893, "learning_rate": 5.47854241028689e-07, "logits/chosen": 11.879565238952637, "logits/rejected": 11.539456367492676, "logps/chosen": -3.9308815002441406, "logps/rejected": -4.175505638122559, "loss": 4.2462, "rewards/accuracies": 0.75, "rewards/chosen": -39.308815002441406, "rewards/margins": 2.446242332458496, "rewards/rejected": -41.75505828857422, "step": 3243 }, { "epoch": 0.4417211328976035, "grad_norm": 35.24373505342013, "learning_rate": 5.476775504013227e-07, "logits/chosen": 12.167964935302734, "logits/rejected": 12.446353912353516, "logps/chosen": -3.6413073539733887, "logps/rejected": -4.014691352844238, "loss": 3.4913, "rewards/accuracies": 0.75, "rewards/chosen": -36.41307067871094, "rewards/margins": 3.7338409423828125, "rewards/rejected": -40.146915435791016, "step": 3244 }, { "epoch": 0.44185729847494554, "grad_norm": 41.55857063316894, "learning_rate": 5.475008264049931e-07, "logits/chosen": 11.492818832397461, "logits/rejected": 12.020492553710938, "logps/chosen": -3.964531898498535, "logps/rejected": -3.9477152824401855, "loss": 4.3189, "rewards/accuracies": 0.5, "rewards/chosen": -39.64531707763672, "rewards/margins": -0.1681661605834961, "rewards/rejected": -39.477149963378906, "step": 3245 }, { "epoch": 0.44199346405228757, "grad_norm": 41.332221438507446, "learning_rate": 5.473240690796325e-07, "logits/chosen": 10.738057136535645, "logits/rejected": 10.435908317565918, "logps/chosen": -3.488760232925415, "logps/rejected": -3.336489677429199, "loss": 4.0954, "rewards/accuracies": 0.5, "rewards/chosen": -34.887603759765625, "rewards/margins": -1.522705078125, "rewards/rejected": -33.364898681640625, "step": 3246 }, { "epoch": 0.44212962962962965, "grad_norm": 36.411306294103966, "learning_rate": 5.471472784651806e-07, "logits/chosen": 11.493154525756836, "logits/rejected": 12.29326343536377, "logps/chosen": -3.648489236831665, "logps/rejected": -3.6645615100860596, "loss": 4.2081, "rewards/accuracies": 0.5, "rewards/chosen": -36.484893798828125, "rewards/margins": 0.1607217788696289, "rewards/rejected": -36.64561462402344, "step": 3247 }, { "epoch": 0.4422657952069717, "grad_norm": 38.013188319861385, "learning_rate": 5.46970454601585e-07, "logits/chosen": 10.832340240478516, "logits/rejected": 10.977014541625977, "logps/chosen": -3.0362348556518555, "logps/rejected": -3.487429141998291, "loss": 3.2756, "rewards/accuracies": 1.0, "rewards/chosen": -30.362348556518555, "rewards/margins": 4.511940002441406, "rewards/rejected": -34.874290466308594, "step": 3248 }, { "epoch": 0.4424019607843137, "grad_norm": 41.619523222804155, "learning_rate": 5.467935975288003e-07, "logits/chosen": 12.0911226272583, "logits/rejected": 12.2140474319458, "logps/chosen": -3.840872287750244, "logps/rejected": -3.8563122749328613, "loss": 3.7532, "rewards/accuracies": 0.5, "rewards/chosen": -38.40872573852539, "rewards/margins": 0.15439891815185547, "rewards/rejected": -38.56312561035156, "step": 3249 }, { "epoch": 0.4425381263616558, "grad_norm": 40.62197373543977, "learning_rate": 5.466167072867887e-07, "logits/chosen": 11.042304992675781, "logits/rejected": 12.284414291381836, "logps/chosen": -3.626694679260254, "logps/rejected": -4.222927570343018, "loss": 3.2091, "rewards/accuracies": 1.0, "rewards/chosen": -36.266944885253906, "rewards/margins": 5.962332725524902, "rewards/rejected": -42.229278564453125, "step": 3250 }, { "epoch": 0.4426742919389978, "grad_norm": 39.41530644371327, "learning_rate": 5.464397839155202e-07, "logits/chosen": 11.920934677124023, "logits/rejected": 12.27362060546875, "logps/chosen": -4.041813373565674, "logps/rejected": -4.111460208892822, "loss": 3.6318, "rewards/accuracies": 0.75, "rewards/chosen": -40.41813659667969, "rewards/margins": 0.6964654922485352, "rewards/rejected": -41.114601135253906, "step": 3251 }, { "epoch": 0.44281045751633985, "grad_norm": 39.424613044953894, "learning_rate": 5.462628274549721e-07, "logits/chosen": 12.097058296203613, "logits/rejected": 12.464775085449219, "logps/chosen": -3.779977798461914, "logps/rejected": -3.8972415924072266, "loss": 3.7987, "rewards/accuracies": 0.5, "rewards/chosen": -37.799781799316406, "rewards/margins": 1.172635555267334, "rewards/rejected": -38.972415924072266, "step": 3252 }, { "epoch": 0.44294662309368193, "grad_norm": 39.408449948813036, "learning_rate": 5.460858379451289e-07, "logits/chosen": 11.303228378295898, "logits/rejected": 10.783882141113281, "logps/chosen": -3.6694421768188477, "logps/rejected": -3.5320656299591064, "loss": 3.8177, "rewards/accuracies": 0.5, "rewards/chosen": -36.694419860839844, "rewards/margins": -1.373763084411621, "rewards/rejected": -35.320655822753906, "step": 3253 }, { "epoch": 0.44308278867102396, "grad_norm": 42.52205640923982, "learning_rate": 5.459088154259834e-07, "logits/chosen": 11.5491361618042, "logits/rejected": 11.440589904785156, "logps/chosen": -3.5229265689849854, "logps/rejected": -3.5806069374084473, "loss": 4.3244, "rewards/accuracies": 0.5, "rewards/chosen": -35.22926330566406, "rewards/margins": 0.5768041610717773, "rewards/rejected": -35.806068420410156, "step": 3254 }, { "epoch": 0.443218954248366, "grad_norm": 37.01105785546383, "learning_rate": 5.457317599375347e-07, "logits/chosen": 11.528165817260742, "logits/rejected": 12.063438415527344, "logps/chosen": -3.6404106616973877, "logps/rejected": -3.8008885383605957, "loss": 3.7069, "rewards/accuracies": 0.75, "rewards/chosen": -36.40410614013672, "rewards/margins": 1.6047792434692383, "rewards/rejected": -38.00888442993164, "step": 3255 }, { "epoch": 0.4433551198257081, "grad_norm": 40.753642667131146, "learning_rate": 5.455546715197903e-07, "logits/chosen": 10.523447036743164, "logits/rejected": 11.920137405395508, "logps/chosen": -3.2688910961151123, "logps/rejected": -3.7087907791137695, "loss": 4.0129, "rewards/accuracies": 0.75, "rewards/chosen": -32.68891143798828, "rewards/margins": 4.398995399475098, "rewards/rejected": -37.08790588378906, "step": 3256 }, { "epoch": 0.4434912854030501, "grad_norm": 40.55802432122838, "learning_rate": 5.453775502127647e-07, "logits/chosen": 12.200538635253906, "logits/rejected": 11.60420036315918, "logps/chosen": -3.6292166709899902, "logps/rejected": -3.704684257507324, "loss": 4.0722, "rewards/accuracies": 0.75, "rewards/chosen": -36.29216766357422, "rewards/margins": 0.7546753883361816, "rewards/rejected": -37.046844482421875, "step": 3257 }, { "epoch": 0.44362745098039214, "grad_norm": 38.88233152779957, "learning_rate": 5.4520039605648e-07, "logits/chosen": 13.330930709838867, "logits/rejected": 12.176355361938477, "logps/chosen": -3.962681293487549, "logps/rejected": -3.812473773956299, "loss": 3.671, "rewards/accuracies": 0.25, "rewards/chosen": -39.62681579589844, "rewards/margins": -1.5020761489868164, "rewards/rejected": -38.12474060058594, "step": 3258 }, { "epoch": 0.4437636165577342, "grad_norm": 71.28312041771973, "learning_rate": 5.450232090909654e-07, "logits/chosen": 12.104111671447754, "logits/rejected": 12.097421646118164, "logps/chosen": -3.666698455810547, "logps/rejected": -3.7770020961761475, "loss": 4.2697, "rewards/accuracies": 0.5, "rewards/chosen": -36.66698455810547, "rewards/margins": 1.1030349731445312, "rewards/rejected": -37.77001953125, "step": 3259 }, { "epoch": 0.44389978213507625, "grad_norm": 38.98087217958977, "learning_rate": 5.448459893562581e-07, "logits/chosen": 11.58969783782959, "logits/rejected": 11.556396484375, "logps/chosen": -3.311436891555786, "logps/rejected": -3.567166328430176, "loss": 3.6708, "rewards/accuracies": 0.75, "rewards/chosen": -33.1143684387207, "rewards/margins": 2.5572938919067383, "rewards/rejected": -35.671661376953125, "step": 3260 }, { "epoch": 0.4440359477124183, "grad_norm": 38.37912326219454, "learning_rate": 5.44668736892402e-07, "logits/chosen": 11.379069328308105, "logits/rejected": 12.573020935058594, "logps/chosen": -3.4981279373168945, "logps/rejected": -3.9101243019104004, "loss": 3.774, "rewards/accuracies": 0.75, "rewards/chosen": -34.98127746582031, "rewards/margins": 4.1199631690979, "rewards/rejected": -39.10124206542969, "step": 3261 }, { "epoch": 0.44417211328976036, "grad_norm": 39.15749579167488, "learning_rate": 5.444914517394491e-07, "logits/chosen": 12.05772590637207, "logits/rejected": 11.82867431640625, "logps/chosen": -3.6950125694274902, "logps/rejected": -3.7829864025115967, "loss": 3.9843, "rewards/accuracies": 0.5, "rewards/chosen": -36.95012664794922, "rewards/margins": 0.8797397613525391, "rewards/rejected": -37.829864501953125, "step": 3262 }, { "epoch": 0.4443082788671024, "grad_norm": 42.774532393824266, "learning_rate": 5.443141339374583e-07, "logits/chosen": 11.443922996520996, "logits/rejected": 11.287174224853516, "logps/chosen": -3.5329344272613525, "logps/rejected": -3.4063220024108887, "loss": 4.3402, "rewards/accuracies": 0.5, "rewards/chosen": -35.329345703125, "rewards/margins": -1.266125202178955, "rewards/rejected": -34.06321716308594, "step": 3263 }, { "epoch": 0.4444444444444444, "grad_norm": 38.96715334876277, "learning_rate": 5.44136783526496e-07, "logits/chosen": 10.501554489135742, "logits/rejected": 10.587709426879883, "logps/chosen": -3.5596399307250977, "logps/rejected": -3.3974547386169434, "loss": 3.6225, "rewards/accuracies": 0.25, "rewards/chosen": -35.596397399902344, "rewards/margins": -1.6218504905700684, "rewards/rejected": -33.97454833984375, "step": 3264 }, { "epoch": 0.4445806100217865, "grad_norm": 40.9296076690427, "learning_rate": 5.43959400546636e-07, "logits/chosen": 11.879911422729492, "logits/rejected": 13.036760330200195, "logps/chosen": -3.8484489917755127, "logps/rejected": -4.070675373077393, "loss": 4.1711, "rewards/accuracies": 0.5, "rewards/chosen": -38.48448944091797, "rewards/margins": 2.222261428833008, "rewards/rejected": -40.70675277709961, "step": 3265 }, { "epoch": 0.44471677559912853, "grad_norm": 45.500731286768364, "learning_rate": 5.437819850379594e-07, "logits/chosen": 11.94595718383789, "logits/rejected": 11.667693138122559, "logps/chosen": -3.8891749382019043, "logps/rejected": -3.7377607822418213, "loss": 4.2671, "rewards/accuracies": 0.5, "rewards/chosen": -38.891746520996094, "rewards/margins": -1.514139175415039, "rewards/rejected": -37.37760925292969, "step": 3266 }, { "epoch": 0.44485294117647056, "grad_norm": 39.39664378343658, "learning_rate": 5.43604537040555e-07, "logits/chosen": 11.192422866821289, "logits/rejected": 11.456832885742188, "logps/chosen": -3.664641857147217, "logps/rejected": -3.4892702102661133, "loss": 3.7787, "rewards/accuracies": 0.0, "rewards/chosen": -36.64641571044922, "rewards/margins": -1.7537150382995605, "rewards/rejected": -34.8927001953125, "step": 3267 }, { "epoch": 0.44498910675381265, "grad_norm": 37.19694267585269, "learning_rate": 5.434270565945181e-07, "logits/chosen": 11.995662689208984, "logits/rejected": 12.018308639526367, "logps/chosen": -3.5993874073028564, "logps/rejected": -3.6596851348876953, "loss": 4.0365, "rewards/accuracies": 0.5, "rewards/chosen": -35.993873596191406, "rewards/margins": 0.6029801368713379, "rewards/rejected": -36.59685516357422, "step": 3268 }, { "epoch": 0.4451252723311547, "grad_norm": 51.64919085216926, "learning_rate": 5.432495437399524e-07, "logits/chosen": 11.399150848388672, "logits/rejected": 11.114823341369629, "logps/chosen": -3.9403228759765625, "logps/rejected": -3.7474231719970703, "loss": 3.8853, "rewards/accuracies": 0.5, "rewards/chosen": -39.403228759765625, "rewards/margins": -1.9289956092834473, "rewards/rejected": -37.47422790527344, "step": 3269 }, { "epoch": 0.4452614379084967, "grad_norm": 44.291449780304085, "learning_rate": 5.430719985169681e-07, "logits/chosen": 12.289253234863281, "logits/rejected": 12.21464729309082, "logps/chosen": -3.703805446624756, "logps/rejected": -3.8713314533233643, "loss": 4.0623, "rewards/accuracies": 0.5, "rewards/chosen": -37.038055419921875, "rewards/margins": 1.6752614974975586, "rewards/rejected": -38.71331787109375, "step": 3270 }, { "epoch": 0.4453976034858388, "grad_norm": 48.465757158528255, "learning_rate": 5.428944209656831e-07, "logits/chosen": 11.413712501525879, "logits/rejected": 11.24728012084961, "logps/chosen": -3.6727042198181152, "logps/rejected": -3.6014857292175293, "loss": 4.284, "rewards/accuracies": 0.5, "rewards/chosen": -36.72704315185547, "rewards/margins": -0.7121868133544922, "rewards/rejected": -36.014854431152344, "step": 3271 }, { "epoch": 0.4455337690631808, "grad_norm": 43.77775693053812, "learning_rate": 5.427168111262225e-07, "logits/chosen": 12.664066314697266, "logits/rejected": 12.223762512207031, "logps/chosen": -3.621170997619629, "logps/rejected": -3.4554052352905273, "loss": 4.2797, "rewards/accuracies": 0.25, "rewards/chosen": -36.211708068847656, "rewards/margins": -1.6576533317565918, "rewards/rejected": -34.554054260253906, "step": 3272 }, { "epoch": 0.4456699346405229, "grad_norm": 51.93938749609234, "learning_rate": 5.425391690387187e-07, "logits/chosen": 11.878671646118164, "logits/rejected": 12.098355293273926, "logps/chosen": -3.9483184814453125, "logps/rejected": -3.9786312580108643, "loss": 3.8259, "rewards/accuracies": 0.75, "rewards/chosen": -39.483184814453125, "rewards/margins": 0.3031291961669922, "rewards/rejected": -39.78631591796875, "step": 3273 }, { "epoch": 0.44580610021786493, "grad_norm": 43.683166424690675, "learning_rate": 5.423614947433115e-07, "logits/chosen": 11.358657836914062, "logits/rejected": 12.321455001831055, "logps/chosen": -3.2648532390594482, "logps/rejected": -4.084358215332031, "loss": 4.4261, "rewards/accuracies": 1.0, "rewards/chosen": -32.64853286743164, "rewards/margins": 8.195048332214355, "rewards/rejected": -40.84358215332031, "step": 3274 }, { "epoch": 0.44594226579520696, "grad_norm": 45.247892005206836, "learning_rate": 5.421837882801477e-07, "logits/chosen": 12.319190979003906, "logits/rejected": 12.135429382324219, "logps/chosen": -3.7142481803894043, "logps/rejected": -4.000479221343994, "loss": 3.9098, "rewards/accuracies": 0.75, "rewards/chosen": -37.14248275756836, "rewards/margins": 2.862309455871582, "rewards/rejected": -40.004791259765625, "step": 3275 }, { "epoch": 0.44607843137254904, "grad_norm": 42.460627719084634, "learning_rate": 5.420060496893818e-07, "logits/chosen": 12.695833206176758, "logits/rejected": 12.778505325317383, "logps/chosen": -3.947857141494751, "logps/rejected": -4.073494911193848, "loss": 4.0443, "rewards/accuracies": 1.0, "rewards/chosen": -39.47856903076172, "rewards/margins": 1.2563791275024414, "rewards/rejected": -40.73495101928711, "step": 3276 }, { "epoch": 0.4462145969498911, "grad_norm": 45.409931870536504, "learning_rate": 5.418282790111748e-07, "logits/chosen": 12.184606552124023, "logits/rejected": 12.699227333068848, "logps/chosen": -3.4437379837036133, "logps/rejected": -3.641573667526245, "loss": 3.7997, "rewards/accuracies": 0.75, "rewards/chosen": -34.4373779296875, "rewards/margins": 1.9783573150634766, "rewards/rejected": -36.41573715209961, "step": 3277 }, { "epoch": 0.4463507625272331, "grad_norm": 39.29272263953686, "learning_rate": 5.416504762856961e-07, "logits/chosen": 11.411872863769531, "logits/rejected": 12.183149337768555, "logps/chosen": -3.3062000274658203, "logps/rejected": -3.964881658554077, "loss": 3.3989, "rewards/accuracies": 1.0, "rewards/chosen": -33.0620002746582, "rewards/margins": 6.58681583404541, "rewards/rejected": -39.64881896972656, "step": 3278 }, { "epoch": 0.4464869281045752, "grad_norm": 44.943916840263434, "learning_rate": 5.414726415531213e-07, "logits/chosen": 11.882475852966309, "logits/rejected": 12.21280288696289, "logps/chosen": -3.705094814300537, "logps/rejected": -4.001079559326172, "loss": 4.0496, "rewards/accuracies": 0.75, "rewards/chosen": -37.05094909667969, "rewards/margins": 2.9598493576049805, "rewards/rejected": -40.01079559326172, "step": 3279 }, { "epoch": 0.4466230936819172, "grad_norm": 43.554640017248516, "learning_rate": 5.412947748536337e-07, "logits/chosen": 11.915605545043945, "logits/rejected": 11.89805793762207, "logps/chosen": -3.6519699096679688, "logps/rejected": -3.9856467247009277, "loss": 3.8033, "rewards/accuracies": 0.5, "rewards/chosen": -36.51969909667969, "rewards/margins": 3.3367691040039062, "rewards/rejected": -39.856468200683594, "step": 3280 }, { "epoch": 0.44675925925925924, "grad_norm": 43.37007624385939, "learning_rate": 5.411168762274238e-07, "logits/chosen": 12.497661590576172, "logits/rejected": 11.986326217651367, "logps/chosen": -3.6523244380950928, "logps/rejected": -3.669506311416626, "loss": 3.7159, "rewards/accuracies": 0.75, "rewards/chosen": -36.52324295043945, "rewards/margins": 0.17181730270385742, "rewards/rejected": -36.69506072998047, "step": 3281 }, { "epoch": 0.44689542483660133, "grad_norm": 41.37431371260132, "learning_rate": 5.409389457146891e-07, "logits/chosen": 12.55249309539795, "logits/rejected": 12.737373352050781, "logps/chosen": -3.633131504058838, "logps/rejected": -3.798003911972046, "loss": 3.3454, "rewards/accuracies": 0.75, "rewards/chosen": -36.33131408691406, "rewards/margins": 1.6487231254577637, "rewards/rejected": -37.98004150390625, "step": 3282 }, { "epoch": 0.44703159041394336, "grad_norm": 41.95833857319995, "learning_rate": 5.407609833556348e-07, "logits/chosen": 11.828615188598633, "logits/rejected": 11.860872268676758, "logps/chosen": -3.7995505332946777, "logps/rejected": -3.8676536083221436, "loss": 4.4328, "rewards/accuracies": 0.75, "rewards/chosen": -37.995506286621094, "rewards/margins": 0.6810312271118164, "rewards/rejected": -38.676536560058594, "step": 3283 }, { "epoch": 0.4471677559912854, "grad_norm": 41.128608485005046, "learning_rate": 5.405829891904727e-07, "logits/chosen": 12.869179725646973, "logits/rejected": 12.210868835449219, "logps/chosen": -3.950277805328369, "logps/rejected": -4.0183000564575195, "loss": 3.9527, "rewards/accuracies": 0.5, "rewards/chosen": -39.502777099609375, "rewards/margins": 0.6802253723144531, "rewards/rejected": -40.183006286621094, "step": 3284 }, { "epoch": 0.44730392156862747, "grad_norm": 45.16862060522999, "learning_rate": 5.404049632594221e-07, "logits/chosen": 12.145421981811523, "logits/rejected": 12.619623184204102, "logps/chosen": -3.6261446475982666, "logps/rejected": -3.8547728061676025, "loss": 3.856, "rewards/accuracies": 0.75, "rewards/chosen": -36.261444091796875, "rewards/margins": 2.2862815856933594, "rewards/rejected": -38.5477294921875, "step": 3285 }, { "epoch": 0.4474400871459695, "grad_norm": 42.39607521965418, "learning_rate": 5.402269056027094e-07, "logits/chosen": 12.650949478149414, "logits/rejected": 12.521112442016602, "logps/chosen": -3.7603530883789062, "logps/rejected": -3.8211865425109863, "loss": 4.0433, "rewards/accuracies": 0.5, "rewards/chosen": -37.60353088378906, "rewards/margins": 0.6083364486694336, "rewards/rejected": -38.21186447143555, "step": 3286 }, { "epoch": 0.44757625272331153, "grad_norm": 46.99639288729564, "learning_rate": 5.400488162605684e-07, "logits/chosen": 10.883243560791016, "logits/rejected": 12.539440155029297, "logps/chosen": -3.5748744010925293, "logps/rejected": -4.150556564331055, "loss": 4.0294, "rewards/accuracies": 1.0, "rewards/chosen": -35.748741149902344, "rewards/margins": 5.756820201873779, "rewards/rejected": -41.50556182861328, "step": 3287 }, { "epoch": 0.4477124183006536, "grad_norm": 51.41429939105844, "learning_rate": 5.398706952732396e-07, "logits/chosen": 11.252199172973633, "logits/rejected": 12.010986328125, "logps/chosen": -3.4825103282928467, "logps/rejected": -3.591193199157715, "loss": 4.302, "rewards/accuracies": 0.75, "rewards/chosen": -34.825103759765625, "rewards/margins": 1.086827278137207, "rewards/rejected": -35.911930084228516, "step": 3288 }, { "epoch": 0.44784858387799564, "grad_norm": 54.80865060038196, "learning_rate": 5.396925426809709e-07, "logits/chosen": 11.060140609741211, "logits/rejected": 12.63404369354248, "logps/chosen": -3.482736349105835, "logps/rejected": -3.781865119934082, "loss": 2.9827, "rewards/accuracies": 0.75, "rewards/chosen": -34.82736587524414, "rewards/margins": 2.991288661956787, "rewards/rejected": -37.81865310668945, "step": 3289 }, { "epoch": 0.44798474945533767, "grad_norm": 48.502421955892416, "learning_rate": 5.395143585240178e-07, "logits/chosen": 12.20252799987793, "logits/rejected": 11.960296630859375, "logps/chosen": -3.525783061981201, "logps/rejected": -3.6146960258483887, "loss": 4.6376, "rewards/accuracies": 0.5, "rewards/chosen": -35.25782775878906, "rewards/margins": 0.8891305923461914, "rewards/rejected": -36.1469612121582, "step": 3290 }, { "epoch": 0.44812091503267976, "grad_norm": 43.44860122150233, "learning_rate": 5.393361428426419e-07, "logits/chosen": 11.541865348815918, "logits/rejected": 12.614497184753418, "logps/chosen": -3.6808722019195557, "logps/rejected": -3.6405482292175293, "loss": 3.8967, "rewards/accuracies": 0.25, "rewards/chosen": -36.80872344970703, "rewards/margins": -0.4032421112060547, "rewards/rejected": -36.405479431152344, "step": 3291 }, { "epoch": 0.4482570806100218, "grad_norm": 49.03589939843623, "learning_rate": 5.391578956771127e-07, "logits/chosen": 11.91088581085205, "logits/rejected": 12.75193977355957, "logps/chosen": -3.5072455406188965, "logps/rejected": -3.8564820289611816, "loss": 4.4808, "rewards/accuracies": 0.75, "rewards/chosen": -35.07245635986328, "rewards/margins": 3.4923653602600098, "rewards/rejected": -38.5648193359375, "step": 3292 }, { "epoch": 0.4483932461873638, "grad_norm": 46.85454337918073, "learning_rate": 5.38979617067707e-07, "logits/chosen": 11.381628036499023, "logits/rejected": 11.725374221801758, "logps/chosen": -3.4554624557495117, "logps/rejected": -3.7312841415405273, "loss": 3.9629, "rewards/accuracies": 0.75, "rewards/chosen": -34.55462646484375, "rewards/margins": 2.758216381072998, "rewards/rejected": -37.312843322753906, "step": 3293 }, { "epoch": 0.4485294117647059, "grad_norm": 47.49518404249649, "learning_rate": 5.388013070547078e-07, "logits/chosen": 12.042547225952148, "logits/rejected": 13.738862991333008, "logps/chosen": -3.5740866661071777, "logps/rejected": -3.851820468902588, "loss": 3.8746, "rewards/accuracies": 1.0, "rewards/chosen": -35.740867614746094, "rewards/margins": 2.777337074279785, "rewards/rejected": -38.51820373535156, "step": 3294 }, { "epoch": 0.4486655773420479, "grad_norm": 55.532669364182944, "learning_rate": 5.386229656784058e-07, "logits/chosen": 13.179028511047363, "logits/rejected": 12.831448554992676, "logps/chosen": -3.905587911605835, "logps/rejected": -4.194244384765625, "loss": 4.5687, "rewards/accuracies": 0.75, "rewards/chosen": -39.055877685546875, "rewards/margins": 2.886563301086426, "rewards/rejected": -41.94244384765625, "step": 3295 }, { "epoch": 0.44880174291938996, "grad_norm": 48.67896710366081, "learning_rate": 5.38444592979099e-07, "logits/chosen": 13.027257919311523, "logits/rejected": 13.4690580368042, "logps/chosen": -3.9099931716918945, "logps/rejected": -3.9834327697753906, "loss": 4.0618, "rewards/accuracies": 0.75, "rewards/chosen": -39.09992980957031, "rewards/margins": 0.7343931198120117, "rewards/rejected": -39.83432388305664, "step": 3296 }, { "epoch": 0.44893790849673204, "grad_norm": 44.17504112286811, "learning_rate": 5.382661889970921e-07, "logits/chosen": 13.128421783447266, "logits/rejected": 11.99659252166748, "logps/chosen": -4.168913841247559, "logps/rejected": -3.9133758544921875, "loss": 4.1596, "rewards/accuracies": 0.25, "rewards/chosen": -41.68914031982422, "rewards/margins": -2.5553789138793945, "rewards/rejected": -39.133758544921875, "step": 3297 }, { "epoch": 0.44907407407407407, "grad_norm": 44.7132369742489, "learning_rate": 5.380877537726966e-07, "logits/chosen": 12.584524154663086, "logits/rejected": 12.851057052612305, "logps/chosen": -3.679658889770508, "logps/rejected": -3.8904268741607666, "loss": 3.8965, "rewards/accuracies": 0.75, "rewards/chosen": -36.79658889770508, "rewards/margins": 2.1076793670654297, "rewards/rejected": -38.904266357421875, "step": 3298 }, { "epoch": 0.4492102396514161, "grad_norm": 41.98752176465873, "learning_rate": 5.379092873462319e-07, "logits/chosen": 11.606311798095703, "logits/rejected": 11.838432312011719, "logps/chosen": -3.422452688217163, "logps/rejected": -3.557445764541626, "loss": 3.8093, "rewards/accuracies": 0.5, "rewards/chosen": -34.22452926635742, "rewards/margins": 1.349930763244629, "rewards/rejected": -35.574459075927734, "step": 3299 }, { "epoch": 0.4493464052287582, "grad_norm": 43.14413269137607, "learning_rate": 5.377307897580238e-07, "logits/chosen": 12.36050033569336, "logits/rejected": 12.492757797241211, "logps/chosen": -3.939664363861084, "logps/rejected": -4.006279945373535, "loss": 3.9708, "rewards/accuracies": 0.25, "rewards/chosen": -39.396644592285156, "rewards/margins": 0.6661539077758789, "rewards/rejected": -40.06279754638672, "step": 3300 }, { "epoch": 0.4494825708061002, "grad_norm": 39.75004588162795, "learning_rate": 5.375522610484051e-07, "logits/chosen": 12.00485610961914, "logits/rejected": 12.55975341796875, "logps/chosen": -3.335023880004883, "logps/rejected": -3.7959647178649902, "loss": 3.9401, "rewards/accuracies": 1.0, "rewards/chosen": -33.35023498535156, "rewards/margins": 4.609408378601074, "rewards/rejected": -37.95964813232422, "step": 3301 }, { "epoch": 0.44961873638344224, "grad_norm": 42.3553693268486, "learning_rate": 5.373737012577161e-07, "logits/chosen": 12.114786148071289, "logits/rejected": 12.373607635498047, "logps/chosen": -4.128081321716309, "logps/rejected": -3.84126615524292, "loss": 4.1663, "rewards/accuracies": 0.5, "rewards/chosen": -41.28081512451172, "rewards/margins": -2.868152618408203, "rewards/rejected": -38.41265869140625, "step": 3302 }, { "epoch": 0.4497549019607843, "grad_norm": 46.12841100646754, "learning_rate": 5.371951104263037e-07, "logits/chosen": 11.690505981445312, "logits/rejected": 12.409335136413574, "logps/chosen": -3.5273773670196533, "logps/rejected": -3.7050561904907227, "loss": 4.2959, "rewards/accuracies": 0.75, "rewards/chosen": -35.273773193359375, "rewards/margins": 1.7767877578735352, "rewards/rejected": -37.050559997558594, "step": 3303 }, { "epoch": 0.44989106753812635, "grad_norm": 38.84467208141676, "learning_rate": 5.37016488594522e-07, "logits/chosen": 11.596672058105469, "logits/rejected": 12.062446594238281, "logps/chosen": -3.7795932292938232, "logps/rejected": -3.9374563694000244, "loss": 3.6302, "rewards/accuracies": 0.5, "rewards/chosen": -37.795936584472656, "rewards/margins": 1.5786314010620117, "rewards/rejected": -39.37456512451172, "step": 3304 }, { "epoch": 0.4500272331154684, "grad_norm": 43.127035460765974, "learning_rate": 5.368378358027322e-07, "logits/chosen": 12.375846862792969, "logits/rejected": 12.126626968383789, "logps/chosen": -3.812985420227051, "logps/rejected": -3.834397077560425, "loss": 3.7611, "rewards/accuracies": 0.75, "rewards/chosen": -38.12985610961914, "rewards/margins": 0.21411514282226562, "rewards/rejected": -38.343971252441406, "step": 3305 }, { "epoch": 0.45016339869281047, "grad_norm": 38.58919065799729, "learning_rate": 5.366591520913024e-07, "logits/chosen": 12.200740814208984, "logits/rejected": 12.767505645751953, "logps/chosen": -3.462839126586914, "logps/rejected": -3.832200288772583, "loss": 4.0997, "rewards/accuracies": 0.75, "rewards/chosen": -34.62839126586914, "rewards/margins": 3.6936120986938477, "rewards/rejected": -38.32200241088867, "step": 3306 }, { "epoch": 0.4502995642701525, "grad_norm": 40.42706535264678, "learning_rate": 5.364804375006072e-07, "logits/chosen": 11.497604370117188, "logits/rejected": 12.24514389038086, "logps/chosen": -3.491184711456299, "logps/rejected": -3.901614189147949, "loss": 4.1574, "rewards/accuracies": 1.0, "rewards/chosen": -34.91184997558594, "rewards/margins": 4.104294776916504, "rewards/rejected": -39.016143798828125, "step": 3307 }, { "epoch": 0.4504357298474945, "grad_norm": 42.15361028760141, "learning_rate": 5.363016920710294e-07, "logits/chosen": 12.166011810302734, "logits/rejected": 12.765434265136719, "logps/chosen": -3.698969841003418, "logps/rejected": -4.0951104164123535, "loss": 4.2998, "rewards/accuracies": 0.75, "rewards/chosen": -36.98969650268555, "rewards/margins": 3.9614062309265137, "rewards/rejected": -40.95110321044922, "step": 3308 }, { "epoch": 0.4505718954248366, "grad_norm": 39.123380648143076, "learning_rate": 5.361229158429573e-07, "logits/chosen": 11.940580368041992, "logits/rejected": 12.18350601196289, "logps/chosen": -3.4267702102661133, "logps/rejected": -3.7324748039245605, "loss": 3.8373, "rewards/accuracies": 0.75, "rewards/chosen": -34.2677001953125, "rewards/margins": 3.0570459365844727, "rewards/rejected": -37.324745178222656, "step": 3309 }, { "epoch": 0.45070806100217864, "grad_norm": 39.415447693637695, "learning_rate": 5.359441088567872e-07, "logits/chosen": 11.704290390014648, "logits/rejected": 12.62682819366455, "logps/chosen": -3.744248151779175, "logps/rejected": -3.9472198486328125, "loss": 3.8866, "rewards/accuracies": 0.75, "rewards/chosen": -37.442481994628906, "rewards/margins": 2.0297164916992188, "rewards/rejected": -39.472198486328125, "step": 3310 }, { "epoch": 0.4508442265795207, "grad_norm": 66.16101968193024, "learning_rate": 5.357652711529221e-07, "logits/chosen": 11.618368148803711, "logits/rejected": 12.231115341186523, "logps/chosen": -3.4034230709075928, "logps/rejected": -3.5848312377929688, "loss": 3.6192, "rewards/accuracies": 0.5, "rewards/chosen": -34.03423309326172, "rewards/margins": 1.8140811920166016, "rewards/rejected": -35.84831237792969, "step": 3311 }, { "epoch": 0.45098039215686275, "grad_norm": 57.29173956357481, "learning_rate": 5.355864027717717e-07, "logits/chosen": 12.22265625, "logits/rejected": 11.963029861450195, "logps/chosen": -3.803647518157959, "logps/rejected": -3.976776361465454, "loss": 3.8548, "rewards/accuracies": 0.75, "rewards/chosen": -38.036476135253906, "rewards/margins": 1.7312870025634766, "rewards/rejected": -39.767765045166016, "step": 3312 }, { "epoch": 0.4511165577342048, "grad_norm": 36.99381996328578, "learning_rate": 5.354075037537527e-07, "logits/chosen": 11.551752090454102, "logits/rejected": 12.49758529663086, "logps/chosen": -3.6948366165161133, "logps/rejected": -3.892664670944214, "loss": 4.0921, "rewards/accuracies": 0.5, "rewards/chosen": -36.9483642578125, "rewards/margins": 1.9782791137695312, "rewards/rejected": -38.92664337158203, "step": 3313 }, { "epoch": 0.45125272331154687, "grad_norm": 41.09448424888812, "learning_rate": 5.35228574139289e-07, "logits/chosen": 12.279787063598633, "logits/rejected": 12.41286849975586, "logps/chosen": -3.287365436553955, "logps/rejected": -3.5547852516174316, "loss": 4.2978, "rewards/accuracies": 0.75, "rewards/chosen": -32.8736572265625, "rewards/margins": 2.674196720123291, "rewards/rejected": -35.5478515625, "step": 3314 }, { "epoch": 0.4513888888888889, "grad_norm": 44.623457281288644, "learning_rate": 5.350496139688112e-07, "logits/chosen": 11.624902725219727, "logits/rejected": 12.240243911743164, "logps/chosen": -3.1847105026245117, "logps/rejected": -3.69235897064209, "loss": 3.5046, "rewards/accuracies": 0.75, "rewards/chosen": -31.847105026245117, "rewards/margins": 5.076484680175781, "rewards/rejected": -36.92359161376953, "step": 3315 }, { "epoch": 0.4515250544662309, "grad_norm": 46.34978332974565, "learning_rate": 5.348706232827569e-07, "logits/chosen": 12.183792114257812, "logits/rejected": 12.960570335388184, "logps/chosen": -4.051898956298828, "logps/rejected": -3.9912452697753906, "loss": 4.2605, "rewards/accuracies": 0.5, "rewards/chosen": -40.51898956298828, "rewards/margins": -0.6065387725830078, "rewards/rejected": -39.912452697753906, "step": 3316 }, { "epoch": 0.451661220043573, "grad_norm": 41.987570436917494, "learning_rate": 5.346916021215702e-07, "logits/chosen": 11.749786376953125, "logits/rejected": 12.928302764892578, "logps/chosen": -3.6328978538513184, "logps/rejected": -3.8828744888305664, "loss": 4.1696, "rewards/accuracies": 0.5, "rewards/chosen": -36.328975677490234, "rewards/margins": 2.4997682571411133, "rewards/rejected": -38.82874298095703, "step": 3317 }, { "epoch": 0.45179738562091504, "grad_norm": 38.47243895905539, "learning_rate": 5.345125505257028e-07, "logits/chosen": 11.894529342651367, "logits/rejected": 13.333023071289062, "logps/chosen": -3.3544342517852783, "logps/rejected": -3.897935390472412, "loss": 3.9451, "rewards/accuracies": 1.0, "rewards/chosen": -33.544342041015625, "rewards/margins": 5.43501091003418, "rewards/rejected": -38.97935485839844, "step": 3318 }, { "epoch": 0.45193355119825707, "grad_norm": 54.09314019470622, "learning_rate": 5.343334685356126e-07, "logits/chosen": 12.850021362304688, "logits/rejected": 12.46993350982666, "logps/chosen": -3.634139060974121, "logps/rejected": -4.029174327850342, "loss": 4.1693, "rewards/accuracies": 0.75, "rewards/chosen": -36.341392517089844, "rewards/margins": 3.9503531455993652, "rewards/rejected": -40.291744232177734, "step": 3319 }, { "epoch": 0.45206971677559915, "grad_norm": 43.011199348705674, "learning_rate": 5.341543561917648e-07, "logits/chosen": 12.038776397705078, "logits/rejected": 12.534502029418945, "logps/chosen": -3.719761610031128, "logps/rejected": -3.8855366706848145, "loss": 3.6353, "rewards/accuracies": 0.75, "rewards/chosen": -37.19761657714844, "rewards/margins": 1.657752513885498, "rewards/rejected": -38.855369567871094, "step": 3320 }, { "epoch": 0.4522058823529412, "grad_norm": 79.70759181160754, "learning_rate": 5.339752135346313e-07, "logits/chosen": 12.687021255493164, "logits/rejected": 13.015199661254883, "logps/chosen": -3.7617886066436768, "logps/rejected": -3.937654972076416, "loss": 4.3013, "rewards/accuracies": 0.75, "rewards/chosen": -37.61788558959961, "rewards/margins": 1.7586641311645508, "rewards/rejected": -39.376548767089844, "step": 3321 }, { "epoch": 0.4523420479302832, "grad_norm": 40.26573252437262, "learning_rate": 5.337960406046909e-07, "logits/chosen": 12.288516998291016, "logits/rejected": 13.075908660888672, "logps/chosen": -3.7636775970458984, "logps/rejected": -4.123052597045898, "loss": 4.1119, "rewards/accuracies": 0.75, "rewards/chosen": -37.636775970458984, "rewards/margins": 3.5937509536743164, "rewards/rejected": -41.23052978515625, "step": 3322 }, { "epoch": 0.4524782135076253, "grad_norm": 45.44727454342077, "learning_rate": 5.336168374424291e-07, "logits/chosen": 11.860246658325195, "logits/rejected": 11.94219970703125, "logps/chosen": -3.6565184593200684, "logps/rejected": -3.7029309272766113, "loss": 4.3994, "rewards/accuracies": 0.5, "rewards/chosen": -36.565181732177734, "rewards/margins": 0.4641265869140625, "rewards/rejected": -37.02931213378906, "step": 3323 }, { "epoch": 0.4526143790849673, "grad_norm": 44.13436505559247, "learning_rate": 5.334376040883384e-07, "logits/chosen": 12.510379791259766, "logits/rejected": 12.731191635131836, "logps/chosen": -3.8897266387939453, "logps/rejected": -3.830641984939575, "loss": 4.5353, "rewards/accuracies": 0.5, "rewards/chosen": -38.89726257324219, "rewards/margins": -0.5908451080322266, "rewards/rejected": -38.306419372558594, "step": 3324 }, { "epoch": 0.45275054466230935, "grad_norm": 36.95828052914994, "learning_rate": 5.33258340582918e-07, "logits/chosen": 12.157089233398438, "logits/rejected": 12.395182609558105, "logps/chosen": -3.7171835899353027, "logps/rejected": -3.8599517345428467, "loss": 3.7092, "rewards/accuracies": 0.75, "rewards/chosen": -37.171836853027344, "rewards/margins": 1.4276800155639648, "rewards/rejected": -38.599517822265625, "step": 3325 }, { "epoch": 0.45288671023965144, "grad_norm": 100.03355998524165, "learning_rate": 5.330790469666742e-07, "logits/chosen": 12.504005432128906, "logits/rejected": 12.602596282958984, "logps/chosen": -3.7361700534820557, "logps/rejected": -4.201509475708008, "loss": 3.6699, "rewards/accuracies": 1.0, "rewards/chosen": -37.36170196533203, "rewards/margins": 4.653390884399414, "rewards/rejected": -42.01509094238281, "step": 3326 }, { "epoch": 0.45302287581699346, "grad_norm": 40.35983386591998, "learning_rate": 5.328997232801195e-07, "logits/chosen": 12.245917320251465, "logits/rejected": 12.904108047485352, "logps/chosen": -3.7521369457244873, "logps/rejected": -4.1912689208984375, "loss": 4.2443, "rewards/accuracies": 1.0, "rewards/chosen": -37.52136993408203, "rewards/margins": 4.391322135925293, "rewards/rejected": -41.912689208984375, "step": 3327 }, { "epoch": 0.4531590413943355, "grad_norm": 38.35830624682495, "learning_rate": 5.327203695637738e-07, "logits/chosen": 12.081085205078125, "logits/rejected": 12.61504077911377, "logps/chosen": -3.5979385375976562, "logps/rejected": -3.916785478591919, "loss": 3.9993, "rewards/accuracies": 0.5, "rewards/chosen": -35.97938537597656, "rewards/margins": 3.188469409942627, "rewards/rejected": -39.16785430908203, "step": 3328 }, { "epoch": 0.4532952069716776, "grad_norm": 39.23495367586424, "learning_rate": 5.325409858581636e-07, "logits/chosen": 12.720917701721191, "logits/rejected": 13.08431339263916, "logps/chosen": -3.7217612266540527, "logps/rejected": -3.7470083236694336, "loss": 3.3557, "rewards/accuracies": 0.75, "rewards/chosen": -37.21760940551758, "rewards/margins": 0.2524728775024414, "rewards/rejected": -37.4700813293457, "step": 3329 }, { "epoch": 0.4534313725490196, "grad_norm": 38.86979177224782, "learning_rate": 5.32361572203822e-07, "logits/chosen": 12.376304626464844, "logits/rejected": 12.820598602294922, "logps/chosen": -3.766183376312256, "logps/rejected": -4.361769199371338, "loss": 3.641, "rewards/accuracies": 1.0, "rewards/chosen": -37.661834716796875, "rewards/margins": 5.955859184265137, "rewards/rejected": -43.61769104003906, "step": 3330 }, { "epoch": 0.45356753812636164, "grad_norm": 51.95161256846664, "learning_rate": 5.32182128641289e-07, "logits/chosen": 10.514961242675781, "logits/rejected": 11.421280860900879, "logps/chosen": -3.34712553024292, "logps/rejected": -3.7875306606292725, "loss": 4.2453, "rewards/accuracies": 1.0, "rewards/chosen": -33.471256256103516, "rewards/margins": 4.404050827026367, "rewards/rejected": -37.87530517578125, "step": 3331 }, { "epoch": 0.4537037037037037, "grad_norm": 40.16594242529411, "learning_rate": 5.320026552111115e-07, "logits/chosen": 12.025083541870117, "logits/rejected": 13.23661994934082, "logps/chosen": -3.908750534057617, "logps/rejected": -4.145949363708496, "loss": 4.097, "rewards/accuracies": 0.25, "rewards/chosen": -39.087501525878906, "rewards/margins": 2.3719897270202637, "rewards/rejected": -41.459495544433594, "step": 3332 }, { "epoch": 0.45383986928104575, "grad_norm": 38.311872936806274, "learning_rate": 5.318231519538429e-07, "logits/chosen": 11.74582290649414, "logits/rejected": 12.578132629394531, "logps/chosen": -3.7702107429504395, "logps/rejected": -3.995371103286743, "loss": 3.9984, "rewards/accuracies": 0.5, "rewards/chosen": -37.702110290527344, "rewards/margins": 2.251603126525879, "rewards/rejected": -39.953712463378906, "step": 3333 }, { "epoch": 0.4539760348583878, "grad_norm": 40.278298149439834, "learning_rate": 5.316436189100434e-07, "logits/chosen": 12.54178237915039, "logits/rejected": 12.434086799621582, "logps/chosen": -3.8695192337036133, "logps/rejected": -4.091255187988281, "loss": 4.4715, "rewards/accuracies": 0.5, "rewards/chosen": -38.6951904296875, "rewards/margins": 2.217357635498047, "rewards/rejected": -40.91254806518555, "step": 3334 }, { "epoch": 0.45411220043572986, "grad_norm": 42.68156870532942, "learning_rate": 5.314640561202801e-07, "logits/chosen": 12.163434982299805, "logits/rejected": 12.963239669799805, "logps/chosen": -3.910871982574463, "logps/rejected": -4.159568786621094, "loss": 4.545, "rewards/accuracies": 0.75, "rewards/chosen": -39.10871887207031, "rewards/margins": 2.4869699478149414, "rewards/rejected": -41.5956916809082, "step": 3335 }, { "epoch": 0.4542483660130719, "grad_norm": 38.42244077810904, "learning_rate": 5.312844636251266e-07, "logits/chosen": 12.265748023986816, "logits/rejected": 12.238750457763672, "logps/chosen": -4.028924942016602, "logps/rejected": -3.8168044090270996, "loss": 3.5293, "rewards/accuracies": 0.25, "rewards/chosen": -40.289249420166016, "rewards/margins": -2.121204376220703, "rewards/rejected": -38.16804504394531, "step": 3336 }, { "epoch": 0.4543845315904139, "grad_norm": 40.06474196633496, "learning_rate": 5.311048414651634e-07, "logits/chosen": 11.75798225402832, "logits/rejected": 12.071493148803711, "logps/chosen": -3.602612018585205, "logps/rejected": -4.015848159790039, "loss": 4.3962, "rewards/accuracies": 0.75, "rewards/chosen": -36.026119232177734, "rewards/margins": 4.132360458374023, "rewards/rejected": -40.158477783203125, "step": 3337 }, { "epoch": 0.454520697167756, "grad_norm": 42.413937960891374, "learning_rate": 5.309251896809774e-07, "logits/chosen": 12.57927131652832, "logits/rejected": 12.339738845825195, "logps/chosen": -3.755152940750122, "logps/rejected": -3.9508397579193115, "loss": 4.0345, "rewards/accuracies": 0.5, "rewards/chosen": -37.55152893066406, "rewards/margins": 1.9568681716918945, "rewards/rejected": -39.50839614868164, "step": 3338 }, { "epoch": 0.45465686274509803, "grad_norm": 39.203292158017675, "learning_rate": 5.307455083131627e-07, "logits/chosen": 12.62319278717041, "logits/rejected": 12.185844421386719, "logps/chosen": -3.9543404579162598, "logps/rejected": -3.9624099731445312, "loss": 4.2922, "rewards/accuracies": 0.5, "rewards/chosen": -39.54340362548828, "rewards/margins": 0.08069419860839844, "rewards/rejected": -39.62409973144531, "step": 3339 }, { "epoch": 0.45479302832244006, "grad_norm": 42.03732224350431, "learning_rate": 5.305657974023194e-07, "logits/chosen": 12.112590789794922, "logits/rejected": 12.350981712341309, "logps/chosen": -3.9605588912963867, "logps/rejected": -3.8101325035095215, "loss": 4.4191, "rewards/accuracies": 0.25, "rewards/chosen": -39.605587005615234, "rewards/margins": -1.5042648315429688, "rewards/rejected": -38.101322174072266, "step": 3340 }, { "epoch": 0.45492919389978215, "grad_norm": 50.593158845862924, "learning_rate": 5.30386056989055e-07, "logits/chosen": 12.699705123901367, "logits/rejected": 12.821460723876953, "logps/chosen": -3.8690624237060547, "logps/rejected": -4.242857933044434, "loss": 4.1149, "rewards/accuracies": 0.75, "rewards/chosen": -38.69062423706055, "rewards/margins": 3.7379579544067383, "rewards/rejected": -42.42858123779297, "step": 3341 }, { "epoch": 0.4550653594771242, "grad_norm": 38.854749217695634, "learning_rate": 5.302062871139835e-07, "logits/chosen": 12.133899688720703, "logits/rejected": 12.219890594482422, "logps/chosen": -4.042120933532715, "logps/rejected": -4.043102741241455, "loss": 4.0932, "rewards/accuracies": 0.5, "rewards/chosen": -40.421207427978516, "rewards/margins": 0.009820938110351562, "rewards/rejected": -40.431026458740234, "step": 3342 }, { "epoch": 0.4552015250544662, "grad_norm": 40.83695318238431, "learning_rate": 5.300264878177248e-07, "logits/chosen": 12.03807258605957, "logits/rejected": 12.098474502563477, "logps/chosen": -3.9600605964660645, "logps/rejected": -4.1104254722595215, "loss": 3.9314, "rewards/accuracies": 0.5, "rewards/chosen": -39.60060501098633, "rewards/margins": 1.5036506652832031, "rewards/rejected": -41.10425567626953, "step": 3343 }, { "epoch": 0.4553376906318083, "grad_norm": 41.8663024590647, "learning_rate": 5.298466591409066e-07, "logits/chosen": 11.675911903381348, "logits/rejected": 11.811044692993164, "logps/chosen": -3.975247859954834, "logps/rejected": -4.242142677307129, "loss": 3.5215, "rewards/accuracies": 0.5, "rewards/chosen": -39.752479553222656, "rewards/margins": 2.6689529418945312, "rewards/rejected": -42.42142868041992, "step": 3344 }, { "epoch": 0.4554738562091503, "grad_norm": 42.24830159109431, "learning_rate": 5.296668011241624e-07, "logits/chosen": 13.059240341186523, "logits/rejected": 13.227174758911133, "logps/chosen": -4.2952880859375, "logps/rejected": -4.12033748626709, "loss": 4.4263, "rewards/accuracies": 0.25, "rewards/chosen": -42.952880859375, "rewards/margins": -1.7495040893554688, "rewards/rejected": -41.20337677001953, "step": 3345 }, { "epoch": 0.45561002178649235, "grad_norm": 46.27830722566627, "learning_rate": 5.294869138081325e-07, "logits/chosen": 12.557533264160156, "logits/rejected": 12.045933723449707, "logps/chosen": -3.8587021827697754, "logps/rejected": -3.812403917312622, "loss": 3.8027, "rewards/accuracies": 0.5, "rewards/chosen": -38.58702087402344, "rewards/margins": -0.4629802703857422, "rewards/rejected": -38.12404251098633, "step": 3346 }, { "epoch": 0.45574618736383443, "grad_norm": 38.76369602025192, "learning_rate": 5.293069972334642e-07, "logits/chosen": 12.78555965423584, "logits/rejected": 12.673121452331543, "logps/chosen": -4.018141269683838, "logps/rejected": -4.021487712860107, "loss": 3.7698, "rewards/accuracies": 0.75, "rewards/chosen": -40.18141174316406, "rewards/margins": 0.03346729278564453, "rewards/rejected": -40.21487808227539, "step": 3347 }, { "epoch": 0.45588235294117646, "grad_norm": 39.21720970832205, "learning_rate": 5.29127051440811e-07, "logits/chosen": 11.446659088134766, "logits/rejected": 12.4288330078125, "logps/chosen": -3.5574514865875244, "logps/rejected": -3.903782844543457, "loss": 4.0609, "rewards/accuracies": 0.75, "rewards/chosen": -35.57451629638672, "rewards/margins": 3.463313579559326, "rewards/rejected": -39.03782653808594, "step": 3348 }, { "epoch": 0.45601851851851855, "grad_norm": 62.130899776688395, "learning_rate": 5.289470764708331e-07, "logits/chosen": 12.410511016845703, "logits/rejected": 12.099395751953125, "logps/chosen": -3.6023902893066406, "logps/rejected": -4.079877853393555, "loss": 4.1672, "rewards/accuracies": 0.75, "rewards/chosen": -36.023902893066406, "rewards/margins": 4.774872779846191, "rewards/rejected": -40.79877471923828, "step": 3349 }, { "epoch": 0.4561546840958606, "grad_norm": 40.15793491638258, "learning_rate": 5.287670723641975e-07, "logits/chosen": 12.114818572998047, "logits/rejected": 12.114603042602539, "logps/chosen": -3.7258846759796143, "logps/rejected": -3.8119890689849854, "loss": 3.6081, "rewards/accuracies": 0.75, "rewards/chosen": -37.258846282958984, "rewards/margins": 0.8610420227050781, "rewards/rejected": -38.11988830566406, "step": 3350 }, { "epoch": 0.4562908496732026, "grad_norm": 40.10717497663959, "learning_rate": 5.285870391615775e-07, "logits/chosen": 11.960485458374023, "logits/rejected": 13.02291488647461, "logps/chosen": -3.845907211303711, "logps/rejected": -4.147300720214844, "loss": 3.9531, "rewards/accuracies": 0.75, "rewards/chosen": -38.45907211303711, "rewards/margins": 3.013936996459961, "rewards/rejected": -41.47300720214844, "step": 3351 }, { "epoch": 0.4564270152505447, "grad_norm": 40.342892429979116, "learning_rate": 5.284069769036529e-07, "logits/chosen": 11.343781471252441, "logits/rejected": 12.314722061157227, "logps/chosen": -3.744962215423584, "logps/rejected": -4.099261283874512, "loss": 3.9993, "rewards/accuracies": 1.0, "rewards/chosen": -37.449623107910156, "rewards/margins": 3.5429887771606445, "rewards/rejected": -40.99261474609375, "step": 3352 }, { "epoch": 0.4565631808278867, "grad_norm": 44.597645468130786, "learning_rate": 5.282268856311107e-07, "logits/chosen": 12.595481872558594, "logits/rejected": 12.282109260559082, "logps/chosen": -4.108396530151367, "logps/rejected": -4.280102729797363, "loss": 4.0811, "rewards/accuracies": 0.5, "rewards/chosen": -41.08396911621094, "rewards/margins": 1.7170591354370117, "rewards/rejected": -42.801029205322266, "step": 3353 }, { "epoch": 0.45669934640522875, "grad_norm": 43.19174401193402, "learning_rate": 5.280467653846436e-07, "logits/chosen": 11.750411033630371, "logits/rejected": 12.788885116577148, "logps/chosen": -3.914423704147339, "logps/rejected": -4.3784403800964355, "loss": 4.1012, "rewards/accuracies": 0.75, "rewards/chosen": -39.14423751831055, "rewards/margins": 4.640168190002441, "rewards/rejected": -43.78440856933594, "step": 3354 }, { "epoch": 0.45683551198257083, "grad_norm": 39.568746590075044, "learning_rate": 5.278666162049514e-07, "logits/chosen": 12.447067260742188, "logits/rejected": 12.988788604736328, "logps/chosen": -3.617621421813965, "logps/rejected": -4.120757579803467, "loss": 3.9063, "rewards/accuracies": 0.75, "rewards/chosen": -36.17621612548828, "rewards/margins": 5.031359672546387, "rewards/rejected": -41.207576751708984, "step": 3355 }, { "epoch": 0.45697167755991286, "grad_norm": 41.6372374907535, "learning_rate": 5.276864381327403e-07, "logits/chosen": 11.136417388916016, "logits/rejected": 12.102191925048828, "logps/chosen": -3.6569266319274902, "logps/rejected": -4.087935447692871, "loss": 3.7579, "rewards/accuracies": 0.75, "rewards/chosen": -36.56926727294922, "rewards/margins": 4.310086250305176, "rewards/rejected": -40.879356384277344, "step": 3356 }, { "epoch": 0.4571078431372549, "grad_norm": 47.16798387346113, "learning_rate": 5.275062312087232e-07, "logits/chosen": 11.798192977905273, "logits/rejected": 12.723298072814941, "logps/chosen": -3.596907615661621, "logps/rejected": -3.9423210620880127, "loss": 4.6442, "rewards/accuracies": 1.0, "rewards/chosen": -35.969078063964844, "rewards/margins": 3.4541330337524414, "rewards/rejected": -39.42321014404297, "step": 3357 }, { "epoch": 0.457244008714597, "grad_norm": 39.07078576781219, "learning_rate": 5.27325995473619e-07, "logits/chosen": 12.32817268371582, "logits/rejected": 12.725391387939453, "logps/chosen": -3.6593780517578125, "logps/rejected": -3.6465704441070557, "loss": 3.5751, "rewards/accuracies": 0.5, "rewards/chosen": -36.593780517578125, "rewards/margins": -0.12807464599609375, "rewards/rejected": -36.46570587158203, "step": 3358 }, { "epoch": 0.457380174291939, "grad_norm": 40.87112688824326, "learning_rate": 5.271457309681537e-07, "logits/chosen": 11.827510833740234, "logits/rejected": 11.992925643920898, "logps/chosen": -3.8944571018218994, "logps/rejected": -3.962923288345337, "loss": 3.547, "rewards/accuracies": 0.5, "rewards/chosen": -38.94457244873047, "rewards/margins": 0.6846628189086914, "rewards/rejected": -39.629234313964844, "step": 3359 }, { "epoch": 0.45751633986928103, "grad_norm": 52.42207319307192, "learning_rate": 5.269654377330595e-07, "logits/chosen": 11.695663452148438, "logits/rejected": 11.774526596069336, "logps/chosen": -3.754793643951416, "logps/rejected": -3.920374870300293, "loss": 3.7578, "rewards/accuracies": 1.0, "rewards/chosen": -37.547935485839844, "rewards/margins": 1.6558151245117188, "rewards/rejected": -39.20375061035156, "step": 3360 }, { "epoch": 0.4576525054466231, "grad_norm": 43.9937257290641, "learning_rate": 5.267851158090752e-07, "logits/chosen": 12.77176284790039, "logits/rejected": 12.565835952758789, "logps/chosen": -3.9629063606262207, "logps/rejected": -4.161349296569824, "loss": 3.6542, "rewards/accuracies": 0.75, "rewards/chosen": -39.62906265258789, "rewards/margins": 1.9844303131103516, "rewards/rejected": -41.613494873046875, "step": 3361 }, { "epoch": 0.45778867102396514, "grad_norm": 56.05348720243652, "learning_rate": 5.266047652369458e-07, "logits/chosen": 10.723344802856445, "logits/rejected": 11.394651412963867, "logps/chosen": -3.290618896484375, "logps/rejected": -3.67659068107605, "loss": 3.7962, "rewards/accuracies": 1.0, "rewards/chosen": -32.90618896484375, "rewards/margins": 3.85971736907959, "rewards/rejected": -36.765907287597656, "step": 3362 }, { "epoch": 0.4579248366013072, "grad_norm": 48.782828291021154, "learning_rate": 5.264243860574232e-07, "logits/chosen": 12.122642517089844, "logits/rejected": 12.550098419189453, "logps/chosen": -3.377110719680786, "logps/rejected": -3.810664415359497, "loss": 3.5292, "rewards/accuracies": 0.75, "rewards/chosen": -33.7711067199707, "rewards/margins": 4.335537433624268, "rewards/rejected": -38.10664367675781, "step": 3363 }, { "epoch": 0.45806100217864926, "grad_norm": 46.40592292529506, "learning_rate": 5.262439783112657e-07, "logits/chosen": 11.84217357635498, "logits/rejected": 13.082982063293457, "logps/chosen": -3.4769046306610107, "logps/rejected": -3.5385005474090576, "loss": 3.9632, "rewards/accuracies": 0.5, "rewards/chosen": -34.769046783447266, "rewards/margins": 0.6159586906433105, "rewards/rejected": -35.38500213623047, "step": 3364 }, { "epoch": 0.4581971677559913, "grad_norm": 40.16652890358664, "learning_rate": 5.260635420392376e-07, "logits/chosen": 11.785634994506836, "logits/rejected": 11.717391967773438, "logps/chosen": -3.6467206478118896, "logps/rejected": -3.900757312774658, "loss": 3.6658, "rewards/accuracies": 0.75, "rewards/chosen": -36.46720886230469, "rewards/margins": 2.5403690338134766, "rewards/rejected": -39.00757598876953, "step": 3365 }, { "epoch": 0.4583333333333333, "grad_norm": 45.78417724911779, "learning_rate": 5.258830772821102e-07, "logits/chosen": 12.173372268676758, "logits/rejected": 12.251413345336914, "logps/chosen": -3.8420987129211426, "logps/rejected": -4.290562152862549, "loss": 4.0191, "rewards/accuracies": 1.0, "rewards/chosen": -38.420989990234375, "rewards/margins": 4.484631538391113, "rewards/rejected": -42.90562057495117, "step": 3366 }, { "epoch": 0.4584694989106754, "grad_norm": 46.89177814090491, "learning_rate": 5.257025840806609e-07, "logits/chosen": 11.525579452514648, "logits/rejected": 12.144004821777344, "logps/chosen": -3.084980010986328, "logps/rejected": -3.4508156776428223, "loss": 3.9885, "rewards/accuracies": 1.0, "rewards/chosen": -30.84980010986328, "rewards/margins": 3.6583547592163086, "rewards/rejected": -34.508155822753906, "step": 3367 }, { "epoch": 0.45860566448801743, "grad_norm": 49.645359541871535, "learning_rate": 5.255220624756737e-07, "logits/chosen": 12.646093368530273, "logits/rejected": 12.733316421508789, "logps/chosen": -4.241401672363281, "logps/rejected": -3.9604876041412354, "loss": 4.9371, "rewards/accuracies": 0.75, "rewards/chosen": -42.41401290893555, "rewards/margins": -2.8091373443603516, "rewards/rejected": -39.60487747192383, "step": 3368 }, { "epoch": 0.45874183006535946, "grad_norm": 50.426661066808435, "learning_rate": 5.253415125079389e-07, "logits/chosen": 11.894072532653809, "logits/rejected": 12.036515235900879, "logps/chosen": -3.9217121601104736, "logps/rejected": -3.841322422027588, "loss": 3.8511, "rewards/accuracies": 0.25, "rewards/chosen": -39.21712112426758, "rewards/margins": -0.8038973808288574, "rewards/rejected": -38.41322326660156, "step": 3369 }, { "epoch": 0.45887799564270154, "grad_norm": 48.536879929406105, "learning_rate": 5.251609342182531e-07, "logits/chosen": 11.206321716308594, "logits/rejected": 11.578123092651367, "logps/chosen": -3.229914426803589, "logps/rejected": -3.359462261199951, "loss": 3.5548, "rewards/accuracies": 0.75, "rewards/chosen": -32.29914474487305, "rewards/margins": 1.2954792976379395, "rewards/rejected": -33.59462356567383, "step": 3370 }, { "epoch": 0.45901416122004357, "grad_norm": 43.86167301891828, "learning_rate": 5.249803276474198e-07, "logits/chosen": 12.110834121704102, "logits/rejected": 10.962285041809082, "logps/chosen": -3.6917974948883057, "logps/rejected": -3.6918745040893555, "loss": 4.1094, "rewards/accuracies": 0.5, "rewards/chosen": -36.91797637939453, "rewards/margins": 0.0007677078247070312, "rewards/rejected": -36.91874313354492, "step": 3371 }, { "epoch": 0.4591503267973856, "grad_norm": 52.03391374646334, "learning_rate": 5.247996928362484e-07, "logits/chosen": 10.894156455993652, "logits/rejected": 11.125534057617188, "logps/chosen": -3.0251502990722656, "logps/rejected": -3.1862645149230957, "loss": 4.6085, "rewards/accuracies": 0.5, "rewards/chosen": -30.251502990722656, "rewards/margins": 1.6111440658569336, "rewards/rejected": -31.862646102905273, "step": 3372 }, { "epoch": 0.4592864923747277, "grad_norm": 45.34507846548467, "learning_rate": 5.246190298255546e-07, "logits/chosen": 11.855437278747559, "logits/rejected": 11.669437408447266, "logps/chosen": -3.5052738189697266, "logps/rejected": -3.726314067840576, "loss": 4.2799, "rewards/accuracies": 0.75, "rewards/chosen": -35.052738189697266, "rewards/margins": 2.2104034423828125, "rewards/rejected": -37.26314163208008, "step": 3373 }, { "epoch": 0.4594226579520697, "grad_norm": 40.83955987497063, "learning_rate": 5.244383386561612e-07, "logits/chosen": 10.809688568115234, "logits/rejected": 12.260763168334961, "logps/chosen": -3.728062629699707, "logps/rejected": -4.320479393005371, "loss": 3.6424, "rewards/accuracies": 0.75, "rewards/chosen": -37.2806282043457, "rewards/margins": 5.924167633056641, "rewards/rejected": -43.204795837402344, "step": 3374 }, { "epoch": 0.45955882352941174, "grad_norm": 37.92772984476031, "learning_rate": 5.242576193688964e-07, "logits/chosen": 11.521065711975098, "logits/rejected": 11.631719589233398, "logps/chosen": -3.288377285003662, "logps/rejected": -3.500009059906006, "loss": 3.6939, "rewards/accuracies": 1.0, "rewards/chosen": -32.88377380371094, "rewards/margins": 2.1163196563720703, "rewards/rejected": -35.000091552734375, "step": 3375 }, { "epoch": 0.4596949891067538, "grad_norm": 43.3994927042262, "learning_rate": 5.240768720045952e-07, "logits/chosen": 11.918943405151367, "logits/rejected": 10.917838096618652, "logps/chosen": -3.203751564025879, "logps/rejected": -3.1973094940185547, "loss": 3.6954, "rewards/accuracies": 0.5, "rewards/chosen": -32.037513732910156, "rewards/margins": -0.06442117691040039, "rewards/rejected": -31.973094940185547, "step": 3376 }, { "epoch": 0.45983115468409586, "grad_norm": 52.19387078209244, "learning_rate": 5.238960966040995e-07, "logits/chosen": 12.174410820007324, "logits/rejected": 12.430336952209473, "logps/chosen": -3.430471420288086, "logps/rejected": -3.8902926445007324, "loss": 3.7582, "rewards/accuracies": 1.0, "rewards/chosen": -34.304718017578125, "rewards/margins": 4.598212718963623, "rewards/rejected": -38.90292739868164, "step": 3377 }, { "epoch": 0.4599673202614379, "grad_norm": 41.36054975093174, "learning_rate": 5.237152932082563e-07, "logits/chosen": 10.590534210205078, "logits/rejected": 10.764674186706543, "logps/chosen": -3.131366491317749, "logps/rejected": -3.5276365280151367, "loss": 3.7764, "rewards/accuracies": 0.75, "rewards/chosen": -31.313663482666016, "rewards/margins": 3.962700366973877, "rewards/rejected": -35.276363372802734, "step": 3378 }, { "epoch": 0.46010348583877997, "grad_norm": 53.64479351303284, "learning_rate": 5.235344618579202e-07, "logits/chosen": 10.937421798706055, "logits/rejected": 11.615188598632812, "logps/chosen": -3.4780056476593018, "logps/rejected": -3.537879705429077, "loss": 4.2387, "rewards/accuracies": 0.75, "rewards/chosen": -34.780059814453125, "rewards/margins": 0.5987386703491211, "rewards/rejected": -35.3787956237793, "step": 3379 }, { "epoch": 0.460239651416122, "grad_norm": 44.953576011487925, "learning_rate": 5.233536025939512e-07, "logits/chosen": 11.826117515563965, "logits/rejected": 12.270573616027832, "logps/chosen": -3.565256357192993, "logps/rejected": -3.760033369064331, "loss": 4.0912, "rewards/accuracies": 0.75, "rewards/chosen": -35.652565002441406, "rewards/margins": 1.9477672576904297, "rewards/rejected": -37.6003303527832, "step": 3380 }, { "epoch": 0.460375816993464, "grad_norm": 42.30830818163368, "learning_rate": 5.231727154572162e-07, "logits/chosen": 11.465068817138672, "logits/rejected": 11.912384033203125, "logps/chosen": -3.4270567893981934, "logps/rejected": -3.70632266998291, "loss": 3.5977, "rewards/accuracies": 0.75, "rewards/chosen": -34.27056884765625, "rewards/margins": 2.792658805847168, "rewards/rejected": -37.06322479248047, "step": 3381 }, { "epoch": 0.4605119825708061, "grad_norm": 52.68061975159956, "learning_rate": 5.229918004885877e-07, "logits/chosen": 11.913739204406738, "logits/rejected": 12.45856761932373, "logps/chosen": -3.6737043857574463, "logps/rejected": -3.7397308349609375, "loss": 4.2605, "rewards/accuracies": 0.75, "rewards/chosen": -36.73704528808594, "rewards/margins": 0.6602659225463867, "rewards/rejected": -37.397308349609375, "step": 3382 }, { "epoch": 0.46064814814814814, "grad_norm": 41.191288307220226, "learning_rate": 5.228108577289454e-07, "logits/chosen": 11.678964614868164, "logits/rejected": 12.354301452636719, "logps/chosen": -3.2827653884887695, "logps/rejected": -3.515000820159912, "loss": 3.5736, "rewards/accuracies": 0.75, "rewards/chosen": -32.82765197753906, "rewards/margins": 2.322354316711426, "rewards/rejected": -35.15000915527344, "step": 3383 }, { "epoch": 0.46078431372549017, "grad_norm": 42.774144528599344, "learning_rate": 5.226298872191746e-07, "logits/chosen": 10.896635055541992, "logits/rejected": 11.221199035644531, "logps/chosen": -3.4916014671325684, "logps/rejected": -3.7049617767333984, "loss": 3.9457, "rewards/accuracies": 0.75, "rewards/chosen": -34.916015625, "rewards/margins": 2.133605480194092, "rewards/rejected": -37.049617767333984, "step": 3384 }, { "epoch": 0.46092047930283225, "grad_norm": 40.51834366527705, "learning_rate": 5.22448889000167e-07, "logits/chosen": 11.872345924377441, "logits/rejected": 12.161725997924805, "logps/chosen": -3.7288498878479004, "logps/rejected": -3.7508203983306885, "loss": 4.2304, "rewards/accuracies": 0.5, "rewards/chosen": -37.28849792480469, "rewards/margins": 0.21970701217651367, "rewards/rejected": -37.50820541381836, "step": 3385 }, { "epoch": 0.4610566448801743, "grad_norm": 46.874973667616366, "learning_rate": 5.222678631128209e-07, "logits/chosen": 12.014081954956055, "logits/rejected": 12.025592803955078, "logps/chosen": -3.564110517501831, "logps/rejected": -3.5802550315856934, "loss": 4.2286, "rewards/accuracies": 0.5, "rewards/chosen": -35.64110565185547, "rewards/margins": 0.16144561767578125, "rewards/rejected": -35.80255126953125, "step": 3386 }, { "epoch": 0.46119281045751637, "grad_norm": 41.076105585676366, "learning_rate": 5.220868095980405e-07, "logits/chosen": 10.8316650390625, "logits/rejected": 11.938880920410156, "logps/chosen": -3.7077181339263916, "logps/rejected": -3.949309825897217, "loss": 3.1182, "rewards/accuracies": 1.0, "rewards/chosen": -37.07718276977539, "rewards/margins": 2.41591739654541, "rewards/rejected": -39.49310302734375, "step": 3387 }, { "epoch": 0.4613289760348584, "grad_norm": 39.38938288083675, "learning_rate": 5.219057284967362e-07, "logits/chosen": 10.863094329833984, "logits/rejected": 11.407859802246094, "logps/chosen": -3.2695138454437256, "logps/rejected": -3.4749884605407715, "loss": 4.1083, "rewards/accuracies": 0.5, "rewards/chosen": -32.69513702392578, "rewards/margins": 2.054745674133301, "rewards/rejected": -34.74988555908203, "step": 3388 }, { "epoch": 0.4614651416122004, "grad_norm": 41.36866722696816, "learning_rate": 5.217246198498248e-07, "logits/chosen": 11.573238372802734, "logits/rejected": 12.20505142211914, "logps/chosen": -3.2359657287597656, "logps/rejected": -3.898375988006592, "loss": 3.9265, "rewards/accuracies": 0.75, "rewards/chosen": -32.359657287597656, "rewards/margins": 6.624104022979736, "rewards/rejected": -38.983760833740234, "step": 3389 }, { "epoch": 0.4616013071895425, "grad_norm": 41.66728101726026, "learning_rate": 5.215434836982295e-07, "logits/chosen": 10.839580535888672, "logits/rejected": 10.911687850952148, "logps/chosen": -2.9197824001312256, "logps/rejected": -3.310377597808838, "loss": 4.2232, "rewards/accuracies": 1.0, "rewards/chosen": -29.197824478149414, "rewards/margins": 3.9059510231018066, "rewards/rejected": -33.10377502441406, "step": 3390 }, { "epoch": 0.46173747276688454, "grad_norm": 45.94288504082527, "learning_rate": 5.213623200828792e-07, "logits/chosen": 11.666830062866211, "logits/rejected": 10.805365562438965, "logps/chosen": -3.850818634033203, "logps/rejected": -3.826674461364746, "loss": 3.9797, "rewards/accuracies": 0.5, "rewards/chosen": -38.50818634033203, "rewards/margins": -0.2414393424987793, "rewards/rejected": -38.266746520996094, "step": 3391 }, { "epoch": 0.46187363834422657, "grad_norm": 39.76113001471024, "learning_rate": 5.211811290447096e-07, "logits/chosen": 11.614063262939453, "logits/rejected": 12.571125030517578, "logps/chosen": -3.339705467224121, "logps/rejected": -3.761805772781372, "loss": 4.2558, "rewards/accuracies": 1.0, "rewards/chosen": -33.39705276489258, "rewards/margins": 4.221003532409668, "rewards/rejected": -37.61805725097656, "step": 3392 }, { "epoch": 0.46200980392156865, "grad_norm": 40.809415916580924, "learning_rate": 5.209999106246623e-07, "logits/chosen": 11.531930923461914, "logits/rejected": 12.385468482971191, "logps/chosen": -3.654022693634033, "logps/rejected": -3.940974473953247, "loss": 4.3626, "rewards/accuracies": 0.75, "rewards/chosen": -36.540225982666016, "rewards/margins": 2.8695173263549805, "rewards/rejected": -39.40974426269531, "step": 3393 }, { "epoch": 0.4621459694989107, "grad_norm": 40.294536365322564, "learning_rate": 5.208186648636849e-07, "logits/chosen": 10.676359176635742, "logits/rejected": 10.977112770080566, "logps/chosen": -3.2685623168945312, "logps/rejected": -3.4098615646362305, "loss": 3.9045, "rewards/accuracies": 0.75, "rewards/chosen": -32.68561935424805, "rewards/margins": 1.4129953384399414, "rewards/rejected": -34.09861755371094, "step": 3394 }, { "epoch": 0.4622821350762527, "grad_norm": 37.630024737805556, "learning_rate": 5.206373918027314e-07, "logits/chosen": 11.829764366149902, "logits/rejected": 12.497066497802734, "logps/chosen": -3.7205142974853516, "logps/rejected": -4.08969783782959, "loss": 3.6601, "rewards/accuracies": 0.75, "rewards/chosen": -37.205142974853516, "rewards/margins": 3.6918344497680664, "rewards/rejected": -40.896976470947266, "step": 3395 }, { "epoch": 0.4624183006535948, "grad_norm": 46.85877699242211, "learning_rate": 5.204560914827621e-07, "logits/chosen": 12.573060989379883, "logits/rejected": 12.447071075439453, "logps/chosen": -3.6601762771606445, "logps/rejected": -3.8478236198425293, "loss": 3.8149, "rewards/accuracies": 0.75, "rewards/chosen": -36.60176467895508, "rewards/margins": 1.876471996307373, "rewards/rejected": -38.47823715209961, "step": 3396 }, { "epoch": 0.4625544662309368, "grad_norm": 68.5734783453595, "learning_rate": 5.202747639447432e-07, "logits/chosen": 11.729419708251953, "logits/rejected": 12.589120864868164, "logps/chosen": -3.388528347015381, "logps/rejected": -3.8805532455444336, "loss": 4.1072, "rewards/accuracies": 0.75, "rewards/chosen": -33.885284423828125, "rewards/margins": 4.920248031616211, "rewards/rejected": -38.8055305480957, "step": 3397 }, { "epoch": 0.46269063180827885, "grad_norm": 46.82203491429482, "learning_rate": 5.200934092296472e-07, "logits/chosen": 12.126544952392578, "logits/rejected": 12.557674407958984, "logps/chosen": -3.7793710231781006, "logps/rejected": -4.052112579345703, "loss": 4.6817, "rewards/accuracies": 0.75, "rewards/chosen": -37.79370880126953, "rewards/margins": 2.7274169921875, "rewards/rejected": -40.52112579345703, "step": 3398 }, { "epoch": 0.46282679738562094, "grad_norm": 41.62887156462356, "learning_rate": 5.199120273784527e-07, "logits/chosen": 12.590139389038086, "logits/rejected": 13.070640563964844, "logps/chosen": -3.921919822692871, "logps/rejected": -4.308528900146484, "loss": 3.8475, "rewards/accuracies": 1.0, "rewards/chosen": -39.219200134277344, "rewards/margins": 3.8660879135131836, "rewards/rejected": -43.08528518676758, "step": 3399 }, { "epoch": 0.46296296296296297, "grad_norm": 37.54399067889563, "learning_rate": 5.197306184321443e-07, "logits/chosen": 11.367298126220703, "logits/rejected": 12.706720352172852, "logps/chosen": -3.635859727859497, "logps/rejected": -4.053225040435791, "loss": 4.1694, "rewards/accuracies": 1.0, "rewards/chosen": -36.35859680175781, "rewards/margins": 4.173654556274414, "rewards/rejected": -40.532249450683594, "step": 3400 }, { "epoch": 0.463099128540305, "grad_norm": 43.914018663880114, "learning_rate": 5.195491824317132e-07, "logits/chosen": 11.57719612121582, "logits/rejected": 11.803951263427734, "logps/chosen": -3.886079788208008, "logps/rejected": -3.812520980834961, "loss": 3.6715, "rewards/accuracies": 0.25, "rewards/chosen": -38.86079788208008, "rewards/margins": -0.7355861663818359, "rewards/rejected": -38.12520980834961, "step": 3401 }, { "epoch": 0.4632352941176471, "grad_norm": 37.66595629688338, "learning_rate": 5.19367719418156e-07, "logits/chosen": 12.232637405395508, "logits/rejected": 11.490921974182129, "logps/chosen": -3.754917621612549, "logps/rejected": -3.6480958461761475, "loss": 4.2682, "rewards/accuracies": 0.75, "rewards/chosen": -37.54917526245117, "rewards/margins": -1.0682172775268555, "rewards/rejected": -36.48095703125, "step": 3402 }, { "epoch": 0.4633714596949891, "grad_norm": 69.80238458510048, "learning_rate": 5.191862294324758e-07, "logits/chosen": 11.937353134155273, "logits/rejected": 12.22140884399414, "logps/chosen": -3.6838340759277344, "logps/rejected": -3.813248872756958, "loss": 3.4784, "rewards/accuracies": 0.75, "rewards/chosen": -36.838340759277344, "rewards/margins": 1.2941474914550781, "rewards/rejected": -38.13248825073242, "step": 3403 }, { "epoch": 0.46350762527233114, "grad_norm": 43.91951808770979, "learning_rate": 5.190047125156819e-07, "logits/chosen": 10.978511810302734, "logits/rejected": 11.885984420776367, "logps/chosen": -3.6632156372070312, "logps/rejected": -3.999948740005493, "loss": 4.0404, "rewards/accuracies": 0.75, "rewards/chosen": -36.63215637207031, "rewards/margins": 3.3673324584960938, "rewards/rejected": -39.999488830566406, "step": 3404 }, { "epoch": 0.4636437908496732, "grad_norm": 39.94381212990558, "learning_rate": 5.188231687087895e-07, "logits/chosen": 12.721665382385254, "logits/rejected": 12.805924415588379, "logps/chosen": -4.237588882446289, "logps/rejected": -4.202586650848389, "loss": 4.1255, "rewards/accuracies": 0.5, "rewards/chosen": -42.375892639160156, "rewards/margins": -0.3500242233276367, "rewards/rejected": -42.02586364746094, "step": 3405 }, { "epoch": 0.46377995642701525, "grad_norm": 44.625393376161874, "learning_rate": 5.1864159805282e-07, "logits/chosen": 12.90829086303711, "logits/rejected": 12.214174270629883, "logps/chosen": -4.252951622009277, "logps/rejected": -3.6106929779052734, "loss": 3.936, "rewards/accuracies": 0.0, "rewards/chosen": -42.529518127441406, "rewards/margins": -6.422591209411621, "rewards/rejected": -36.10692596435547, "step": 3406 }, { "epoch": 0.4639161220043573, "grad_norm": 41.635041005640524, "learning_rate": 5.184600005888007e-07, "logits/chosen": 12.791704177856445, "logits/rejected": 11.898159980773926, "logps/chosen": -3.8855419158935547, "logps/rejected": -3.6572399139404297, "loss": 4.0182, "rewards/accuracies": 0.25, "rewards/chosen": -38.85541915893555, "rewards/margins": -2.2830190658569336, "rewards/rejected": -36.5723991394043, "step": 3407 }, { "epoch": 0.46405228758169936, "grad_norm": 56.93544497879635, "learning_rate": 5.18278376357765e-07, "logits/chosen": 12.49710464477539, "logits/rejected": 12.177699089050293, "logps/chosen": -4.062712669372559, "logps/rejected": -3.988903045654297, "loss": 3.8843, "rewards/accuracies": 0.25, "rewards/chosen": -40.62712860107422, "rewards/margins": -0.738095760345459, "rewards/rejected": -39.88903045654297, "step": 3408 }, { "epoch": 0.4641884531590414, "grad_norm": 40.61187576747203, "learning_rate": 5.180967254007525e-07, "logits/chosen": 12.237934112548828, "logits/rejected": 12.166388511657715, "logps/chosen": -4.233726501464844, "logps/rejected": -4.234616756439209, "loss": 3.9744, "rewards/accuracies": 0.5, "rewards/chosen": -42.33726501464844, "rewards/margins": 0.008900642395019531, "rewards/rejected": -42.346168518066406, "step": 3409 }, { "epoch": 0.4643246187363834, "grad_norm": 176.17643887582855, "learning_rate": 5.179150477588087e-07, "logits/chosen": 12.043395042419434, "logits/rejected": 13.233728408813477, "logps/chosen": -3.611600875854492, "logps/rejected": -4.176373481750488, "loss": 4.0801, "rewards/accuracies": 1.0, "rewards/chosen": -36.11601257324219, "rewards/margins": 5.647727966308594, "rewards/rejected": -41.76374053955078, "step": 3410 }, { "epoch": 0.4644607843137255, "grad_norm": 59.33893062095665, "learning_rate": 5.177333434729852e-07, "logits/chosen": 11.402551651000977, "logits/rejected": 12.262815475463867, "logps/chosen": -3.8235936164855957, "logps/rejected": -4.103151798248291, "loss": 4.4749, "rewards/accuracies": 1.0, "rewards/chosen": -38.235939025878906, "rewards/margins": 2.7955780029296875, "rewards/rejected": -41.031517028808594, "step": 3411 }, { "epoch": 0.46459694989106753, "grad_norm": 40.14867938076047, "learning_rate": 5.175516125843395e-07, "logits/chosen": 13.008378982543945, "logits/rejected": 12.979881286621094, "logps/chosen": -3.9968574047088623, "logps/rejected": -3.8203954696655273, "loss": 4.2304, "rewards/accuracies": 0.25, "rewards/chosen": -39.96857452392578, "rewards/margins": -1.7646198272705078, "rewards/rejected": -38.203956604003906, "step": 3412 }, { "epoch": 0.46473311546840956, "grad_norm": 40.79472864519207, "learning_rate": 5.173698551339352e-07, "logits/chosen": 11.99004077911377, "logits/rejected": 11.55985164642334, "logps/chosen": -3.62734317779541, "logps/rejected": -3.7135705947875977, "loss": 3.6147, "rewards/accuracies": 0.5, "rewards/chosen": -36.27342987060547, "rewards/margins": 0.8622746467590332, "rewards/rejected": -37.135704040527344, "step": 3413 }, { "epoch": 0.46486928104575165, "grad_norm": 41.9936145806591, "learning_rate": 5.171880711628421e-07, "logits/chosen": 12.288921356201172, "logits/rejected": 13.525308609008789, "logps/chosen": -3.909806966781616, "logps/rejected": -4.4273786544799805, "loss": 4.035, "rewards/accuracies": 0.75, "rewards/chosen": -39.09806823730469, "rewards/margins": 5.175718307495117, "rewards/rejected": -44.27378845214844, "step": 3414 }, { "epoch": 0.4650054466230937, "grad_norm": 43.082988233704164, "learning_rate": 5.170062607121356e-07, "logits/chosen": 12.216144561767578, "logits/rejected": 12.508198738098145, "logps/chosen": -3.6287131309509277, "logps/rejected": -3.7448716163635254, "loss": 4.3035, "rewards/accuracies": 0.25, "rewards/chosen": -36.28712844848633, "rewards/margins": 1.1615843772888184, "rewards/rejected": -37.44871520996094, "step": 3415 }, { "epoch": 0.4651416122004357, "grad_norm": 40.86256387086207, "learning_rate": 5.168244238228971e-07, "logits/chosen": 12.140303611755371, "logits/rejected": 12.478742599487305, "logps/chosen": -3.6992874145507812, "logps/rejected": -3.6521847248077393, "loss": 4.0184, "rewards/accuracies": 0.5, "rewards/chosen": -36.99287414550781, "rewards/margins": -0.47102832794189453, "rewards/rejected": -36.521846771240234, "step": 3416 }, { "epoch": 0.4652777777777778, "grad_norm": 41.40110245512274, "learning_rate": 5.166425605362145e-07, "logits/chosen": 11.79318618774414, "logits/rejected": 12.48523235321045, "logps/chosen": -3.599886894226074, "logps/rejected": -4.304900169372559, "loss": 3.8422, "rewards/accuracies": 1.0, "rewards/chosen": -35.99886703491211, "rewards/margins": 7.050130844116211, "rewards/rejected": -43.04899978637695, "step": 3417 }, { "epoch": 0.4654139433551198, "grad_norm": 45.25404837002435, "learning_rate": 5.164606708931812e-07, "logits/chosen": 12.455072402954102, "logits/rejected": 11.984313011169434, "logps/chosen": -3.859463691711426, "logps/rejected": -3.701307773590088, "loss": 4.8474, "rewards/accuracies": 0.25, "rewards/chosen": -38.59463882446289, "rewards/margins": -1.5815625190734863, "rewards/rejected": -37.01307678222656, "step": 3418 }, { "epoch": 0.46555010893246185, "grad_norm": 45.22525584384557, "learning_rate": 5.162787549348966e-07, "logits/chosen": 12.579527854919434, "logits/rejected": 12.862834930419922, "logps/chosen": -4.027754783630371, "logps/rejected": -4.306049346923828, "loss": 4.382, "rewards/accuracies": 0.75, "rewards/chosen": -40.277549743652344, "rewards/margins": 2.7829484939575195, "rewards/rejected": -43.06049728393555, "step": 3419 }, { "epoch": 0.46568627450980393, "grad_norm": 37.878846062377974, "learning_rate": 5.160968127024662e-07, "logits/chosen": 11.999288558959961, "logits/rejected": 12.437897682189941, "logps/chosen": -3.6550345420837402, "logps/rejected": -3.891099214553833, "loss": 3.4878, "rewards/accuracies": 0.75, "rewards/chosen": -36.55034637451172, "rewards/margins": 2.3606462478637695, "rewards/rejected": -38.91099166870117, "step": 3420 }, { "epoch": 0.46582244008714596, "grad_norm": 41.26843173687984, "learning_rate": 5.159148442370013e-07, "logits/chosen": 13.341052055358887, "logits/rejected": 13.191226959228516, "logps/chosen": -4.221863746643066, "logps/rejected": -3.9681308269500732, "loss": 3.5197, "rewards/accuracies": 0.25, "rewards/chosen": -42.21863555908203, "rewards/margins": -2.5373306274414062, "rewards/rejected": -39.68130874633789, "step": 3421 }, { "epoch": 0.465958605664488, "grad_norm": 45.41509057935392, "learning_rate": 5.157328495796191e-07, "logits/chosen": 12.387565612792969, "logits/rejected": 12.948755264282227, "logps/chosen": -3.6043107509613037, "logps/rejected": -3.765554189682007, "loss": 4.6128, "rewards/accuracies": 0.75, "rewards/chosen": -36.04310607910156, "rewards/margins": 1.6124348640441895, "rewards/rejected": -37.655540466308594, "step": 3422 }, { "epoch": 0.4660947712418301, "grad_norm": 37.70936427124554, "learning_rate": 5.15550828771443e-07, "logits/chosen": 12.491806030273438, "logits/rejected": 12.4366455078125, "logps/chosen": -4.125157356262207, "logps/rejected": -3.9530794620513916, "loss": 3.7973, "rewards/accuracies": 0.5, "rewards/chosen": -41.25157165527344, "rewards/margins": -1.7207775115966797, "rewards/rejected": -39.530792236328125, "step": 3423 }, { "epoch": 0.4662309368191721, "grad_norm": 37.84947558492331, "learning_rate": 5.153687818536019e-07, "logits/chosen": 11.722929954528809, "logits/rejected": 12.504097938537598, "logps/chosen": -3.4347305297851562, "logps/rejected": -3.7906153202056885, "loss": 3.7845, "rewards/accuracies": 0.5, "rewards/chosen": -34.34730529785156, "rewards/margins": 3.558849334716797, "rewards/rejected": -37.906150817871094, "step": 3424 }, { "epoch": 0.4663671023965142, "grad_norm": 42.70089866629184, "learning_rate": 5.15186708867231e-07, "logits/chosen": 12.479761123657227, "logits/rejected": 12.770116806030273, "logps/chosen": -3.617825508117676, "logps/rejected": -3.9527440071105957, "loss": 4.1745, "rewards/accuracies": 1.0, "rewards/chosen": -36.178253173828125, "rewards/margins": 3.349184513092041, "rewards/rejected": -39.52743911743164, "step": 3425 }, { "epoch": 0.4665032679738562, "grad_norm": 45.65944421174613, "learning_rate": 5.15004609853471e-07, "logits/chosen": 12.338655471801758, "logits/rejected": 11.87096881866455, "logps/chosen": -3.8461718559265137, "logps/rejected": -3.858234405517578, "loss": 4.1567, "rewards/accuracies": 0.5, "rewards/chosen": -38.46171951293945, "rewards/margins": 0.12062406539916992, "rewards/rejected": -38.58234405517578, "step": 3426 }, { "epoch": 0.46663943355119825, "grad_norm": 36.52134797542273, "learning_rate": 5.148224848534687e-07, "logits/chosen": 11.466304779052734, "logits/rejected": 11.962337493896484, "logps/chosen": -3.83561372756958, "logps/rejected": -3.7200875282287598, "loss": 3.7874, "rewards/accuracies": 0.5, "rewards/chosen": -38.356136322021484, "rewards/margins": -1.1552600860595703, "rewards/rejected": -37.20087432861328, "step": 3427 }, { "epoch": 0.46677559912854033, "grad_norm": 39.596232962531346, "learning_rate": 5.146403339083769e-07, "logits/chosen": 12.197062492370605, "logits/rejected": 12.629419326782227, "logps/chosen": -3.9223251342773438, "logps/rejected": -3.881877899169922, "loss": 4.2931, "rewards/accuracies": 0.25, "rewards/chosen": -39.22325134277344, "rewards/margins": -0.40447330474853516, "rewards/rejected": -38.81877899169922, "step": 3428 }, { "epoch": 0.46691176470588236, "grad_norm": 47.13311438004208, "learning_rate": 5.14458157059354e-07, "logits/chosen": 11.928288459777832, "logits/rejected": 12.47056770324707, "logps/chosen": -3.5674238204956055, "logps/rejected": -4.108214378356934, "loss": 4.5122, "rewards/accuracies": 0.75, "rewards/chosen": -35.67423629760742, "rewards/margins": 5.407902717590332, "rewards/rejected": -41.08213806152344, "step": 3429 }, { "epoch": 0.4670479302832244, "grad_norm": 59.792909240103775, "learning_rate": 5.142759543475644e-07, "logits/chosen": 12.753326416015625, "logits/rejected": 12.497968673706055, "logps/chosen": -3.964632749557495, "logps/rejected": -3.9604599475860596, "loss": 4.4334, "rewards/accuracies": 0.5, "rewards/chosen": -39.646324157714844, "rewards/margins": -0.04172515869140625, "rewards/rejected": -39.60459899902344, "step": 3430 }, { "epoch": 0.4671840958605665, "grad_norm": 45.70924676926573, "learning_rate": 5.140937258141782e-07, "logits/chosen": 12.451992988586426, "logits/rejected": 12.248503684997559, "logps/chosen": -3.841050863265991, "logps/rejected": -4.202611923217773, "loss": 4.4519, "rewards/accuracies": 1.0, "rewards/chosen": -38.41050720214844, "rewards/margins": 3.615610122680664, "rewards/rejected": -42.026119232177734, "step": 3431 }, { "epoch": 0.4673202614379085, "grad_norm": 36.58272323400498, "learning_rate": 5.139114715003718e-07, "logits/chosen": 13.31658935546875, "logits/rejected": 12.391091346740723, "logps/chosen": -3.7134780883789062, "logps/rejected": -3.9985740184783936, "loss": 3.8882, "rewards/accuracies": 0.75, "rewards/chosen": -37.1347770690918, "rewards/margins": 2.8509626388549805, "rewards/rejected": -39.985740661621094, "step": 3432 }, { "epoch": 0.46745642701525053, "grad_norm": 41.55058783746385, "learning_rate": 5.137291914473266e-07, "logits/chosen": 12.12319564819336, "logits/rejected": 11.491918563842773, "logps/chosen": -3.692826986312866, "logps/rejected": -3.5649101734161377, "loss": 4.4175, "rewards/accuracies": 0.5, "rewards/chosen": -36.92826843261719, "rewards/margins": -1.2791681289672852, "rewards/rejected": -35.64910125732422, "step": 3433 }, { "epoch": 0.4675925925925926, "grad_norm": 41.20753009759207, "learning_rate": 5.135468856962304e-07, "logits/chosen": 11.463201522827148, "logits/rejected": 12.847164154052734, "logps/chosen": -3.436166524887085, "logps/rejected": -3.8387999534606934, "loss": 4.0576, "rewards/accuracies": 0.75, "rewards/chosen": -34.361663818359375, "rewards/margins": 4.026335716247559, "rewards/rejected": -38.38800048828125, "step": 3434 }, { "epoch": 0.46772875816993464, "grad_norm": 42.208929717342016, "learning_rate": 5.133645542882771e-07, "logits/chosen": 12.391775131225586, "logits/rejected": 13.034862518310547, "logps/chosen": -3.8001019954681396, "logps/rejected": -3.8441197872161865, "loss": 3.8568, "rewards/accuracies": 0.75, "rewards/chosen": -38.00102233886719, "rewards/margins": 0.44017934799194336, "rewards/rejected": -38.441200256347656, "step": 3435 }, { "epoch": 0.4678649237472767, "grad_norm": 38.87988230560163, "learning_rate": 5.131821972646655e-07, "logits/chosen": 13.436777114868164, "logits/rejected": 13.311502456665039, "logps/chosen": -4.238035202026367, "logps/rejected": -4.393669128417969, "loss": 3.8444, "rewards/accuracies": 0.75, "rewards/chosen": -42.38035202026367, "rewards/margins": 1.5563364028930664, "rewards/rejected": -43.93668746948242, "step": 3436 }, { "epoch": 0.46800108932461876, "grad_norm": 42.10451277384072, "learning_rate": 5.129998146666008e-07, "logits/chosen": 11.768087387084961, "logits/rejected": 12.086435317993164, "logps/chosen": -3.6577401161193848, "logps/rejected": -4.04461669921875, "loss": 4.1387, "rewards/accuracies": 1.0, "rewards/chosen": -36.57740020751953, "rewards/margins": 3.868764877319336, "rewards/rejected": -40.4461669921875, "step": 3437 }, { "epoch": 0.4681372549019608, "grad_norm": 39.941235518427355, "learning_rate": 5.128174065352941e-07, "logits/chosen": 13.06273078918457, "logits/rejected": 12.982098579406738, "logps/chosen": -4.218629837036133, "logps/rejected": -3.914825916290283, "loss": 3.7811, "rewards/accuracies": 0.25, "rewards/chosen": -42.18629837036133, "rewards/margins": -3.038041114807129, "rewards/rejected": -39.148258209228516, "step": 3438 }, { "epoch": 0.4682734204793028, "grad_norm": 39.668842402553416, "learning_rate": 5.126349729119617e-07, "logits/chosen": 11.300654411315918, "logits/rejected": 12.096988677978516, "logps/chosen": -3.775989532470703, "logps/rejected": -4.133059501647949, "loss": 4.051, "rewards/accuracies": 0.75, "rewards/chosen": -37.75989532470703, "rewards/margins": 3.5707039833068848, "rewards/rejected": -41.33060073852539, "step": 3439 }, { "epoch": 0.4684095860566449, "grad_norm": 52.00208378787748, "learning_rate": 5.124525138378262e-07, "logits/chosen": 12.121341705322266, "logits/rejected": 12.148767471313477, "logps/chosen": -3.925236940383911, "logps/rejected": -4.0630998611450195, "loss": 4.0495, "rewards/accuracies": 0.5, "rewards/chosen": -39.25236892700195, "rewards/margins": 1.3786334991455078, "rewards/rejected": -40.631004333496094, "step": 3440 }, { "epoch": 0.46854575163398693, "grad_norm": 39.63030941841743, "learning_rate": 5.122700293541155e-07, "logits/chosen": 12.48970890045166, "logits/rejected": 12.424577713012695, "logps/chosen": -3.8582942485809326, "logps/rejected": -3.960453987121582, "loss": 4.2723, "rewards/accuracies": 0.75, "rewards/chosen": -38.582942962646484, "rewards/margins": 1.0215959548950195, "rewards/rejected": -39.60453796386719, "step": 3441 }, { "epoch": 0.46868191721132896, "grad_norm": 39.52448433604294, "learning_rate": 5.120875195020637e-07, "logits/chosen": 12.090179443359375, "logits/rejected": 14.269755363464355, "logps/chosen": -3.818544864654541, "logps/rejected": -4.517070293426514, "loss": 3.7184, "rewards/accuracies": 0.75, "rewards/chosen": -38.185447692871094, "rewards/margins": 6.985252380371094, "rewards/rejected": -45.17070007324219, "step": 3442 }, { "epoch": 0.46881808278867104, "grad_norm": 42.72363199557707, "learning_rate": 5.119049843229105e-07, "logits/chosen": 12.421587944030762, "logits/rejected": 12.452239990234375, "logps/chosen": -3.9393773078918457, "logps/rejected": -4.007088661193848, "loss": 3.4168, "rewards/accuracies": 0.25, "rewards/chosen": -39.393775939941406, "rewards/margins": 0.6771135330200195, "rewards/rejected": -40.07088851928711, "step": 3443 }, { "epoch": 0.46895424836601307, "grad_norm": 38.28535912131288, "learning_rate": 5.117224238579009e-07, "logits/chosen": 12.4561185836792, "logits/rejected": 12.388940811157227, "logps/chosen": -3.8086659908294678, "logps/rejected": -3.8782403469085693, "loss": 4.0039, "rewards/accuracies": 0.5, "rewards/chosen": -38.08666229248047, "rewards/margins": 0.6957449913024902, "rewards/rejected": -38.782405853271484, "step": 3444 }, { "epoch": 0.4690904139433551, "grad_norm": 45.88907766123835, "learning_rate": 5.115398381482862e-07, "logits/chosen": 12.183954238891602, "logits/rejected": 12.303359985351562, "logps/chosen": -3.850806951522827, "logps/rejected": -3.650766372680664, "loss": 3.6919, "rewards/accuracies": 0.25, "rewards/chosen": -38.50807189941406, "rewards/margins": -2.00040864944458, "rewards/rejected": -36.50766372680664, "step": 3445 }, { "epoch": 0.4692265795206972, "grad_norm": 42.626249153822656, "learning_rate": 5.11357227235323e-07, "logits/chosen": 12.827947616577148, "logits/rejected": 12.570892333984375, "logps/chosen": -3.9592533111572266, "logps/rejected": -4.1248087882995605, "loss": 4.2026, "rewards/accuracies": 1.0, "rewards/chosen": -39.592533111572266, "rewards/margins": 1.6555538177490234, "rewards/rejected": -41.24808883666992, "step": 3446 }, { "epoch": 0.4693627450980392, "grad_norm": 40.2752765368841, "learning_rate": 5.111745911602739e-07, "logits/chosen": 11.734051704406738, "logits/rejected": 12.581592559814453, "logps/chosen": -3.7324228286743164, "logps/rejected": -3.976604461669922, "loss": 3.6018, "rewards/accuracies": 0.75, "rewards/chosen": -37.3242301940918, "rewards/margins": 2.441814422607422, "rewards/rejected": -39.76604461669922, "step": 3447 }, { "epoch": 0.46949891067538124, "grad_norm": 45.7049188647798, "learning_rate": 5.109919299644069e-07, "logits/chosen": 12.332794189453125, "logits/rejected": 12.965343475341797, "logps/chosen": -3.7559661865234375, "logps/rejected": -4.136157035827637, "loss": 4.2359, "rewards/accuracies": 1.0, "rewards/chosen": -37.559661865234375, "rewards/margins": 3.8019094467163086, "rewards/rejected": -41.361572265625, "step": 3448 }, { "epoch": 0.4696350762527233, "grad_norm": 40.437699968140095, "learning_rate": 5.108092436889959e-07, "logits/chosen": 12.426504135131836, "logits/rejected": 12.282845497131348, "logps/chosen": -3.75433349609375, "logps/rejected": -4.308453559875488, "loss": 3.3398, "rewards/accuracies": 1.0, "rewards/chosen": -37.543331146240234, "rewards/margins": 5.541203498840332, "rewards/rejected": -43.08453369140625, "step": 3449 }, { "epoch": 0.46977124183006536, "grad_norm": 57.061136590834664, "learning_rate": 5.106265323753203e-07, "logits/chosen": 12.113241195678711, "logits/rejected": 12.58254623413086, "logps/chosen": -3.6164326667785645, "logps/rejected": -3.604398250579834, "loss": 3.9982, "rewards/accuracies": 0.25, "rewards/chosen": -36.16432571411133, "rewards/margins": -0.1203460693359375, "rewards/rejected": -36.043983459472656, "step": 3450 }, { "epoch": 0.4699074074074074, "grad_norm": 42.116455946493396, "learning_rate": 5.104437960646652e-07, "logits/chosen": 12.01574993133545, "logits/rejected": 12.95781135559082, "logps/chosen": -3.872694492340088, "logps/rejected": -4.201231002807617, "loss": 3.895, "rewards/accuracies": 1.0, "rewards/chosen": -38.72694396972656, "rewards/margins": 3.2853660583496094, "rewards/rejected": -42.012306213378906, "step": 3451 }, { "epoch": 0.47004357298474947, "grad_norm": 41.919339230487616, "learning_rate": 5.102610347983216e-07, "logits/chosen": 12.37060260772705, "logits/rejected": 12.73317813873291, "logps/chosen": -3.8941421508789062, "logps/rejected": -4.052318096160889, "loss": 4.4521, "rewards/accuracies": 0.75, "rewards/chosen": -38.94142532348633, "rewards/margins": 1.5817575454711914, "rewards/rejected": -40.5231819152832, "step": 3452 }, { "epoch": 0.4701797385620915, "grad_norm": 37.92314805665782, "learning_rate": 5.100782486175857e-07, "logits/chosen": 12.594902038574219, "logits/rejected": 12.803524017333984, "logps/chosen": -3.660679817199707, "logps/rejected": -4.2406840324401855, "loss": 4.1847, "rewards/accuracies": 0.75, "rewards/chosen": -36.6068000793457, "rewards/margins": 5.800040245056152, "rewards/rejected": -42.406837463378906, "step": 3453 }, { "epoch": 0.4703159041394335, "grad_norm": 43.8290279900757, "learning_rate": 5.098954375637595e-07, "logits/chosen": 11.62678337097168, "logits/rejected": 11.939079284667969, "logps/chosen": -3.730250835418701, "logps/rejected": -3.9448742866516113, "loss": 3.8938, "rewards/accuracies": 0.75, "rewards/chosen": -37.30250930786133, "rewards/margins": 2.1462326049804688, "rewards/rejected": -39.4487419128418, "step": 3454 }, { "epoch": 0.4704520697167756, "grad_norm": 40.7349569948887, "learning_rate": 5.097126016781508e-07, "logits/chosen": 12.287374496459961, "logits/rejected": 13.02037239074707, "logps/chosen": -3.6560869216918945, "logps/rejected": -4.021799564361572, "loss": 3.9421, "rewards/accuracies": 1.0, "rewards/chosen": -36.56087112426758, "rewards/margins": 3.6571264266967773, "rewards/rejected": -40.21799850463867, "step": 3455 }, { "epoch": 0.47058823529411764, "grad_norm": 51.29630871586455, "learning_rate": 5.09529741002073e-07, "logits/chosen": 12.071168899536133, "logits/rejected": 13.038068771362305, "logps/chosen": -3.3194351196289062, "logps/rejected": -3.902331829071045, "loss": 3.9074, "rewards/accuracies": 1.0, "rewards/chosen": -33.19435119628906, "rewards/margins": 5.8289666175842285, "rewards/rejected": -39.0233154296875, "step": 3456 }, { "epoch": 0.47072440087145967, "grad_norm": 44.23102975924152, "learning_rate": 5.093468555768446e-07, "logits/chosen": 11.690942764282227, "logits/rejected": 11.898439407348633, "logps/chosen": -3.6405205726623535, "logps/rejected": -3.7712044715881348, "loss": 3.7875, "rewards/accuracies": 0.75, "rewards/chosen": -36.40520477294922, "rewards/margins": 1.3068394660949707, "rewards/rejected": -37.71204376220703, "step": 3457 }, { "epoch": 0.47086056644880175, "grad_norm": 38.756690422963175, "learning_rate": 5.091639454437905e-07, "logits/chosen": 12.475274085998535, "logits/rejected": 12.387304306030273, "logps/chosen": -3.8238630294799805, "logps/rejected": -4.183985710144043, "loss": 3.6614, "rewards/accuracies": 1.0, "rewards/chosen": -38.23863220214844, "rewards/margins": 3.6012277603149414, "rewards/rejected": -41.83985900878906, "step": 3458 }, { "epoch": 0.4709967320261438, "grad_norm": 44.112238793545465, "learning_rate": 5.089810106442405e-07, "logits/chosen": 12.966625213623047, "logits/rejected": 14.025144577026367, "logps/chosen": -3.9100029468536377, "logps/rejected": -4.119258880615234, "loss": 3.9837, "rewards/accuracies": 0.5, "rewards/chosen": -39.10002899169922, "rewards/margins": 2.092559814453125, "rewards/rejected": -41.192588806152344, "step": 3459 }, { "epoch": 0.4711328976034858, "grad_norm": 55.69360108905607, "learning_rate": 5.087980512195303e-07, "logits/chosen": 13.22441291809082, "logits/rejected": 12.739684104919434, "logps/chosen": -3.9700374603271484, "logps/rejected": -3.9867682456970215, "loss": 3.6071, "rewards/accuracies": 0.75, "rewards/chosen": -39.700374603271484, "rewards/margins": 0.16730880737304688, "rewards/rejected": -39.86768341064453, "step": 3460 }, { "epoch": 0.4712690631808279, "grad_norm": 47.55661865616739, "learning_rate": 5.086150672110012e-07, "logits/chosen": 12.010684967041016, "logits/rejected": 12.806931495666504, "logps/chosen": -3.6317453384399414, "logps/rejected": -3.799886465072632, "loss": 4.462, "rewards/accuracies": 0.75, "rewards/chosen": -36.31745529174805, "rewards/margins": 1.6814098358154297, "rewards/rejected": -37.99886703491211, "step": 3461 }, { "epoch": 0.4714052287581699, "grad_norm": 43.216301854643916, "learning_rate": 5.084320586599997e-07, "logits/chosen": 11.670766830444336, "logits/rejected": 11.943243026733398, "logps/chosen": -3.5400214195251465, "logps/rejected": -3.9694879055023193, "loss": 3.5526, "rewards/accuracies": 1.0, "rewards/chosen": -35.40021514892578, "rewards/margins": 4.294666290283203, "rewards/rejected": -39.694881439208984, "step": 3462 }, { "epoch": 0.471541394335512, "grad_norm": 45.49065825197859, "learning_rate": 5.082490256078784e-07, "logits/chosen": 11.91767692565918, "logits/rejected": 12.757207870483398, "logps/chosen": -3.802906036376953, "logps/rejected": -4.185632705688477, "loss": 3.8581, "rewards/accuracies": 0.75, "rewards/chosen": -38.02906036376953, "rewards/margins": 3.8272666931152344, "rewards/rejected": -41.856327056884766, "step": 3463 }, { "epoch": 0.47167755991285404, "grad_norm": 49.1218907146656, "learning_rate": 5.080659680959947e-07, "logits/chosen": 10.859769821166992, "logits/rejected": 11.581829071044922, "logps/chosen": -3.2205400466918945, "logps/rejected": -3.762773036956787, "loss": 4.3127, "rewards/accuracies": 1.0, "rewards/chosen": -32.20540237426758, "rewards/margins": 5.422330379486084, "rewards/rejected": -37.62773132324219, "step": 3464 }, { "epoch": 0.47181372549019607, "grad_norm": 52.94044525066833, "learning_rate": 5.078828861657125e-07, "logits/chosen": 11.376958847045898, "logits/rejected": 11.766765594482422, "logps/chosen": -3.478665828704834, "logps/rejected": -3.803950309753418, "loss": 4.5453, "rewards/accuracies": 0.75, "rewards/chosen": -34.786659240722656, "rewards/margins": 3.252847671508789, "rewards/rejected": -38.03950500488281, "step": 3465 }, { "epoch": 0.47194989106753815, "grad_norm": 44.10559128140569, "learning_rate": 5.076997798584003e-07, "logits/chosen": 11.95348834991455, "logits/rejected": 12.573917388916016, "logps/chosen": -3.5096349716186523, "logps/rejected": -3.8710579872131348, "loss": 3.9636, "rewards/accuracies": 0.75, "rewards/chosen": -35.096351623535156, "rewards/margins": 3.6142287254333496, "rewards/rejected": -38.71057891845703, "step": 3466 }, { "epoch": 0.4720860566448802, "grad_norm": 46.110833029356115, "learning_rate": 5.075166492154325e-07, "logits/chosen": 12.594071388244629, "logits/rejected": 12.330011367797852, "logps/chosen": -3.758490800857544, "logps/rejected": -3.6218600273132324, "loss": 4.2671, "rewards/accuracies": 0.25, "rewards/chosen": -37.58490753173828, "rewards/margins": -1.3663086891174316, "rewards/rejected": -36.218597412109375, "step": 3467 }, { "epoch": 0.4722222222222222, "grad_norm": 48.462637375738026, "learning_rate": 5.073334942781893e-07, "logits/chosen": 12.165179252624512, "logits/rejected": 12.607924461364746, "logps/chosen": -3.6219098567962646, "logps/rejected": -3.819511890411377, "loss": 3.7997, "rewards/accuracies": 0.5, "rewards/chosen": -36.21910095214844, "rewards/margins": 1.9760193824768066, "rewards/rejected": -38.19511795043945, "step": 3468 }, { "epoch": 0.4723583877995643, "grad_norm": 41.77883758011458, "learning_rate": 5.071503150880556e-07, "logits/chosen": 11.073319435119629, "logits/rejected": 11.463754653930664, "logps/chosen": -3.284435749053955, "logps/rejected": -3.463080406188965, "loss": 3.7737, "rewards/accuracies": 0.5, "rewards/chosen": -32.844356536865234, "rewards/margins": 1.7864465713500977, "rewards/rejected": -34.630802154541016, "step": 3469 }, { "epoch": 0.4724945533769063, "grad_norm": 53.25761976384952, "learning_rate": 5.069671116864226e-07, "logits/chosen": 12.007633209228516, "logits/rejected": 12.902024269104004, "logps/chosen": -3.454573392868042, "logps/rejected": -4.069989204406738, "loss": 4.106, "rewards/accuracies": 1.0, "rewards/chosen": -34.545738220214844, "rewards/margins": 6.154156684875488, "rewards/rejected": -40.69989013671875, "step": 3470 }, { "epoch": 0.47263071895424835, "grad_norm": 44.762293992877595, "learning_rate": 5.067838841146865e-07, "logits/chosen": 12.281204223632812, "logits/rejected": 13.287631034851074, "logps/chosen": -4.243897438049316, "logps/rejected": -4.3681159019470215, "loss": 3.885, "rewards/accuracies": 0.5, "rewards/chosen": -42.43897247314453, "rewards/margins": 1.2421875, "rewards/rejected": -43.68115997314453, "step": 3471 }, { "epoch": 0.47276688453159044, "grad_norm": 44.5981884477738, "learning_rate": 5.06600632414249e-07, "logits/chosen": 12.156570434570312, "logits/rejected": 12.382379531860352, "logps/chosen": -3.5293431282043457, "logps/rejected": -3.5301125049591064, "loss": 4.0678, "rewards/accuracies": 0.5, "rewards/chosen": -35.29343032836914, "rewards/margins": 0.007692813873291016, "rewards/rejected": -35.301124572753906, "step": 3472 }, { "epoch": 0.47290305010893247, "grad_norm": 47.58417577253182, "learning_rate": 5.064173566265177e-07, "logits/chosen": 11.398226737976074, "logits/rejected": 11.73028564453125, "logps/chosen": -3.5616238117218018, "logps/rejected": -3.6207854747772217, "loss": 4.5427, "rewards/accuracies": 0.5, "rewards/chosen": -35.61623764038086, "rewards/margins": 0.5916152000427246, "rewards/rejected": -36.207855224609375, "step": 3473 }, { "epoch": 0.4730392156862745, "grad_norm": 43.99073160480905, "learning_rate": 5.062340567929048e-07, "logits/chosen": 11.366084098815918, "logits/rejected": 12.931180953979492, "logps/chosen": -3.6116557121276855, "logps/rejected": -3.869054079055786, "loss": 4.0339, "rewards/accuracies": 0.75, "rewards/chosen": -36.11655807495117, "rewards/margins": 2.5739822387695312, "rewards/rejected": -38.6905403137207, "step": 3474 }, { "epoch": 0.4731753812636166, "grad_norm": 48.80170474753227, "learning_rate": 5.060507329548286e-07, "logits/chosen": 12.849895477294922, "logits/rejected": 12.077442169189453, "logps/chosen": -3.9102492332458496, "logps/rejected": -3.856394052505493, "loss": 4.3374, "rewards/accuracies": 0.5, "rewards/chosen": -39.10249328613281, "rewards/margins": -0.5385551452636719, "rewards/rejected": -38.563941955566406, "step": 3475 }, { "epoch": 0.4733115468409586, "grad_norm": 42.666601368129456, "learning_rate": 5.058673851537127e-07, "logits/chosen": 11.518157958984375, "logits/rejected": 11.99515151977539, "logps/chosen": -3.6975293159484863, "logps/rejected": -3.91675066947937, "loss": 3.7272, "rewards/accuracies": 0.75, "rewards/chosen": -36.97529220581055, "rewards/margins": 2.192215919494629, "rewards/rejected": -39.16750717163086, "step": 3476 }, { "epoch": 0.47344771241830064, "grad_norm": 40.95436461128085, "learning_rate": 5.056840134309862e-07, "logits/chosen": 11.449361801147461, "logits/rejected": 12.592647552490234, "logps/chosen": -3.2665233612060547, "logps/rejected": -3.680511951446533, "loss": 3.6022, "rewards/accuracies": 1.0, "rewards/chosen": -32.66523361206055, "rewards/margins": 4.139885902404785, "rewards/rejected": -36.805118560791016, "step": 3477 }, { "epoch": 0.4735838779956427, "grad_norm": 47.95946715696173, "learning_rate": 5.05500617828083e-07, "logits/chosen": 11.760547637939453, "logits/rejected": 12.55750846862793, "logps/chosen": -3.8876655101776123, "logps/rejected": -4.216768264770508, "loss": 3.7259, "rewards/accuracies": 1.0, "rewards/chosen": -38.87665557861328, "rewards/margins": 3.2910232543945312, "rewards/rejected": -42.16767883300781, "step": 3478 }, { "epoch": 0.47372004357298475, "grad_norm": 61.69824203062534, "learning_rate": 5.053171983864433e-07, "logits/chosen": 11.900402069091797, "logits/rejected": 12.341737747192383, "logps/chosen": -3.8129377365112305, "logps/rejected": -3.956951856613159, "loss": 4.6257, "rewards/accuracies": 0.75, "rewards/chosen": -38.12937927246094, "rewards/margins": 1.4401378631591797, "rewards/rejected": -39.56951904296875, "step": 3479 }, { "epoch": 0.4738562091503268, "grad_norm": 47.077070172963005, "learning_rate": 5.05133755147512e-07, "logits/chosen": 12.196468353271484, "logits/rejected": 12.183940887451172, "logps/chosen": -3.8061490058898926, "logps/rejected": -4.122611999511719, "loss": 4.1747, "rewards/accuracies": 0.75, "rewards/chosen": -38.061492919921875, "rewards/margins": 3.1646289825439453, "rewards/rejected": -41.22611999511719, "step": 3480 }, { "epoch": 0.47399237472766886, "grad_norm": 40.426193125534915, "learning_rate": 5.049502881527398e-07, "logits/chosen": 12.247762680053711, "logits/rejected": 11.975564002990723, "logps/chosen": -3.5964622497558594, "logps/rejected": -3.9703176021575928, "loss": 3.8249, "rewards/accuracies": 0.75, "rewards/chosen": -35.96461868286133, "rewards/margins": 3.738554000854492, "rewards/rejected": -39.70317459106445, "step": 3481 }, { "epoch": 0.4741285403050109, "grad_norm": 52.343368931203926, "learning_rate": 5.047667974435823e-07, "logits/chosen": 11.55677604675293, "logits/rejected": 12.083403587341309, "logps/chosen": -3.8564352989196777, "logps/rejected": -4.060118675231934, "loss": 4.2023, "rewards/accuracies": 1.0, "rewards/chosen": -38.564353942871094, "rewards/margins": 2.0368337631225586, "rewards/rejected": -40.60118865966797, "step": 3482 }, { "epoch": 0.4742647058823529, "grad_norm": 43.48515570459564, "learning_rate": 5.04583283061501e-07, "logits/chosen": 12.190165519714355, "logits/rejected": 11.670365333557129, "logps/chosen": -3.5001392364501953, "logps/rejected": -3.8559069633483887, "loss": 4.4366, "rewards/accuracies": 1.0, "rewards/chosen": -35.00138854980469, "rewards/margins": 3.557680130004883, "rewards/rejected": -38.5590705871582, "step": 3483 }, { "epoch": 0.474400871459695, "grad_norm": 52.47913525100792, "learning_rate": 5.043997450479622e-07, "logits/chosen": 11.137260437011719, "logits/rejected": 11.88235855102539, "logps/chosen": -3.449737071990967, "logps/rejected": -3.5914218425750732, "loss": 3.3365, "rewards/accuracies": 0.75, "rewards/chosen": -34.49736785888672, "rewards/margins": 1.4168481826782227, "rewards/rejected": -35.91421890258789, "step": 3484 }, { "epoch": 0.47453703703703703, "grad_norm": 51.30511891750424, "learning_rate": 5.042161834444383e-07, "logits/chosen": 12.492728233337402, "logits/rejected": 12.781244277954102, "logps/chosen": -3.7502710819244385, "logps/rejected": -3.82391095161438, "loss": 3.7422, "rewards/accuracies": 0.5, "rewards/chosen": -37.502708435058594, "rewards/margins": 0.7363996505737305, "rewards/rejected": -38.23910903930664, "step": 3485 }, { "epoch": 0.47467320261437906, "grad_norm": 45.02687457788781, "learning_rate": 5.040325982924062e-07, "logits/chosen": 11.758543014526367, "logits/rejected": 11.767305374145508, "logps/chosen": -3.606889247894287, "logps/rejected": -3.7835476398468018, "loss": 3.4033, "rewards/accuracies": 1.0, "rewards/chosen": -36.06889343261719, "rewards/margins": 1.7665824890136719, "rewards/rejected": -37.83547592163086, "step": 3486 }, { "epoch": 0.47480936819172115, "grad_norm": 41.96354916389152, "learning_rate": 5.038489896333485e-07, "logits/chosen": 11.545456886291504, "logits/rejected": 11.524782180786133, "logps/chosen": -4.0318827629089355, "logps/rejected": -3.862412214279175, "loss": 4.3335, "rewards/accuracies": 0.25, "rewards/chosen": -40.31882858276367, "rewards/margins": -1.6947050094604492, "rewards/rejected": -38.624122619628906, "step": 3487 }, { "epoch": 0.4749455337690632, "grad_norm": 50.4248497571284, "learning_rate": 5.036653575087533e-07, "logits/chosen": 11.832893371582031, "logits/rejected": 10.418866157531738, "logps/chosen": -3.747529983520508, "logps/rejected": -3.7007956504821777, "loss": 4.4778, "rewards/accuracies": 0.5, "rewards/chosen": -37.475303649902344, "rewards/margins": -0.4673452377319336, "rewards/rejected": -37.007957458496094, "step": 3488 }, { "epoch": 0.4750816993464052, "grad_norm": 43.68047585462919, "learning_rate": 5.034817019601135e-07, "logits/chosen": 12.014225006103516, "logits/rejected": 12.355489730834961, "logps/chosen": -4.066751956939697, "logps/rejected": -3.950878858566284, "loss": 3.824, "rewards/accuracies": 0.25, "rewards/chosen": -40.66752243041992, "rewards/margins": -1.1587333679199219, "rewards/rejected": -39.5087890625, "step": 3489 }, { "epoch": 0.4752178649237473, "grad_norm": 65.04400493342318, "learning_rate": 5.032980230289279e-07, "logits/chosen": 12.091022491455078, "logits/rejected": 12.231176376342773, "logps/chosen": -3.809943675994873, "logps/rejected": -3.9385180473327637, "loss": 4.8551, "rewards/accuracies": 0.75, "rewards/chosen": -38.09943389892578, "rewards/margins": 1.2857446670532227, "rewards/rejected": -39.38517761230469, "step": 3490 }, { "epoch": 0.4753540305010893, "grad_norm": 43.79436137445352, "learning_rate": 5.031143207567001e-07, "logits/chosen": 11.345588684082031, "logits/rejected": 11.765430450439453, "logps/chosen": -3.626291275024414, "logps/rejected": -3.9734859466552734, "loss": 3.8831, "rewards/accuracies": 0.75, "rewards/chosen": -36.26291275024414, "rewards/margins": 3.4719467163085938, "rewards/rejected": -39.73486328125, "step": 3491 }, { "epoch": 0.47549019607843135, "grad_norm": 45.07492708796708, "learning_rate": 5.029305951849391e-07, "logits/chosen": 11.325843811035156, "logits/rejected": 11.42584228515625, "logps/chosen": -3.74947452545166, "logps/rejected": -4.0123186111450195, "loss": 3.9089, "rewards/accuracies": 0.5, "rewards/chosen": -37.494747161865234, "rewards/margins": 2.6284408569335938, "rewards/rejected": -40.12318420410156, "step": 3492 }, { "epoch": 0.47562636165577343, "grad_norm": 43.41960065673161, "learning_rate": 5.027468463551594e-07, "logits/chosen": 11.81930160522461, "logits/rejected": 12.185911178588867, "logps/chosen": -3.869952917098999, "logps/rejected": -3.854452610015869, "loss": 3.8125, "rewards/accuracies": 0.5, "rewards/chosen": -38.69953155517578, "rewards/margins": -0.15500354766845703, "rewards/rejected": -38.544525146484375, "step": 3493 }, { "epoch": 0.47576252723311546, "grad_norm": 44.22361224250434, "learning_rate": 5.025630743088804e-07, "logits/chosen": 11.602495193481445, "logits/rejected": 11.635064125061035, "logps/chosen": -3.67983078956604, "logps/rejected": -3.6782405376434326, "loss": 4.4902, "rewards/accuracies": 0.75, "rewards/chosen": -36.798309326171875, "rewards/margins": -0.015903472900390625, "rewards/rejected": -36.782405853271484, "step": 3494 }, { "epoch": 0.4758986928104575, "grad_norm": 44.243393349148455, "learning_rate": 5.023792790876269e-07, "logits/chosen": 11.76329231262207, "logits/rejected": 12.558062553405762, "logps/chosen": -3.5760512351989746, "logps/rejected": -4.068709850311279, "loss": 4.0599, "rewards/accuracies": 1.0, "rewards/chosen": -35.76051330566406, "rewards/margins": 4.926587104797363, "rewards/rejected": -40.687103271484375, "step": 3495 }, { "epoch": 0.4760348583877996, "grad_norm": 46.146782077157354, "learning_rate": 5.021954607329291e-07, "logits/chosen": 12.985897064208984, "logits/rejected": 12.722444534301758, "logps/chosen": -4.006645202636719, "logps/rejected": -4.206429481506348, "loss": 3.6711, "rewards/accuracies": 0.5, "rewards/chosen": -40.06645202636719, "rewards/margins": 1.9978437423706055, "rewards/rejected": -42.06429672241211, "step": 3496 }, { "epoch": 0.4761710239651416, "grad_norm": 45.031602009489816, "learning_rate": 5.02011619286322e-07, "logits/chosen": 12.69804573059082, "logits/rejected": 12.579259872436523, "logps/chosen": -4.129644393920898, "logps/rejected": -4.098203659057617, "loss": 4.5111, "rewards/accuracies": 0.5, "rewards/chosen": -41.29644775390625, "rewards/margins": -0.31441307067871094, "rewards/rejected": -40.982032775878906, "step": 3497 }, { "epoch": 0.47630718954248363, "grad_norm": 42.58684253283392, "learning_rate": 5.018277547893465e-07, "logits/chosen": 12.909274101257324, "logits/rejected": 13.232503890991211, "logps/chosen": -3.8586983680725098, "logps/rejected": -4.444545745849609, "loss": 4.005, "rewards/accuracies": 1.0, "rewards/chosen": -38.58698272705078, "rewards/margins": 5.858473777770996, "rewards/rejected": -44.445457458496094, "step": 3498 }, { "epoch": 0.4764433551198257, "grad_norm": 44.08339491189729, "learning_rate": 5.016438672835481e-07, "logits/chosen": 12.368638038635254, "logits/rejected": 12.152393341064453, "logps/chosen": -4.018139839172363, "logps/rejected": -4.459920406341553, "loss": 3.8372, "rewards/accuracies": 0.75, "rewards/chosen": -40.181396484375, "rewards/margins": 4.417808532714844, "rewards/rejected": -44.599205017089844, "step": 3499 }, { "epoch": 0.47657952069716775, "grad_norm": 43.53970861503491, "learning_rate": 5.014599568104776e-07, "logits/chosen": 12.973453521728516, "logits/rejected": 12.705853462219238, "logps/chosen": -3.7092227935791016, "logps/rejected": -4.091135025024414, "loss": 4.145, "rewards/accuracies": 0.75, "rewards/chosen": -37.092227935791016, "rewards/margins": 3.819119453430176, "rewards/rejected": -40.911346435546875, "step": 3500 }, { "epoch": 0.47671568627450983, "grad_norm": 42.646487341121926, "learning_rate": 5.012760234116912e-07, "logits/chosen": 12.241739273071289, "logits/rejected": 12.216365814208984, "logps/chosen": -3.9216113090515137, "logps/rejected": -3.9767630100250244, "loss": 3.8439, "rewards/accuracies": 0.5, "rewards/chosen": -39.21611022949219, "rewards/margins": 0.551518440246582, "rewards/rejected": -39.76762771606445, "step": 3501 }, { "epoch": 0.47685185185185186, "grad_norm": 46.757004765273116, "learning_rate": 5.010920671287501e-07, "logits/chosen": 11.796041488647461, "logits/rejected": 12.49293327331543, "logps/chosen": -3.861351490020752, "logps/rejected": -3.985126495361328, "loss": 3.9517, "rewards/accuracies": 0.75, "rewards/chosen": -38.6135139465332, "rewards/margins": 1.2377510070800781, "rewards/rejected": -39.85126495361328, "step": 3502 }, { "epoch": 0.4769880174291939, "grad_norm": 42.13718791838605, "learning_rate": 5.00908088003221e-07, "logits/chosen": 12.989959716796875, "logits/rejected": 13.06469440460205, "logps/chosen": -4.077593803405762, "logps/rejected": -4.109524726867676, "loss": 3.7714, "rewards/accuracies": 0.5, "rewards/chosen": -40.77593994140625, "rewards/margins": 0.3193092346191406, "rewards/rejected": -41.095245361328125, "step": 3503 }, { "epoch": 0.477124183006536, "grad_norm": 44.438165889551385, "learning_rate": 5.007240860766751e-07, "logits/chosen": 12.125844955444336, "logits/rejected": 12.552591323852539, "logps/chosen": -4.2153167724609375, "logps/rejected": -4.483528137207031, "loss": 3.7911, "rewards/accuracies": 1.0, "rewards/chosen": -42.153167724609375, "rewards/margins": 2.6821165084838867, "rewards/rejected": -44.83528137207031, "step": 3504 }, { "epoch": 0.477260348583878, "grad_norm": 51.67812918918693, "learning_rate": 5.005400613906894e-07, "logits/chosen": 11.936090469360352, "logits/rejected": 13.395362854003906, "logps/chosen": -3.5870163440704346, "logps/rejected": -3.875542163848877, "loss": 4.5059, "rewards/accuracies": 0.5, "rewards/chosen": -35.87016296386719, "rewards/margins": 2.8852548599243164, "rewards/rejected": -38.75542068481445, "step": 3505 }, { "epoch": 0.47739651416122003, "grad_norm": 64.96055770125729, "learning_rate": 5.003560139868457e-07, "logits/chosen": 12.806816101074219, "logits/rejected": 13.109579086303711, "logps/chosen": -4.161996841430664, "logps/rejected": -4.255047798156738, "loss": 4.5619, "rewards/accuracies": 0.75, "rewards/chosen": -41.619972229003906, "rewards/margins": 0.930511474609375, "rewards/rejected": -42.55048370361328, "step": 3506 }, { "epoch": 0.4775326797385621, "grad_norm": 47.14341700328676, "learning_rate": 5.001719439067312e-07, "logits/chosen": 12.200410842895508, "logits/rejected": 12.633358001708984, "logps/chosen": -3.626732587814331, "logps/rejected": -3.9137802124023438, "loss": 4.7483, "rewards/accuracies": 0.75, "rewards/chosen": -36.26732635498047, "rewards/margins": 2.8704776763916016, "rewards/rejected": -39.13780212402344, "step": 3507 }, { "epoch": 0.47766884531590414, "grad_norm": 43.75299233285623, "learning_rate": 4.999878511919378e-07, "logits/chosen": 11.864341735839844, "logits/rejected": 13.881298065185547, "logps/chosen": -3.905734062194824, "logps/rejected": -4.640348434448242, "loss": 3.6464, "rewards/accuracies": 1.0, "rewards/chosen": -39.057342529296875, "rewards/margins": 7.346139907836914, "rewards/rejected": -46.403480529785156, "step": 3508 }, { "epoch": 0.4778050108932462, "grad_norm": 44.93561905067308, "learning_rate": 4.998037358840632e-07, "logits/chosen": 11.476189613342285, "logits/rejected": 13.461118698120117, "logps/chosen": -3.949709415435791, "logps/rejected": -4.111830234527588, "loss": 4.2062, "rewards/accuracies": 0.75, "rewards/chosen": -39.497093200683594, "rewards/margins": 1.621206283569336, "rewards/rejected": -41.11830139160156, "step": 3509 }, { "epoch": 0.47794117647058826, "grad_norm": 44.700733092664564, "learning_rate": 4.996195980247091e-07, "logits/chosen": 12.255202293395996, "logits/rejected": 12.025875091552734, "logps/chosen": -3.6299405097961426, "logps/rejected": -3.958530902862549, "loss": 3.7299, "rewards/accuracies": 0.75, "rewards/chosen": -36.299407958984375, "rewards/margins": 3.2859058380126953, "rewards/rejected": -39.58531188964844, "step": 3510 }, { "epoch": 0.4780773420479303, "grad_norm": 41.22816826751256, "learning_rate": 4.994354376554836e-07, "logits/chosen": 12.988462448120117, "logits/rejected": 13.080317497253418, "logps/chosen": -4.032963752746582, "logps/rejected": -4.633332252502441, "loss": 3.8156, "rewards/accuracies": 1.0, "rewards/chosen": -40.32963562011719, "rewards/margins": 6.00368595123291, "rewards/rejected": -46.33332061767578, "step": 3511 }, { "epoch": 0.4782135076252723, "grad_norm": 47.610234568178676, "learning_rate": 4.99251254817999e-07, "logits/chosen": 13.005508422851562, "logits/rejected": 13.983731269836426, "logps/chosen": -4.169189453125, "logps/rejected": -4.114564895629883, "loss": 3.9306, "rewards/accuracies": 0.5, "rewards/chosen": -41.69189453125, "rewards/margins": -0.5462446212768555, "rewards/rejected": -41.145652770996094, "step": 3512 }, { "epoch": 0.4783496732026144, "grad_norm": 49.72185297204499, "learning_rate": 4.99067049553873e-07, "logits/chosen": 12.777220726013184, "logits/rejected": 13.200166702270508, "logps/chosen": -4.116327285766602, "logps/rejected": -4.114206314086914, "loss": 4.2964, "rewards/accuracies": 0.75, "rewards/chosen": -41.163272857666016, "rewards/margins": -0.02120685577392578, "rewards/rejected": -41.142066955566406, "step": 3513 }, { "epoch": 0.47848583877995643, "grad_norm": 50.06841328939565, "learning_rate": 4.988828219047282e-07, "logits/chosen": 12.497295379638672, "logits/rejected": 12.423290252685547, "logps/chosen": -4.283738136291504, "logps/rejected": -4.158769607543945, "loss": 3.9417, "rewards/accuracies": 0.25, "rewards/chosen": -42.837379455566406, "rewards/margins": -1.2496843338012695, "rewards/rejected": -41.58769226074219, "step": 3514 }, { "epoch": 0.47862200435729846, "grad_norm": 46.55087923710159, "learning_rate": 4.986985719121923e-07, "logits/chosen": 11.828707695007324, "logits/rejected": 13.31447982788086, "logps/chosen": -4.0437726974487305, "logps/rejected": -4.459965705871582, "loss": 3.4364, "rewards/accuracies": 1.0, "rewards/chosen": -40.43772888183594, "rewards/margins": 4.161928176879883, "rewards/rejected": -44.59965515136719, "step": 3515 }, { "epoch": 0.47875816993464054, "grad_norm": 46.533081673045146, "learning_rate": 4.985142996178984e-07, "logits/chosen": 13.446540832519531, "logits/rejected": 13.797381401062012, "logps/chosen": -4.069091796875, "logps/rejected": -4.319952964782715, "loss": 4.0496, "rewards/accuracies": 0.5, "rewards/chosen": -40.69091796875, "rewards/margins": 2.5086097717285156, "rewards/rejected": -43.199527740478516, "step": 3516 }, { "epoch": 0.47889433551198257, "grad_norm": 47.089129647625434, "learning_rate": 4.983300050634841e-07, "logits/chosen": 13.038769721984863, "logits/rejected": 12.72958755493164, "logps/chosen": -4.410326957702637, "logps/rejected": -4.438958168029785, "loss": 3.6967, "rewards/accuracies": 0.5, "rewards/chosen": -44.103271484375, "rewards/margins": 0.28630828857421875, "rewards/rejected": -44.38957595825195, "step": 3517 }, { "epoch": 0.4790305010893246, "grad_norm": 49.291819450632296, "learning_rate": 4.981456882905924e-07, "logits/chosen": 12.49184799194336, "logits/rejected": 13.497576713562012, "logps/chosen": -4.109191417694092, "logps/rejected": -4.102299690246582, "loss": 4.1218, "rewards/accuracies": 0.5, "rewards/chosen": -41.09191131591797, "rewards/margins": -0.06891441345214844, "rewards/rejected": -41.02299499511719, "step": 3518 }, { "epoch": 0.4791666666666667, "grad_norm": 46.66159970041809, "learning_rate": 4.979613493408711e-07, "logits/chosen": 11.236282348632812, "logits/rejected": 11.716537475585938, "logps/chosen": -3.428483486175537, "logps/rejected": -3.7603302001953125, "loss": 4.1593, "rewards/accuracies": 0.75, "rewards/chosen": -34.28483581542969, "rewards/margins": 3.3184690475463867, "rewards/rejected": -37.603302001953125, "step": 3519 }, { "epoch": 0.4793028322440087, "grad_norm": 44.79030239010586, "learning_rate": 4.977769882559731e-07, "logits/chosen": 12.716320037841797, "logits/rejected": 12.365806579589844, "logps/chosen": -3.8286421298980713, "logps/rejected": -3.9077296257019043, "loss": 4.5177, "rewards/accuracies": 0.75, "rewards/chosen": -38.28641891479492, "rewards/margins": 0.7908763885498047, "rewards/rejected": -39.07729721069336, "step": 3520 }, { "epoch": 0.47943899782135074, "grad_norm": 51.21870159788441, "learning_rate": 4.975926050775565e-07, "logits/chosen": 11.235718727111816, "logits/rejected": 11.672706604003906, "logps/chosen": -4.054842948913574, "logps/rejected": -4.189540863037109, "loss": 3.7652, "rewards/accuracies": 0.75, "rewards/chosen": -40.54842758178711, "rewards/margins": 1.3469829559326172, "rewards/rejected": -41.895408630371094, "step": 3521 }, { "epoch": 0.4795751633986928, "grad_norm": 45.18165252660419, "learning_rate": 4.97408199847284e-07, "logits/chosen": 12.15424919128418, "logits/rejected": 12.724010467529297, "logps/chosen": -3.8729283809661865, "logps/rejected": -4.163315773010254, "loss": 4.2385, "rewards/accuracies": 0.5, "rewards/chosen": -38.729286193847656, "rewards/margins": 2.903871536254883, "rewards/rejected": -41.633155822753906, "step": 3522 }, { "epoch": 0.47971132897603486, "grad_norm": 42.508767730253176, "learning_rate": 4.972237726068236e-07, "logits/chosen": 11.872096061706543, "logits/rejected": 12.599332809448242, "logps/chosen": -3.8207755088806152, "logps/rejected": -4.211994647979736, "loss": 4.276, "rewards/accuracies": 0.75, "rewards/chosen": -38.20775604248047, "rewards/margins": 3.9121885299682617, "rewards/rejected": -42.11994171142578, "step": 3523 }, { "epoch": 0.4798474945533769, "grad_norm": 44.447815288691714, "learning_rate": 4.970393233978481e-07, "logits/chosen": 13.269689559936523, "logits/rejected": 13.244810104370117, "logps/chosen": -4.269576072692871, "logps/rejected": -4.501667022705078, "loss": 3.9473, "rewards/accuracies": 0.75, "rewards/chosen": -42.695762634277344, "rewards/margins": 2.3209095001220703, "rewards/rejected": -45.01667404174805, "step": 3524 }, { "epoch": 0.47998366013071897, "grad_norm": 40.370483225517795, "learning_rate": 4.968548522620353e-07, "logits/chosen": 11.890459060668945, "logits/rejected": 12.773843765258789, "logps/chosen": -3.8893074989318848, "logps/rejected": -4.402711391448975, "loss": 4.1511, "rewards/accuracies": 0.75, "rewards/chosen": -38.89307403564453, "rewards/margins": 5.134037971496582, "rewards/rejected": -44.02711486816406, "step": 3525 }, { "epoch": 0.480119825708061, "grad_norm": 46.8591762718492, "learning_rate": 4.966703592410681e-07, "logits/chosen": 11.983207702636719, "logits/rejected": 12.781761169433594, "logps/chosen": -3.7274889945983887, "logps/rejected": -3.788747787475586, "loss": 3.9809, "rewards/accuracies": 0.75, "rewards/chosen": -37.27488708496094, "rewards/margins": 0.6125869750976562, "rewards/rejected": -37.88747787475586, "step": 3526 }, { "epoch": 0.480255991285403, "grad_norm": 46.40955837241026, "learning_rate": 4.964858443766341e-07, "logits/chosen": 12.669668197631836, "logits/rejected": 11.822920799255371, "logps/chosen": -4.022997856140137, "logps/rejected": -4.024096488952637, "loss": 4.4754, "rewards/accuracies": 0.5, "rewards/chosen": -40.22998046875, "rewards/margins": 0.010987281799316406, "rewards/rejected": -40.240966796875, "step": 3527 }, { "epoch": 0.4803921568627451, "grad_norm": 45.34592176870964, "learning_rate": 4.96301307710426e-07, "logits/chosen": 12.081718444824219, "logits/rejected": 12.334567070007324, "logps/chosen": -3.969679355621338, "logps/rejected": -4.04118013381958, "loss": 3.6414, "rewards/accuracies": 0.75, "rewards/chosen": -39.69679260253906, "rewards/margins": 0.7150058746337891, "rewards/rejected": -40.411800384521484, "step": 3528 }, { "epoch": 0.48052832244008714, "grad_norm": 43.27686249374847, "learning_rate": 4.961167492841414e-07, "logits/chosen": 12.761405944824219, "logits/rejected": 13.355031967163086, "logps/chosen": -3.970335006713867, "logps/rejected": -4.173633098602295, "loss": 3.9775, "rewards/accuracies": 0.5, "rewards/chosen": -39.70335006713867, "rewards/margins": 2.0329818725585938, "rewards/rejected": -41.736331939697266, "step": 3529 }, { "epoch": 0.48066448801742917, "grad_norm": 40.69458203339966, "learning_rate": 4.959321691394828e-07, "logits/chosen": 11.948124885559082, "logits/rejected": 12.86489200592041, "logps/chosen": -4.174182891845703, "logps/rejected": -4.3195905685424805, "loss": 3.6798, "rewards/accuracies": 0.75, "rewards/chosen": -41.74182891845703, "rewards/margins": 1.4540777206420898, "rewards/rejected": -43.19590759277344, "step": 3530 }, { "epoch": 0.48080065359477125, "grad_norm": 47.521610119745944, "learning_rate": 4.957475673181576e-07, "logits/chosen": 11.83499526977539, "logits/rejected": 12.619794845581055, "logps/chosen": -3.751966953277588, "logps/rejected": -3.8906397819519043, "loss": 4.66, "rewards/accuracies": 0.75, "rewards/chosen": -37.51966857910156, "rewards/margins": 1.3867292404174805, "rewards/rejected": -38.90639877319336, "step": 3531 }, { "epoch": 0.4809368191721133, "grad_norm": 44.91828213193647, "learning_rate": 4.955629438618782e-07, "logits/chosen": 13.076985359191895, "logits/rejected": 12.643661499023438, "logps/chosen": -4.036026477813721, "logps/rejected": -3.845686912536621, "loss": 3.5562, "rewards/accuracies": 0.25, "rewards/chosen": -40.36026382446289, "rewards/margins": -1.9033946990966797, "rewards/rejected": -38.456871032714844, "step": 3532 }, { "epoch": 0.4810729847494553, "grad_norm": 47.181648568766796, "learning_rate": 4.953782988123615e-07, "logits/chosen": 13.688179969787598, "logits/rejected": 13.357622146606445, "logps/chosen": -4.287514686584473, "logps/rejected": -4.083556652069092, "loss": 4.6281, "rewards/accuracies": 0.25, "rewards/chosen": -42.875144958496094, "rewards/margins": -2.0395774841308594, "rewards/rejected": -40.835567474365234, "step": 3533 }, { "epoch": 0.4812091503267974, "grad_norm": 52.663904962154945, "learning_rate": 4.951936322113299e-07, "logits/chosen": 11.369107246398926, "logits/rejected": 12.146186828613281, "logps/chosen": -3.595536947250366, "logps/rejected": -4.021071434020996, "loss": 4.3141, "rewards/accuracies": 1.0, "rewards/chosen": -35.95537185668945, "rewards/margins": 4.255342483520508, "rewards/rejected": -40.21071243286133, "step": 3534 }, { "epoch": 0.4813453159041394, "grad_norm": 52.28666223466453, "learning_rate": 4.950089441005102e-07, "logits/chosen": 13.07957649230957, "logits/rejected": 13.176177978515625, "logps/chosen": -4.456065654754639, "logps/rejected": -4.511307716369629, "loss": 3.8589, "rewards/accuracies": 0.75, "rewards/chosen": -44.5606575012207, "rewards/margins": 0.5524168014526367, "rewards/rejected": -45.11307144165039, "step": 3535 }, { "epoch": 0.48148148148148145, "grad_norm": 39.67308420168177, "learning_rate": 4.948242345216343e-07, "logits/chosen": 11.832839965820312, "logits/rejected": 12.580440521240234, "logps/chosen": -3.7073538303375244, "logps/rejected": -3.693882465362549, "loss": 3.9936, "rewards/accuracies": 0.5, "rewards/chosen": -37.07353973388672, "rewards/margins": -0.13471317291259766, "rewards/rejected": -36.93882369995117, "step": 3536 }, { "epoch": 0.48161764705882354, "grad_norm": 38.320016981900814, "learning_rate": 4.946395035164387e-07, "logits/chosen": 12.837193489074707, "logits/rejected": 12.506457328796387, "logps/chosen": -3.9787769317626953, "logps/rejected": -3.7720251083374023, "loss": 4.0294, "rewards/accuracies": 0.0, "rewards/chosen": -39.78776550292969, "rewards/margins": -2.0675172805786133, "rewards/rejected": -37.720252990722656, "step": 3537 }, { "epoch": 0.48175381263616557, "grad_norm": 39.36991310361599, "learning_rate": 4.94454751126665e-07, "logits/chosen": 12.028985977172852, "logits/rejected": 12.811285972595215, "logps/chosen": -3.8986124992370605, "logps/rejected": -4.433712005615234, "loss": 3.4825, "rewards/accuracies": 0.75, "rewards/chosen": -38.986122131347656, "rewards/margins": 5.350996971130371, "rewards/rejected": -44.337120056152344, "step": 3538 }, { "epoch": 0.48188997821350765, "grad_norm": 41.900899325157084, "learning_rate": 4.942699773940595e-07, "logits/chosen": 12.364437103271484, "logits/rejected": 13.067447662353516, "logps/chosen": -3.6250901222229004, "logps/rejected": -4.028138160705566, "loss": 4.4353, "rewards/accuracies": 1.0, "rewards/chosen": -36.25090026855469, "rewards/margins": 4.030486106872559, "rewards/rejected": -40.28138732910156, "step": 3539 }, { "epoch": 0.4820261437908497, "grad_norm": 45.876410833806084, "learning_rate": 4.940851823603733e-07, "logits/chosen": 13.542045593261719, "logits/rejected": 12.09347915649414, "logps/chosen": -4.564789772033691, "logps/rejected": -4.221229553222656, "loss": 4.3319, "rewards/accuracies": 0.25, "rewards/chosen": -45.64789581298828, "rewards/margins": -3.4356021881103516, "rewards/rejected": -42.21229553222656, "step": 3540 }, { "epoch": 0.4821623093681917, "grad_norm": 41.788248074271166, "learning_rate": 4.939003660673625e-07, "logits/chosen": 11.202617645263672, "logits/rejected": 13.327274322509766, "logps/chosen": -3.6133310794830322, "logps/rejected": -4.13970947265625, "loss": 3.8793, "rewards/accuracies": 1.0, "rewards/chosen": -36.13330841064453, "rewards/margins": 5.263786315917969, "rewards/rejected": -41.3970947265625, "step": 3541 }, { "epoch": 0.4822984749455338, "grad_norm": 42.918568787651395, "learning_rate": 4.937155285567879e-07, "logits/chosen": 12.58751106262207, "logits/rejected": 12.478883743286133, "logps/chosen": -3.884431838989258, "logps/rejected": -4.192852020263672, "loss": 3.7104, "rewards/accuracies": 1.0, "rewards/chosen": -38.84431457519531, "rewards/margins": 3.084199905395508, "rewards/rejected": -41.92851638793945, "step": 3542 }, { "epoch": 0.4824346405228758, "grad_norm": 44.712702683739934, "learning_rate": 4.935306698704148e-07, "logits/chosen": 13.640189170837402, "logits/rejected": 13.907655715942383, "logps/chosen": -4.607356071472168, "logps/rejected": -4.551753044128418, "loss": 4.2717, "rewards/accuracies": 0.25, "rewards/chosen": -46.07355499267578, "rewards/margins": -0.5560274124145508, "rewards/rejected": -45.51753234863281, "step": 3543 }, { "epoch": 0.48257080610021785, "grad_norm": 40.00153983234914, "learning_rate": 4.933457900500138e-07, "logits/chosen": 11.340171813964844, "logits/rejected": 11.686779022216797, "logps/chosen": -4.055015563964844, "logps/rejected": -4.032522201538086, "loss": 4.2312, "rewards/accuracies": 0.5, "rewards/chosen": -40.55015563964844, "rewards/margins": -0.2249298095703125, "rewards/rejected": -40.325225830078125, "step": 3544 }, { "epoch": 0.48270697167755994, "grad_norm": 40.92105208910108, "learning_rate": 4.931608891373599e-07, "logits/chosen": 12.324609756469727, "logits/rejected": 13.470251083374023, "logps/chosen": -3.8032095432281494, "logps/rejected": -4.137235641479492, "loss": 4.54, "rewards/accuracies": 0.75, "rewards/chosen": -38.03209686279297, "rewards/margins": 3.340256690979004, "rewards/rejected": -41.372352600097656, "step": 3545 }, { "epoch": 0.48284313725490197, "grad_norm": 42.386478539159164, "learning_rate": 4.92975967174233e-07, "logits/chosen": 11.887628555297852, "logits/rejected": 11.948454856872559, "logps/chosen": -4.331608772277832, "logps/rejected": -4.4940667152404785, "loss": 3.7552, "rewards/accuracies": 1.0, "rewards/chosen": -43.31608963012695, "rewards/margins": 1.6245803833007812, "rewards/rejected": -44.940670013427734, "step": 3546 }, { "epoch": 0.482979302832244, "grad_norm": 42.755297095421405, "learning_rate": 4.927910242024178e-07, "logits/chosen": 12.573522567749023, "logits/rejected": 12.019991874694824, "logps/chosen": -3.939002752304077, "logps/rejected": -3.884133815765381, "loss": 3.8634, "rewards/accuracies": 0.5, "rewards/chosen": -39.39002990722656, "rewards/margins": -0.5486907958984375, "rewards/rejected": -38.841339111328125, "step": 3547 }, { "epoch": 0.4831154684095861, "grad_norm": 45.397858089119204, "learning_rate": 4.926060602637037e-07, "logits/chosen": 12.000389099121094, "logits/rejected": 12.224279403686523, "logps/chosen": -4.073457717895508, "logps/rejected": -4.074176788330078, "loss": 4.06, "rewards/accuracies": 0.5, "rewards/chosen": -40.73457717895508, "rewards/margins": 0.007190704345703125, "rewards/rejected": -40.74176788330078, "step": 3548 }, { "epoch": 0.4832516339869281, "grad_norm": 39.34717636086848, "learning_rate": 4.924210753998847e-07, "logits/chosen": 13.294403076171875, "logits/rejected": 13.156927108764648, "logps/chosen": -4.426900386810303, "logps/rejected": -4.188103675842285, "loss": 4.1699, "rewards/accuracies": 0.25, "rewards/chosen": -44.269004821777344, "rewards/margins": -2.387972831726074, "rewards/rejected": -41.88103103637695, "step": 3549 }, { "epoch": 0.48338779956427014, "grad_norm": 42.058122856475634, "learning_rate": 4.922360696527599e-07, "logits/chosen": 12.128764152526855, "logits/rejected": 12.041800498962402, "logps/chosen": -3.8771872520446777, "logps/rejected": -4.058915615081787, "loss": 4.3048, "rewards/accuracies": 0.5, "rewards/chosen": -38.771873474121094, "rewards/margins": 1.8172845840454102, "rewards/rejected": -40.58915710449219, "step": 3550 }, { "epoch": 0.4835239651416122, "grad_norm": 39.59373054477959, "learning_rate": 4.920510430641327e-07, "logits/chosen": 11.923447608947754, "logits/rejected": 12.862533569335938, "logps/chosen": -3.944695472717285, "logps/rejected": -4.52623176574707, "loss": 3.9153, "rewards/accuracies": 0.75, "rewards/chosen": -39.446956634521484, "rewards/margins": 5.815359115600586, "rewards/rejected": -45.26231384277344, "step": 3551 }, { "epoch": 0.48366013071895425, "grad_norm": 44.94573848102199, "learning_rate": 4.918659956758113e-07, "logits/chosen": 12.231815338134766, "logits/rejected": 13.016324996948242, "logps/chosen": -3.8495004177093506, "logps/rejected": -4.19316291809082, "loss": 3.5413, "rewards/accuracies": 0.75, "rewards/chosen": -38.49500274658203, "rewards/margins": 3.4366273880004883, "rewards/rejected": -41.9316291809082, "step": 3552 }, { "epoch": 0.4837962962962963, "grad_norm": 40.903017277385274, "learning_rate": 4.916809275296089e-07, "logits/chosen": 12.485429763793945, "logits/rejected": 13.03194808959961, "logps/chosen": -4.279457092285156, "logps/rejected": -4.336952209472656, "loss": 4.1433, "rewards/accuracies": 0.75, "rewards/chosen": -42.79457092285156, "rewards/margins": 0.5749492645263672, "rewards/rejected": -43.36952209472656, "step": 3553 }, { "epoch": 0.48393246187363836, "grad_norm": 44.33713902071101, "learning_rate": 4.914958386673431e-07, "logits/chosen": 12.310977935791016, "logits/rejected": 13.404500961303711, "logps/chosen": -4.147943019866943, "logps/rejected": -4.348886489868164, "loss": 4.0286, "rewards/accuracies": 0.75, "rewards/chosen": -41.47943115234375, "rewards/margins": 2.009431838989258, "rewards/rejected": -43.488861083984375, "step": 3554 }, { "epoch": 0.4840686274509804, "grad_norm": 45.342625038372645, "learning_rate": 4.91310729130836e-07, "logits/chosen": 11.925050735473633, "logits/rejected": 12.345195770263672, "logps/chosen": -3.9591312408447266, "logps/rejected": -4.056234359741211, "loss": 3.8629, "rewards/accuracies": 0.5, "rewards/chosen": -39.591312408447266, "rewards/margins": 0.9710283279418945, "rewards/rejected": -40.562339782714844, "step": 3555 }, { "epoch": 0.4842047930283224, "grad_norm": 40.63874253675354, "learning_rate": 4.911255989619151e-07, "logits/chosen": 12.249048233032227, "logits/rejected": 12.799575805664062, "logps/chosen": -4.089324951171875, "logps/rejected": -4.237204551696777, "loss": 4.5403, "rewards/accuracies": 0.5, "rewards/chosen": -40.893253326416016, "rewards/margins": 1.4787893295288086, "rewards/rejected": -42.372039794921875, "step": 3556 }, { "epoch": 0.4843409586056645, "grad_norm": 39.78928377889329, "learning_rate": 4.90940448202412e-07, "logits/chosen": 12.622936248779297, "logits/rejected": 12.354547500610352, "logps/chosen": -3.792010545730591, "logps/rejected": -4.2163801193237305, "loss": 4.1271, "rewards/accuracies": 0.75, "rewards/chosen": -37.92010498046875, "rewards/margins": 4.243694305419922, "rewards/rejected": -42.16379928588867, "step": 3557 }, { "epoch": 0.48447712418300654, "grad_norm": 39.821266684581616, "learning_rate": 4.907552768941626e-07, "logits/chosen": 12.829028129577637, "logits/rejected": 12.54764175415039, "logps/chosen": -4.090705871582031, "logps/rejected": -4.265454292297363, "loss": 3.861, "rewards/accuracies": 0.75, "rewards/chosen": -40.90705871582031, "rewards/margins": 1.747483253479004, "rewards/rejected": -42.654541015625, "step": 3558 }, { "epoch": 0.48461328976034856, "grad_norm": 40.71345612418106, "learning_rate": 4.905700850790083e-07, "logits/chosen": 12.830158233642578, "logits/rejected": 12.761628150939941, "logps/chosen": -3.8862006664276123, "logps/rejected": -4.05052375793457, "loss": 3.9229, "rewards/accuracies": 0.5, "rewards/chosen": -38.86200714111328, "rewards/margins": 1.643233299255371, "rewards/rejected": -40.50524139404297, "step": 3559 }, { "epoch": 0.48474945533769065, "grad_norm": 43.81097532116408, "learning_rate": 4.903848727987947e-07, "logits/chosen": 13.502894401550293, "logits/rejected": 12.706890106201172, "logps/chosen": -3.8845226764678955, "logps/rejected": -4.271222114562988, "loss": 3.9898, "rewards/accuracies": 0.75, "rewards/chosen": -38.8452262878418, "rewards/margins": 3.866992950439453, "rewards/rejected": -42.71221923828125, "step": 3560 }, { "epoch": 0.4848856209150327, "grad_norm": 53.23816533544071, "learning_rate": 4.901996400953718e-07, "logits/chosen": 12.670356750488281, "logits/rejected": 12.72992992401123, "logps/chosen": -3.8841590881347656, "logps/rejected": -3.833590507507324, "loss": 4.3235, "rewards/accuracies": 0.25, "rewards/chosen": -38.841590881347656, "rewards/margins": -0.5056858062744141, "rewards/rejected": -38.335906982421875, "step": 3561 }, { "epoch": 0.4850217864923747, "grad_norm": 42.97180753333471, "learning_rate": 4.900143870105948e-07, "logits/chosen": 12.668394088745117, "logits/rejected": 13.46351432800293, "logps/chosen": -4.010141372680664, "logps/rejected": -4.293050765991211, "loss": 4.348, "rewards/accuracies": 0.75, "rewards/chosen": -40.101417541503906, "rewards/margins": 2.8290939331054688, "rewards/rejected": -42.93050765991211, "step": 3562 }, { "epoch": 0.4851579520697168, "grad_norm": 70.47147617491622, "learning_rate": 4.898291135863229e-07, "logits/chosen": 11.70004653930664, "logits/rejected": 13.048599243164062, "logps/chosen": -3.8671200275421143, "logps/rejected": -4.409513473510742, "loss": 3.7872, "rewards/accuracies": 1.0, "rewards/chosen": -38.671199798583984, "rewards/margins": 5.423935890197754, "rewards/rejected": -44.09513854980469, "step": 3563 }, { "epoch": 0.4852941176470588, "grad_norm": 44.49683730905046, "learning_rate": 4.896438198644203e-07, "logits/chosen": 12.46169662475586, "logits/rejected": 13.519834518432617, "logps/chosen": -4.090347766876221, "logps/rejected": -4.322592735290527, "loss": 4.1709, "rewards/accuracies": 0.5, "rewards/chosen": -40.903480529785156, "rewards/margins": 2.3224496841430664, "rewards/rejected": -43.225929260253906, "step": 3564 }, { "epoch": 0.48543028322440085, "grad_norm": 45.6496534558429, "learning_rate": 4.894585058867555e-07, "logits/chosen": 11.956340789794922, "logits/rejected": 12.095725059509277, "logps/chosen": -3.8642539978027344, "logps/rejected": -3.855358600616455, "loss": 4.2941, "rewards/accuracies": 0.5, "rewards/chosen": -38.642539978027344, "rewards/margins": -0.08895397186279297, "rewards/rejected": -38.553585052490234, "step": 3565 }, { "epoch": 0.48556644880174293, "grad_norm": 49.336445179517696, "learning_rate": 4.892731716952019e-07, "logits/chosen": 11.73318862915039, "logits/rejected": 12.785731315612793, "logps/chosen": -4.054356575012207, "logps/rejected": -4.56743049621582, "loss": 3.5393, "rewards/accuracies": 0.75, "rewards/chosen": -40.5435676574707, "rewards/margins": 5.130736351013184, "rewards/rejected": -45.6743049621582, "step": 3566 }, { "epoch": 0.48570261437908496, "grad_norm": 42.482601086513455, "learning_rate": 4.890878173316373e-07, "logits/chosen": 12.649177551269531, "logits/rejected": 13.250774383544922, "logps/chosen": -4.091997146606445, "logps/rejected": -4.203021049499512, "loss": 4.1124, "rewards/accuracies": 0.5, "rewards/chosen": -40.91996765136719, "rewards/margins": 1.1102428436279297, "rewards/rejected": -42.03021240234375, "step": 3567 }, { "epoch": 0.485838779956427, "grad_norm": 77.6633603192001, "learning_rate": 4.889024428379437e-07, "logits/chosen": 11.856048583984375, "logits/rejected": 12.940277099609375, "logps/chosen": -4.034348964691162, "logps/rejected": -4.400735855102539, "loss": 4.0369, "rewards/accuracies": 0.75, "rewards/chosen": -40.34349060058594, "rewards/margins": 3.663869857788086, "rewards/rejected": -44.00735855102539, "step": 3568 }, { "epoch": 0.4859749455337691, "grad_norm": 42.94343603714046, "learning_rate": 4.887170482560085e-07, "logits/chosen": 12.339406967163086, "logits/rejected": 12.668835639953613, "logps/chosen": -4.017819404602051, "logps/rejected": -4.227836608886719, "loss": 3.1377, "rewards/accuracies": 0.75, "rewards/chosen": -40.178192138671875, "rewards/margins": 2.100172996520996, "rewards/rejected": -42.27836608886719, "step": 3569 }, { "epoch": 0.4861111111111111, "grad_norm": 44.01764989402811, "learning_rate": 4.885316336277227e-07, "logits/chosen": 13.332216262817383, "logits/rejected": 12.501276969909668, "logps/chosen": -3.9417548179626465, "logps/rejected": -4.168813705444336, "loss": 4.0261, "rewards/accuracies": 0.5, "rewards/chosen": -39.41754913330078, "rewards/margins": 2.2705917358398438, "rewards/rejected": -41.688140869140625, "step": 3570 }, { "epoch": 0.48624727668845313, "grad_norm": 50.91623815469352, "learning_rate": 4.883461989949827e-07, "logits/chosen": 11.632366180419922, "logits/rejected": 11.85812759399414, "logps/chosen": -4.315801620483398, "logps/rejected": -4.157212257385254, "loss": 5.0412, "rewards/accuracies": 0.5, "rewards/chosen": -43.15801239013672, "rewards/margins": -1.5858917236328125, "rewards/rejected": -41.572120666503906, "step": 3571 }, { "epoch": 0.4863834422657952, "grad_norm": 41.09978217729035, "learning_rate": 4.881607443996887e-07, "logits/chosen": 11.617401123046875, "logits/rejected": 12.132286071777344, "logps/chosen": -4.150032997131348, "logps/rejected": -4.346196174621582, "loss": 3.8718, "rewards/accuracies": 0.75, "rewards/chosen": -41.500328063964844, "rewards/margins": 1.961629867553711, "rewards/rejected": -43.46195983886719, "step": 3572 }, { "epoch": 0.48651960784313725, "grad_norm": 45.479821103426794, "learning_rate": 4.879752698837457e-07, "logits/chosen": 11.551995277404785, "logits/rejected": 12.554159164428711, "logps/chosen": -4.003592491149902, "logps/rejected": -4.145700454711914, "loss": 3.6799, "rewards/accuracies": 0.75, "rewards/chosen": -40.035926818847656, "rewards/margins": 1.4210748672485352, "rewards/rejected": -41.457000732421875, "step": 3573 }, { "epoch": 0.4866557734204793, "grad_norm": 40.28497566940644, "learning_rate": 4.877897754890634e-07, "logits/chosen": 12.340330123901367, "logits/rejected": 13.749619483947754, "logps/chosen": -3.8460354804992676, "logps/rejected": -4.653850555419922, "loss": 4.1861, "rewards/accuracies": 1.0, "rewards/chosen": -38.46035385131836, "rewards/margins": 8.078149795532227, "rewards/rejected": -46.53850555419922, "step": 3574 }, { "epoch": 0.48679193899782136, "grad_norm": 54.58161416311633, "learning_rate": 4.876042612575554e-07, "logits/chosen": 11.22657585144043, "logits/rejected": 12.334356307983398, "logps/chosen": -3.9906625747680664, "logps/rejected": -4.136508941650391, "loss": 3.483, "rewards/accuracies": 0.75, "rewards/chosen": -39.90662384033203, "rewards/margins": 1.4584684371948242, "rewards/rejected": -41.36509323120117, "step": 3575 }, { "epoch": 0.4869281045751634, "grad_norm": 47.331086525130814, "learning_rate": 4.874187272311406e-07, "logits/chosen": 12.268279075622559, "logits/rejected": 12.736673355102539, "logps/chosen": -3.932511329650879, "logps/rejected": -4.045567512512207, "loss": 4.1036, "rewards/accuracies": 0.75, "rewards/chosen": -39.325111389160156, "rewards/margins": 1.1305665969848633, "rewards/rejected": -40.4556770324707, "step": 3576 }, { "epoch": 0.4870642701525055, "grad_norm": 53.33254556406957, "learning_rate": 4.872331734517418e-07, "logits/chosen": 12.106728553771973, "logits/rejected": 12.407251358032227, "logps/chosen": -3.49489688873291, "logps/rejected": -3.817809581756592, "loss": 3.6827, "rewards/accuracies": 0.75, "rewards/chosen": -34.948970794677734, "rewards/margins": 3.2291250228881836, "rewards/rejected": -38.17809295654297, "step": 3577 }, { "epoch": 0.4872004357298475, "grad_norm": 44.39469362473974, "learning_rate": 4.870475999612863e-07, "logits/chosen": 13.212667465209961, "logits/rejected": 13.8515625, "logps/chosen": -4.2098894119262695, "logps/rejected": -4.293159008026123, "loss": 4.0909, "rewards/accuracies": 0.5, "rewards/chosen": -42.09889221191406, "rewards/margins": 0.8326988220214844, "rewards/rejected": -42.93159103393555, "step": 3578 }, { "epoch": 0.48733660130718953, "grad_norm": 48.217184311939384, "learning_rate": 4.86862006801706e-07, "logits/chosen": 11.17374038696289, "logits/rejected": 12.371330261230469, "logps/chosen": -3.7105135917663574, "logps/rejected": -4.1276140213012695, "loss": 4.0182, "rewards/accuracies": 1.0, "rewards/chosen": -37.10513687133789, "rewards/margins": 4.171006202697754, "rewards/rejected": -41.276145935058594, "step": 3579 }, { "epoch": 0.4874727668845316, "grad_norm": 45.55930948764133, "learning_rate": 4.866763940149374e-07, "logits/chosen": 11.328369140625, "logits/rejected": 13.128944396972656, "logps/chosen": -3.603731393814087, "logps/rejected": -4.119388580322266, "loss": 3.8102, "rewards/accuracies": 0.75, "rewards/chosen": -36.037315368652344, "rewards/margins": 5.15656852722168, "rewards/rejected": -41.19388198852539, "step": 3580 }, { "epoch": 0.48760893246187365, "grad_norm": 48.35598665941022, "learning_rate": 4.864907616429211e-07, "logits/chosen": 11.52708911895752, "logits/rejected": 12.256908416748047, "logps/chosen": -3.422105312347412, "logps/rejected": -3.723551034927368, "loss": 4.1657, "rewards/accuracies": 0.75, "rewards/chosen": -34.22105407714844, "rewards/margins": 3.0144567489624023, "rewards/rejected": -37.235511779785156, "step": 3581 }, { "epoch": 0.4877450980392157, "grad_norm": 61.774843201981504, "learning_rate": 4.863051097276021e-07, "logits/chosen": 12.816868782043457, "logits/rejected": 12.726381301879883, "logps/chosen": -4.023530006408691, "logps/rejected": -4.075393199920654, "loss": 3.8798, "rewards/accuracies": 0.5, "rewards/chosen": -40.23529815673828, "rewards/margins": 0.5186309814453125, "rewards/rejected": -40.753929138183594, "step": 3582 }, { "epoch": 0.48788126361655776, "grad_norm": 40.646803424520314, "learning_rate": 4.861194383109301e-07, "logits/chosen": 11.167440414428711, "logits/rejected": 11.410205841064453, "logps/chosen": -3.5357985496520996, "logps/rejected": -3.830202579498291, "loss": 3.6291, "rewards/accuracies": 0.75, "rewards/chosen": -35.35798645019531, "rewards/margins": 2.9440393447875977, "rewards/rejected": -38.302024841308594, "step": 3583 }, { "epoch": 0.4880174291938998, "grad_norm": 43.78642519510219, "learning_rate": 4.859337474348594e-07, "logits/chosen": 12.875094413757324, "logits/rejected": 11.653100967407227, "logps/chosen": -3.842914342880249, "logps/rejected": -3.915722608566284, "loss": 3.8281, "rewards/accuracies": 0.5, "rewards/chosen": -38.42914581298828, "rewards/margins": 0.7280802726745605, "rewards/rejected": -39.157222747802734, "step": 3584 }, { "epoch": 0.4881535947712418, "grad_norm": 48.122137180006135, "learning_rate": 4.85748037141348e-07, "logits/chosen": 13.14626693725586, "logits/rejected": 12.843570709228516, "logps/chosen": -3.744422435760498, "logps/rejected": -3.8711650371551514, "loss": 3.5813, "rewards/accuracies": 0.75, "rewards/chosen": -37.44422149658203, "rewards/margins": 1.2674269676208496, "rewards/rejected": -38.711647033691406, "step": 3585 }, { "epoch": 0.4882897603485839, "grad_norm": 40.820531218678376, "learning_rate": 4.855623074723588e-07, "logits/chosen": 12.236190795898438, "logits/rejected": 12.590635299682617, "logps/chosen": -3.9531102180480957, "logps/rejected": -3.994581699371338, "loss": 4.3125, "rewards/accuracies": 0.5, "rewards/chosen": -39.53110122680664, "rewards/margins": 0.41471385955810547, "rewards/rejected": -39.94581604003906, "step": 3586 }, { "epoch": 0.48842592592592593, "grad_norm": 45.6690650399442, "learning_rate": 4.85376558469859e-07, "logits/chosen": 12.385250091552734, "logits/rejected": 12.855293273925781, "logps/chosen": -3.8593051433563232, "logps/rejected": -4.068575859069824, "loss": 3.7979, "rewards/accuracies": 0.75, "rewards/chosen": -38.59305191040039, "rewards/margins": 2.0927047729492188, "rewards/rejected": -40.685760498046875, "step": 3587 }, { "epoch": 0.48856209150326796, "grad_norm": 50.69965906325273, "learning_rate": 4.8519079017582e-07, "logits/chosen": 11.804948806762695, "logits/rejected": 12.468982696533203, "logps/chosen": -3.966425895690918, "logps/rejected": -4.168476104736328, "loss": 4.1721, "rewards/accuracies": 0.75, "rewards/chosen": -39.66426086425781, "rewards/margins": 2.020503044128418, "rewards/rejected": -41.68476104736328, "step": 3588 }, { "epoch": 0.48869825708061004, "grad_norm": 46.757131343070455, "learning_rate": 4.850050026322179e-07, "logits/chosen": 11.23250961303711, "logits/rejected": 11.128129959106445, "logps/chosen": -3.7660436630249023, "logps/rejected": -3.8087880611419678, "loss": 4.7698, "rewards/accuracies": 0.75, "rewards/chosen": -37.660438537597656, "rewards/margins": 0.4274439811706543, "rewards/rejected": -38.0878791809082, "step": 3589 }, { "epoch": 0.4888344226579521, "grad_norm": 39.92158904709587, "learning_rate": 4.848191958810328e-07, "logits/chosen": 12.085264205932617, "logits/rejected": 12.894136428833008, "logps/chosen": -3.8568308353424072, "logps/rejected": -3.8010621070861816, "loss": 3.7313, "rewards/accuracies": 0.5, "rewards/chosen": -38.56830978393555, "rewards/margins": -0.5576896667480469, "rewards/rejected": -38.0106201171875, "step": 3590 }, { "epoch": 0.4889705882352941, "grad_norm": 35.70254523363031, "learning_rate": 4.846333699642491e-07, "logits/chosen": 11.618501663208008, "logits/rejected": 12.519720077514648, "logps/chosen": -3.9290852546691895, "logps/rejected": -4.10926628112793, "loss": 3.8869, "rewards/accuracies": 0.5, "rewards/chosen": -39.290855407714844, "rewards/margins": 1.801809310913086, "rewards/rejected": -41.0926628112793, "step": 3591 }, { "epoch": 0.4891067538126362, "grad_norm": 45.01069665255829, "learning_rate": 4.84447524923856e-07, "logits/chosen": 11.554561614990234, "logits/rejected": 13.238739013671875, "logps/chosen": -3.788649082183838, "logps/rejected": -4.090733528137207, "loss": 3.434, "rewards/accuracies": 0.5, "rewards/chosen": -37.8864860534668, "rewards/margins": 3.020847797393799, "rewards/rejected": -40.9073371887207, "step": 3592 }, { "epoch": 0.4892429193899782, "grad_norm": 40.81545430838261, "learning_rate": 4.842616608018465e-07, "logits/chosen": 12.73332405090332, "logits/rejected": 12.044697761535645, "logps/chosen": -3.438659191131592, "logps/rejected": -3.441678047180176, "loss": 3.6103, "rewards/accuracies": 0.75, "rewards/chosen": -34.38658905029297, "rewards/margins": 0.030187129974365234, "rewards/rejected": -34.416778564453125, "step": 3593 }, { "epoch": 0.48937908496732024, "grad_norm": 35.69938517247591, "learning_rate": 4.840757776402183e-07, "logits/chosen": 12.299941062927246, "logits/rejected": 12.853135108947754, "logps/chosen": -3.717705249786377, "logps/rejected": -4.188276767730713, "loss": 3.4276, "rewards/accuracies": 1.0, "rewards/chosen": -37.17705535888672, "rewards/margins": 4.705716133117676, "rewards/rejected": -41.88276672363281, "step": 3594 }, { "epoch": 0.48951525054466233, "grad_norm": 46.369692928407694, "learning_rate": 4.838898754809731e-07, "logits/chosen": 12.293954849243164, "logits/rejected": 12.696745872497559, "logps/chosen": -3.66324520111084, "logps/rejected": -4.014133453369141, "loss": 3.6196, "rewards/accuracies": 1.0, "rewards/chosen": -36.63245391845703, "rewards/margins": 3.5088844299316406, "rewards/rejected": -40.141334533691406, "step": 3595 }, { "epoch": 0.48965141612200436, "grad_norm": 34.7830555284663, "learning_rate": 4.837039543661173e-07, "logits/chosen": 11.477226257324219, "logits/rejected": 11.75749397277832, "logps/chosen": -3.6462268829345703, "logps/rejected": -3.8120763301849365, "loss": 3.3798, "rewards/accuracies": 0.75, "rewards/chosen": -36.4622688293457, "rewards/margins": 1.658492088317871, "rewards/rejected": -38.12076187133789, "step": 3596 }, { "epoch": 0.4897875816993464, "grad_norm": 45.38358143812613, "learning_rate": 4.835180143376608e-07, "logits/chosen": 12.081473350524902, "logits/rejected": 12.202523231506348, "logps/chosen": -3.638889789581299, "logps/rejected": -3.972073793411255, "loss": 4.591, "rewards/accuracies": 0.5, "rewards/chosen": -36.38890075683594, "rewards/margins": 3.3318376541137695, "rewards/rejected": -39.720733642578125, "step": 3597 }, { "epoch": 0.48992374727668847, "grad_norm": 54.544526969768874, "learning_rate": 4.833320554376187e-07, "logits/chosen": 11.407493591308594, "logits/rejected": 11.713443756103516, "logps/chosen": -3.4494175910949707, "logps/rejected": -3.5929272174835205, "loss": 4.0163, "rewards/accuracies": 0.75, "rewards/chosen": -34.49417495727539, "rewards/margins": 1.4350948333740234, "rewards/rejected": -35.92927169799805, "step": 3598 }, { "epoch": 0.4900599128540305, "grad_norm": 41.65756767413356, "learning_rate": 4.8314607770801e-07, "logits/chosen": 12.740840911865234, "logits/rejected": 12.383522033691406, "logps/chosen": -3.9817912578582764, "logps/rejected": -3.785168409347534, "loss": 4.0329, "rewards/accuracies": 0.25, "rewards/chosen": -39.81791305541992, "rewards/margins": -1.9662294387817383, "rewards/rejected": -37.8516845703125, "step": 3599 }, { "epoch": 0.49019607843137253, "grad_norm": 39.07958301870276, "learning_rate": 4.829600811908576e-07, "logits/chosen": 12.7568359375, "logits/rejected": 12.851913452148438, "logps/chosen": -3.9591732025146484, "logps/rejected": -4.252134799957275, "loss": 3.8651, "rewards/accuracies": 0.75, "rewards/chosen": -39.59172821044922, "rewards/margins": 2.9296188354492188, "rewards/rejected": -42.52134704589844, "step": 3600 }, { "epoch": 0.4903322440087146, "grad_norm": 36.476726815808185, "learning_rate": 4.827740659281892e-07, "logits/chosen": 13.173310279846191, "logits/rejected": 13.158341407775879, "logps/chosen": -4.242666721343994, "logps/rejected": -4.358550071716309, "loss": 3.7868, "rewards/accuracies": 0.5, "rewards/chosen": -42.42667007446289, "rewards/margins": 1.1588287353515625, "rewards/rejected": -43.58549880981445, "step": 3601 }, { "epoch": 0.49046840958605664, "grad_norm": 46.76419089128796, "learning_rate": 4.825880319620363e-07, "logits/chosen": 13.462784767150879, "logits/rejected": 13.045059204101562, "logps/chosen": -4.331754684448242, "logps/rejected": -4.252030372619629, "loss": 4.6448, "rewards/accuracies": 0.5, "rewards/chosen": -43.31754684448242, "rewards/margins": -0.7972421646118164, "rewards/rejected": -42.520301818847656, "step": 3602 }, { "epoch": 0.49060457516339867, "grad_norm": 40.941518774477785, "learning_rate": 4.824019793344349e-07, "logits/chosen": 13.486305236816406, "logits/rejected": 13.410064697265625, "logps/chosen": -4.239832878112793, "logps/rejected": -4.072455406188965, "loss": 4.0816, "rewards/accuracies": 0.25, "rewards/chosen": -42.3983268737793, "rewards/margins": -1.6737785339355469, "rewards/rejected": -40.72454833984375, "step": 3603 }, { "epoch": 0.49074074074074076, "grad_norm": 75.15420300773475, "learning_rate": 4.822159080874253e-07, "logits/chosen": 12.913549423217773, "logits/rejected": 13.665302276611328, "logps/chosen": -3.9711689949035645, "logps/rejected": -4.2206621170043945, "loss": 4.0576, "rewards/accuracies": 0.75, "rewards/chosen": -39.71168899536133, "rewards/margins": 2.4949331283569336, "rewards/rejected": -42.20661926269531, "step": 3604 }, { "epoch": 0.4908769063180828, "grad_norm": 37.991883162865115, "learning_rate": 4.820298182630514e-07, "logits/chosen": 12.802173614501953, "logits/rejected": 13.081767082214355, "logps/chosen": -4.143242835998535, "logps/rejected": -4.160233020782471, "loss": 3.9256, "rewards/accuracies": 0.75, "rewards/chosen": -41.432430267333984, "rewards/margins": 0.16990089416503906, "rewards/rejected": -41.602333068847656, "step": 3605 }, { "epoch": 0.4910130718954248, "grad_norm": 44.059028954944665, "learning_rate": 4.818437099033621e-07, "logits/chosen": 12.89122200012207, "logits/rejected": 14.611612319946289, "logps/chosen": -4.238453388214111, "logps/rejected": -4.687515735626221, "loss": 4.2075, "rewards/accuracies": 1.0, "rewards/chosen": -42.38453674316406, "rewards/margins": 4.490623474121094, "rewards/rejected": -46.875160217285156, "step": 3606 }, { "epoch": 0.4911492374727669, "grad_norm": 37.775834689014346, "learning_rate": 4.816575830504101e-07, "logits/chosen": 14.068122863769531, "logits/rejected": 13.19336986541748, "logps/chosen": -4.277204513549805, "logps/rejected": -4.294069766998291, "loss": 4.0247, "rewards/accuracies": 0.75, "rewards/chosen": -42.77204132080078, "rewards/margins": 0.1686573028564453, "rewards/rejected": -42.940696716308594, "step": 3607 }, { "epoch": 0.4912854030501089, "grad_norm": 50.91254603622724, "learning_rate": 4.814714377462521e-07, "logits/chosen": 13.184629440307617, "logits/rejected": 13.578484535217285, "logps/chosen": -3.7879722118377686, "logps/rejected": -4.092336654663086, "loss": 4.0461, "rewards/accuracies": 0.75, "rewards/chosen": -37.879722595214844, "rewards/margins": 3.043644905090332, "rewards/rejected": -40.92336654663086, "step": 3608 }, { "epoch": 0.49142156862745096, "grad_norm": 42.499545719565695, "learning_rate": 4.812852740329493e-07, "logits/chosen": 13.012434005737305, "logits/rejected": 14.111292839050293, "logps/chosen": -4.220440864562988, "logps/rejected": -4.631773471832275, "loss": 4.2102, "rewards/accuracies": 0.75, "rewards/chosen": -42.20440673828125, "rewards/margins": 4.1133270263671875, "rewards/rejected": -46.31773376464844, "step": 3609 }, { "epoch": 0.49155773420479304, "grad_norm": 44.46846611930786, "learning_rate": 4.81099091952567e-07, "logits/chosen": 13.745759963989258, "logits/rejected": 13.090807914733887, "logps/chosen": -4.434674263000488, "logps/rejected": -4.187482833862305, "loss": 3.5813, "rewards/accuracies": 0.25, "rewards/chosen": -44.346744537353516, "rewards/margins": -2.471914291381836, "rewards/rejected": -41.87482833862305, "step": 3610 }, { "epoch": 0.49169389978213507, "grad_norm": 44.035347930869555, "learning_rate": 4.809128915471744e-07, "logits/chosen": 13.136090278625488, "logits/rejected": 13.065397262573242, "logps/chosen": -3.995112895965576, "logps/rejected": -4.371241092681885, "loss": 3.6037, "rewards/accuracies": 0.75, "rewards/chosen": -39.95112991333008, "rewards/margins": 3.761281967163086, "rewards/rejected": -43.71240997314453, "step": 3611 }, { "epoch": 0.4918300653594771, "grad_norm": 44.681575098467604, "learning_rate": 4.807266728588452e-07, "logits/chosen": 13.20020866394043, "logits/rejected": 14.575326919555664, "logps/chosen": -4.241211891174316, "logps/rejected": -4.639222145080566, "loss": 4.1879, "rewards/accuracies": 0.75, "rewards/chosen": -42.4121208190918, "rewards/margins": 3.980099678039551, "rewards/rejected": -46.39221954345703, "step": 3612 }, { "epoch": 0.4919662309368192, "grad_norm": 59.6551333898904, "learning_rate": 4.80540435929657e-07, "logits/chosen": 12.04963493347168, "logits/rejected": 12.554746627807617, "logps/chosen": -3.9309051036834717, "logps/rejected": -3.7966413497924805, "loss": 4.4357, "rewards/accuracies": 0.25, "rewards/chosen": -39.309051513671875, "rewards/margins": -1.342637062072754, "rewards/rejected": -37.96641540527344, "step": 3613 }, { "epoch": 0.4921023965141612, "grad_norm": 42.439197056189386, "learning_rate": 4.803541808016915e-07, "logits/chosen": 11.853347778320312, "logits/rejected": 12.532573699951172, "logps/chosen": -3.8905110359191895, "logps/rejected": -4.328180313110352, "loss": 4.3971, "rewards/accuracies": 1.0, "rewards/chosen": -38.90510940551758, "rewards/margins": 4.376692771911621, "rewards/rejected": -43.281803131103516, "step": 3614 }, { "epoch": 0.4922385620915033, "grad_norm": 44.443858605875505, "learning_rate": 4.801679075170347e-07, "logits/chosen": 13.138927459716797, "logits/rejected": 13.997875213623047, "logps/chosen": -4.1922454833984375, "logps/rejected": -4.4101033210754395, "loss": 3.7146, "rewards/accuracies": 0.75, "rewards/chosen": -41.922454833984375, "rewards/margins": 2.178579330444336, "rewards/rejected": -44.10103225708008, "step": 3615 }, { "epoch": 0.4923747276688453, "grad_norm": 41.09477892601272, "learning_rate": 4.799816161177763e-07, "logits/chosen": 12.312461853027344, "logits/rejected": 12.648859024047852, "logps/chosen": -4.097790241241455, "logps/rejected": -4.056838035583496, "loss": 3.7164, "rewards/accuracies": 0.25, "rewards/chosen": -40.977901458740234, "rewards/margins": -0.4095277786254883, "rewards/rejected": -40.56837463378906, "step": 3616 }, { "epoch": 0.49251089324618735, "grad_norm": 45.35666298598317, "learning_rate": 4.797953066460108e-07, "logits/chosen": 13.15707015991211, "logits/rejected": 12.983362197875977, "logps/chosen": -4.162110328674316, "logps/rejected": -4.222053527832031, "loss": 4.0226, "rewards/accuracies": 0.5, "rewards/chosen": -41.62110137939453, "rewards/margins": 0.599431037902832, "rewards/rejected": -42.22053527832031, "step": 3617 }, { "epoch": 0.49264705882352944, "grad_norm": 39.82222401419105, "learning_rate": 4.796089791438362e-07, "logits/chosen": 11.678411483764648, "logits/rejected": 12.1737642288208, "logps/chosen": -3.863224983215332, "logps/rejected": -4.07841157913208, "loss": 3.7787, "rewards/accuracies": 0.5, "rewards/chosen": -38.63224792480469, "rewards/margins": 2.151866912841797, "rewards/rejected": -40.78411865234375, "step": 3618 }, { "epoch": 0.49278322440087147, "grad_norm": 42.05522727077336, "learning_rate": 4.794226336533546e-07, "logits/chosen": 12.344881057739258, "logits/rejected": 13.259302139282227, "logps/chosen": -3.816603422164917, "logps/rejected": -4.257147312164307, "loss": 3.4769, "rewards/accuracies": 1.0, "rewards/chosen": -38.16603469848633, "rewards/margins": 4.405439376831055, "rewards/rejected": -42.57147216796875, "step": 3619 }, { "epoch": 0.4929193899782135, "grad_norm": 43.84048454026954, "learning_rate": 4.792362702166725e-07, "logits/chosen": 13.079690933227539, "logits/rejected": 14.193428039550781, "logps/chosen": -3.96915602684021, "logps/rejected": -4.526117324829102, "loss": 3.766, "rewards/accuracies": 0.75, "rewards/chosen": -39.691558837890625, "rewards/margins": 5.569611549377441, "rewards/rejected": -45.26116943359375, "step": 3620 }, { "epoch": 0.4930555555555556, "grad_norm": 47.94471468527951, "learning_rate": 4.790498888759e-07, "logits/chosen": 12.387723922729492, "logits/rejected": 12.271574020385742, "logps/chosen": -3.88879132270813, "logps/rejected": -3.82848858833313, "loss": 3.5944, "rewards/accuracies": 0.75, "rewards/chosen": -38.88791275024414, "rewards/margins": -0.6030282974243164, "rewards/rejected": -38.28488540649414, "step": 3621 }, { "epoch": 0.4931917211328976, "grad_norm": 63.44855001069848, "learning_rate": 4.788634896731519e-07, "logits/chosen": 13.152255058288574, "logits/rejected": 12.887451171875, "logps/chosen": -4.013358116149902, "logps/rejected": -3.9679665565490723, "loss": 4.3945, "rewards/accuracies": 0.25, "rewards/chosen": -40.133583068847656, "rewards/margins": -0.4539165496826172, "rewards/rejected": -39.679664611816406, "step": 3622 }, { "epoch": 0.49332788671023964, "grad_norm": 41.08327326733613, "learning_rate": 4.786770726505463e-07, "logits/chosen": 13.224098205566406, "logits/rejected": 13.176983833312988, "logps/chosen": -4.111879348754883, "logps/rejected": -4.294615745544434, "loss": 3.7899, "rewards/accuracies": 0.5, "rewards/chosen": -41.118797302246094, "rewards/margins": 1.8273563385009766, "rewards/rejected": -42.94615173339844, "step": 3623 }, { "epoch": 0.4934640522875817, "grad_norm": 43.52368893544025, "learning_rate": 4.784906378502058e-07, "logits/chosen": 13.015095710754395, "logits/rejected": 11.957561492919922, "logps/chosen": -4.086888313293457, "logps/rejected": -3.8107542991638184, "loss": 4.4691, "rewards/accuracies": 0.25, "rewards/chosen": -40.86888122558594, "rewards/margins": -2.761338233947754, "rewards/rejected": -38.1075439453125, "step": 3624 }, { "epoch": 0.49360021786492375, "grad_norm": 45.51322522890714, "learning_rate": 4.783041853142568e-07, "logits/chosen": 13.222976684570312, "logits/rejected": 13.403240203857422, "logps/chosen": -4.061263084411621, "logps/rejected": -4.162005424499512, "loss": 4.0237, "rewards/accuracies": 0.5, "rewards/chosen": -40.612632751464844, "rewards/margins": 1.0074195861816406, "rewards/rejected": -41.620052337646484, "step": 3625 }, { "epoch": 0.4937363834422658, "grad_norm": 41.48549177553, "learning_rate": 4.7811771508483e-07, "logits/chosen": 12.868762969970703, "logits/rejected": 12.296405792236328, "logps/chosen": -3.930818557739258, "logps/rejected": -3.759766101837158, "loss": 3.7801, "rewards/accuracies": 0.5, "rewards/chosen": -39.30818176269531, "rewards/margins": -1.7105212211608887, "rewards/rejected": -37.59766387939453, "step": 3626 }, { "epoch": 0.49387254901960786, "grad_norm": 39.90688201490852, "learning_rate": 4.779312272040597e-07, "logits/chosen": 12.977607727050781, "logits/rejected": 12.808938026428223, "logps/chosen": -3.9975037574768066, "logps/rejected": -4.092711448669434, "loss": 4.2484, "rewards/accuracies": 0.5, "rewards/chosen": -39.97503662109375, "rewards/margins": 0.9520750045776367, "rewards/rejected": -40.9271125793457, "step": 3627 }, { "epoch": 0.4940087145969499, "grad_norm": 44.200835837767315, "learning_rate": 4.777447217140845e-07, "logits/chosen": 13.054315567016602, "logits/rejected": 13.966224670410156, "logps/chosen": -4.186939239501953, "logps/rejected": -4.104217529296875, "loss": 4.2467, "rewards/accuracies": 0.25, "rewards/chosen": -41.86939239501953, "rewards/margins": -0.8272190093994141, "rewards/rejected": -41.042171478271484, "step": 3628 }, { "epoch": 0.4941448801742919, "grad_norm": 69.7452133515267, "learning_rate": 4.775581986570467e-07, "logits/chosen": 13.631887435913086, "logits/rejected": 13.958526611328125, "logps/chosen": -4.192777633666992, "logps/rejected": -4.385406494140625, "loss": 4.4816, "rewards/accuracies": 0.75, "rewards/chosen": -41.92778015136719, "rewards/margins": 1.9262847900390625, "rewards/rejected": -43.85406494140625, "step": 3629 }, { "epoch": 0.494281045751634, "grad_norm": 38.35570823591908, "learning_rate": 4.773716580750926e-07, "logits/chosen": 12.788797378540039, "logits/rejected": 12.918365478515625, "logps/chosen": -3.7933061122894287, "logps/rejected": -4.000850677490234, "loss": 3.7696, "rewards/accuracies": 0.75, "rewards/chosen": -37.93305969238281, "rewards/margins": 2.0754470825195312, "rewards/rejected": -40.008506774902344, "step": 3630 }, { "epoch": 0.49441721132897604, "grad_norm": 41.511526637023074, "learning_rate": 4.771851000103731e-07, "logits/chosen": 12.857290267944336, "logits/rejected": 13.449976921081543, "logps/chosen": -4.04212760925293, "logps/rejected": -4.078619003295898, "loss": 3.8717, "rewards/accuracies": 0.75, "rewards/chosen": -40.4212760925293, "rewards/margins": 0.3649120330810547, "rewards/rejected": -40.786190032958984, "step": 3631 }, { "epoch": 0.49455337690631807, "grad_norm": 271.3873265791679, "learning_rate": 4.769985245050421e-07, "logits/chosen": 12.78514575958252, "logits/rejected": 13.666471481323242, "logps/chosen": -4.3248209953308105, "logps/rejected": -4.776779651641846, "loss": 4.0992, "rewards/accuracies": 1.0, "rewards/chosen": -43.24821472167969, "rewards/margins": 4.519585609436035, "rewards/rejected": -47.767799377441406, "step": 3632 }, { "epoch": 0.49468954248366015, "grad_norm": 42.32707414579426, "learning_rate": 4.768119316012581e-07, "logits/chosen": 13.563660621643066, "logits/rejected": 13.821952819824219, "logps/chosen": -3.9455795288085938, "logps/rejected": -4.4001665115356445, "loss": 3.9632, "rewards/accuracies": 0.75, "rewards/chosen": -39.45579528808594, "rewards/margins": 4.545872688293457, "rewards/rejected": -44.001670837402344, "step": 3633 }, { "epoch": 0.4948257080610022, "grad_norm": 42.84844820047568, "learning_rate": 4.766253213411832e-07, "logits/chosen": 13.336100578308105, "logits/rejected": 13.510920524597168, "logps/chosen": -4.193099021911621, "logps/rejected": -4.248369216918945, "loss": 4.2434, "rewards/accuracies": 0.5, "rewards/chosen": -41.930992126464844, "rewards/margins": 0.5526981353759766, "rewards/rejected": -42.48368835449219, "step": 3634 }, { "epoch": 0.4949618736383442, "grad_norm": 69.29336233436246, "learning_rate": 4.764386937669835e-07, "logits/chosen": 12.651723861694336, "logits/rejected": 12.463504791259766, "logps/chosen": -3.920175552368164, "logps/rejected": -3.889129161834717, "loss": 4.0066, "rewards/accuracies": 0.5, "rewards/chosen": -39.20175552368164, "rewards/margins": -0.31046485900878906, "rewards/rejected": -38.89128875732422, "step": 3635 }, { "epoch": 0.4950980392156863, "grad_norm": 47.713715968511906, "learning_rate": 4.7625204892082906e-07, "logits/chosen": 14.016647338867188, "logits/rejected": 13.367256164550781, "logps/chosen": -4.28581428527832, "logps/rejected": -4.137618064880371, "loss": 4.0029, "rewards/accuracies": 0.0, "rewards/chosen": -42.85814666748047, "rewards/margins": -1.4819650650024414, "rewards/rejected": -41.37617874145508, "step": 3636 }, { "epoch": 0.4952342047930283, "grad_norm": 48.89618672233015, "learning_rate": 4.7606538684489397e-07, "logits/chosen": 12.294546127319336, "logits/rejected": 13.80963134765625, "logps/chosen": -3.6206963062286377, "logps/rejected": -4.154263496398926, "loss": 4.0834, "rewards/accuracies": 0.5, "rewards/chosen": -36.20696258544922, "rewards/margins": 5.335672378540039, "rewards/rejected": -41.542633056640625, "step": 3637 }, { "epoch": 0.49537037037037035, "grad_norm": 47.65605894185939, "learning_rate": 4.7587870758135595e-07, "logits/chosen": 13.831582069396973, "logits/rejected": 13.869840621948242, "logps/chosen": -4.390349388122559, "logps/rejected": -4.596140384674072, "loss": 3.8782, "rewards/accuracies": 0.5, "rewards/chosen": -43.90349578857422, "rewards/margins": 2.0579090118408203, "rewards/rejected": -45.961402893066406, "step": 3638 }, { "epoch": 0.49550653594771243, "grad_norm": 42.94368635226968, "learning_rate": 4.756920111723966e-07, "logits/chosen": 13.190711975097656, "logits/rejected": 14.192343711853027, "logps/chosen": -4.154629707336426, "logps/rejected": -4.527549743652344, "loss": 4.2142, "rewards/accuracies": 0.75, "rewards/chosen": -41.54629898071289, "rewards/margins": 3.7291994094848633, "rewards/rejected": -45.27549743652344, "step": 3639 }, { "epoch": 0.49564270152505446, "grad_norm": 38.39982428403969, "learning_rate": 4.7550529766020177e-07, "logits/chosen": 13.506172180175781, "logits/rejected": 13.365150451660156, "logps/chosen": -4.811961650848389, "logps/rejected": -4.490442276000977, "loss": 4.2569, "rewards/accuracies": 0.25, "rewards/chosen": -48.1196174621582, "rewards/margins": -3.215195655822754, "rewards/rejected": -44.904422760009766, "step": 3640 }, { "epoch": 0.4957788671023965, "grad_norm": 37.63209595844195, "learning_rate": 4.753185670869608e-07, "logits/chosen": 13.158219337463379, "logits/rejected": 14.327792167663574, "logps/chosen": -4.0919318199157715, "logps/rejected": -4.535531997680664, "loss": 3.7702, "rewards/accuracies": 0.75, "rewards/chosen": -40.919315338134766, "rewards/margins": 4.436004638671875, "rewards/rejected": -45.35531997680664, "step": 3641 }, { "epoch": 0.4959150326797386, "grad_norm": 42.9205284492727, "learning_rate": 4.751318194948669e-07, "logits/chosen": 13.623373031616211, "logits/rejected": 14.103918075561523, "logps/chosen": -3.971813440322876, "logps/rejected": -4.22743034362793, "loss": 3.8522, "rewards/accuracies": 0.75, "rewards/chosen": -39.71813201904297, "rewards/margins": 2.5561742782592773, "rewards/rejected": -42.27430725097656, "step": 3642 }, { "epoch": 0.4960511982570806, "grad_norm": 43.76624190078221, "learning_rate": 4.7494505492611746e-07, "logits/chosen": 13.838420867919922, "logits/rejected": 13.196243286132812, "logps/chosen": -4.221129417419434, "logps/rejected": -4.364253044128418, "loss": 3.6258, "rewards/accuracies": 0.75, "rewards/chosen": -42.2112922668457, "rewards/margins": 1.431239128112793, "rewards/rejected": -43.64253234863281, "step": 3643 }, { "epoch": 0.49618736383442263, "grad_norm": 44.44314406562405, "learning_rate": 4.7475827342291337e-07, "logits/chosen": 13.897222518920898, "logits/rejected": 13.508634567260742, "logps/chosen": -4.376197814941406, "logps/rejected": -4.461635589599609, "loss": 3.9468, "rewards/accuracies": 0.25, "rewards/chosen": -43.76197814941406, "rewards/margins": 0.8543777465820312, "rewards/rejected": -44.616355895996094, "step": 3644 }, { "epoch": 0.4963235294117647, "grad_norm": 41.581431729060625, "learning_rate": 4.7457147502745927e-07, "logits/chosen": 11.879669189453125, "logits/rejected": 13.681852340698242, "logps/chosen": -3.4934492111206055, "logps/rejected": -3.8960394859313965, "loss": 3.8524, "rewards/accuracies": 0.75, "rewards/chosen": -34.93449401855469, "rewards/margins": 4.0259013175964355, "rewards/rejected": -38.96039581298828, "step": 3645 }, { "epoch": 0.49645969498910675, "grad_norm": 60.626546603922016, "learning_rate": 4.743846597819641e-07, "logits/chosen": 13.224807739257812, "logits/rejected": 13.206745147705078, "logps/chosen": -4.186176300048828, "logps/rejected": -4.101374626159668, "loss": 4.0026, "rewards/accuracies": 0.25, "rewards/chosen": -41.86176681518555, "rewards/margins": -0.8480205535888672, "rewards/rejected": -41.01374816894531, "step": 3646 }, { "epoch": 0.4965958605664488, "grad_norm": 41.36222613150951, "learning_rate": 4.741978277286402e-07, "logits/chosen": 13.674235343933105, "logits/rejected": 13.526906967163086, "logps/chosen": -4.0250325202941895, "logps/rejected": -3.9066052436828613, "loss": 3.8755, "rewards/accuracies": 0.25, "rewards/chosen": -40.25032424926758, "rewards/margins": -1.184274673461914, "rewards/rejected": -39.0660514831543, "step": 3647 }, { "epoch": 0.49673202614379086, "grad_norm": 68.6291631237764, "learning_rate": 4.7401097890970375e-07, "logits/chosen": 12.349125862121582, "logits/rejected": 13.125404357910156, "logps/chosen": -3.665558338165283, "logps/rejected": -4.141709327697754, "loss": 3.9871, "rewards/accuracies": 1.0, "rewards/chosen": -36.655582427978516, "rewards/margins": 4.761510848999023, "rewards/rejected": -41.417091369628906, "step": 3648 }, { "epoch": 0.4968681917211329, "grad_norm": 49.79384460094825, "learning_rate": 4.7382411336737485e-07, "logits/chosen": 13.236014366149902, "logits/rejected": 13.526944160461426, "logps/chosen": -4.088935852050781, "logps/rejected": -4.086911201477051, "loss": 3.7746, "rewards/accuracies": 0.75, "rewards/chosen": -40.88936233520508, "rewards/margins": -0.020252227783203125, "rewards/rejected": -40.869110107421875, "step": 3649 }, { "epoch": 0.4970043572984749, "grad_norm": 43.693973008567376, "learning_rate": 4.7363723114387735e-07, "logits/chosen": 13.369612693786621, "logits/rejected": 14.196830749511719, "logps/chosen": -4.107294082641602, "logps/rejected": -4.47043514251709, "loss": 4.118, "rewards/accuracies": 0.75, "rewards/chosen": -41.072940826416016, "rewards/margins": 3.631406784057617, "rewards/rejected": -44.704349517822266, "step": 3650 }, { "epoch": 0.497140522875817, "grad_norm": 43.233499038052294, "learning_rate": 4.734503322814387e-07, "logits/chosen": 13.385032653808594, "logits/rejected": 13.29157543182373, "logps/chosen": -3.821423292160034, "logps/rejected": -4.108145713806152, "loss": 3.3622, "rewards/accuracies": 0.5, "rewards/chosen": -38.2142333984375, "rewards/margins": 2.8672218322753906, "rewards/rejected": -41.081451416015625, "step": 3651 }, { "epoch": 0.49727668845315903, "grad_norm": 51.04373656116317, "learning_rate": 4.732634168222903e-07, "logits/chosen": 13.762264251708984, "logits/rejected": 14.424827575683594, "logps/chosen": -4.062238693237305, "logps/rejected": -4.484561920166016, "loss": 3.5102, "rewards/accuracies": 1.0, "rewards/chosen": -40.62239074707031, "rewards/margins": 4.223231315612793, "rewards/rejected": -44.845619201660156, "step": 3652 }, { "epoch": 0.4974128540305011, "grad_norm": 44.51305505195798, "learning_rate": 4.7307648480866744e-07, "logits/chosen": 13.78179931640625, "logits/rejected": 14.028362274169922, "logps/chosen": -4.0297722816467285, "logps/rejected": -4.348050117492676, "loss": 4.1079, "rewards/accuracies": 1.0, "rewards/chosen": -40.29772186279297, "rewards/margins": 3.1827802658081055, "rewards/rejected": -43.480506896972656, "step": 3653 }, { "epoch": 0.49754901960784315, "grad_norm": 46.94322551020859, "learning_rate": 4.7288953628280853e-07, "logits/chosen": 13.392260551452637, "logits/rejected": 13.42049789428711, "logps/chosen": -3.958456516265869, "logps/rejected": -4.0100579261779785, "loss": 4.214, "rewards/accuracies": 0.5, "rewards/chosen": -39.584564208984375, "rewards/margins": 0.516014575958252, "rewards/rejected": -40.10057830810547, "step": 3654 }, { "epoch": 0.4976851851851852, "grad_norm": 41.28638406178001, "learning_rate": 4.727025712869566e-07, "logits/chosen": 12.1654052734375, "logits/rejected": 13.411556243896484, "logps/chosen": -3.7646021842956543, "logps/rejected": -4.119340896606445, "loss": 3.9064, "rewards/accuracies": 1.0, "rewards/chosen": -37.64602279663086, "rewards/margins": 3.5473833084106445, "rewards/rejected": -41.19340515136719, "step": 3655 }, { "epoch": 0.49782135076252726, "grad_norm": 54.69517277050749, "learning_rate": 4.7251558986335764e-07, "logits/chosen": 12.527865409851074, "logits/rejected": 14.051859855651855, "logps/chosen": -3.9190289974212646, "logps/rejected": -4.233142852783203, "loss": 4.3832, "rewards/accuracies": 0.75, "rewards/chosen": -39.19029235839844, "rewards/margins": 3.1411361694335938, "rewards/rejected": -42.33142852783203, "step": 3656 }, { "epoch": 0.4979575163398693, "grad_norm": 42.408775259816714, "learning_rate": 4.723285920542617e-07, "logits/chosen": 13.157910346984863, "logits/rejected": 13.153568267822266, "logps/chosen": -4.215002059936523, "logps/rejected": -4.581214427947998, "loss": 4.2318, "rewards/accuracies": 0.75, "rewards/chosen": -42.15001678466797, "rewards/margins": 3.6621217727661133, "rewards/rejected": -45.81214141845703, "step": 3657 }, { "epoch": 0.4980936819172113, "grad_norm": 44.434423755036846, "learning_rate": 4.7214157790192253e-07, "logits/chosen": 13.477712631225586, "logits/rejected": 13.728313446044922, "logps/chosen": -4.365365028381348, "logps/rejected": -4.287693500518799, "loss": 4.395, "rewards/accuracies": 0.5, "rewards/chosen": -43.653648376464844, "rewards/margins": -0.7767152786254883, "rewards/rejected": -42.87693786621094, "step": 3658 }, { "epoch": 0.4982298474945534, "grad_norm": 55.76004379913498, "learning_rate": 4.7195454744859756e-07, "logits/chosen": 13.637968063354492, "logits/rejected": 13.779790878295898, "logps/chosen": -4.212878227233887, "logps/rejected": -4.225864410400391, "loss": 3.8552, "rewards/accuracies": 0.75, "rewards/chosen": -42.1287841796875, "rewards/margins": 0.12986183166503906, "rewards/rejected": -42.258644104003906, "step": 3659 }, { "epoch": 0.49836601307189543, "grad_norm": 51.26657031118885, "learning_rate": 4.717675007365477e-07, "logits/chosen": 13.19723129272461, "logits/rejected": 12.92292594909668, "logps/chosen": -4.06201171875, "logps/rejected": -4.33429479598999, "loss": 3.7132, "rewards/accuracies": 0.75, "rewards/chosen": -40.6201171875, "rewards/margins": 2.7228288650512695, "rewards/rejected": -43.34294891357422, "step": 3660 }, { "epoch": 0.49850217864923746, "grad_norm": 42.0566957299926, "learning_rate": 4.71580437808038e-07, "logits/chosen": 14.404533386230469, "logits/rejected": 13.600954055786133, "logps/chosen": -4.090231895446777, "logps/rejected": -4.343366622924805, "loss": 3.4209, "rewards/accuracies": 0.5, "rewards/chosen": -40.902320861816406, "rewards/margins": 2.5313453674316406, "rewards/rejected": -43.43366622924805, "step": 3661 }, { "epoch": 0.49863834422657954, "grad_norm": 40.733495901186075, "learning_rate": 4.7139335870533645e-07, "logits/chosen": 11.916091918945312, "logits/rejected": 12.578929901123047, "logps/chosen": -3.744502067565918, "logps/rejected": -3.8657073974609375, "loss": 3.833, "rewards/accuracies": 0.5, "rewards/chosen": -37.44502258300781, "rewards/margins": 1.2120532989501953, "rewards/rejected": -38.657073974609375, "step": 3662 }, { "epoch": 0.4987745098039216, "grad_norm": 94.69765936479719, "learning_rate": 4.712062634707155e-07, "logits/chosen": 14.156486511230469, "logits/rejected": 14.564245223999023, "logps/chosen": -4.293422698974609, "logps/rejected": -4.504341125488281, "loss": 4.319, "rewards/accuracies": 0.75, "rewards/chosen": -42.934226989746094, "rewards/margins": 2.109185218811035, "rewards/rejected": -45.04341125488281, "step": 3663 }, { "epoch": 0.4989106753812636, "grad_norm": 56.73645482712831, "learning_rate": 4.710191521464507e-07, "logits/chosen": 13.0728759765625, "logits/rejected": 13.461669921875, "logps/chosen": -3.952371835708618, "logps/rejected": -4.117164611816406, "loss": 4.1224, "rewards/accuracies": 0.5, "rewards/chosen": -39.523719787597656, "rewards/margins": 1.6479244232177734, "rewards/rejected": -41.1716423034668, "step": 3664 }, { "epoch": 0.4990468409586057, "grad_norm": 42.21486580833282, "learning_rate": 4.708320247748214e-07, "logits/chosen": 13.777204513549805, "logits/rejected": 12.722220420837402, "logps/chosen": -4.511855125427246, "logps/rejected": -4.175630569458008, "loss": 4.415, "rewards/accuracies": 0.5, "rewards/chosen": -45.11854934692383, "rewards/margins": -3.362245559692383, "rewards/rejected": -41.75630187988281, "step": 3665 }, { "epoch": 0.4991830065359477, "grad_norm": 41.9148492679243, "learning_rate": 4.7064488139811063e-07, "logits/chosen": 13.897566795349121, "logits/rejected": 13.988751411437988, "logps/chosen": -4.30222225189209, "logps/rejected": -4.455034255981445, "loss": 4.1988, "rewards/accuracies": 0.5, "rewards/chosen": -43.02222442626953, "rewards/margins": 1.5281238555908203, "rewards/rejected": -44.55034637451172, "step": 3666 }, { "epoch": 0.49931917211328974, "grad_norm": 44.43047068219649, "learning_rate": 4.704577220586049e-07, "logits/chosen": 13.094587326049805, "logits/rejected": 14.604320526123047, "logps/chosen": -3.969153881072998, "logps/rejected": -4.296929836273193, "loss": 3.692, "rewards/accuracies": 0.75, "rewards/chosen": -39.6915397644043, "rewards/margins": 3.277759552001953, "rewards/rejected": -42.96929931640625, "step": 3667 }, { "epoch": 0.49945533769063183, "grad_norm": 47.16303370463401, "learning_rate": 4.702705467985945e-07, "logits/chosen": 13.434588432312012, "logits/rejected": 13.146039009094238, "logps/chosen": -4.3114471435546875, "logps/rejected": -4.06776762008667, "loss": 4.1406, "rewards/accuracies": 0.25, "rewards/chosen": -43.114471435546875, "rewards/margins": -2.436796188354492, "rewards/rejected": -40.677677154541016, "step": 3668 }, { "epoch": 0.49959150326797386, "grad_norm": 234.92035545923977, "learning_rate": 4.700833556603731e-07, "logits/chosen": 14.576725959777832, "logits/rejected": 13.470520973205566, "logps/chosen": -4.29417610168457, "logps/rejected": -4.463493347167969, "loss": 5.1751, "rewards/accuracies": 0.5, "rewards/chosen": -42.9417610168457, "rewards/margins": 1.6931724548339844, "rewards/rejected": -44.63493347167969, "step": 3669 }, { "epoch": 0.4997276688453159, "grad_norm": 52.27433924905606, "learning_rate": 4.6989614868623835e-07, "logits/chosen": 13.20169448852539, "logits/rejected": 13.444509506225586, "logps/chosen": -3.888117551803589, "logps/rejected": -4.0761003494262695, "loss": 3.5624, "rewards/accuracies": 0.75, "rewards/chosen": -38.88117599487305, "rewards/margins": 1.8798274993896484, "rewards/rejected": -40.76100540161133, "step": 3670 }, { "epoch": 0.49986383442265797, "grad_norm": 39.62761104129738, "learning_rate": 4.69708925918491e-07, "logits/chosen": 13.404590606689453, "logits/rejected": 12.681800842285156, "logps/chosen": -4.341012001037598, "logps/rejected": -4.196652889251709, "loss": 4.3066, "rewards/accuracies": 0.25, "rewards/chosen": -43.41012191772461, "rewards/margins": -1.4435911178588867, "rewards/rejected": -41.966529846191406, "step": 3671 }, { "epoch": 0.5, "grad_norm": 47.84745091212864, "learning_rate": 4.695216873994355e-07, "logits/chosen": 14.208749771118164, "logits/rejected": 13.87468147277832, "logps/chosen": -4.246612071990967, "logps/rejected": -4.069141387939453, "loss": 4.2799, "rewards/accuracies": 0.25, "rewards/chosen": -42.466121673583984, "rewards/margins": -1.774709701538086, "rewards/rejected": -40.69141387939453, "step": 3672 }, { "epoch": 0.5001361655773421, "grad_norm": 40.959166751463954, "learning_rate": 4.693344331713802e-07, "logits/chosen": 14.602622032165527, "logits/rejected": 14.551128387451172, "logps/chosen": -4.060085296630859, "logps/rejected": -4.380573272705078, "loss": 3.8641, "rewards/accuracies": 0.75, "rewards/chosen": -40.600852966308594, "rewards/margins": 3.2048792839050293, "rewards/rejected": -43.80573272705078, "step": 3673 }, { "epoch": 0.5002723311546841, "grad_norm": 42.259616879459195, "learning_rate": 4.6914716327663653e-07, "logits/chosen": 13.518564224243164, "logits/rejected": 13.98322868347168, "logps/chosen": -3.8887453079223633, "logps/rejected": -4.010677337646484, "loss": 3.782, "rewards/accuracies": 0.5, "rewards/chosen": -38.887451171875, "rewards/margins": 1.2193260192871094, "rewards/rejected": -40.106781005859375, "step": 3674 }, { "epoch": 0.5004084967320261, "grad_norm": 48.66092745584224, "learning_rate": 4.689598777575197e-07, "logits/chosen": 12.265223503112793, "logits/rejected": 12.899017333984375, "logps/chosen": -4.286829948425293, "logps/rejected": -4.3290019035339355, "loss": 4.4517, "rewards/accuracies": 0.5, "rewards/chosen": -42.86830139160156, "rewards/margins": 0.4217195510864258, "rewards/rejected": -43.29001998901367, "step": 3675 }, { "epoch": 0.5005446623093682, "grad_norm": 42.12593972377677, "learning_rate": 4.687725766563485e-07, "logits/chosen": 12.523219108581543, "logits/rejected": 14.159463882446289, "logps/chosen": -4.23138427734375, "logps/rejected": -4.697672367095947, "loss": 3.7097, "rewards/accuracies": 0.75, "rewards/chosen": -42.3138427734375, "rewards/margins": 4.662879943847656, "rewards/rejected": -46.976722717285156, "step": 3676 }, { "epoch": 0.5006808278867102, "grad_norm": 38.75080941471876, "learning_rate": 4.6858526001544517e-07, "logits/chosen": 12.792425155639648, "logits/rejected": 14.02318286895752, "logps/chosen": -4.1382365226745605, "logps/rejected": -4.612808704376221, "loss": 3.8245, "rewards/accuracies": 1.0, "rewards/chosen": -41.38236618041992, "rewards/margins": 4.745718955993652, "rewards/rejected": -46.12808609008789, "step": 3677 }, { "epoch": 0.5008169934640523, "grad_norm": 50.04875982779437, "learning_rate": 4.683979278771353e-07, "logits/chosen": 13.747146606445312, "logits/rejected": 14.65859317779541, "logps/chosen": -4.369171619415283, "logps/rejected": -4.7452263832092285, "loss": 3.874, "rewards/accuracies": 0.75, "rewards/chosen": -43.691715240478516, "rewards/margins": 3.7605485916137695, "rewards/rejected": -47.45226287841797, "step": 3678 }, { "epoch": 0.5009531590413944, "grad_norm": 51.48495732864612, "learning_rate": 4.6821058028374833e-07, "logits/chosen": 13.68797492980957, "logits/rejected": 13.996757507324219, "logps/chosen": -4.051016807556152, "logps/rejected": -4.378458499908447, "loss": 3.9839, "rewards/accuracies": 0.75, "rewards/chosen": -40.510169982910156, "rewards/margins": 3.2744178771972656, "rewards/rejected": -43.784584045410156, "step": 3679 }, { "epoch": 0.5010893246187363, "grad_norm": 39.26423514659384, "learning_rate": 4.6802321727761696e-07, "logits/chosen": 13.904593467712402, "logits/rejected": 13.721930503845215, "logps/chosen": -4.080116271972656, "logps/rejected": -4.04910135269165, "loss": 3.9181, "rewards/accuracies": 0.75, "rewards/chosen": -40.80116271972656, "rewards/margins": -0.3101482391357422, "rewards/rejected": -40.49101257324219, "step": 3680 }, { "epoch": 0.5012254901960784, "grad_norm": 44.03518011850916, "learning_rate": 4.678358389010772e-07, "logits/chosen": 13.760818481445312, "logits/rejected": 14.413858413696289, "logps/chosen": -4.27269172668457, "logps/rejected": -4.727659702301025, "loss": 3.6645, "rewards/accuracies": 0.75, "rewards/chosen": -42.72691345214844, "rewards/margins": 4.549681663513184, "rewards/rejected": -47.27659606933594, "step": 3681 }, { "epoch": 0.5013616557734205, "grad_norm": 41.73584606606235, "learning_rate": 4.67648445196469e-07, "logits/chosen": 12.561681747436523, "logits/rejected": 14.224727630615234, "logps/chosen": -3.704871654510498, "logps/rejected": -4.084285736083984, "loss": 4.0291, "rewards/accuracies": 0.75, "rewards/chosen": -37.0487174987793, "rewards/margins": 3.794139862060547, "rewards/rejected": -40.842857360839844, "step": 3682 }, { "epoch": 0.5014978213507625, "grad_norm": 44.984140722689204, "learning_rate": 4.6746103620613545e-07, "logits/chosen": 13.778182029724121, "logits/rejected": 14.209762573242188, "logps/chosen": -4.014027118682861, "logps/rejected": -4.182784080505371, "loss": 4.5321, "rewards/accuracies": 0.5, "rewards/chosen": -40.14027404785156, "rewards/margins": 1.6875696182250977, "rewards/rejected": -41.827842712402344, "step": 3683 }, { "epoch": 0.5016339869281046, "grad_norm": 39.95799594258726, "learning_rate": 4.672736119724231e-07, "logits/chosen": 13.029081344604492, "logits/rejected": 12.827863693237305, "logps/chosen": -4.083498954772949, "logps/rejected": -3.799959659576416, "loss": 4.1098, "rewards/accuracies": 0.25, "rewards/chosen": -40.834991455078125, "rewards/margins": -2.835391044616699, "rewards/rejected": -37.999595642089844, "step": 3684 }, { "epoch": 0.5017701525054467, "grad_norm": 47.2499413606826, "learning_rate": 4.6708617253768203e-07, "logits/chosen": 14.296309471130371, "logits/rejected": 14.099776268005371, "logps/chosen": -4.472154140472412, "logps/rejected": -4.4759979248046875, "loss": 4.0829, "rewards/accuracies": 0.5, "rewards/chosen": -44.72154235839844, "rewards/margins": 0.03843975067138672, "rewards/rejected": -44.75998306274414, "step": 3685 }, { "epoch": 0.5019063180827886, "grad_norm": 39.54933274635924, "learning_rate": 4.6689871794426575e-07, "logits/chosen": 13.571187019348145, "logits/rejected": 13.624645233154297, "logps/chosen": -4.083020210266113, "logps/rejected": -3.9390416145324707, "loss": 4.0642, "rewards/accuracies": 0.5, "rewards/chosen": -40.8302001953125, "rewards/margins": -1.439784049987793, "rewards/rejected": -39.39041519165039, "step": 3686 }, { "epoch": 0.5020424836601307, "grad_norm": 40.383110221675075, "learning_rate": 4.6671124823453114e-07, "logits/chosen": 13.614217758178711, "logits/rejected": 14.375622749328613, "logps/chosen": -3.8175101280212402, "logps/rejected": -4.389740943908691, "loss": 4.2087, "rewards/accuracies": 1.0, "rewards/chosen": -38.17510223388672, "rewards/margins": 5.7223100662231445, "rewards/rejected": -43.89740753173828, "step": 3687 }, { "epoch": 0.5021786492374728, "grad_norm": 40.15446854883581, "learning_rate": 4.6652376345083854e-07, "logits/chosen": 13.403884887695312, "logits/rejected": 13.312314987182617, "logps/chosen": -4.045193195343018, "logps/rejected": -3.958155632019043, "loss": 4.0719, "rewards/accuracies": 0.25, "rewards/chosen": -40.451927185058594, "rewards/margins": -0.8703746795654297, "rewards/rejected": -39.5815544128418, "step": 3688 }, { "epoch": 0.5023148148148148, "grad_norm": 41.730358780798554, "learning_rate": 4.6633626363555177e-07, "logits/chosen": 12.877015113830566, "logits/rejected": 12.59243392944336, "logps/chosen": -3.9414854049682617, "logps/rejected": -4.147829532623291, "loss": 3.1964, "rewards/accuracies": 1.0, "rewards/chosen": -39.41485595703125, "rewards/margins": 2.063441276550293, "rewards/rejected": -41.478294372558594, "step": 3689 }, { "epoch": 0.5024509803921569, "grad_norm": 53.37434326954374, "learning_rate": 4.661487488310378e-07, "logits/chosen": 12.438682556152344, "logits/rejected": 14.99273681640625, "logps/chosen": -3.794966697692871, "logps/rejected": -4.489683151245117, "loss": 4.3632, "rewards/accuracies": 1.0, "rewards/chosen": -37.949668884277344, "rewards/margins": 6.947161674499512, "rewards/rejected": -44.89683151245117, "step": 3690 }, { "epoch": 0.5025871459694989, "grad_norm": 48.109787350908555, "learning_rate": 4.6596121907966726e-07, "logits/chosen": 13.101426124572754, "logits/rejected": 14.289615631103516, "logps/chosen": -3.8813486099243164, "logps/rejected": -4.3044538497924805, "loss": 4.3586, "rewards/accuracies": 1.0, "rewards/chosen": -38.81348419189453, "rewards/margins": 4.231053352355957, "rewards/rejected": -43.04454040527344, "step": 3691 }, { "epoch": 0.5027233115468409, "grad_norm": 39.37318122407898, "learning_rate": 4.657736744238141e-07, "logits/chosen": 13.750665664672852, "logits/rejected": 15.23480224609375, "logps/chosen": -4.038015842437744, "logps/rejected": -4.306301116943359, "loss": 3.4793, "rewards/accuracies": 0.75, "rewards/chosen": -40.38016128540039, "rewards/margins": 2.6828536987304688, "rewards/rejected": -43.063011169433594, "step": 3692 }, { "epoch": 0.502859477124183, "grad_norm": 42.14012235107318, "learning_rate": 4.655861149058554e-07, "logits/chosen": 13.097085952758789, "logits/rejected": 13.206972122192383, "logps/chosen": -4.014247417449951, "logps/rejected": -4.001865386962891, "loss": 4.4687, "rewards/accuracies": 0.5, "rewards/chosen": -40.14247512817383, "rewards/margins": -0.12382316589355469, "rewards/rejected": -40.018653869628906, "step": 3693 }, { "epoch": 0.5029956427015251, "grad_norm": 62.01450078181563, "learning_rate": 4.6539854056817194e-07, "logits/chosen": 13.818737030029297, "logits/rejected": 14.421567916870117, "logps/chosen": -3.92671275138855, "logps/rejected": -4.472270488739014, "loss": 3.6972, "rewards/accuracies": 0.75, "rewards/chosen": -39.26712417602539, "rewards/margins": 5.455580711364746, "rewards/rejected": -44.72270965576172, "step": 3694 }, { "epoch": 0.503131808278867, "grad_norm": 42.72062915958278, "learning_rate": 4.6521095145314773e-07, "logits/chosen": 12.848318099975586, "logits/rejected": 14.075523376464844, "logps/chosen": -3.984142541885376, "logps/rejected": -4.456246852874756, "loss": 4.1163, "rewards/accuracies": 1.0, "rewards/chosen": -39.841426849365234, "rewards/margins": 4.721043586730957, "rewards/rejected": -44.562469482421875, "step": 3695 }, { "epoch": 0.5032679738562091, "grad_norm": 42.31901379958263, "learning_rate": 4.650233476031698e-07, "logits/chosen": 13.842992782592773, "logits/rejected": 13.748260498046875, "logps/chosen": -4.0636210441589355, "logps/rejected": -4.267724514007568, "loss": 4.0544, "rewards/accuracies": 0.75, "rewards/chosen": -40.63621520996094, "rewards/margins": 2.0410327911376953, "rewards/rejected": -42.67724609375, "step": 3696 }, { "epoch": 0.5034041394335512, "grad_norm": 35.84314234438237, "learning_rate": 4.648357290606292e-07, "logits/chosen": 14.05630874633789, "logits/rejected": 13.733668327331543, "logps/chosen": -4.1278581619262695, "logps/rejected": -4.154338836669922, "loss": 3.7863, "rewards/accuracies": 0.5, "rewards/chosen": -41.27857971191406, "rewards/margins": 0.26480674743652344, "rewards/rejected": -41.54338836669922, "step": 3697 }, { "epoch": 0.5035403050108932, "grad_norm": 41.73750932974349, "learning_rate": 4.6464809586791966e-07, "logits/chosen": 12.517267227172852, "logits/rejected": 14.039623260498047, "logps/chosen": -3.4238522052764893, "logps/rejected": -3.9721171855926514, "loss": 3.9601, "rewards/accuracies": 0.75, "rewards/chosen": -34.238525390625, "rewards/margins": 5.4826483726501465, "rewards/rejected": -39.72117233276367, "step": 3698 }, { "epoch": 0.5036764705882353, "grad_norm": 41.752741678810835, "learning_rate": 4.644604480674383e-07, "logits/chosen": 13.866613388061523, "logits/rejected": 14.033361434936523, "logps/chosen": -4.244813442230225, "logps/rejected": -4.430887699127197, "loss": 4.2171, "rewards/accuracies": 1.0, "rewards/chosen": -42.44813537597656, "rewards/margins": 1.8607406616210938, "rewards/rejected": -44.308876037597656, "step": 3699 }, { "epoch": 0.5038126361655774, "grad_norm": 40.86814412452365, "learning_rate": 4.6427278570158607e-07, "logits/chosen": 12.964727401733398, "logits/rejected": 13.467775344848633, "logps/chosen": -3.6793668270111084, "logps/rejected": -3.8856759071350098, "loss": 3.4096, "rewards/accuracies": 0.75, "rewards/chosen": -36.793670654296875, "rewards/margins": 2.0630903244018555, "rewards/rejected": -38.85675811767578, "step": 3700 }, { "epoch": 0.5039488017429193, "grad_norm": 44.4092818680088, "learning_rate": 4.6408510881276656e-07, "logits/chosen": 13.350761413574219, "logits/rejected": 13.82803726196289, "logps/chosen": -4.141530990600586, "logps/rejected": -4.274497985839844, "loss": 4.079, "rewards/accuracies": 0.75, "rewards/chosen": -41.415313720703125, "rewards/margins": 1.3296699523925781, "rewards/rejected": -42.74497985839844, "step": 3701 }, { "epoch": 0.5040849673202614, "grad_norm": 52.505356832887095, "learning_rate": 4.6389741744338693e-07, "logits/chosen": 13.002243041992188, "logits/rejected": 13.341839790344238, "logps/chosen": -3.9959051609039307, "logps/rejected": -4.232540607452393, "loss": 4.4517, "rewards/accuracies": 0.75, "rewards/chosen": -39.95905303955078, "rewards/margins": 2.3663511276245117, "rewards/rejected": -42.325401306152344, "step": 3702 }, { "epoch": 0.5042211328976035, "grad_norm": 43.804618030794984, "learning_rate": 4.6370971163585765e-07, "logits/chosen": 13.984241485595703, "logits/rejected": 13.255672454833984, "logps/chosen": -4.361522197723389, "logps/rejected": -4.266129493713379, "loss": 4.2622, "rewards/accuracies": 0.5, "rewards/chosen": -43.61522674560547, "rewards/margins": -0.9539308547973633, "rewards/rejected": -42.661293029785156, "step": 3703 }, { "epoch": 0.5043572984749455, "grad_norm": 43.42505069916912, "learning_rate": 4.635219914325924e-07, "logits/chosen": 12.053701400756836, "logits/rejected": 12.511979103088379, "logps/chosen": -3.505237102508545, "logps/rejected": -3.8742475509643555, "loss": 3.9705, "rewards/accuracies": 0.75, "rewards/chosen": -35.0523681640625, "rewards/margins": 3.6901063919067383, "rewards/rejected": -38.74247741699219, "step": 3704 }, { "epoch": 0.5044934640522876, "grad_norm": 37.8438148802597, "learning_rate": 4.6333425687600813e-07, "logits/chosen": 13.328079223632812, "logits/rejected": 13.405557632446289, "logps/chosen": -3.8887720108032227, "logps/rejected": -4.235349655151367, "loss": 3.8782, "rewards/accuracies": 1.0, "rewards/chosen": -38.887718200683594, "rewards/margins": 3.465773582458496, "rewards/rejected": -42.353492736816406, "step": 3705 }, { "epoch": 0.5046296296296297, "grad_norm": 38.69494416739187, "learning_rate": 4.6314650800852496e-07, "logits/chosen": 13.282451629638672, "logits/rejected": 12.564393997192383, "logps/chosen": -3.539337158203125, "logps/rejected": -3.4124813079833984, "loss": 4.0831, "rewards/accuracies": 0.5, "rewards/chosen": -35.39337158203125, "rewards/margins": -1.2685608863830566, "rewards/rejected": -34.12480926513672, "step": 3706 }, { "epoch": 0.5047657952069716, "grad_norm": 74.7158178805641, "learning_rate": 4.6295874487256645e-07, "logits/chosen": 13.115743637084961, "logits/rejected": 13.94134521484375, "logps/chosen": -4.2655029296875, "logps/rejected": -4.5273847579956055, "loss": 4.1476, "rewards/accuracies": 0.75, "rewards/chosen": -42.655029296875, "rewards/margins": 2.618816375732422, "rewards/rejected": -45.273841857910156, "step": 3707 }, { "epoch": 0.5049019607843137, "grad_norm": 43.3142646223414, "learning_rate": 4.627709675105589e-07, "logits/chosen": 14.242101669311523, "logits/rejected": 13.670628547668457, "logps/chosen": -4.248115539550781, "logps/rejected": -3.9576785564422607, "loss": 4.0537, "rewards/accuracies": 0.25, "rewards/chosen": -42.48115539550781, "rewards/margins": -2.904369354248047, "rewards/rejected": -39.5767822265625, "step": 3708 }, { "epoch": 0.5050381263616558, "grad_norm": 39.53461821222633, "learning_rate": 4.625831759649326e-07, "logits/chosen": 13.332639694213867, "logits/rejected": 13.621783256530762, "logps/chosen": -4.160191535949707, "logps/rejected": -4.300412178039551, "loss": 3.3895, "rewards/accuracies": 0.75, "rewards/chosen": -41.6019172668457, "rewards/margins": 1.402205467224121, "rewards/rejected": -43.004119873046875, "step": 3709 }, { "epoch": 0.5051742919389978, "grad_norm": 38.95391996536984, "learning_rate": 4.623953702781203e-07, "logits/chosen": 13.304704666137695, "logits/rejected": 14.266590118408203, "logps/chosen": -3.9939775466918945, "logps/rejected": -4.301227569580078, "loss": 3.7099, "rewards/accuracies": 0.75, "rewards/chosen": -39.93977355957031, "rewards/margins": 3.0724945068359375, "rewards/rejected": -43.01226806640625, "step": 3710 }, { "epoch": 0.5053104575163399, "grad_norm": 46.70643601858337, "learning_rate": 4.622075504925582e-07, "logits/chosen": 13.366399765014648, "logits/rejected": 14.64451789855957, "logps/chosen": -3.9892611503601074, "logps/rejected": -4.376150131225586, "loss": 4.5193, "rewards/accuracies": 0.75, "rewards/chosen": -39.89261245727539, "rewards/margins": 3.8688888549804688, "rewards/rejected": -43.76150131225586, "step": 3711 }, { "epoch": 0.5054466230936819, "grad_norm": 40.735125654382756, "learning_rate": 4.62019716650686e-07, "logits/chosen": 13.794113159179688, "logits/rejected": 13.94243335723877, "logps/chosen": -4.3372802734375, "logps/rejected": -4.180885314941406, "loss": 4.2922, "rewards/accuracies": 0.25, "rewards/chosen": -43.372802734375, "rewards/margins": -1.5639495849609375, "rewards/rejected": -41.8088493347168, "step": 3712 }, { "epoch": 0.505582788671024, "grad_norm": 45.51262715016004, "learning_rate": 4.6183186879494603e-07, "logits/chosen": 13.074796676635742, "logits/rejected": 14.027867317199707, "logps/chosen": -3.636723279953003, "logps/rejected": -4.209498405456543, "loss": 4.1604, "rewards/accuracies": 1.0, "rewards/chosen": -36.36723327636719, "rewards/margins": 5.727753162384033, "rewards/rejected": -42.09498596191406, "step": 3713 }, { "epoch": 0.505718954248366, "grad_norm": 44.88167726987399, "learning_rate": 4.616440069677843e-07, "logits/chosen": 13.59646987915039, "logits/rejected": 13.24359130859375, "logps/chosen": -3.9143011569976807, "logps/rejected": -3.871829032897949, "loss": 4.5332, "rewards/accuracies": 0.25, "rewards/chosen": -39.14301300048828, "rewards/margins": -0.42472362518310547, "rewards/rejected": -38.71828842163086, "step": 3714 }, { "epoch": 0.5058551198257081, "grad_norm": 73.76908823233126, "learning_rate": 4.6145613121164955e-07, "logits/chosen": 13.123186111450195, "logits/rejected": 14.254874229431152, "logps/chosen": -3.657670259475708, "logps/rejected": -4.091545104980469, "loss": 3.99, "rewards/accuracies": 0.75, "rewards/chosen": -36.57670211791992, "rewards/margins": 4.3387451171875, "rewards/rejected": -40.91544723510742, "step": 3715 }, { "epoch": 0.5059912854030502, "grad_norm": 39.93636047045096, "learning_rate": 4.6126824156899404e-07, "logits/chosen": 14.074762344360352, "logits/rejected": 14.12474250793457, "logps/chosen": -3.8407022953033447, "logps/rejected": -3.994398593902588, "loss": 3.8316, "rewards/accuracies": 0.75, "rewards/chosen": -38.40702438354492, "rewards/margins": 1.5369606018066406, "rewards/rejected": -39.94398498535156, "step": 3716 }, { "epoch": 0.5061274509803921, "grad_norm": 37.43005500713085, "learning_rate": 4.6108033808227295e-07, "logits/chosen": 13.809640884399414, "logits/rejected": 14.438422203063965, "logps/chosen": -4.103566646575928, "logps/rejected": -4.335619926452637, "loss": 3.8701, "rewards/accuracies": 0.75, "rewards/chosen": -41.035667419433594, "rewards/margins": 2.320530891418457, "rewards/rejected": -43.356197357177734, "step": 3717 }, { "epoch": 0.5062636165577342, "grad_norm": 44.77662255721623, "learning_rate": 4.608924207939444e-07, "logits/chosen": 12.983809471130371, "logits/rejected": 12.426156044006348, "logps/chosen": -3.8606982231140137, "logps/rejected": -3.5935957431793213, "loss": 3.5544, "rewards/accuracies": 0.0, "rewards/chosen": -38.60697937011719, "rewards/margins": -2.6710238456726074, "rewards/rejected": -35.93595886230469, "step": 3718 }, { "epoch": 0.5063997821350763, "grad_norm": 41.09362564835582, "learning_rate": 4.6070448974647015e-07, "logits/chosen": 13.357497215270996, "logits/rejected": 13.317059516906738, "logps/chosen": -3.6075401306152344, "logps/rejected": -3.7878575325012207, "loss": 3.3075, "rewards/accuracies": 0.5, "rewards/chosen": -36.075401306152344, "rewards/margins": 1.8031740188598633, "rewards/rejected": -37.878578186035156, "step": 3719 }, { "epoch": 0.5065359477124183, "grad_norm": 46.02627301906842, "learning_rate": 4.605165449823146e-07, "logits/chosen": 13.208311080932617, "logits/rejected": 14.352481842041016, "logps/chosen": -3.6461269855499268, "logps/rejected": -4.927330017089844, "loss": 4.0928, "rewards/accuracies": 1.0, "rewards/chosen": -36.46126937866211, "rewards/margins": 12.812033653259277, "rewards/rejected": -49.27330017089844, "step": 3720 }, { "epoch": 0.5066721132897604, "grad_norm": 38.777369367357544, "learning_rate": 4.6032858654394555e-07, "logits/chosen": 13.553915023803711, "logits/rejected": 13.757390975952148, "logps/chosen": -3.8741607666015625, "logps/rejected": -3.8888773918151855, "loss": 3.5955, "rewards/accuracies": 0.5, "rewards/chosen": -38.741607666015625, "rewards/margins": 0.14716529846191406, "rewards/rejected": -38.88877487182617, "step": 3721 }, { "epoch": 0.5068082788671024, "grad_norm": 42.848586760417625, "learning_rate": 4.6014061447383367e-07, "logits/chosen": 13.852407455444336, "logits/rejected": 14.211554527282715, "logps/chosen": -4.015633583068848, "logps/rejected": -3.922208786010742, "loss": 3.909, "rewards/accuracies": 0.75, "rewards/chosen": -40.15633773803711, "rewards/margins": -0.9342451095581055, "rewards/rejected": -39.22209167480469, "step": 3722 }, { "epoch": 0.5069444444444444, "grad_norm": 42.019225175254135, "learning_rate": 4.5995262881445277e-07, "logits/chosen": 13.128633499145508, "logits/rejected": 14.197549819946289, "logps/chosen": -3.6689975261688232, "logps/rejected": -4.132142066955566, "loss": 4.102, "rewards/accuracies": 1.0, "rewards/chosen": -36.689971923828125, "rewards/margins": 4.631448268890381, "rewards/rejected": -41.32142639160156, "step": 3723 }, { "epoch": 0.5070806100217865, "grad_norm": 43.9810473487319, "learning_rate": 4.597646296082798e-07, "logits/chosen": 13.783561706542969, "logits/rejected": 13.993221282958984, "logps/chosen": -4.180574893951416, "logps/rejected": -4.066995143890381, "loss": 3.9633, "rewards/accuracies": 0.25, "rewards/chosen": -41.805747985839844, "rewards/margins": -1.1357975006103516, "rewards/rejected": -40.669952392578125, "step": 3724 }, { "epoch": 0.5072167755991286, "grad_norm": 49.953599025701955, "learning_rate": 4.595766168977949e-07, "logits/chosen": 12.772207260131836, "logits/rejected": 13.44686508178711, "logps/chosen": -3.741025447845459, "logps/rejected": -3.8650007247924805, "loss": 3.475, "rewards/accuracies": 0.5, "rewards/chosen": -37.410255432128906, "rewards/margins": 1.239755630493164, "rewards/rejected": -38.65000915527344, "step": 3725 }, { "epoch": 0.5073529411764706, "grad_norm": 42.60498551091767, "learning_rate": 4.593885907254807e-07, "logits/chosen": 13.878173828125, "logits/rejected": 13.917863845825195, "logps/chosen": -4.151470184326172, "logps/rejected": -3.8104612827301025, "loss": 3.959, "rewards/accuracies": 0.25, "rewards/chosen": -41.51470184326172, "rewards/margins": -3.410085678100586, "rewards/rejected": -38.1046142578125, "step": 3726 }, { "epoch": 0.5074891067538126, "grad_norm": 40.20286313332982, "learning_rate": 4.5920055113382376e-07, "logits/chosen": 13.355401039123535, "logits/rejected": 13.444543838500977, "logps/chosen": -3.629484176635742, "logps/rejected": -3.8298232555389404, "loss": 3.8875, "rewards/accuracies": 0.75, "rewards/chosen": -36.29484176635742, "rewards/margins": 2.003392219543457, "rewards/rejected": -38.29823303222656, "step": 3727 }, { "epoch": 0.5076252723311547, "grad_norm": 53.667188802127505, "learning_rate": 4.5901249816531287e-07, "logits/chosen": 13.806768417358398, "logits/rejected": 13.621940612792969, "logps/chosen": -4.499809265136719, "logps/rejected": -3.884488344192505, "loss": 4.8854, "rewards/accuracies": 0.0, "rewards/chosen": -44.99809265136719, "rewards/margins": -6.1532087326049805, "rewards/rejected": -38.84488296508789, "step": 3728 }, { "epoch": 0.5077614379084967, "grad_norm": 38.18104787404733, "learning_rate": 4.5882443186244006e-07, "logits/chosen": 13.25516414642334, "logits/rejected": 13.780010223388672, "logps/chosen": -4.002775192260742, "logps/rejected": -4.051166534423828, "loss": 3.6704, "rewards/accuracies": 0.5, "rewards/chosen": -40.02775573730469, "rewards/margins": 0.48390865325927734, "rewards/rejected": -40.511661529541016, "step": 3729 }, { "epoch": 0.5078976034858388, "grad_norm": 40.837326523250105, "learning_rate": 4.586363522677008e-07, "logits/chosen": 14.672773361206055, "logits/rejected": 15.062165260314941, "logps/chosen": -4.315737724304199, "logps/rejected": -4.22947883605957, "loss": 3.986, "rewards/accuracies": 0.5, "rewards/chosen": -43.157379150390625, "rewards/margins": -0.8625926971435547, "rewards/rejected": -42.29478454589844, "step": 3730 }, { "epoch": 0.5080337690631809, "grad_norm": 42.02179658814086, "learning_rate": 4.58448259423593e-07, "logits/chosen": 13.773031234741211, "logits/rejected": 14.291923522949219, "logps/chosen": -3.6931698322296143, "logps/rejected": -3.742191791534424, "loss": 4.0641, "rewards/accuracies": 0.5, "rewards/chosen": -36.93170166015625, "rewards/margins": 0.4902191162109375, "rewards/rejected": -37.42191696166992, "step": 3731 }, { "epoch": 0.5081699346405228, "grad_norm": 69.41717544498198, "learning_rate": 4.582601533726178e-07, "logits/chosen": 14.02354621887207, "logits/rejected": 15.073356628417969, "logps/chosen": -3.911680221557617, "logps/rejected": -4.51957893371582, "loss": 3.5264, "rewards/accuracies": 1.0, "rewards/chosen": -39.116798400878906, "rewards/margins": 6.078991889953613, "rewards/rejected": -45.1957893371582, "step": 3732 }, { "epoch": 0.5083061002178649, "grad_norm": 47.8010469745176, "learning_rate": 4.580720341572794e-07, "logits/chosen": 13.398582458496094, "logits/rejected": 13.381519317626953, "logps/chosen": -3.6556031703948975, "logps/rejected": -3.9989662170410156, "loss": 4.0113, "rewards/accuracies": 0.75, "rewards/chosen": -36.5560302734375, "rewards/margins": 3.433629035949707, "rewards/rejected": -39.98965835571289, "step": 3733 }, { "epoch": 0.508442265795207, "grad_norm": 37.67312645381683, "learning_rate": 4.578839018200849e-07, "logits/chosen": 13.237997055053711, "logits/rejected": 13.388985633850098, "logps/chosen": -3.702047348022461, "logps/rejected": -3.9773404598236084, "loss": 3.7526, "rewards/accuracies": 0.75, "rewards/chosen": -37.020469665527344, "rewards/margins": 2.7529311180114746, "rewards/rejected": -39.77340316772461, "step": 3734 }, { "epoch": 0.508578431372549, "grad_norm": 41.534328497289145, "learning_rate": 4.576957564035442e-07, "logits/chosen": 12.594297409057617, "logits/rejected": 14.048774719238281, "logps/chosen": -3.5562961101531982, "logps/rejected": -4.130877494812012, "loss": 3.5235, "rewards/accuracies": 1.0, "rewards/chosen": -35.56296157836914, "rewards/margins": 5.7458086013793945, "rewards/rejected": -41.30876922607422, "step": 3735 }, { "epoch": 0.5087145969498911, "grad_norm": 42.82574822258276, "learning_rate": 4.5750759795017053e-07, "logits/chosen": 13.156829833984375, "logits/rejected": 13.996488571166992, "logps/chosen": -4.033574104309082, "logps/rejected": -4.101782321929932, "loss": 4.1658, "rewards/accuracies": 0.75, "rewards/chosen": -40.33574295043945, "rewards/margins": 0.6820812225341797, "rewards/rejected": -41.017822265625, "step": 3736 }, { "epoch": 0.5088507625272332, "grad_norm": 45.974386398279215, "learning_rate": 4.5731942650247975e-07, "logits/chosen": 14.881345748901367, "logits/rejected": 13.893817901611328, "logps/chosen": -4.360410213470459, "logps/rejected": -4.143202304840088, "loss": 4.2351, "rewards/accuracies": 0.25, "rewards/chosen": -43.604103088378906, "rewards/margins": -2.17208194732666, "rewards/rejected": -41.43202209472656, "step": 3737 }, { "epoch": 0.5089869281045751, "grad_norm": 45.99901873976779, "learning_rate": 4.5713124210299065e-07, "logits/chosen": 12.701223373413086, "logits/rejected": 13.444134712219238, "logps/chosen": -3.618600368499756, "logps/rejected": -3.847452163696289, "loss": 4.3688, "rewards/accuracies": 0.75, "rewards/chosen": -36.186004638671875, "rewards/margins": 2.2885193824768066, "rewards/rejected": -38.47452163696289, "step": 3738 }, { "epoch": 0.5091230936819172, "grad_norm": 46.712024863503935, "learning_rate": 4.5694304479422525e-07, "logits/chosen": 13.960208892822266, "logits/rejected": 14.646142959594727, "logps/chosen": -4.086427688598633, "logps/rejected": -4.298659324645996, "loss": 3.6698, "rewards/accuracies": 0.75, "rewards/chosen": -40.86427307128906, "rewards/margins": 2.1223134994506836, "rewards/rejected": -42.98658752441406, "step": 3739 }, { "epoch": 0.5092592592592593, "grad_norm": 64.43471798793891, "learning_rate": 4.567548346187081e-07, "logits/chosen": 13.874932289123535, "logits/rejected": 13.597454071044922, "logps/chosen": -4.078609466552734, "logps/rejected": -4.039462566375732, "loss": 3.6893, "rewards/accuracies": 0.25, "rewards/chosen": -40.78609848022461, "rewards/margins": -0.39147281646728516, "rewards/rejected": -40.394622802734375, "step": 3740 }, { "epoch": 0.5093954248366013, "grad_norm": 39.29926521864065, "learning_rate": 4.5656661161896695e-07, "logits/chosen": 12.932511329650879, "logits/rejected": 13.16743278503418, "logps/chosen": -3.686081886291504, "logps/rejected": -4.153251647949219, "loss": 3.8413, "rewards/accuracies": 0.75, "rewards/chosen": -36.860816955566406, "rewards/margins": 4.671701431274414, "rewards/rejected": -41.53252029418945, "step": 3741 }, { "epoch": 0.5095315904139434, "grad_norm": 46.46698837638492, "learning_rate": 4.563783758375323e-07, "logits/chosen": 13.678722381591797, "logits/rejected": 14.763172149658203, "logps/chosen": -3.7931060791015625, "logps/rejected": -4.337477207183838, "loss": 3.8046, "rewards/accuracies": 0.75, "rewards/chosen": -37.93106460571289, "rewards/margins": 5.443709373474121, "rewards/rejected": -43.37477111816406, "step": 3742 }, { "epoch": 0.5096677559912854, "grad_norm": 40.1443840605163, "learning_rate": 4.5619012731693765e-07, "logits/chosen": 13.2493314743042, "logits/rejected": 14.326849937438965, "logps/chosen": -3.7758007049560547, "logps/rejected": -4.008132457733154, "loss": 3.8024, "rewards/accuracies": 0.5, "rewards/chosen": -37.75800704956055, "rewards/margins": 2.3233184814453125, "rewards/rejected": -40.081321716308594, "step": 3743 }, { "epoch": 0.5098039215686274, "grad_norm": 49.42223874116889, "learning_rate": 4.5600186609971923e-07, "logits/chosen": 14.431095123291016, "logits/rejected": 14.047449111938477, "logps/chosen": -3.980996608734131, "logps/rejected": -3.8963589668273926, "loss": 4.4369, "rewards/accuracies": 0.5, "rewards/chosen": -39.809967041015625, "rewards/margins": -0.8463764190673828, "rewards/rejected": -38.963592529296875, "step": 3744 }, { "epoch": 0.5099400871459695, "grad_norm": 45.227895414461514, "learning_rate": 4.5581359222841626e-07, "logits/chosen": 13.516876220703125, "logits/rejected": 14.179786682128906, "logps/chosen": -3.9974541664123535, "logps/rejected": -4.307974815368652, "loss": 4.5707, "rewards/accuracies": 0.75, "rewards/chosen": -39.974544525146484, "rewards/margins": 3.1052026748657227, "rewards/rejected": -43.07974624633789, "step": 3745 }, { "epoch": 0.5100762527233116, "grad_norm": 40.420312587569484, "learning_rate": 4.5562530574557076e-07, "logits/chosen": 12.606269836425781, "logits/rejected": 13.457939147949219, "logps/chosen": -3.4870169162750244, "logps/rejected": -3.8844854831695557, "loss": 4.1911, "rewards/accuracies": 0.75, "rewards/chosen": -34.87017059326172, "rewards/margins": 3.9746856689453125, "rewards/rejected": -38.84485626220703, "step": 3746 }, { "epoch": 0.5102124183006536, "grad_norm": 39.962760567987274, "learning_rate": 4.5543700669372755e-07, "logits/chosen": 12.807247161865234, "logits/rejected": 13.986152648925781, "logps/chosen": -3.8043458461761475, "logps/rejected": -4.211319923400879, "loss": 3.3029, "rewards/accuracies": 0.75, "rewards/chosen": -38.04345703125, "rewards/margins": 4.069744110107422, "rewards/rejected": -42.11320114135742, "step": 3747 }, { "epoch": 0.5103485838779956, "grad_norm": 39.86621756948613, "learning_rate": 4.5524869511543453e-07, "logits/chosen": 12.995777130126953, "logits/rejected": 13.169120788574219, "logps/chosen": -4.149503707885742, "logps/rejected": -4.162654399871826, "loss": 4.199, "rewards/accuracies": 0.5, "rewards/chosen": -41.49504089355469, "rewards/margins": 0.13150310516357422, "rewards/rejected": -41.62654495239258, "step": 3748 }, { "epoch": 0.5104847494553377, "grad_norm": 51.171296244446204, "learning_rate": 4.550603710532422e-07, "logits/chosen": 13.813095092773438, "logits/rejected": 14.144981384277344, "logps/chosen": -4.019534111022949, "logps/rejected": -4.435127258300781, "loss": 3.3648, "rewards/accuracies": 1.0, "rewards/chosen": -40.19533920288086, "rewards/margins": 4.155933380126953, "rewards/rejected": -44.35127258300781, "step": 3749 }, { "epoch": 0.5106209150326797, "grad_norm": 43.326375832378965, "learning_rate": 4.5487203454970375e-07, "logits/chosen": 13.003101348876953, "logits/rejected": 13.475452423095703, "logps/chosen": -3.9335134029388428, "logps/rejected": -4.1839213371276855, "loss": 3.8189, "rewards/accuracies": 1.0, "rewards/chosen": -39.33513641357422, "rewards/margins": 2.5040807723999023, "rewards/rejected": -41.83921432495117, "step": 3750 }, { "epoch": 0.5107570806100218, "grad_norm": 44.22870432747467, "learning_rate": 4.5468368564737565e-07, "logits/chosen": 13.438145637512207, "logits/rejected": 13.96610164642334, "logps/chosen": -3.844022750854492, "logps/rejected": -4.199101448059082, "loss": 4.3151, "rewards/accuracies": 0.75, "rewards/chosen": -38.440223693847656, "rewards/margins": 3.5507898330688477, "rewards/rejected": -41.99101638793945, "step": 3751 }, { "epoch": 0.5108932461873639, "grad_norm": 36.71127767216826, "learning_rate": 4.544953243888167e-07, "logits/chosen": 13.281365394592285, "logits/rejected": 13.847874641418457, "logps/chosen": -3.575277328491211, "logps/rejected": -4.249441146850586, "loss": 3.2852, "rewards/accuracies": 1.0, "rewards/chosen": -35.75277328491211, "rewards/margins": 6.741639137268066, "rewards/rejected": -42.49441146850586, "step": 3752 }, { "epoch": 0.5110294117647058, "grad_norm": 46.03334916765751, "learning_rate": 4.543069508165887e-07, "logits/chosen": 13.78405475616455, "logits/rejected": 12.732243537902832, "logps/chosen": -3.7609596252441406, "logps/rejected": -3.813760995864868, "loss": 4.327, "rewards/accuracies": 0.5, "rewards/chosen": -37.609596252441406, "rewards/margins": 0.5280156135559082, "rewards/rejected": -38.137611389160156, "step": 3753 }, { "epoch": 0.5111655773420479, "grad_norm": 41.882485462887836, "learning_rate": 4.541185649732563e-07, "logits/chosen": 12.577780723571777, "logits/rejected": 12.914665222167969, "logps/chosen": -3.7134745121002197, "logps/rejected": -3.8965134620666504, "loss": 3.9341, "rewards/accuracies": 1.0, "rewards/chosen": -37.134742736816406, "rewards/margins": 1.8303909301757812, "rewards/rejected": -38.96513366699219, "step": 3754 }, { "epoch": 0.51130174291939, "grad_norm": 41.756760354304106, "learning_rate": 4.539301669013868e-07, "logits/chosen": 13.712383270263672, "logits/rejected": 13.475481033325195, "logps/chosen": -3.980180501937866, "logps/rejected": -3.8870198726654053, "loss": 4.2222, "rewards/accuracies": 0.25, "rewards/chosen": -39.80180358886719, "rewards/margins": -0.9316043853759766, "rewards/rejected": -38.870201110839844, "step": 3755 }, { "epoch": 0.511437908496732, "grad_norm": 43.90302368997525, "learning_rate": 4.5374175664355033e-07, "logits/chosen": 13.259364128112793, "logits/rejected": 14.022928237915039, "logps/chosen": -3.916048765182495, "logps/rejected": -4.1144561767578125, "loss": 4.0527, "rewards/accuracies": 0.5, "rewards/chosen": -39.16048812866211, "rewards/margins": 1.9840717315673828, "rewards/rejected": -41.144561767578125, "step": 3756 }, { "epoch": 0.5115740740740741, "grad_norm": 40.34713431691, "learning_rate": 4.535533342423196e-07, "logits/chosen": 13.092153549194336, "logits/rejected": 13.926685333251953, "logps/chosen": -4.003016471862793, "logps/rejected": -4.3267693519592285, "loss": 3.7807, "rewards/accuracies": 1.0, "rewards/chosen": -40.03016662597656, "rewards/margins": 3.237527847290039, "rewards/rejected": -43.26769256591797, "step": 3757 }, { "epoch": 0.5117102396514162, "grad_norm": 41.09896147192328, "learning_rate": 4.5336489974027044e-07, "logits/chosen": 13.344023704528809, "logits/rejected": 13.47143840789795, "logps/chosen": -3.873793601989746, "logps/rejected": -3.9866271018981934, "loss": 4.1763, "rewards/accuracies": 0.75, "rewards/chosen": -38.737937927246094, "rewards/margins": 1.1283340454101562, "rewards/rejected": -39.86627197265625, "step": 3758 }, { "epoch": 0.5118464052287581, "grad_norm": 44.24120824891093, "learning_rate": 4.53176453179981e-07, "logits/chosen": 12.891623497009277, "logits/rejected": 13.857327461242676, "logps/chosen": -3.9412360191345215, "logps/rejected": -4.129944801330566, "loss": 4.5152, "rewards/accuracies": 0.75, "rewards/chosen": -39.41236114501953, "rewards/margins": 1.8870887756347656, "rewards/rejected": -41.2994499206543, "step": 3759 }, { "epoch": 0.5119825708061002, "grad_norm": 38.514353913998534, "learning_rate": 4.5298799460403244e-07, "logits/chosen": 12.361040115356445, "logits/rejected": 13.088747024536133, "logps/chosen": -3.65903639793396, "logps/rejected": -3.8885860443115234, "loss": 3.9616, "rewards/accuracies": 0.75, "rewards/chosen": -36.590362548828125, "rewards/margins": 2.29549503326416, "rewards/rejected": -38.88585662841797, "step": 3760 }, { "epoch": 0.5121187363834423, "grad_norm": 43.86773471165541, "learning_rate": 4.5279952405500844e-07, "logits/chosen": 12.736700057983398, "logits/rejected": 12.490022659301758, "logps/chosen": -3.7148923873901367, "logps/rejected": -3.5416626930236816, "loss": 4.3321, "rewards/accuracies": 0.0, "rewards/chosen": -37.148921966552734, "rewards/margins": -1.7322936058044434, "rewards/rejected": -35.416629791259766, "step": 3761 }, { "epoch": 0.5122549019607843, "grad_norm": 39.79905825232375, "learning_rate": 4.5261104157549567e-07, "logits/chosen": 12.342775344848633, "logits/rejected": 12.74665641784668, "logps/chosen": -3.740823745727539, "logps/rejected": -3.6579248905181885, "loss": 4.0753, "rewards/accuracies": 0.5, "rewards/chosen": -37.40823745727539, "rewards/margins": -0.8289899826049805, "rewards/rejected": -36.579246520996094, "step": 3762 }, { "epoch": 0.5123910675381264, "grad_norm": 42.01491734993727, "learning_rate": 4.5242254720808307e-07, "logits/chosen": 12.777761459350586, "logits/rejected": 14.134265899658203, "logps/chosen": -3.7152152061462402, "logps/rejected": -4.397899627685547, "loss": 3.662, "rewards/accuracies": 1.0, "rewards/chosen": -37.15214920043945, "rewards/margins": 6.826842308044434, "rewards/rejected": -43.9789924621582, "step": 3763 }, { "epoch": 0.5125272331154684, "grad_norm": 45.6144821218802, "learning_rate": 4.522340409953625e-07, "logits/chosen": 13.02484130859375, "logits/rejected": 13.077095031738281, "logps/chosen": -4.053188323974609, "logps/rejected": -3.9359219074249268, "loss": 3.8423, "rewards/accuracies": 0.25, "rewards/chosen": -40.53188705444336, "rewards/margins": -1.1726665496826172, "rewards/rejected": -39.359222412109375, "step": 3764 }, { "epoch": 0.5126633986928104, "grad_norm": 36.9609276311984, "learning_rate": 4.520455229799287e-07, "logits/chosen": 12.718235969543457, "logits/rejected": 13.520013809204102, "logps/chosen": -3.707120418548584, "logps/rejected": -4.100605010986328, "loss": 3.7831, "rewards/accuracies": 0.75, "rewards/chosen": -37.071205139160156, "rewards/margins": 3.9348459243774414, "rewards/rejected": -41.00605010986328, "step": 3765 }, { "epoch": 0.5127995642701525, "grad_norm": 42.98880128472337, "learning_rate": 4.518569932043787e-07, "logits/chosen": 13.369303703308105, "logits/rejected": 13.807219505310059, "logps/chosen": -4.040094375610352, "logps/rejected": -4.20306921005249, "loss": 3.5852, "rewards/accuracies": 0.75, "rewards/chosen": -40.40094757080078, "rewards/margins": 1.629744529724121, "rewards/rejected": -42.03069305419922, "step": 3766 }, { "epoch": 0.5129357298474946, "grad_norm": 38.16885784361096, "learning_rate": 4.516684517113126e-07, "logits/chosen": 12.885116577148438, "logits/rejected": 14.140100479125977, "logps/chosen": -4.071122169494629, "logps/rejected": -4.280599594116211, "loss": 3.8938, "rewards/accuracies": 1.0, "rewards/chosen": -40.711219787597656, "rewards/margins": 2.094771385192871, "rewards/rejected": -42.805992126464844, "step": 3767 }, { "epoch": 0.5130718954248366, "grad_norm": 40.16004350505088, "learning_rate": 4.514798985433326e-07, "logits/chosen": 13.35867977142334, "logits/rejected": 14.356884956359863, "logps/chosen": -3.931838274002075, "logps/rejected": -4.08843994140625, "loss": 3.5315, "rewards/accuracies": 0.5, "rewards/chosen": -39.318382263183594, "rewards/margins": 1.5660152435302734, "rewards/rejected": -40.8843994140625, "step": 3768 }, { "epoch": 0.5132080610021786, "grad_norm": 43.6045166843425, "learning_rate": 4.51291333743044e-07, "logits/chosen": 13.15158462524414, "logits/rejected": 13.602264404296875, "logps/chosen": -4.186158657073975, "logps/rejected": -4.278151035308838, "loss": 4.2072, "rewards/accuracies": 0.75, "rewards/chosen": -41.86158752441406, "rewards/margins": 0.9199247360229492, "rewards/rejected": -42.78150939941406, "step": 3769 }, { "epoch": 0.5133442265795207, "grad_norm": 46.045259132539904, "learning_rate": 4.5110275735305467e-07, "logits/chosen": 12.922981262207031, "logits/rejected": 13.332952499389648, "logps/chosen": -4.215512275695801, "logps/rejected": -4.182734489440918, "loss": 3.381, "rewards/accuracies": 0.5, "rewards/chosen": -42.15512466430664, "rewards/margins": -0.3277769088745117, "rewards/rejected": -41.82734680175781, "step": 3770 }, { "epoch": 0.5134803921568627, "grad_norm": 42.36306755033828, "learning_rate": 4.509141694159748e-07, "logits/chosen": 12.64642333984375, "logits/rejected": 13.721087455749512, "logps/chosen": -3.9189248085021973, "logps/rejected": -4.620860576629639, "loss": 4.5294, "rewards/accuracies": 1.0, "rewards/chosen": -39.18925094604492, "rewards/margins": 7.019356727600098, "rewards/rejected": -46.2086067199707, "step": 3771 }, { "epoch": 0.5136165577342048, "grad_norm": 44.535915052573195, "learning_rate": 4.507255699744175e-07, "logits/chosen": 13.041234016418457, "logits/rejected": 13.140849113464355, "logps/chosen": -4.179131507873535, "logps/rejected": -4.159507751464844, "loss": 3.7275, "rewards/accuracies": 0.75, "rewards/chosen": -41.79131317138672, "rewards/margins": -0.19623851776123047, "rewards/rejected": -41.59507751464844, "step": 3772 }, { "epoch": 0.5137527233115469, "grad_norm": 39.53884964504267, "learning_rate": 4.505369590709984e-07, "logits/chosen": 12.40401840209961, "logits/rejected": 12.810964584350586, "logps/chosen": -3.804750442504883, "logps/rejected": -3.973048686981201, "loss": 3.7924, "rewards/accuracies": 0.5, "rewards/chosen": -38.04750442504883, "rewards/margins": 1.6829824447631836, "rewards/rejected": -39.73048782348633, "step": 3773 }, { "epoch": 0.5138888888888888, "grad_norm": 46.783874063888916, "learning_rate": 4.5034833674833556e-07, "logits/chosen": 13.575517654418945, "logits/rejected": 13.453588485717773, "logps/chosen": -4.141910552978516, "logps/rejected": -4.435678958892822, "loss": 4.0669, "rewards/accuracies": 1.0, "rewards/chosen": -41.419105529785156, "rewards/margins": 2.937687873840332, "rewards/rejected": -44.356788635253906, "step": 3774 }, { "epoch": 0.5140250544662309, "grad_norm": 47.01042977565326, "learning_rate": 4.501597030490499e-07, "logits/chosen": 13.29189682006836, "logits/rejected": 12.751495361328125, "logps/chosen": -3.8952183723449707, "logps/rejected": -4.239529609680176, "loss": 4.1648, "rewards/accuracies": 1.0, "rewards/chosen": -38.95218276977539, "rewards/margins": 3.4431095123291016, "rewards/rejected": -42.395294189453125, "step": 3775 }, { "epoch": 0.514161220043573, "grad_norm": 50.821461136148635, "learning_rate": 4.4997105801576474e-07, "logits/chosen": 12.99436092376709, "logits/rejected": 13.212276458740234, "logps/chosen": -4.235283374786377, "logps/rejected": -4.173183441162109, "loss": 3.6603, "rewards/accuracies": 0.5, "rewards/chosen": -42.35283660888672, "rewards/margins": -0.6209964752197266, "rewards/rejected": -41.73183822631836, "step": 3776 }, { "epoch": 0.514297385620915, "grad_norm": 45.438417201964675, "learning_rate": 4.4978240169110596e-07, "logits/chosen": 12.550296783447266, "logits/rejected": 14.0412015914917, "logps/chosen": -4.131247043609619, "logps/rejected": -4.606905460357666, "loss": 3.7214, "rewards/accuracies": 0.75, "rewards/chosen": -41.312469482421875, "rewards/margins": 4.756586074829102, "rewards/rejected": -46.069053649902344, "step": 3777 }, { "epoch": 0.5144335511982571, "grad_norm": 53.4450190598305, "learning_rate": 4.4959373411770194e-07, "logits/chosen": 14.449674606323242, "logits/rejected": 13.245122909545898, "logps/chosen": -4.123111724853516, "logps/rejected": -4.2166290283203125, "loss": 3.5716, "rewards/accuracies": 0.5, "rewards/chosen": -41.23111343383789, "rewards/margins": 0.9351768493652344, "rewards/rejected": -42.166290283203125, "step": 3778 }, { "epoch": 0.5145697167755992, "grad_norm": 40.09628497105751, "learning_rate": 4.4940505533818384e-07, "logits/chosen": 13.175823211669922, "logits/rejected": 13.086321830749512, "logps/chosen": -3.8962936401367188, "logps/rejected": -4.052524089813232, "loss": 4.0585, "rewards/accuracies": 0.5, "rewards/chosen": -38.96293640136719, "rewards/margins": 1.562302589416504, "rewards/rejected": -40.525238037109375, "step": 3779 }, { "epoch": 0.5147058823529411, "grad_norm": 44.42202923206075, "learning_rate": 4.49216365395185e-07, "logits/chosen": 14.053265571594238, "logits/rejected": 14.260173797607422, "logps/chosen": -4.38753604888916, "logps/rejected": -4.346269607543945, "loss": 4.3737, "rewards/accuracies": 0.25, "rewards/chosen": -43.87535858154297, "rewards/margins": -0.41266727447509766, "rewards/rejected": -43.46269226074219, "step": 3780 }, { "epoch": 0.5148420479302832, "grad_norm": 40.992489810568955, "learning_rate": 4.490276643313417e-07, "logits/chosen": 13.292298316955566, "logits/rejected": 12.965940475463867, "logps/chosen": -4.198969841003418, "logps/rejected": -4.144989013671875, "loss": 3.4408, "rewards/accuracies": 0.5, "rewards/chosen": -41.98970413208008, "rewards/margins": -0.5398111343383789, "rewards/rejected": -41.44989013671875, "step": 3781 }, { "epoch": 0.5149782135076253, "grad_norm": 54.28953995519213, "learning_rate": 4.4883895218929233e-07, "logits/chosen": 12.91598892211914, "logits/rejected": 13.512495040893555, "logps/chosen": -3.9701545238494873, "logps/rejected": -4.318861484527588, "loss": 3.422, "rewards/accuracies": 0.75, "rewards/chosen": -39.70154571533203, "rewards/margins": 3.487070083618164, "rewards/rejected": -43.18861389160156, "step": 3782 }, { "epoch": 0.5151143790849673, "grad_norm": 42.703621315829224, "learning_rate": 4.486502290116779e-07, "logits/chosen": 13.09689712524414, "logits/rejected": 13.299765586853027, "logps/chosen": -3.9069440364837646, "logps/rejected": -4.203782081604004, "loss": 3.2871, "rewards/accuracies": 1.0, "rewards/chosen": -39.06944274902344, "rewards/margins": 2.9683828353881836, "rewards/rejected": -42.03782272338867, "step": 3783 }, { "epoch": 0.5152505446623094, "grad_norm": 58.576519884229846, "learning_rate": 4.4846149484114226e-07, "logits/chosen": 12.909085273742676, "logits/rejected": 13.539668083190918, "logps/chosen": -4.065731048583984, "logps/rejected": -4.380782127380371, "loss": 4.6185, "rewards/accuracies": 0.75, "rewards/chosen": -40.65731430053711, "rewards/margins": 3.1505050659179688, "rewards/rejected": -43.80781936645508, "step": 3784 }, { "epoch": 0.5153867102396514, "grad_norm": 45.070545325805895, "learning_rate": 4.4827274972033116e-07, "logits/chosen": 13.519208908081055, "logits/rejected": 13.680512428283691, "logps/chosen": -4.076902389526367, "logps/rejected": -4.27420711517334, "loss": 3.7856, "rewards/accuracies": 0.75, "rewards/chosen": -40.769020080566406, "rewards/margins": 1.9730491638183594, "rewards/rejected": -42.74207305908203, "step": 3785 }, { "epoch": 0.5155228758169934, "grad_norm": 51.44431906850331, "learning_rate": 4.480839936918932e-07, "logits/chosen": 13.94194507598877, "logits/rejected": 13.121659278869629, "logps/chosen": -4.385169506072998, "logps/rejected": -4.302711009979248, "loss": 4.5998, "rewards/accuracies": 0.5, "rewards/chosen": -43.8516960144043, "rewards/margins": -0.8245859146118164, "rewards/rejected": -43.02710723876953, "step": 3786 }, { "epoch": 0.5156590413943355, "grad_norm": 52.69716220314345, "learning_rate": 4.4789522679847946e-07, "logits/chosen": 13.073508262634277, "logits/rejected": 14.042500495910645, "logps/chosen": -3.745093584060669, "logps/rejected": -4.2850823402404785, "loss": 3.4955, "rewards/accuracies": 1.0, "rewards/chosen": -37.45093536376953, "rewards/margins": 5.399890899658203, "rewards/rejected": -42.850826263427734, "step": 3787 }, { "epoch": 0.5157952069716776, "grad_norm": 44.3047195565368, "learning_rate": 4.477064490827434e-07, "logits/chosen": 13.216508865356445, "logits/rejected": 13.237266540527344, "logps/chosen": -3.6648590564727783, "logps/rejected": -3.976754665374756, "loss": 3.9631, "rewards/accuracies": 1.0, "rewards/chosen": -36.648590087890625, "rewards/margins": 3.118955612182617, "rewards/rejected": -39.767547607421875, "step": 3788 }, { "epoch": 0.5159313725490197, "grad_norm": 46.11362658456779, "learning_rate": 4.4751766058734065e-07, "logits/chosen": 12.750123977661133, "logits/rejected": 13.171669960021973, "logps/chosen": -4.082740306854248, "logps/rejected": -4.314139366149902, "loss": 4.7494, "rewards/accuracies": 0.5, "rewards/chosen": -40.8274040222168, "rewards/margins": 2.313990592956543, "rewards/rejected": -43.141395568847656, "step": 3789 }, { "epoch": 0.5160675381263616, "grad_norm": 43.819105065831074, "learning_rate": 4.4732886135492985e-07, "logits/chosen": 13.179804801940918, "logits/rejected": 14.394011497497559, "logps/chosen": -3.833911418914795, "logps/rejected": -4.373479843139648, "loss": 3.6467, "rewards/accuracies": 1.0, "rewards/chosen": -38.339115142822266, "rewards/margins": 5.395681381225586, "rewards/rejected": -43.73479461669922, "step": 3790 }, { "epoch": 0.5162037037037037, "grad_norm": 40.4810490883462, "learning_rate": 4.4714005142817155e-07, "logits/chosen": 13.604898452758789, "logits/rejected": 12.663180351257324, "logps/chosen": -3.699673652648926, "logps/rejected": -3.916205883026123, "loss": 3.3263, "rewards/accuracies": 0.75, "rewards/chosen": -36.996734619140625, "rewards/margins": 2.165323257446289, "rewards/rejected": -39.16205978393555, "step": 3791 }, { "epoch": 0.5163398692810458, "grad_norm": 43.991820601981836, "learning_rate": 4.4695123084972887e-07, "logits/chosen": 14.384397506713867, "logits/rejected": 14.010063171386719, "logps/chosen": -4.400458335876465, "logps/rejected": -4.229036331176758, "loss": 4.3669, "rewards/accuracies": 0.5, "rewards/chosen": -44.00458526611328, "rewards/margins": -1.7142162322998047, "rewards/rejected": -42.290367126464844, "step": 3792 }, { "epoch": 0.5164760348583878, "grad_norm": 54.55350564359312, "learning_rate": 4.467623996622676e-07, "logits/chosen": 13.14078140258789, "logits/rejected": 13.45401382446289, "logps/chosen": -4.051392078399658, "logps/rejected": -4.114854335784912, "loss": 4.2652, "rewards/accuracies": 0.25, "rewards/chosen": -40.513919830322266, "rewards/margins": 0.6346206665039062, "rewards/rejected": -41.14854049682617, "step": 3793 }, { "epoch": 0.5166122004357299, "grad_norm": 43.481552027190816, "learning_rate": 4.4657355790845564e-07, "logits/chosen": 13.0953950881958, "logits/rejected": 13.768342971801758, "logps/chosen": -4.101412773132324, "logps/rejected": -4.4266676902771, "loss": 4.4824, "rewards/accuracies": 0.75, "rewards/chosen": -41.014129638671875, "rewards/margins": 3.2525482177734375, "rewards/rejected": -44.26667785644531, "step": 3794 }, { "epoch": 0.516748366013072, "grad_norm": 42.90690922986284, "learning_rate": 4.4638470563096307e-07, "logits/chosen": 13.464273452758789, "logits/rejected": 13.416711807250977, "logps/chosen": -4.147082328796387, "logps/rejected": -3.8925437927246094, "loss": 3.9804, "rewards/accuracies": 0.25, "rewards/chosen": -41.470821380615234, "rewards/margins": -2.545384407043457, "rewards/rejected": -38.925437927246094, "step": 3795 }, { "epoch": 0.5168845315904139, "grad_norm": 39.69526811789974, "learning_rate": 4.4619584287246306e-07, "logits/chosen": 13.725992202758789, "logits/rejected": 13.390386581420898, "logps/chosen": -4.282546520233154, "logps/rejected": -4.481115341186523, "loss": 3.9288, "rewards/accuracies": 0.75, "rewards/chosen": -42.82546615600586, "rewards/margins": 1.985687255859375, "rewards/rejected": -44.811153411865234, "step": 3796 }, { "epoch": 0.517020697167756, "grad_norm": 39.88135247460774, "learning_rate": 4.4600696967563046e-07, "logits/chosen": 13.154314994812012, "logits/rejected": 13.248926162719727, "logps/chosen": -3.8214669227600098, "logps/rejected": -4.00147819519043, "loss": 3.9499, "rewards/accuracies": 0.5, "rewards/chosen": -38.21466827392578, "rewards/margins": 1.8001117706298828, "rewards/rejected": -40.01477813720703, "step": 3797 }, { "epoch": 0.5171568627450981, "grad_norm": 37.344845407348714, "learning_rate": 4.458180860831426e-07, "logits/chosen": 13.325654983520508, "logits/rejected": 14.014592170715332, "logps/chosen": -3.879316806793213, "logps/rejected": -4.43717098236084, "loss": 3.8249, "rewards/accuracies": 1.0, "rewards/chosen": -38.79316711425781, "rewards/margins": 5.5785417556762695, "rewards/rejected": -44.37171173095703, "step": 3798 }, { "epoch": 0.5172930283224401, "grad_norm": 39.20155124226641, "learning_rate": 4.4562919213767963e-07, "logits/chosen": 12.778532028198242, "logits/rejected": 13.791744232177734, "logps/chosen": -4.072028160095215, "logps/rejected": -4.229917526245117, "loss": 3.6179, "rewards/accuracies": 0.5, "rewards/chosen": -40.72028350830078, "rewards/margins": 1.578892707824707, "rewards/rejected": -42.299171447753906, "step": 3799 }, { "epoch": 0.5174291938997821, "grad_norm": 47.59235496508154, "learning_rate": 4.454402878819235e-07, "logits/chosen": 14.360461235046387, "logits/rejected": 13.11257553100586, "logps/chosen": -4.496897220611572, "logps/rejected": -4.282479763031006, "loss": 4.2883, "rewards/accuracies": 0.25, "rewards/chosen": -44.968971252441406, "rewards/margins": -2.144176483154297, "rewards/rejected": -42.82479476928711, "step": 3800 }, { "epoch": 0.5175653594771242, "grad_norm": 54.753118732204264, "learning_rate": 4.4525137335855857e-07, "logits/chosen": 13.21434497833252, "logits/rejected": 13.699000358581543, "logps/chosen": -4.032183647155762, "logps/rejected": -4.133925914764404, "loss": 4.0157, "rewards/accuracies": 0.75, "rewards/chosen": -40.32183837890625, "rewards/margins": 1.0174226760864258, "rewards/rejected": -41.33926010131836, "step": 3801 }, { "epoch": 0.5177015250544662, "grad_norm": 37.28946715760469, "learning_rate": 4.450624486102719e-07, "logits/chosen": 13.646251678466797, "logits/rejected": 13.971881866455078, "logps/chosen": -4.018979072570801, "logps/rejected": -4.406349182128906, "loss": 3.9147, "rewards/accuracies": 1.0, "rewards/chosen": -40.189788818359375, "rewards/margins": 3.8737049102783203, "rewards/rejected": -44.06349563598633, "step": 3802 }, { "epoch": 0.5178376906318083, "grad_norm": 45.17869042552901, "learning_rate": 4.4487351367975254e-07, "logits/chosen": 12.454534530639648, "logits/rejected": 14.414131164550781, "logps/chosen": -3.763683319091797, "logps/rejected": -4.603055477142334, "loss": 3.5009, "rewards/accuracies": 1.0, "rewards/chosen": -37.63683319091797, "rewards/margins": 8.393721580505371, "rewards/rejected": -46.030555725097656, "step": 3803 }, { "epoch": 0.5179738562091504, "grad_norm": 42.40566640425566, "learning_rate": 4.4468456860969165e-07, "logits/chosen": 13.39052963256836, "logits/rejected": 13.538249969482422, "logps/chosen": -4.12137508392334, "logps/rejected": -4.340612411499023, "loss": 3.6601, "rewards/accuracies": 0.75, "rewards/chosen": -41.21375274658203, "rewards/margins": 2.1923770904541016, "rewards/rejected": -43.4061279296875, "step": 3804 }, { "epoch": 0.5181100217864923, "grad_norm": 41.40303957526052, "learning_rate": 4.4449561344278325e-07, "logits/chosen": 14.56973648071289, "logits/rejected": 14.30610466003418, "logps/chosen": -4.395744323730469, "logps/rejected": -4.316075801849365, "loss": 3.9992, "rewards/accuracies": 0.5, "rewards/chosen": -43.95744323730469, "rewards/margins": -0.7966842651367188, "rewards/rejected": -43.16075897216797, "step": 3805 }, { "epoch": 0.5182461873638344, "grad_norm": 41.561608177781885, "learning_rate": 4.443066482217232e-07, "logits/chosen": 14.241950035095215, "logits/rejected": 13.756375312805176, "logps/chosen": -4.424144268035889, "logps/rejected": -4.704000949859619, "loss": 4.1014, "rewards/accuracies": 0.75, "rewards/chosen": -44.24143981933594, "rewards/margins": 2.7985715866088867, "rewards/rejected": -47.04001235961914, "step": 3806 }, { "epoch": 0.5183823529411765, "grad_norm": 43.228974057506186, "learning_rate": 4.4411767298920966e-07, "logits/chosen": 13.011802673339844, "logits/rejected": 12.946737289428711, "logps/chosen": -3.717881679534912, "logps/rejected": -3.914163112640381, "loss": 4.3638, "rewards/accuracies": 0.75, "rewards/chosen": -37.17881774902344, "rewards/margins": 1.9628105163574219, "rewards/rejected": -39.14162826538086, "step": 3807 }, { "epoch": 0.5185185185185185, "grad_norm": 39.771488923393065, "learning_rate": 4.439286877879432e-07, "logits/chosen": 13.443742752075195, "logits/rejected": 13.469589233398438, "logps/chosen": -3.8182568550109863, "logps/rejected": -4.086793899536133, "loss": 3.8972, "rewards/accuracies": 1.0, "rewards/chosen": -38.18256759643555, "rewards/margins": 2.6853675842285156, "rewards/rejected": -40.86793518066406, "step": 3808 }, { "epoch": 0.5186546840958606, "grad_norm": 48.89687755092363, "learning_rate": 4.4373969266062675e-07, "logits/chosen": 13.154069900512695, "logits/rejected": 13.46762466430664, "logps/chosen": -4.206470489501953, "logps/rejected": -4.530213356018066, "loss": 4.3117, "rewards/accuracies": 1.0, "rewards/chosen": -42.06470489501953, "rewards/margins": 3.237429618835449, "rewards/rejected": -45.30213165283203, "step": 3809 }, { "epoch": 0.5187908496732027, "grad_norm": 39.38934605052708, "learning_rate": 4.4355068764996504e-07, "logits/chosen": 13.10594367980957, "logits/rejected": 13.32927131652832, "logps/chosen": -3.820756435394287, "logps/rejected": -3.9210665225982666, "loss": 3.8456, "rewards/accuracies": 0.75, "rewards/chosen": -38.20756530761719, "rewards/margins": 1.0031018257141113, "rewards/rejected": -39.210662841796875, "step": 3810 }, { "epoch": 0.5189270152505446, "grad_norm": 45.72093615695656, "learning_rate": 4.433616727986656e-07, "logits/chosen": 13.528539657592773, "logits/rejected": 13.70731258392334, "logps/chosen": -4.143021106719971, "logps/rejected": -4.469081878662109, "loss": 4.0923, "rewards/accuracies": 1.0, "rewards/chosen": -41.43021011352539, "rewards/margins": 3.2606077194213867, "rewards/rejected": -44.690818786621094, "step": 3811 }, { "epoch": 0.5190631808278867, "grad_norm": 39.16296932022912, "learning_rate": 4.431726481494376e-07, "logits/chosen": 12.449055671691895, "logits/rejected": 13.72098159790039, "logps/chosen": -3.738891124725342, "logps/rejected": -4.154815673828125, "loss": 3.9046, "rewards/accuracies": 0.75, "rewards/chosen": -37.388912200927734, "rewards/margins": 4.159246444702148, "rewards/rejected": -41.54815673828125, "step": 3812 }, { "epoch": 0.5191993464052288, "grad_norm": 64.26899732209945, "learning_rate": 4.4298361374499305e-07, "logits/chosen": 13.383878707885742, "logits/rejected": 13.790078163146973, "logps/chosen": -4.093935489654541, "logps/rejected": -4.232265472412109, "loss": 4.3658, "rewards/accuracies": 0.5, "rewards/chosen": -40.939353942871094, "rewards/margins": 1.3832998275756836, "rewards/rejected": -42.322654724121094, "step": 3813 }, { "epoch": 0.5193355119825708, "grad_norm": 41.08755648797882, "learning_rate": 4.4279456962804556e-07, "logits/chosen": 12.859027862548828, "logits/rejected": 14.014265060424805, "logps/chosen": -3.5646796226501465, "logps/rejected": -4.05479621887207, "loss": 3.6942, "rewards/accuracies": 1.0, "rewards/chosen": -35.64679718017578, "rewards/margins": 4.9011640548706055, "rewards/rejected": -40.54795837402344, "step": 3814 }, { "epoch": 0.5194716775599129, "grad_norm": 41.04623637991901, "learning_rate": 4.4260551584131135e-07, "logits/chosen": 12.745651245117188, "logits/rejected": 14.096680641174316, "logps/chosen": -3.734431266784668, "logps/rejected": -3.969613552093506, "loss": 4.4289, "rewards/accuracies": 0.5, "rewards/chosen": -37.34431457519531, "rewards/margins": 2.351825714111328, "rewards/rejected": -39.696136474609375, "step": 3815 }, { "epoch": 0.5196078431372549, "grad_norm": 42.59406124870039, "learning_rate": 4.4241645242750865e-07, "logits/chosen": 13.102021217346191, "logits/rejected": 13.498493194580078, "logps/chosen": -4.140216827392578, "logps/rejected": -4.138003349304199, "loss": 4.4376, "rewards/accuracies": 0.25, "rewards/chosen": -41.40216827392578, "rewards/margins": -0.02213764190673828, "rewards/rejected": -41.380027770996094, "step": 3816 }, { "epoch": 0.5197440087145969, "grad_norm": 43.79532860225276, "learning_rate": 4.422273794293579e-07, "logits/chosen": 14.082025527954102, "logits/rejected": 14.385720252990723, "logps/chosen": -4.2789154052734375, "logps/rejected": -4.353388786315918, "loss": 4.2031, "rewards/accuracies": 0.75, "rewards/chosen": -42.789154052734375, "rewards/margins": 0.7447328567504883, "rewards/rejected": -43.53388214111328, "step": 3817 }, { "epoch": 0.519880174291939, "grad_norm": 39.38922645642132, "learning_rate": 4.4203829688958176e-07, "logits/chosen": 13.326956748962402, "logits/rejected": 13.540942192077637, "logps/chosen": -4.197244644165039, "logps/rejected": -4.181717872619629, "loss": 4.1586, "rewards/accuracies": 0.5, "rewards/chosen": -41.97244644165039, "rewards/margins": -0.15526771545410156, "rewards/rejected": -41.817176818847656, "step": 3818 }, { "epoch": 0.5200163398692811, "grad_norm": 41.658084311765194, "learning_rate": 4.4184920485090487e-07, "logits/chosen": 13.18950080871582, "logits/rejected": 14.000717163085938, "logps/chosen": -4.415318489074707, "logps/rejected": -4.689370155334473, "loss": 3.8364, "rewards/accuracies": 0.5, "rewards/chosen": -44.15318298339844, "rewards/margins": 2.7405147552490234, "rewards/rejected": -46.893699645996094, "step": 3819 }, { "epoch": 0.5201525054466231, "grad_norm": 40.75228487796404, "learning_rate": 4.4166010335605427e-07, "logits/chosen": 14.149251937866211, "logits/rejected": 13.417847633361816, "logps/chosen": -4.181968688964844, "logps/rejected": -4.230401039123535, "loss": 4.2046, "rewards/accuracies": 0.5, "rewards/chosen": -41.81968688964844, "rewards/margins": 0.48432159423828125, "rewards/rejected": -42.30400848388672, "step": 3820 }, { "epoch": 0.5202886710239651, "grad_norm": 46.77602007103401, "learning_rate": 4.41470992447759e-07, "logits/chosen": 13.470300674438477, "logits/rejected": 14.178357124328613, "logps/chosen": -4.337602615356445, "logps/rejected": -4.531805992126465, "loss": 4.2764, "rewards/accuracies": 0.75, "rewards/chosen": -43.37602615356445, "rewards/margins": 1.9420299530029297, "rewards/rejected": -45.318058013916016, "step": 3821 }, { "epoch": 0.5204248366013072, "grad_norm": 43.1520784613241, "learning_rate": 4.4128187216875004e-07, "logits/chosen": 12.923532485961914, "logits/rejected": 13.247513771057129, "logps/chosen": -4.01602029800415, "logps/rejected": -4.11775541305542, "loss": 3.9315, "rewards/accuracies": 0.5, "rewards/chosen": -40.16020202636719, "rewards/margins": 1.0173511505126953, "rewards/rejected": -41.17755126953125, "step": 3822 }, { "epoch": 0.5205610021786492, "grad_norm": 41.657244017879094, "learning_rate": 4.4109274256176097e-07, "logits/chosen": 13.205077171325684, "logits/rejected": 13.230623245239258, "logps/chosen": -4.218900680541992, "logps/rejected": -4.304020881652832, "loss": 4.5174, "rewards/accuracies": 0.5, "rewards/chosen": -42.189002990722656, "rewards/margins": 0.8512020111083984, "rewards/rejected": -43.04020690917969, "step": 3823 }, { "epoch": 0.5206971677559913, "grad_norm": 36.80617268620116, "learning_rate": 4.40903603669527e-07, "logits/chosen": 12.671597480773926, "logits/rejected": 12.948724746704102, "logps/chosen": -4.035579204559326, "logps/rejected": -4.0651373863220215, "loss": 3.8327, "rewards/accuracies": 0.5, "rewards/chosen": -40.35578918457031, "rewards/margins": 0.2955818176269531, "rewards/rejected": -40.65137481689453, "step": 3824 }, { "epoch": 0.5208333333333334, "grad_norm": 38.863984141833, "learning_rate": 4.4071445553478563e-07, "logits/chosen": 13.993877410888672, "logits/rejected": 15.62828254699707, "logps/chosen": -4.163187026977539, "logps/rejected": -4.735767364501953, "loss": 3.9923, "rewards/accuracies": 1.0, "rewards/chosen": -41.63187026977539, "rewards/margins": 5.72580623626709, "rewards/rejected": -47.35767364501953, "step": 3825 }, { "epoch": 0.5209694989106753, "grad_norm": 41.427660528101626, "learning_rate": 4.405252982002765e-07, "logits/chosen": 13.298993110656738, "logits/rejected": 12.9598388671875, "logps/chosen": -4.125598907470703, "logps/rejected": -4.240118503570557, "loss": 4.2472, "rewards/accuracies": 0.75, "rewards/chosen": -41.25598907470703, "rewards/margins": 1.145193099975586, "rewards/rejected": -42.40118408203125, "step": 3826 }, { "epoch": 0.5211056644880174, "grad_norm": 43.98943367022279, "learning_rate": 4.4033613170874124e-07, "logits/chosen": 12.494102478027344, "logits/rejected": 13.40561294555664, "logps/chosen": -3.73922061920166, "logps/rejected": -4.227073669433594, "loss": 4.1679, "rewards/accuracies": 1.0, "rewards/chosen": -37.39220428466797, "rewards/margins": 4.878531455993652, "rewards/rejected": -42.27073669433594, "step": 3827 }, { "epoch": 0.5212418300653595, "grad_norm": 149.92517154728617, "learning_rate": 4.4014695610292356e-07, "logits/chosen": 13.988129615783691, "logits/rejected": 13.82347583770752, "logps/chosen": -3.909025192260742, "logps/rejected": -4.059689521789551, "loss": 3.3177, "rewards/accuracies": 0.75, "rewards/chosen": -39.09025573730469, "rewards/margins": 1.5066452026367188, "rewards/rejected": -40.59689712524414, "step": 3828 }, { "epoch": 0.5213779956427015, "grad_norm": 52.52466492114792, "learning_rate": 4.399577714255694e-07, "logits/chosen": 13.312694549560547, "logits/rejected": 13.213991165161133, "logps/chosen": -4.16176700592041, "logps/rejected": -4.431424140930176, "loss": 4.2332, "rewards/accuracies": 0.75, "rewards/chosen": -41.61766815185547, "rewards/margins": 2.6965770721435547, "rewards/rejected": -44.31424331665039, "step": 3829 }, { "epoch": 0.5215141612200436, "grad_norm": 38.804502112491356, "learning_rate": 4.3976857771942643e-07, "logits/chosen": 13.112200736999512, "logits/rejected": 13.735803604125977, "logps/chosen": -3.8772246837615967, "logps/rejected": -4.511684417724609, "loss": 4.0912, "rewards/accuracies": 0.75, "rewards/chosen": -38.772247314453125, "rewards/margins": 6.3445940017700195, "rewards/rejected": -45.11684036254883, "step": 3830 }, { "epoch": 0.5216503267973857, "grad_norm": 41.2405422618284, "learning_rate": 4.395793750272446e-07, "logits/chosen": 12.469400405883789, "logits/rejected": 13.528318405151367, "logps/chosen": -4.1882734298706055, "logps/rejected": -4.391130447387695, "loss": 4.1519, "rewards/accuracies": 0.75, "rewards/chosen": -41.88273620605469, "rewards/margins": 2.0285720825195312, "rewards/rejected": -43.91130828857422, "step": 3831 }, { "epoch": 0.5217864923747276, "grad_norm": 49.104419664948864, "learning_rate": 4.3939016339177585e-07, "logits/chosen": 12.694957733154297, "logits/rejected": 13.53604507446289, "logps/chosen": -3.845977783203125, "logps/rejected": -4.198003768920898, "loss": 3.8727, "rewards/accuracies": 0.75, "rewards/chosen": -38.45977783203125, "rewards/margins": 3.5202598571777344, "rewards/rejected": -41.980037689208984, "step": 3832 }, { "epoch": 0.5219226579520697, "grad_norm": 43.95177181656151, "learning_rate": 4.392009428557741e-07, "logits/chosen": 13.434873580932617, "logits/rejected": 14.171318054199219, "logps/chosen": -4.031428337097168, "logps/rejected": -4.272249221801758, "loss": 3.707, "rewards/accuracies": 1.0, "rewards/chosen": -40.31428527832031, "rewards/margins": 2.4082040786743164, "rewards/rejected": -42.72249221801758, "step": 3833 }, { "epoch": 0.5220588235294118, "grad_norm": 43.35804687441407, "learning_rate": 4.3901171346199515e-07, "logits/chosen": 13.147207260131836, "logits/rejected": 12.929065704345703, "logps/chosen": -4.162705421447754, "logps/rejected": -4.205809593200684, "loss": 4.0429, "rewards/accuracies": 0.5, "rewards/chosen": -41.627052307128906, "rewards/margins": 0.4310464859008789, "rewards/rejected": -42.05809783935547, "step": 3834 }, { "epoch": 0.5221949891067538, "grad_norm": 55.25748897779303, "learning_rate": 4.388224752531972e-07, "logits/chosen": 12.799386978149414, "logits/rejected": 12.68265151977539, "logps/chosen": -4.156352996826172, "logps/rejected": -4.2312846183776855, "loss": 3.7643, "rewards/accuracies": 0.5, "rewards/chosen": -41.56352996826172, "rewards/margins": 0.7493181228637695, "rewards/rejected": -42.31285095214844, "step": 3835 }, { "epoch": 0.5223311546840959, "grad_norm": 40.533309042037466, "learning_rate": 4.3863322827213995e-07, "logits/chosen": 12.744367599487305, "logits/rejected": 12.454330444335938, "logps/chosen": -3.81943416595459, "logps/rejected": -3.875617504119873, "loss": 3.9965, "rewards/accuracies": 0.5, "rewards/chosen": -38.19434356689453, "rewards/margins": 0.5618338584899902, "rewards/rejected": -38.75617599487305, "step": 3836 }, { "epoch": 0.5224673202614379, "grad_norm": 43.006320257489314, "learning_rate": 4.3844397256158545e-07, "logits/chosen": 12.864532470703125, "logits/rejected": 13.490606307983398, "logps/chosen": -3.9511523246765137, "logps/rejected": -4.309666633605957, "loss": 3.7678, "rewards/accuracies": 1.0, "rewards/chosen": -39.51152038574219, "rewards/margins": 3.585141181945801, "rewards/rejected": -43.09666442871094, "step": 3837 }, { "epoch": 0.5226034858387799, "grad_norm": 51.299576399925535, "learning_rate": 4.3825470816429763e-07, "logits/chosen": 12.53778076171875, "logits/rejected": 13.873086929321289, "logps/chosen": -4.0926713943481445, "logps/rejected": -4.393624305725098, "loss": 4.0683, "rewards/accuracies": 0.75, "rewards/chosen": -40.92671203613281, "rewards/margins": 3.0095348358154297, "rewards/rejected": -43.93624496459961, "step": 3838 }, { "epoch": 0.522739651416122, "grad_norm": 42.82951586210716, "learning_rate": 4.380654351230422e-07, "logits/chosen": 13.747320175170898, "logits/rejected": 13.756572723388672, "logps/chosen": -4.260375499725342, "logps/rejected": -4.47880744934082, "loss": 4.4956, "rewards/accuracies": 0.75, "rewards/chosen": -42.603759765625, "rewards/margins": 2.1843156814575195, "rewards/rejected": -44.78807067871094, "step": 3839 }, { "epoch": 0.5228758169934641, "grad_norm": 40.93622673345919, "learning_rate": 4.3787615348058714e-07, "logits/chosen": 12.785747528076172, "logits/rejected": 13.62590217590332, "logps/chosen": -4.124462127685547, "logps/rejected": -4.335911750793457, "loss": 3.8564, "rewards/accuracies": 0.75, "rewards/chosen": -41.2446174621582, "rewards/margins": 2.114499092102051, "rewards/rejected": -43.35911560058594, "step": 3840 }, { "epoch": 0.523011982570806, "grad_norm": 38.84106890774717, "learning_rate": 4.376868632797021e-07, "logits/chosen": 14.167257308959961, "logits/rejected": 13.708401679992676, "logps/chosen": -4.3866801261901855, "logps/rejected": -4.517099380493164, "loss": 3.9237, "rewards/accuracies": 0.75, "rewards/chosen": -43.866798400878906, "rewards/margins": 1.3041887283325195, "rewards/rejected": -45.170989990234375, "step": 3841 }, { "epoch": 0.5231481481481481, "grad_norm": 42.703636618272355, "learning_rate": 4.374975645631587e-07, "logits/chosen": 14.255066871643066, "logits/rejected": 13.311836242675781, "logps/chosen": -4.611518383026123, "logps/rejected": -4.262676239013672, "loss": 4.6739, "rewards/accuracies": 0.25, "rewards/chosen": -46.11518478393555, "rewards/margins": -3.4884185791015625, "rewards/rejected": -42.62676239013672, "step": 3842 }, { "epoch": 0.5232843137254902, "grad_norm": 41.17557130687451, "learning_rate": 4.3730825737373065e-07, "logits/chosen": 13.96391773223877, "logits/rejected": 13.488401412963867, "logps/chosen": -4.190059661865234, "logps/rejected": -4.091833114624023, "loss": 4.2473, "rewards/accuracies": 0.5, "rewards/chosen": -41.90060043334961, "rewards/margins": -0.9822635650634766, "rewards/rejected": -40.9183349609375, "step": 3843 }, { "epoch": 0.5234204793028322, "grad_norm": 38.45947857832333, "learning_rate": 4.3711894175419354e-07, "logits/chosen": 13.212162017822266, "logits/rejected": 13.98345947265625, "logps/chosen": -3.9618139266967773, "logps/rejected": -4.376592636108398, "loss": 3.7113, "rewards/accuracies": 1.0, "rewards/chosen": -39.618141174316406, "rewards/margins": 4.147785186767578, "rewards/rejected": -43.765926361083984, "step": 3844 }, { "epoch": 0.5235566448801743, "grad_norm": 40.14734935486581, "learning_rate": 4.369296177473247e-07, "logits/chosen": 12.86806869506836, "logits/rejected": 13.661108016967773, "logps/chosen": -3.8515985012054443, "logps/rejected": -4.275028228759766, "loss": 3.8227, "rewards/accuracies": 0.75, "rewards/chosen": -38.51598358154297, "rewards/margins": 4.2342939376831055, "rewards/rejected": -42.75027847290039, "step": 3845 }, { "epoch": 0.5236928104575164, "grad_norm": 37.922158436525955, "learning_rate": 4.367402853959033e-07, "logits/chosen": 14.128875732421875, "logits/rejected": 13.311803817749023, "logps/chosen": -4.218369483947754, "logps/rejected": -4.487570762634277, "loss": 3.7253, "rewards/accuracies": 0.75, "rewards/chosen": -42.183692932128906, "rewards/margins": 2.6920204162597656, "rewards/rejected": -44.87571334838867, "step": 3846 }, { "epoch": 0.5238289760348583, "grad_norm": 43.91397376267397, "learning_rate": 4.365509447427109e-07, "logits/chosen": 13.360877990722656, "logits/rejected": 13.184823989868164, "logps/chosen": -3.9198522567749023, "logps/rejected": -3.8490567207336426, "loss": 4.3565, "rewards/accuracies": 0.5, "rewards/chosen": -39.198524475097656, "rewards/margins": -0.7079534530639648, "rewards/rejected": -38.490570068359375, "step": 3847 }, { "epoch": 0.5239651416122004, "grad_norm": 40.52122737227924, "learning_rate": 4.3636159583053035e-07, "logits/chosen": 13.280264854431152, "logits/rejected": 12.871158599853516, "logps/chosen": -4.236330509185791, "logps/rejected": -4.042820453643799, "loss": 3.6298, "rewards/accuracies": 0.25, "rewards/chosen": -42.363304138183594, "rewards/margins": -1.9351005554199219, "rewards/rejected": -40.42820358276367, "step": 3848 }, { "epoch": 0.5241013071895425, "grad_norm": 44.003111389167636, "learning_rate": 4.361722387021467e-07, "logits/chosen": 12.886825561523438, "logits/rejected": 13.416232109069824, "logps/chosen": -3.929414749145508, "logps/rejected": -4.208605766296387, "loss": 3.5756, "rewards/accuracies": 0.75, "rewards/chosen": -39.29414749145508, "rewards/margins": 2.7919111251831055, "rewards/rejected": -42.0860595703125, "step": 3849 }, { "epoch": 0.5242374727668845, "grad_norm": 69.51530785696379, "learning_rate": 4.359828734003466e-07, "logits/chosen": 13.353970527648926, "logits/rejected": 14.426908493041992, "logps/chosen": -4.274123191833496, "logps/rejected": -4.260305404663086, "loss": 4.2714, "rewards/accuracies": 0.5, "rewards/chosen": -42.74123001098633, "rewards/margins": -0.13817691802978516, "rewards/rejected": -42.60305404663086, "step": 3850 }, { "epoch": 0.5243736383442266, "grad_norm": 46.37314581790208, "learning_rate": 4.357934999679189e-07, "logits/chosen": 12.615438461303711, "logits/rejected": 12.350015640258789, "logps/chosen": -3.9164440631866455, "logps/rejected": -4.265768051147461, "loss": 4.232, "rewards/accuracies": 0.75, "rewards/chosen": -39.16444396972656, "rewards/margins": 3.4932403564453125, "rewards/rejected": -42.65768051147461, "step": 3851 }, { "epoch": 0.5245098039215687, "grad_norm": 41.865412709626234, "learning_rate": 4.356041184476539e-07, "logits/chosen": 14.481939315795898, "logits/rejected": 13.605250358581543, "logps/chosen": -4.346222877502441, "logps/rejected": -4.2423577308654785, "loss": 4.0594, "rewards/accuracies": 0.5, "rewards/chosen": -43.46223068237305, "rewards/margins": -1.0386524200439453, "rewards/rejected": -42.42357635498047, "step": 3852 }, { "epoch": 0.5246459694989106, "grad_norm": 40.775562915137584, "learning_rate": 4.3541472888234417e-07, "logits/chosen": 12.877009391784668, "logits/rejected": 13.276434898376465, "logps/chosen": -3.887021064758301, "logps/rejected": -4.2188520431518555, "loss": 4.0747, "rewards/accuracies": 1.0, "rewards/chosen": -38.870208740234375, "rewards/margins": 3.318312644958496, "rewards/rejected": -42.18852233886719, "step": 3853 }, { "epoch": 0.5247821350762527, "grad_norm": 36.857341684810486, "learning_rate": 4.352253313147837e-07, "logits/chosen": 12.865901947021484, "logits/rejected": 13.470296859741211, "logps/chosen": -4.099100589752197, "logps/rejected": -4.285392761230469, "loss": 3.8786, "rewards/accuracies": 0.5, "rewards/chosen": -40.991004943847656, "rewards/margins": 1.8629264831542969, "rewards/rejected": -42.85392761230469, "step": 3854 }, { "epoch": 0.5249183006535948, "grad_norm": 36.5909233504947, "learning_rate": 4.350359257877684e-07, "logits/chosen": 13.563128471374512, "logits/rejected": 13.480648040771484, "logps/chosen": -4.030604839324951, "logps/rejected": -3.93574595451355, "loss": 4.1191, "rewards/accuracies": 0.5, "rewards/chosen": -40.30604553222656, "rewards/margins": -0.9485864639282227, "rewards/rejected": -39.357460021972656, "step": 3855 }, { "epoch": 0.5250544662309368, "grad_norm": 39.96221636949497, "learning_rate": 4.34846512344096e-07, "logits/chosen": 13.127824783325195, "logits/rejected": 14.877155303955078, "logps/chosen": -4.013589859008789, "logps/rejected": -4.544581413269043, "loss": 4.0885, "rewards/accuracies": 0.75, "rewards/chosen": -40.135902404785156, "rewards/margins": 5.30991268157959, "rewards/rejected": -45.44581604003906, "step": 3856 }, { "epoch": 0.5251906318082789, "grad_norm": 39.48651055141071, "learning_rate": 4.3465709102656606e-07, "logits/chosen": 12.87320327758789, "logits/rejected": 13.21727180480957, "logps/chosen": -4.332178115844727, "logps/rejected": -4.408575057983398, "loss": 3.7858, "rewards/accuracies": 0.75, "rewards/chosen": -43.32177734375, "rewards/margins": 0.76397705078125, "rewards/rejected": -44.08575439453125, "step": 3857 }, { "epoch": 0.5253267973856209, "grad_norm": 46.178095204649345, "learning_rate": 4.3446766187798013e-07, "logits/chosen": 12.626302719116211, "logits/rejected": 13.26519775390625, "logps/chosen": -4.087374210357666, "logps/rejected": -4.306872844696045, "loss": 4.2174, "rewards/accuracies": 0.75, "rewards/chosen": -40.873741149902344, "rewards/margins": 2.194988250732422, "rewards/rejected": -43.0687255859375, "step": 3858 }, { "epoch": 0.5254629629629629, "grad_norm": 98.0863750808061, "learning_rate": 4.342782249411409e-07, "logits/chosen": 13.06502914428711, "logits/rejected": 14.603422164916992, "logps/chosen": -3.901240587234497, "logps/rejected": -4.478604793548584, "loss": 3.8856, "rewards/accuracies": 1.0, "rewards/chosen": -39.01240539550781, "rewards/margins": 5.773642539978027, "rewards/rejected": -44.786048889160156, "step": 3859 }, { "epoch": 0.525599128540305, "grad_norm": 40.672629352586966, "learning_rate": 4.3408878025885344e-07, "logits/chosen": 13.985712051391602, "logits/rejected": 14.16814136505127, "logps/chosen": -3.895723819732666, "logps/rejected": -4.4021124839782715, "loss": 4.0832, "rewards/accuracies": 1.0, "rewards/chosen": -38.957237243652344, "rewards/margins": 5.063889503479004, "rewards/rejected": -44.02112579345703, "step": 3860 }, { "epoch": 0.5257352941176471, "grad_norm": 42.6523541986155, "learning_rate": 4.338993278739243e-07, "logits/chosen": 13.580533981323242, "logits/rejected": 13.80567741394043, "logps/chosen": -4.164052963256836, "logps/rejected": -4.424124717712402, "loss": 3.995, "rewards/accuracies": 0.75, "rewards/chosen": -41.640525817871094, "rewards/margins": 2.600719451904297, "rewards/rejected": -44.24124526977539, "step": 3861 }, { "epoch": 0.525871459694989, "grad_norm": 39.263096489902374, "learning_rate": 4.337098678291619e-07, "logits/chosen": 12.79548454284668, "logits/rejected": 13.312469482421875, "logps/chosen": -3.7903378009796143, "logps/rejected": -3.988813877105713, "loss": 4.1998, "rewards/accuracies": 0.75, "rewards/chosen": -37.903377532958984, "rewards/margins": 1.9847602844238281, "rewards/rejected": -39.88813781738281, "step": 3862 }, { "epoch": 0.5260076252723311, "grad_norm": 38.655901975082784, "learning_rate": 4.3352040016737615e-07, "logits/chosen": 13.419703483581543, "logits/rejected": 13.560663223266602, "logps/chosen": -4.279540538787842, "logps/rejected": -4.450663089752197, "loss": 4.2076, "rewards/accuracies": 0.5, "rewards/chosen": -42.795406341552734, "rewards/margins": 1.7112236022949219, "rewards/rejected": -44.506629943847656, "step": 3863 }, { "epoch": 0.5261437908496732, "grad_norm": 44.79656422481815, "learning_rate": 4.333309249313789e-07, "logits/chosen": 14.085941314697266, "logits/rejected": 12.97981071472168, "logps/chosen": -4.338457107543945, "logps/rejected": -3.9183216094970703, "loss": 3.8746, "rewards/accuracies": 0.25, "rewards/chosen": -43.38457489013672, "rewards/margins": -4.201355934143066, "rewards/rejected": -39.1832160949707, "step": 3864 }, { "epoch": 0.5262799564270153, "grad_norm": 38.19346041063992, "learning_rate": 4.3314144216398364e-07, "logits/chosen": 13.000957489013672, "logits/rejected": 14.218477249145508, "logps/chosen": -3.9906978607177734, "logps/rejected": -4.366263389587402, "loss": 4.0899, "rewards/accuracies": 0.75, "rewards/chosen": -39.90697479248047, "rewards/margins": 3.7556586265563965, "rewards/rejected": -43.662635803222656, "step": 3865 }, { "epoch": 0.5264161220043573, "grad_norm": 39.70360839741969, "learning_rate": 4.3295195190800556e-07, "logits/chosen": 13.048103332519531, "logits/rejected": 13.527284622192383, "logps/chosen": -4.0413498878479, "logps/rejected": -4.1915998458862305, "loss": 3.6045, "rewards/accuracies": 0.75, "rewards/chosen": -40.41349792480469, "rewards/margins": 1.5025014877319336, "rewards/rejected": -41.91600036621094, "step": 3866 }, { "epoch": 0.5265522875816994, "grad_norm": 44.255747682013244, "learning_rate": 4.327624542062615e-07, "logits/chosen": 13.510883331298828, "logits/rejected": 14.167261123657227, "logps/chosen": -3.777503490447998, "logps/rejected": -4.243040561676025, "loss": 4.3545, "rewards/accuracies": 1.0, "rewards/chosen": -37.7750358581543, "rewards/margins": 4.655369281768799, "rewards/rejected": -42.43040466308594, "step": 3867 }, { "epoch": 0.5266884531590414, "grad_norm": 37.947588147281344, "learning_rate": 4.3257294910157023e-07, "logits/chosen": 12.336614608764648, "logits/rejected": 12.37286376953125, "logps/chosen": -3.9913887977600098, "logps/rejected": -4.023218154907227, "loss": 3.8995, "rewards/accuracies": 0.5, "rewards/chosen": -39.91388702392578, "rewards/margins": 0.31829357147216797, "rewards/rejected": -40.232177734375, "step": 3868 }, { "epoch": 0.5268246187363834, "grad_norm": 40.56766299858916, "learning_rate": 4.323834366367519e-07, "logits/chosen": 13.287528038024902, "logits/rejected": 13.790733337402344, "logps/chosen": -4.331915855407715, "logps/rejected": -4.505583763122559, "loss": 3.6526, "rewards/accuracies": 1.0, "rewards/chosen": -43.319156646728516, "rewards/margins": 1.7366809844970703, "rewards/rejected": -45.05583953857422, "step": 3869 }, { "epoch": 0.5269607843137255, "grad_norm": 43.995835122973205, "learning_rate": 4.321939168546282e-07, "logits/chosen": 12.83715534210205, "logits/rejected": 12.476343154907227, "logps/chosen": -3.7752015590667725, "logps/rejected": -3.9617292881011963, "loss": 3.9547, "rewards/accuracies": 0.75, "rewards/chosen": -37.75201416015625, "rewards/margins": 1.8652772903442383, "rewards/rejected": -39.61729431152344, "step": 3870 }, { "epoch": 0.5270969498910676, "grad_norm": 41.798624253621675, "learning_rate": 4.32004389798023e-07, "logits/chosen": 12.657081604003906, "logits/rejected": 13.384597778320312, "logps/chosen": -3.8784401416778564, "logps/rejected": -3.9783689975738525, "loss": 3.7994, "rewards/accuracies": 0.5, "rewards/chosen": -38.784400939941406, "rewards/margins": 0.9992914199829102, "rewards/rejected": -39.78369140625, "step": 3871 }, { "epoch": 0.5272331154684096, "grad_norm": 38.42360369832949, "learning_rate": 4.318148555097613e-07, "logits/chosen": 12.340518951416016, "logits/rejected": 11.922954559326172, "logps/chosen": -3.7490901947021484, "logps/rejected": -3.567124843597412, "loss": 3.9758, "rewards/accuracies": 0.25, "rewards/chosen": -37.49090576171875, "rewards/margins": -1.819657802581787, "rewards/rejected": -35.67124557495117, "step": 3872 }, { "epoch": 0.5273692810457516, "grad_norm": 50.63853207087086, "learning_rate": 4.3162531403267e-07, "logits/chosen": 12.486541748046875, "logits/rejected": 12.623941421508789, "logps/chosen": -3.966423988342285, "logps/rejected": -3.5428638458251953, "loss": 4.328, "rewards/accuracies": 0.0, "rewards/chosen": -39.66423797607422, "rewards/margins": -4.235604286193848, "rewards/rejected": -35.42863464355469, "step": 3873 }, { "epoch": 0.5275054466230937, "grad_norm": 39.91747657981237, "learning_rate": 4.314357654095777e-07, "logits/chosen": 14.309476852416992, "logits/rejected": 13.70205020904541, "logps/chosen": -4.281893730163574, "logps/rejected": -4.150848865509033, "loss": 3.9593, "rewards/accuracies": 0.25, "rewards/chosen": -42.81893539428711, "rewards/margins": -1.3104467391967773, "rewards/rejected": -41.50849151611328, "step": 3874 }, { "epoch": 0.5276416122004357, "grad_norm": 39.0753432239958, "learning_rate": 4.312462096833142e-07, "logits/chosen": 13.129310607910156, "logits/rejected": 13.279989242553711, "logps/chosen": -3.9751477241516113, "logps/rejected": -4.10315465927124, "loss": 4.3008, "rewards/accuracies": 0.75, "rewards/chosen": -39.7514762878418, "rewards/margins": 1.2800712585449219, "rewards/rejected": -41.03154754638672, "step": 3875 }, { "epoch": 0.5277777777777778, "grad_norm": 39.79695395382048, "learning_rate": 4.3105664689671144e-07, "logits/chosen": 13.97504997253418, "logits/rejected": 13.989163398742676, "logps/chosen": -4.243556976318359, "logps/rejected": -4.601572036743164, "loss": 4.2726, "rewards/accuracies": 1.0, "rewards/chosen": -42.43556594848633, "rewards/margins": 3.580155372619629, "rewards/rejected": -46.01572036743164, "step": 3876 }, { "epoch": 0.5279139433551199, "grad_norm": 39.38752610358804, "learning_rate": 4.308670770926026e-07, "logits/chosen": 13.935432434082031, "logits/rejected": 13.612846374511719, "logps/chosen": -4.095527648925781, "logps/rejected": -4.164055347442627, "loss": 3.4531, "rewards/accuracies": 0.75, "rewards/chosen": -40.95527648925781, "rewards/margins": 0.6852750778198242, "rewards/rejected": -41.64055252075195, "step": 3877 }, { "epoch": 0.5280501089324618, "grad_norm": 37.70019606659482, "learning_rate": 4.3067750031382245e-07, "logits/chosen": 12.604427337646484, "logits/rejected": 13.774816513061523, "logps/chosen": -4.0374650955200195, "logps/rejected": -4.617179870605469, "loss": 3.8513, "rewards/accuracies": 1.0, "rewards/chosen": -40.37465286254883, "rewards/margins": 5.797144889831543, "rewards/rejected": -46.17179870605469, "step": 3878 }, { "epoch": 0.5281862745098039, "grad_norm": 40.22814705975892, "learning_rate": 4.3048791660320763e-07, "logits/chosen": 13.487405776977539, "logits/rejected": 13.704658508300781, "logps/chosen": -4.0552825927734375, "logps/rejected": -4.319828510284424, "loss": 3.9781, "rewards/accuracies": 1.0, "rewards/chosen": -40.55282211303711, "rewards/margins": 2.645462989807129, "rewards/rejected": -43.19828414916992, "step": 3879 }, { "epoch": 0.528322440087146, "grad_norm": 204.75316084660537, "learning_rate": 4.3029832600359597e-07, "logits/chosen": 13.774979591369629, "logits/rejected": 12.63127326965332, "logps/chosen": -4.109543323516846, "logps/rejected": -3.8424770832061768, "loss": 4.5731, "rewards/accuracies": 0.0, "rewards/chosen": -41.095436096191406, "rewards/margins": -2.6706647872924805, "rewards/rejected": -38.424774169921875, "step": 3880 }, { "epoch": 0.528458605664488, "grad_norm": 42.439591815017266, "learning_rate": 4.3010872855782707e-07, "logits/chosen": 12.423318862915039, "logits/rejected": 13.139900207519531, "logps/chosen": -4.043225288391113, "logps/rejected": -4.254429817199707, "loss": 3.772, "rewards/accuracies": 0.75, "rewards/chosen": -40.432254791259766, "rewards/margins": 2.112044334411621, "rewards/rejected": -42.5443000793457, "step": 3881 }, { "epoch": 0.5285947712418301, "grad_norm": 58.04775894308019, "learning_rate": 4.2991912430874216e-07, "logits/chosen": 13.789360046386719, "logits/rejected": 12.426531791687012, "logps/chosen": -4.2678422927856445, "logps/rejected": -4.085423469543457, "loss": 3.978, "rewards/accuracies": 0.25, "rewards/chosen": -42.67842102050781, "rewards/margins": -1.8241825103759766, "rewards/rejected": -40.85424041748047, "step": 3882 }, { "epoch": 0.5287309368191722, "grad_norm": 36.71869031794969, "learning_rate": 4.297295132991838e-07, "logits/chosen": 12.726431846618652, "logits/rejected": 12.776609420776367, "logps/chosen": -3.5290110111236572, "logps/rejected": -4.405493259429932, "loss": 3.9795, "rewards/accuracies": 1.0, "rewards/chosen": -35.29010772705078, "rewards/margins": 8.764824867248535, "rewards/rejected": -44.054935455322266, "step": 3883 }, { "epoch": 0.5288671023965141, "grad_norm": 36.7895569534179, "learning_rate": 4.29539895571996e-07, "logits/chosen": 13.971643447875977, "logits/rejected": 14.582881927490234, "logps/chosen": -3.992663621902466, "logps/rejected": -4.248846054077148, "loss": 3.8635, "rewards/accuracies": 0.75, "rewards/chosen": -39.9266357421875, "rewards/margins": 2.561825752258301, "rewards/rejected": -42.48846435546875, "step": 3884 }, { "epoch": 0.5290032679738562, "grad_norm": 39.0721558370666, "learning_rate": 4.293502711700249e-07, "logits/chosen": 12.595209121704102, "logits/rejected": 13.15416145324707, "logps/chosen": -4.179492950439453, "logps/rejected": -4.064004421234131, "loss": 3.6876, "rewards/accuracies": 0.75, "rewards/chosen": -41.79492950439453, "rewards/margins": -1.1548824310302734, "rewards/rejected": -40.640045166015625, "step": 3885 }, { "epoch": 0.5291394335511983, "grad_norm": 39.2335329289238, "learning_rate": 4.2916064013611725e-07, "logits/chosen": 12.06492805480957, "logits/rejected": 12.462221145629883, "logps/chosen": -3.7856359481811523, "logps/rejected": -3.904467821121216, "loss": 4.1017, "rewards/accuracies": 0.5, "rewards/chosen": -37.856361389160156, "rewards/margins": 1.1883163452148438, "rewards/rejected": -39.044677734375, "step": 3886 }, { "epoch": 0.5292755991285403, "grad_norm": 46.94202999762617, "learning_rate": 4.289710025131218e-07, "logits/chosen": 12.715343475341797, "logits/rejected": 12.72899055480957, "logps/chosen": -3.685957193374634, "logps/rejected": -4.024198055267334, "loss": 3.8666, "rewards/accuracies": 0.75, "rewards/chosen": -36.85957336425781, "rewards/margins": 3.3824071884155273, "rewards/rejected": -40.241981506347656, "step": 3887 }, { "epoch": 0.5294117647058824, "grad_norm": 42.60767790451731, "learning_rate": 4.287813583438891e-07, "logits/chosen": 13.402966499328613, "logits/rejected": 13.290225982666016, "logps/chosen": -3.863652229309082, "logps/rejected": -3.936215877532959, "loss": 3.6831, "rewards/accuracies": 0.75, "rewards/chosen": -38.63652038574219, "rewards/margins": 0.7256355285644531, "rewards/rejected": -39.362159729003906, "step": 3888 }, { "epoch": 0.5295479302832244, "grad_norm": 43.35686817437737, "learning_rate": 4.285917076712705e-07, "logits/chosen": 12.448480606079102, "logits/rejected": 13.018749237060547, "logps/chosen": -4.079597473144531, "logps/rejected": -4.220656394958496, "loss": 4.2288, "rewards/accuracies": 0.75, "rewards/chosen": -40.79597473144531, "rewards/margins": 1.410588264465332, "rewards/rejected": -42.206565856933594, "step": 3889 }, { "epoch": 0.5296840958605664, "grad_norm": 44.63338963973298, "learning_rate": 4.284020505381191e-07, "logits/chosen": 13.96759033203125, "logits/rejected": 14.090038299560547, "logps/chosen": -4.088461875915527, "logps/rejected": -4.294014930725098, "loss": 3.8012, "rewards/accuracies": 0.75, "rewards/chosen": -40.884620666503906, "rewards/margins": 2.0555267333984375, "rewards/rejected": -42.940147399902344, "step": 3890 }, { "epoch": 0.5298202614379085, "grad_norm": 38.88288049646412, "learning_rate": 4.2821238698728966e-07, "logits/chosen": 12.313003540039062, "logits/rejected": 12.648843765258789, "logps/chosen": -3.5654568672180176, "logps/rejected": -3.9841175079345703, "loss": 4.0061, "rewards/accuracies": 0.75, "rewards/chosen": -35.65456771850586, "rewards/margins": 4.186607837677002, "rewards/rejected": -39.84117889404297, "step": 3891 }, { "epoch": 0.5299564270152506, "grad_norm": 44.71766018599702, "learning_rate": 4.280227170616382e-07, "logits/chosen": 13.48879623413086, "logits/rejected": 12.08745002746582, "logps/chosen": -4.121411323547363, "logps/rejected": -3.639150857925415, "loss": 3.9638, "rewards/accuracies": 0.25, "rewards/chosen": -41.214111328125, "rewards/margins": -4.822601318359375, "rewards/rejected": -36.391510009765625, "step": 3892 }, { "epoch": 0.5300925925925926, "grad_norm": 38.602724606022704, "learning_rate": 4.2783304080402215e-07, "logits/chosen": 13.934529304504395, "logits/rejected": 13.047684669494629, "logps/chosen": -4.279645919799805, "logps/rejected": -4.087508201599121, "loss": 4.1878, "rewards/accuracies": 0.25, "rewards/chosen": -42.79645919799805, "rewards/margins": -1.9213800430297852, "rewards/rejected": -40.87508010864258, "step": 3893 }, { "epoch": 0.5302287581699346, "grad_norm": 40.35116014180542, "learning_rate": 4.276433582573005e-07, "logits/chosen": 14.284317016601562, "logits/rejected": 13.287843704223633, "logps/chosen": -3.7470293045043945, "logps/rejected": -3.9033203125, "loss": 4.5253, "rewards/accuracies": 0.5, "rewards/chosen": -37.47029495239258, "rewards/margins": 1.5629091262817383, "rewards/rejected": -39.033203125, "step": 3894 }, { "epoch": 0.5303649237472767, "grad_norm": 39.35499290615445, "learning_rate": 4.274536694643335e-07, "logits/chosen": 13.559253692626953, "logits/rejected": 13.888465881347656, "logps/chosen": -4.019980430603027, "logps/rejected": -4.096019744873047, "loss": 3.8491, "rewards/accuracies": 0.5, "rewards/chosen": -40.199806213378906, "rewards/margins": 0.7603960037231445, "rewards/rejected": -40.96019744873047, "step": 3895 }, { "epoch": 0.5305010893246187, "grad_norm": 40.537296757569926, "learning_rate": 4.2726397446798294e-07, "logits/chosen": 12.699917793273926, "logits/rejected": 13.591934204101562, "logps/chosen": -3.9611032009124756, "logps/rejected": -4.304771423339844, "loss": 3.8593, "rewards/accuracies": 0.75, "rewards/chosen": -39.61103057861328, "rewards/margins": 3.436678886413574, "rewards/rejected": -43.04771041870117, "step": 3896 }, { "epoch": 0.5306372549019608, "grad_norm": 39.78479772149117, "learning_rate": 4.2707427331111204e-07, "logits/chosen": 12.4671049118042, "logits/rejected": 12.927774429321289, "logps/chosen": -3.9798665046691895, "logps/rejected": -3.8452374935150146, "loss": 4.05, "rewards/accuracies": 0.5, "rewards/chosen": -39.79866409301758, "rewards/margins": -1.3462896347045898, "rewards/rejected": -38.45237350463867, "step": 3897 }, { "epoch": 0.5307734204793029, "grad_norm": 39.464737503651534, "learning_rate": 4.268845660365853e-07, "logits/chosen": 13.082780838012695, "logits/rejected": 13.483444213867188, "logps/chosen": -4.031461238861084, "logps/rejected": -4.298827171325684, "loss": 4.1925, "rewards/accuracies": 0.75, "rewards/chosen": -40.314613342285156, "rewards/margins": 2.6736583709716797, "rewards/rejected": -42.9882698059082, "step": 3898 }, { "epoch": 0.5309095860566448, "grad_norm": 40.88239682383809, "learning_rate": 4.266948526872685e-07, "logits/chosen": 13.89738655090332, "logits/rejected": 12.208304405212402, "logps/chosen": -3.8989269733428955, "logps/rejected": -3.8641767501831055, "loss": 4.3817, "rewards/accuracies": 0.5, "rewards/chosen": -38.98927307128906, "rewards/margins": -0.3475055694580078, "rewards/rejected": -38.641761779785156, "step": 3899 }, { "epoch": 0.5310457516339869, "grad_norm": 39.29079835896163, "learning_rate": 4.2650513330602924e-07, "logits/chosen": 12.86197280883789, "logits/rejected": 13.26616382598877, "logps/chosen": -3.6779980659484863, "logps/rejected": -4.224747657775879, "loss": 3.7507, "rewards/accuracies": 1.0, "rewards/chosen": -36.77998352050781, "rewards/margins": 5.467494010925293, "rewards/rejected": -42.247474670410156, "step": 3900 }, { "epoch": 0.531181917211329, "grad_norm": 64.16009156022177, "learning_rate": 4.2631540793573597e-07, "logits/chosen": 11.9798002243042, "logits/rejected": 12.499920845031738, "logps/chosen": -3.7630362510681152, "logps/rejected": -3.9610586166381836, "loss": 3.8813, "rewards/accuracies": 0.75, "rewards/chosen": -37.63036346435547, "rewards/margins": 1.9802255630493164, "rewards/rejected": -39.61058807373047, "step": 3901 }, { "epoch": 0.531318082788671, "grad_norm": 39.75112973115933, "learning_rate": 4.261256766192587e-07, "logits/chosen": 12.068885803222656, "logits/rejected": 13.79020881652832, "logps/chosen": -3.6586649417877197, "logps/rejected": -4.421557903289795, "loss": 3.8475, "rewards/accuracies": 1.0, "rewards/chosen": -36.58665084838867, "rewards/margins": 7.628931045532227, "rewards/rejected": -44.21558380126953, "step": 3902 }, { "epoch": 0.5314542483660131, "grad_norm": 59.35710285073052, "learning_rate": 4.259359393994689e-07, "logits/chosen": 12.951845169067383, "logits/rejected": 13.497000694274902, "logps/chosen": -3.7773513793945312, "logps/rejected": -3.896517038345337, "loss": 3.64, "rewards/accuracies": 0.5, "rewards/chosen": -37.77351379394531, "rewards/margins": 1.1916608810424805, "rewards/rejected": -38.965171813964844, "step": 3903 }, { "epoch": 0.5315904139433552, "grad_norm": 45.099681080634596, "learning_rate": 4.257461963192392e-07, "logits/chosen": 12.773797988891602, "logits/rejected": 13.567588806152344, "logps/chosen": -3.9251463413238525, "logps/rejected": -3.926030397415161, "loss": 4.7651, "rewards/accuracies": 0.5, "rewards/chosen": -39.25146484375, "rewards/margins": 0.008840560913085938, "rewards/rejected": -39.26029968261719, "step": 3904 }, { "epoch": 0.5317265795206971, "grad_norm": 61.42168646772319, "learning_rate": 4.255564474214435e-07, "logits/chosen": 12.592923164367676, "logits/rejected": 13.24270248413086, "logps/chosen": -3.652240037918091, "logps/rejected": -4.16516637802124, "loss": 4.5985, "rewards/accuracies": 1.0, "rewards/chosen": -36.52239990234375, "rewards/margins": 5.129263877868652, "rewards/rejected": -41.65166473388672, "step": 3905 }, { "epoch": 0.5318627450980392, "grad_norm": 39.33927820817611, "learning_rate": 4.2536669274895735e-07, "logits/chosen": 12.851436614990234, "logits/rejected": 12.887742042541504, "logps/chosen": -4.054277420043945, "logps/rejected": -4.101080894470215, "loss": 3.9232, "rewards/accuracies": 0.75, "rewards/chosen": -40.54277801513672, "rewards/margins": 0.4680337905883789, "rewards/rejected": -41.01081085205078, "step": 3906 }, { "epoch": 0.5319989106753813, "grad_norm": 50.164824409175935, "learning_rate": 4.251769323446574e-07, "logits/chosen": 13.482681274414062, "logits/rejected": 13.764604568481445, "logps/chosen": -4.048951148986816, "logps/rejected": -4.270756721496582, "loss": 4.0363, "rewards/accuracies": 0.75, "rewards/chosen": -40.4895133972168, "rewards/margins": 2.21805477142334, "rewards/rejected": -42.70756530761719, "step": 3907 }, { "epoch": 0.5321350762527233, "grad_norm": 42.342198455200624, "learning_rate": 4.249871662514213e-07, "logits/chosen": 13.365657806396484, "logits/rejected": 13.06773567199707, "logps/chosen": -3.8541622161865234, "logps/rejected": -4.004899024963379, "loss": 3.5865, "rewards/accuracies": 0.5, "rewards/chosen": -38.541622161865234, "rewards/margins": 1.5073719024658203, "rewards/rejected": -40.04899215698242, "step": 3908 }, { "epoch": 0.5322712418300654, "grad_norm": 40.150399120476365, "learning_rate": 4.247973945121285e-07, "logits/chosen": 12.482568740844727, "logits/rejected": 13.487237930297852, "logps/chosen": -3.936038017272949, "logps/rejected": -4.1940436363220215, "loss": 4.086, "rewards/accuracies": 1.0, "rewards/chosen": -39.360382080078125, "rewards/margins": 2.5800552368164062, "rewards/rejected": -41.940433502197266, "step": 3909 }, { "epoch": 0.5324074074074074, "grad_norm": 34.57378924518509, "learning_rate": 4.246076171696595e-07, "logits/chosen": 13.118558883666992, "logits/rejected": 12.855884552001953, "logps/chosen": -3.8265111446380615, "logps/rejected": -3.7900588512420654, "loss": 3.7763, "rewards/accuracies": 0.25, "rewards/chosen": -38.26511001586914, "rewards/margins": -0.36452388763427734, "rewards/rejected": -37.90058898925781, "step": 3910 }, { "epoch": 0.5325435729847494, "grad_norm": 55.718250601254006, "learning_rate": 4.2441783426689586e-07, "logits/chosen": 13.14785385131836, "logits/rejected": 13.1185941696167, "logps/chosen": -3.9514095783233643, "logps/rejected": -3.7757456302642822, "loss": 4.1946, "rewards/accuracies": 0.25, "rewards/chosen": -39.51409912109375, "rewards/margins": -1.7566423416137695, "rewards/rejected": -37.75745391845703, "step": 3911 }, { "epoch": 0.5326797385620915, "grad_norm": 46.35247528782598, "learning_rate": 4.242280458467208e-07, "logits/chosen": 13.052412033081055, "logits/rejected": 12.959434509277344, "logps/chosen": -3.8892884254455566, "logps/rejected": -4.174244403839111, "loss": 4.116, "rewards/accuracies": 0.75, "rewards/chosen": -38.89288330078125, "rewards/margins": 2.849559783935547, "rewards/rejected": -41.74244689941406, "step": 3912 }, { "epoch": 0.5328159041394336, "grad_norm": 41.27807395640876, "learning_rate": 4.2403825195201843e-07, "logits/chosen": 12.920892715454102, "logits/rejected": 12.883523941040039, "logps/chosen": -3.783494710922241, "logps/rejected": -4.030353546142578, "loss": 4.2404, "rewards/accuracies": 0.75, "rewards/chosen": -37.83494567871094, "rewards/margins": 2.468592643737793, "rewards/rejected": -40.30353546142578, "step": 3913 }, { "epoch": 0.5329520697167756, "grad_norm": 36.92773004187807, "learning_rate": 4.238484526256744e-07, "logits/chosen": 14.072525024414062, "logits/rejected": 13.882773399353027, "logps/chosen": -4.049704074859619, "logps/rejected": -4.330744743347168, "loss": 3.5409, "rewards/accuracies": 0.75, "rewards/chosen": -40.497039794921875, "rewards/margins": 2.810403823852539, "rewards/rejected": -43.30744552612305, "step": 3914 }, { "epoch": 0.5330882352941176, "grad_norm": 39.960724752412894, "learning_rate": 4.236586479105755e-07, "logits/chosen": 13.043304443359375, "logits/rejected": 12.863680839538574, "logps/chosen": -3.882830858230591, "logps/rejected": -3.998321294784546, "loss": 3.9176, "rewards/accuracies": 0.5, "rewards/chosen": -38.82830810546875, "rewards/margins": 1.154902458190918, "rewards/rejected": -39.98321533203125, "step": 3915 }, { "epoch": 0.5332244008714597, "grad_norm": 37.52091995472013, "learning_rate": 4.2346883784960934e-07, "logits/chosen": 12.462459564208984, "logits/rejected": 13.142666816711426, "logps/chosen": -3.7870144844055176, "logps/rejected": -4.137748718261719, "loss": 3.9197, "rewards/accuracies": 1.0, "rewards/chosen": -37.870147705078125, "rewards/margins": 3.507340431213379, "rewards/rejected": -41.37748718261719, "step": 3916 }, { "epoch": 0.5333605664488017, "grad_norm": 39.89943270883498, "learning_rate": 4.2327902248566536e-07, "logits/chosen": 13.653961181640625, "logits/rejected": 13.50220012664795, "logps/chosen": -4.037087917327881, "logps/rejected": -4.2563395500183105, "loss": 4.2726, "rewards/accuracies": 0.5, "rewards/chosen": -40.370880126953125, "rewards/margins": 2.192516326904297, "rewards/rejected": -42.56340026855469, "step": 3917 }, { "epoch": 0.5334967320261438, "grad_norm": 41.84789971882369, "learning_rate": 4.2308920186163376e-07, "logits/chosen": 12.657764434814453, "logits/rejected": 12.640585899353027, "logps/chosen": -4.06089973449707, "logps/rejected": -3.9956579208374023, "loss": 3.1778, "rewards/accuracies": 0.75, "rewards/chosen": -40.6089973449707, "rewards/margins": -0.6524200439453125, "rewards/rejected": -39.95657730102539, "step": 3918 }, { "epoch": 0.5336328976034859, "grad_norm": 56.91013748714607, "learning_rate": 4.228993760204062e-07, "logits/chosen": 13.086533546447754, "logits/rejected": 13.469573020935059, "logps/chosen": -3.9867236614227295, "logps/rejected": -4.261884689331055, "loss": 4.0329, "rewards/accuracies": 1.0, "rewards/chosen": -39.86723709106445, "rewards/margins": 2.75161075592041, "rewards/rejected": -42.61884689331055, "step": 3919 }, { "epoch": 0.5337690631808278, "grad_norm": 39.23580785188776, "learning_rate": 4.227095450048753e-07, "logits/chosen": 13.082540512084961, "logits/rejected": 12.274309158325195, "logps/chosen": -3.8238701820373535, "logps/rejected": -3.7148587703704834, "loss": 3.6797, "rewards/accuracies": 0.5, "rewards/chosen": -38.23870086669922, "rewards/margins": -1.0901117324829102, "rewards/rejected": -37.148590087890625, "step": 3920 }, { "epoch": 0.5339052287581699, "grad_norm": 41.73369761930993, "learning_rate": 4.2251970885793506e-07, "logits/chosen": 12.882505416870117, "logits/rejected": 13.090238571166992, "logps/chosen": -3.334301710128784, "logps/rejected": -3.76560115814209, "loss": 4.2175, "rewards/accuracies": 1.0, "rewards/chosen": -33.343017578125, "rewards/margins": 4.312994956970215, "rewards/rejected": -37.65601348876953, "step": 3921 }, { "epoch": 0.534041394335512, "grad_norm": 37.991060949130386, "learning_rate": 4.223298676224804e-07, "logits/chosen": 14.224331855773926, "logits/rejected": 13.99849796295166, "logps/chosen": -3.866548538208008, "logps/rejected": -4.068317413330078, "loss": 3.8834, "rewards/accuracies": 0.75, "rewards/chosen": -38.66548538208008, "rewards/margins": 2.017690658569336, "rewards/rejected": -40.68317794799805, "step": 3922 }, { "epoch": 0.534177559912854, "grad_norm": 42.26445399771706, "learning_rate": 4.2214002134140745e-07, "logits/chosen": 13.594888687133789, "logits/rejected": 13.484941482543945, "logps/chosen": -4.133055686950684, "logps/rejected": -3.9068970680236816, "loss": 4.4713, "rewards/accuracies": 0.5, "rewards/chosen": -41.3305549621582, "rewards/margins": -2.261580467224121, "rewards/rejected": -39.0689697265625, "step": 3923 }, { "epoch": 0.5343137254901961, "grad_norm": 47.564715596287606, "learning_rate": 4.2195017005761393e-07, "logits/chosen": 12.956010818481445, "logits/rejected": 13.94430923461914, "logps/chosen": -3.8094685077667236, "logps/rejected": -4.2877936363220215, "loss": 3.976, "rewards/accuracies": 1.0, "rewards/chosen": -38.094688415527344, "rewards/margins": 4.783249855041504, "rewards/rejected": -42.87793731689453, "step": 3924 }, { "epoch": 0.5344498910675382, "grad_norm": 38.53597007929104, "learning_rate": 4.217603138139979e-07, "logits/chosen": 13.448822975158691, "logits/rejected": 13.106245040893555, "logps/chosen": -3.689262866973877, "logps/rejected": -4.004045486450195, "loss": 3.6441, "rewards/accuracies": 0.75, "rewards/chosen": -36.89262771606445, "rewards/margins": 3.1478281021118164, "rewards/rejected": -40.04045867919922, "step": 3925 }, { "epoch": 0.5345860566448801, "grad_norm": 39.614861678703136, "learning_rate": 4.2157045265345906e-07, "logits/chosen": 12.55323600769043, "logits/rejected": 13.34450912475586, "logps/chosen": -3.8575055599212646, "logps/rejected": -4.143013000488281, "loss": 3.9427, "rewards/accuracies": 0.5, "rewards/chosen": -38.57505416870117, "rewards/margins": 2.8550758361816406, "rewards/rejected": -41.43013381958008, "step": 3926 }, { "epoch": 0.5347222222222222, "grad_norm": 42.44975073074504, "learning_rate": 4.2138058661889816e-07, "logits/chosen": 13.502918243408203, "logits/rejected": 14.149718284606934, "logps/chosen": -3.655247449874878, "logps/rejected": -4.001865386962891, "loss": 4.1843, "rewards/accuracies": 1.0, "rewards/chosen": -36.55247497558594, "rewards/margins": 3.466184139251709, "rewards/rejected": -40.01865768432617, "step": 3927 }, { "epoch": 0.5348583877995643, "grad_norm": 52.6172703332425, "learning_rate": 4.2119071575321704e-07, "logits/chosen": 14.548234939575195, "logits/rejected": 14.04300594329834, "logps/chosen": -4.365803241729736, "logps/rejected": -4.44137716293335, "loss": 4.3959, "rewards/accuracies": 0.75, "rewards/chosen": -43.65803527832031, "rewards/margins": 0.7557363510131836, "rewards/rejected": -44.41377258300781, "step": 3928 }, { "epoch": 0.5349945533769063, "grad_norm": 38.22367292175295, "learning_rate": 4.210008400993184e-07, "logits/chosen": 12.713188171386719, "logits/rejected": 13.818156242370605, "logps/chosen": -3.368594169616699, "logps/rejected": -3.897667646408081, "loss": 4.0217, "rewards/accuracies": 1.0, "rewards/chosen": -33.685943603515625, "rewards/margins": 5.290736198425293, "rewards/rejected": -38.97667694091797, "step": 3929 }, { "epoch": 0.5351307189542484, "grad_norm": 37.49881021781591, "learning_rate": 4.2081095970010646e-07, "logits/chosen": 12.874834060668945, "logits/rejected": 14.742680549621582, "logps/chosen": -3.943235397338867, "logps/rejected": -4.1380109786987305, "loss": 3.7058, "rewards/accuracies": 0.75, "rewards/chosen": -39.43235397338867, "rewards/margins": 1.9477558135986328, "rewards/rejected": -41.38011169433594, "step": 3930 }, { "epoch": 0.5352668845315904, "grad_norm": 42.44781526021363, "learning_rate": 4.2062107459848616e-07, "logits/chosen": 12.024454116821289, "logits/rejected": 12.72585391998291, "logps/chosen": -3.5425891876220703, "logps/rejected": -4.089840888977051, "loss": 4.084, "rewards/accuracies": 1.0, "rewards/chosen": -35.4258918762207, "rewards/margins": 5.472517967224121, "rewards/rejected": -40.89841079711914, "step": 3931 }, { "epoch": 0.5354030501089324, "grad_norm": 40.93614557497803, "learning_rate": 4.2043118483736356e-07, "logits/chosen": 13.01083755493164, "logits/rejected": 13.328714370727539, "logps/chosen": -4.156071662902832, "logps/rejected": -4.349568843841553, "loss": 4.1602, "rewards/accuracies": 0.75, "rewards/chosen": -41.56071472167969, "rewards/margins": 1.9349737167358398, "rewards/rejected": -43.495689392089844, "step": 3932 }, { "epoch": 0.5355392156862745, "grad_norm": 33.06688966707394, "learning_rate": 4.2024129045964585e-07, "logits/chosen": 12.574995994567871, "logits/rejected": 13.309318542480469, "logps/chosen": -3.6950368881225586, "logps/rejected": -3.9413421154022217, "loss": 3.5741, "rewards/accuracies": 0.75, "rewards/chosen": -36.95036697387695, "rewards/margins": 2.463052749633789, "rewards/rejected": -39.413421630859375, "step": 3933 }, { "epoch": 0.5356753812636166, "grad_norm": 46.752276650684095, "learning_rate": 4.2005139150824134e-07, "logits/chosen": 12.393614768981934, "logits/rejected": 13.112672805786133, "logps/chosen": -3.624640464782715, "logps/rejected": -3.667555332183838, "loss": 4.0491, "rewards/accuracies": 0.5, "rewards/chosen": -36.24640655517578, "rewards/margins": 0.4291501045227051, "rewards/rejected": -36.67555236816406, "step": 3934 }, { "epoch": 0.5358115468409586, "grad_norm": 39.10016970248348, "learning_rate": 4.198614880260591e-07, "logits/chosen": 12.26024341583252, "logits/rejected": 14.749425888061523, "logps/chosen": -3.6170785427093506, "logps/rejected": -4.332984447479248, "loss": 4.189, "rewards/accuracies": 1.0, "rewards/chosen": -36.17078399658203, "rewards/margins": 7.159058094024658, "rewards/rejected": -43.32984161376953, "step": 3935 }, { "epoch": 0.5359477124183006, "grad_norm": 40.82653859507334, "learning_rate": 4.196715800560094e-07, "logits/chosen": 13.748950958251953, "logits/rejected": 14.118481636047363, "logps/chosen": -3.5792746543884277, "logps/rejected": -4.073435306549072, "loss": 4.1643, "rewards/accuracies": 1.0, "rewards/chosen": -35.792747497558594, "rewards/margins": 4.941606521606445, "rewards/rejected": -40.734352111816406, "step": 3936 }, { "epoch": 0.5360838779956427, "grad_norm": 42.980344358521926, "learning_rate": 4.194816676410037e-07, "logits/chosen": 12.883138656616211, "logits/rejected": 12.653749465942383, "logps/chosen": -3.9799985885620117, "logps/rejected": -3.912353992462158, "loss": 4.5067, "rewards/accuracies": 0.25, "rewards/chosen": -39.79998779296875, "rewards/margins": -0.676447868347168, "rewards/rejected": -39.12354278564453, "step": 3937 }, { "epoch": 0.5362200435729847, "grad_norm": 50.94583181787112, "learning_rate": 4.1929175082395404e-07, "logits/chosen": 12.8768892288208, "logits/rejected": 12.949041366577148, "logps/chosen": -3.7743191719055176, "logps/rejected": -4.1648149490356445, "loss": 4.049, "rewards/accuracies": 1.0, "rewards/chosen": -37.743194580078125, "rewards/margins": 3.904956817626953, "rewards/rejected": -41.64815139770508, "step": 3938 }, { "epoch": 0.5363562091503268, "grad_norm": 77.09758012797687, "learning_rate": 4.1910182964777385e-07, "logits/chosen": 13.635429382324219, "logits/rejected": 14.265154838562012, "logps/chosen": -4.106657981872559, "logps/rejected": -4.146154403686523, "loss": 3.8419, "rewards/accuracies": 0.5, "rewards/chosen": -41.06657791137695, "rewards/margins": 0.39496517181396484, "rewards/rejected": -41.46154022216797, "step": 3939 }, { "epoch": 0.5364923747276689, "grad_norm": 38.181080824624, "learning_rate": 4.189119041553774e-07, "logits/chosen": 14.629852294921875, "logits/rejected": 13.803987503051758, "logps/chosen": -4.020971298217773, "logps/rejected": -4.305257797241211, "loss": 3.2426, "rewards/accuracies": 0.75, "rewards/chosen": -40.209712982177734, "rewards/margins": 2.8428702354431152, "rewards/rejected": -43.052581787109375, "step": 3940 }, { "epoch": 0.536628540305011, "grad_norm": 38.56519415011728, "learning_rate": 4.187219743896798e-07, "logits/chosen": 12.329512596130371, "logits/rejected": 14.469476699829102, "logps/chosen": -3.5427756309509277, "logps/rejected": -4.198929309844971, "loss": 3.6766, "rewards/accuracies": 1.0, "rewards/chosen": -35.42775344848633, "rewards/margins": 6.561537742614746, "rewards/rejected": -41.98929214477539, "step": 3941 }, { "epoch": 0.5367647058823529, "grad_norm": 42.27442976407323, "learning_rate": 4.1853204039359743e-07, "logits/chosen": 13.79971981048584, "logits/rejected": 13.951180458068848, "logps/chosen": -4.282898902893066, "logps/rejected": -4.3958539962768555, "loss": 4.1015, "rewards/accuracies": 0.25, "rewards/chosen": -42.8289909362793, "rewards/margins": 1.1295509338378906, "rewards/rejected": -43.95854187011719, "step": 3942 }, { "epoch": 0.536900871459695, "grad_norm": 49.20176581311867, "learning_rate": 4.183421022100473e-07, "logits/chosen": 13.548612594604492, "logits/rejected": 13.0633544921875, "logps/chosen": -3.980125904083252, "logps/rejected": -3.841547966003418, "loss": 4.4115, "rewards/accuracies": 0.25, "rewards/chosen": -39.80125427246094, "rewards/margins": -1.385777473449707, "rewards/rejected": -38.41548156738281, "step": 3943 }, { "epoch": 0.5370370370370371, "grad_norm": 44.32596747008153, "learning_rate": 4.1815215988194745e-07, "logits/chosen": 13.706608772277832, "logits/rejected": 14.146862983703613, "logps/chosen": -4.002933502197266, "logps/rejected": -4.475101470947266, "loss": 4.264, "rewards/accuracies": 1.0, "rewards/chosen": -40.029335021972656, "rewards/margins": 4.7216796875, "rewards/rejected": -44.751014709472656, "step": 3944 }, { "epoch": 0.5371732026143791, "grad_norm": 43.74121739906208, "learning_rate": 4.1796221345221723e-07, "logits/chosen": 13.38786506652832, "logits/rejected": 13.333390235900879, "logps/chosen": -4.002884864807129, "logps/rejected": -4.035097599029541, "loss": 3.5292, "rewards/accuracies": 0.5, "rewards/chosen": -40.028846740722656, "rewards/margins": 0.3221282958984375, "rewards/rejected": -40.350975036621094, "step": 3945 }, { "epoch": 0.5373093681917211, "grad_norm": 39.01172150224613, "learning_rate": 4.1777226296377634e-07, "logits/chosen": 12.844230651855469, "logits/rejected": 14.823981285095215, "logps/chosen": -4.045193672180176, "logps/rejected": -4.2129387855529785, "loss": 4.0627, "rewards/accuracies": 0.5, "rewards/chosen": -40.451934814453125, "rewards/margins": 1.6774511337280273, "rewards/rejected": -42.12938690185547, "step": 3946 }, { "epoch": 0.5374455337690632, "grad_norm": 40.4628451281431, "learning_rate": 4.175823084595456e-07, "logits/chosen": 13.707062721252441, "logits/rejected": 13.813709259033203, "logps/chosen": -4.096490859985352, "logps/rejected": -4.376606464385986, "loss": 4.0795, "rewards/accuracies": 0.5, "rewards/chosen": -40.96490478515625, "rewards/margins": 2.8011550903320312, "rewards/rejected": -43.76605987548828, "step": 3947 }, { "epoch": 0.5375816993464052, "grad_norm": 42.50871320285225, "learning_rate": 4.173923499824471e-07, "logits/chosen": 14.823174476623535, "logits/rejected": 14.399057388305664, "logps/chosen": -4.354358673095703, "logps/rejected": -4.362049102783203, "loss": 3.7693, "rewards/accuracies": 0.5, "rewards/chosen": -43.54358673095703, "rewards/margins": 0.0769052505493164, "rewards/rejected": -43.62049102783203, "step": 3948 }, { "epoch": 0.5377178649237473, "grad_norm": 45.563264504285804, "learning_rate": 4.172023875754033e-07, "logits/chosen": 13.536540985107422, "logits/rejected": 13.63758659362793, "logps/chosen": -4.229442596435547, "logps/rejected": -4.327672958374023, "loss": 4.3661, "rewards/accuracies": 0.75, "rewards/chosen": -42.294429779052734, "rewards/margins": 0.9822969436645508, "rewards/rejected": -43.27672576904297, "step": 3949 }, { "epoch": 0.5378540305010894, "grad_norm": 43.159377480966086, "learning_rate": 4.170124212813377e-07, "logits/chosen": 14.094701766967773, "logits/rejected": 13.516135215759277, "logps/chosen": -3.9406700134277344, "logps/rejected": -3.907845973968506, "loss": 3.9976, "rewards/accuracies": 0.5, "rewards/chosen": -39.40669631958008, "rewards/margins": -0.32823657989501953, "rewards/rejected": -39.078460693359375, "step": 3950 }, { "epoch": 0.5379901960784313, "grad_norm": 39.74862794166887, "learning_rate": 4.1682245114317503e-07, "logits/chosen": 13.218033790588379, "logits/rejected": 13.113439559936523, "logps/chosen": -3.7977561950683594, "logps/rejected": -3.8885231018066406, "loss": 3.8256, "rewards/accuracies": 0.5, "rewards/chosen": -37.977561950683594, "rewards/margins": 0.9076681137084961, "rewards/rejected": -38.885231018066406, "step": 3951 }, { "epoch": 0.5381263616557734, "grad_norm": 42.810263870284786, "learning_rate": 4.1663247720384047e-07, "logits/chosen": 14.005529403686523, "logits/rejected": 14.549776077270508, "logps/chosen": -4.312769889831543, "logps/rejected": -4.845755577087402, "loss": 3.7107, "rewards/accuracies": 0.75, "rewards/chosen": -43.12770080566406, "rewards/margins": 5.3298540115356445, "rewards/rejected": -48.45755386352539, "step": 3952 }, { "epoch": 0.5382625272331155, "grad_norm": 46.280327037756884, "learning_rate": 4.1644249950626016e-07, "logits/chosen": 12.373327255249023, "logits/rejected": 12.726079940795898, "logps/chosen": -3.7479374408721924, "logps/rejected": -3.930394172668457, "loss": 4.2559, "rewards/accuracies": 0.75, "rewards/chosen": -37.479373931884766, "rewards/margins": 1.8245677947998047, "rewards/rejected": -39.30393981933594, "step": 3953 }, { "epoch": 0.5383986928104575, "grad_norm": 67.73812962860812, "learning_rate": 4.1625251809336115e-07, "logits/chosen": 13.354833602905273, "logits/rejected": 14.076789855957031, "logps/chosen": -3.8014936447143555, "logps/rejected": -4.266733169555664, "loss": 3.9764, "rewards/accuracies": 0.75, "rewards/chosen": -38.01493835449219, "rewards/margins": 4.652390480041504, "rewards/rejected": -42.667327880859375, "step": 3954 }, { "epoch": 0.5385348583877996, "grad_norm": 54.01507664433019, "learning_rate": 4.1606253300807134e-07, "logits/chosen": 13.462711334228516, "logits/rejected": 13.438135147094727, "logps/chosen": -4.078380584716797, "logps/rejected": -4.036232948303223, "loss": 3.8626, "rewards/accuracies": 0.25, "rewards/chosen": -40.78380584716797, "rewards/margins": -0.4214763641357422, "rewards/rejected": -40.362327575683594, "step": 3955 }, { "epoch": 0.5386710239651417, "grad_norm": 38.99207998123072, "learning_rate": 4.1587254429331946e-07, "logits/chosen": 12.28652572631836, "logits/rejected": 13.208122253417969, "logps/chosen": -3.9802756309509277, "logps/rejected": -4.149044036865234, "loss": 3.8158, "rewards/accuracies": 0.75, "rewards/chosen": -39.802757263183594, "rewards/margins": 1.68768310546875, "rewards/rejected": -41.490440368652344, "step": 3956 }, { "epoch": 0.5388071895424836, "grad_norm": 43.478204514058255, "learning_rate": 4.1568255199203495e-07, "logits/chosen": 13.637746810913086, "logits/rejected": 14.184877395629883, "logps/chosen": -4.0079345703125, "logps/rejected": -4.433300018310547, "loss": 3.8585, "rewards/accuracies": 1.0, "rewards/chosen": -40.079345703125, "rewards/margins": 4.2536516189575195, "rewards/rejected": -44.33300018310547, "step": 3957 }, { "epoch": 0.5389433551198257, "grad_norm": 42.625102462202676, "learning_rate": 4.154925561471482e-07, "logits/chosen": 12.987112998962402, "logits/rejected": 13.706439971923828, "logps/chosen": -4.111141204833984, "logps/rejected": -4.513913154602051, "loss": 4.182, "rewards/accuracies": 1.0, "rewards/chosen": -41.111412048339844, "rewards/margins": 4.027717590332031, "rewards/rejected": -45.139129638671875, "step": 3958 }, { "epoch": 0.5390795206971678, "grad_norm": 40.443830497483205, "learning_rate": 4.153025568015903e-07, "logits/chosen": 14.142380714416504, "logits/rejected": 13.124650955200195, "logps/chosen": -4.19780158996582, "logps/rejected": -4.368656635284424, "loss": 3.9408, "rewards/accuracies": 0.75, "rewards/chosen": -41.97801208496094, "rewards/margins": 1.7085514068603516, "rewards/rejected": -43.68656539916992, "step": 3959 }, { "epoch": 0.5392156862745098, "grad_norm": 35.11236435068206, "learning_rate": 4.1511255399829324e-07, "logits/chosen": 12.47488021850586, "logits/rejected": 12.792724609375, "logps/chosen": -3.7265374660491943, "logps/rejected": -3.8733553886413574, "loss": 4.1702, "rewards/accuracies": 0.5, "rewards/chosen": -37.26537322998047, "rewards/margins": 1.468179702758789, "rewards/rejected": -38.733551025390625, "step": 3960 }, { "epoch": 0.5393518518518519, "grad_norm": 47.05134740501347, "learning_rate": 4.149225477801897e-07, "logits/chosen": 12.67272663116455, "logits/rejected": 13.814334869384766, "logps/chosen": -3.8128905296325684, "logps/rejected": -4.330898284912109, "loss": 4.2342, "rewards/accuracies": 0.75, "rewards/chosen": -38.128902435302734, "rewards/margins": 5.180078506469727, "rewards/rejected": -43.308982849121094, "step": 3961 }, { "epoch": 0.539488017429194, "grad_norm": 38.670383749985376, "learning_rate": 4.1473253819021306e-07, "logits/chosen": 12.852890014648438, "logits/rejected": 12.632152557373047, "logps/chosen": -3.9696896076202393, "logps/rejected": -4.005974292755127, "loss": 4.12, "rewards/accuracies": 0.75, "rewards/chosen": -39.696895599365234, "rewards/margins": 0.36284542083740234, "rewards/rejected": -40.05974197387695, "step": 3962 }, { "epoch": 0.5396241830065359, "grad_norm": 41.15488783259741, "learning_rate": 4.1454252527129767e-07, "logits/chosen": 13.262491226196289, "logits/rejected": 13.862070083618164, "logps/chosen": -4.279424667358398, "logps/rejected": -4.430659770965576, "loss": 4.1514, "rewards/accuracies": 0.75, "rewards/chosen": -42.794246673583984, "rewards/margins": 1.5123510360717773, "rewards/rejected": -44.30659484863281, "step": 3963 }, { "epoch": 0.539760348583878, "grad_norm": 37.59194258728894, "learning_rate": 4.143525090663784e-07, "logits/chosen": 13.38264274597168, "logits/rejected": 13.67825698852539, "logps/chosen": -3.722123622894287, "logps/rejected": -4.084212303161621, "loss": 3.5516, "rewards/accuracies": 1.0, "rewards/chosen": -37.22123718261719, "rewards/margins": 3.6208887100219727, "rewards/rejected": -40.842124938964844, "step": 3964 }, { "epoch": 0.5398965141612201, "grad_norm": 65.45926354911553, "learning_rate": 4.141624896183913e-07, "logits/chosen": 13.562349319458008, "logits/rejected": 14.398983001708984, "logps/chosen": -3.903047800064087, "logps/rejected": -4.152951240539551, "loss": 3.3227, "rewards/accuracies": 0.5, "rewards/chosen": -39.030479431152344, "rewards/margins": 2.499034881591797, "rewards/rejected": -41.529510498046875, "step": 3965 }, { "epoch": 0.5400326797385621, "grad_norm": 37.11751879620667, "learning_rate": 4.1397246697027237e-07, "logits/chosen": 12.585477828979492, "logits/rejected": 14.328006744384766, "logps/chosen": -3.6707022190093994, "logps/rejected": -4.1060309410095215, "loss": 3.7806, "rewards/accuracies": 1.0, "rewards/chosen": -36.70702362060547, "rewards/margins": 4.353287696838379, "rewards/rejected": -41.06031036376953, "step": 3966 }, { "epoch": 0.5401688453159041, "grad_norm": 41.21397583726758, "learning_rate": 4.137824411649592e-07, "logits/chosen": 13.671865463256836, "logits/rejected": 14.113882064819336, "logps/chosen": -4.0207719802856445, "logps/rejected": -4.267004013061523, "loss": 3.8904, "rewards/accuracies": 0.5, "rewards/chosen": -40.20771789550781, "rewards/margins": 2.462325096130371, "rewards/rejected": -42.670040130615234, "step": 3967 }, { "epoch": 0.5403050108932462, "grad_norm": 41.46025295148122, "learning_rate": 4.135924122453894e-07, "logits/chosen": 13.161111831665039, "logits/rejected": 14.284921646118164, "logps/chosen": -4.008277416229248, "logps/rejected": -4.32925271987915, "loss": 4.1274, "rewards/accuracies": 0.75, "rewards/chosen": -40.08277130126953, "rewards/margins": 3.20975399017334, "rewards/rejected": -43.29252624511719, "step": 3968 }, { "epoch": 0.5404411764705882, "grad_norm": 39.99423134417397, "learning_rate": 4.134023802545017e-07, "logits/chosen": 12.201994895935059, "logits/rejected": 12.954181671142578, "logps/chosen": -3.9767658710479736, "logps/rejected": -4.2992658615112305, "loss": 4.3866, "rewards/accuracies": 1.0, "rewards/chosen": -39.767662048339844, "rewards/margins": 3.2250003814697266, "rewards/rejected": -42.99266052246094, "step": 3969 }, { "epoch": 0.5405773420479303, "grad_norm": 50.498368678048166, "learning_rate": 4.1321234523523546e-07, "logits/chosen": 13.779993057250977, "logits/rejected": 14.129583358764648, "logps/chosen": -4.319197177886963, "logps/rejected": -4.182341575622559, "loss": 3.9768, "rewards/accuracies": 0.25, "rewards/chosen": -43.19197082519531, "rewards/margins": -1.3685541152954102, "rewards/rejected": -41.82341766357422, "step": 3970 }, { "epoch": 0.5407135076252724, "grad_norm": 38.555365327977675, "learning_rate": 4.1302230723053053e-07, "logits/chosen": 13.797410011291504, "logits/rejected": 15.621380805969238, "logps/chosen": -4.542912006378174, "logps/rejected": -4.7710466384887695, "loss": 3.8776, "rewards/accuracies": 0.5, "rewards/chosen": -45.42912292480469, "rewards/margins": 2.2813472747802734, "rewards/rejected": -47.71046829223633, "step": 3971 }, { "epoch": 0.5408496732026143, "grad_norm": 38.504751445678274, "learning_rate": 4.1283226628332774e-07, "logits/chosen": 12.417388916015625, "logits/rejected": 13.705854415893555, "logps/chosen": -3.8974955081939697, "logps/rejected": -4.437998294830322, "loss": 3.576, "rewards/accuracies": 0.75, "rewards/chosen": -38.974952697753906, "rewards/margins": 5.405027389526367, "rewards/rejected": -44.379981994628906, "step": 3972 }, { "epoch": 0.5409858387799564, "grad_norm": 45.27637108815095, "learning_rate": 4.126422224365683e-07, "logits/chosen": 14.62574291229248, "logits/rejected": 14.465181350708008, "logps/chosen": -4.2131500244140625, "logps/rejected": -4.43568754196167, "loss": 4.4563, "rewards/accuracies": 0.75, "rewards/chosen": -42.13149642944336, "rewards/margins": 2.22537899017334, "rewards/rejected": -44.35687255859375, "step": 3973 }, { "epoch": 0.5411220043572985, "grad_norm": 45.01105525321125, "learning_rate": 4.1245217573319407e-07, "logits/chosen": 13.504632949829102, "logits/rejected": 13.437125205993652, "logps/chosen": -4.015403747558594, "logps/rejected": -4.0402374267578125, "loss": 3.8882, "rewards/accuracies": 0.25, "rewards/chosen": -40.15403747558594, "rewards/margins": 0.2483367919921875, "rewards/rejected": -40.40237808227539, "step": 3974 }, { "epoch": 0.5412581699346405, "grad_norm": 37.089866831235604, "learning_rate": 4.1226212621614793e-07, "logits/chosen": 12.113395690917969, "logits/rejected": 14.191292762756348, "logps/chosen": -4.028306007385254, "logps/rejected": -4.519378185272217, "loss": 3.7291, "rewards/accuracies": 1.0, "rewards/chosen": -40.283058166503906, "rewards/margins": 4.910726547241211, "rewards/rejected": -45.19378662109375, "step": 3975 }, { "epoch": 0.5413943355119826, "grad_norm": 41.066668246816846, "learning_rate": 4.1207207392837306e-07, "logits/chosen": 13.967863082885742, "logits/rejected": 13.881942749023438, "logps/chosen": -3.9688026905059814, "logps/rejected": -4.397091865539551, "loss": 3.4018, "rewards/accuracies": 1.0, "rewards/chosen": -39.688026428222656, "rewards/margins": 4.282889366149902, "rewards/rejected": -43.970916748046875, "step": 3976 }, { "epoch": 0.5415305010893247, "grad_norm": 38.53503125238627, "learning_rate": 4.118820189128131e-07, "logits/chosen": 13.072702407836914, "logits/rejected": 15.12685775756836, "logps/chosen": -3.9312241077423096, "logps/rejected": -4.4216203689575195, "loss": 3.6913, "rewards/accuracies": 1.0, "rewards/chosen": -39.31224060058594, "rewards/margins": 4.903959274291992, "rewards/rejected": -44.21620178222656, "step": 3977 }, { "epoch": 0.5416666666666666, "grad_norm": 53.84937642312318, "learning_rate": 4.116919612124129e-07, "logits/chosen": 13.281122207641602, "logits/rejected": 13.722607612609863, "logps/chosen": -4.0579681396484375, "logps/rejected": -4.3180766105651855, "loss": 4.137, "rewards/accuracies": 0.75, "rewards/chosen": -40.579681396484375, "rewards/margins": 2.601088523864746, "rewards/rejected": -43.18076705932617, "step": 3978 }, { "epoch": 0.5418028322440087, "grad_norm": 42.02108021934043, "learning_rate": 4.115019008701174e-07, "logits/chosen": 13.937246322631836, "logits/rejected": 14.927652359008789, "logps/chosen": -4.228771209716797, "logps/rejected": -4.157874584197998, "loss": 4.0762, "rewards/accuracies": 0.25, "rewards/chosen": -42.28771209716797, "rewards/margins": -0.7089710235595703, "rewards/rejected": -41.57874298095703, "step": 3979 }, { "epoch": 0.5419389978213508, "grad_norm": 48.39593443301099, "learning_rate": 4.113118379288722e-07, "logits/chosen": 13.232918739318848, "logits/rejected": 14.412696838378906, "logps/chosen": -4.159663200378418, "logps/rejected": -4.39498233795166, "loss": 3.8236, "rewards/accuracies": 0.75, "rewards/chosen": -41.59662628173828, "rewards/margins": 2.3531932830810547, "rewards/rejected": -43.94982147216797, "step": 3980 }, { "epoch": 0.5420751633986928, "grad_norm": 41.07360456832664, "learning_rate": 4.1112177243162386e-07, "logits/chosen": 13.252120018005371, "logits/rejected": 13.755800247192383, "logps/chosen": -3.6918673515319824, "logps/rejected": -3.9848999977111816, "loss": 4.1517, "rewards/accuracies": 0.5, "rewards/chosen": -36.918670654296875, "rewards/margins": 2.930326461791992, "rewards/rejected": -39.8489990234375, "step": 3981 }, { "epoch": 0.5422113289760349, "grad_norm": 41.24156916327603, "learning_rate": 4.109317044213191e-07, "logits/chosen": 13.490283966064453, "logits/rejected": 14.47474479675293, "logps/chosen": -4.274342060089111, "logps/rejected": -4.5945892333984375, "loss": 3.7605, "rewards/accuracies": 0.75, "rewards/chosen": -42.7434196472168, "rewards/margins": 3.202467918395996, "rewards/rejected": -45.94588851928711, "step": 3982 }, { "epoch": 0.5423474945533769, "grad_norm": 40.69441766674396, "learning_rate": 4.1074163394090535e-07, "logits/chosen": 13.123648643493652, "logits/rejected": 13.406463623046875, "logps/chosen": -4.409494400024414, "logps/rejected": -4.530271053314209, "loss": 4.057, "rewards/accuracies": 0.5, "rewards/chosen": -44.094947814941406, "rewards/margins": 1.2077608108520508, "rewards/rejected": -45.30270767211914, "step": 3983 }, { "epoch": 0.5424836601307189, "grad_norm": 40.38110608215769, "learning_rate": 4.105515610333306e-07, "logits/chosen": 13.919778823852539, "logits/rejected": 13.467425346374512, "logps/chosen": -4.478675842285156, "logps/rejected": -4.498266220092773, "loss": 3.9915, "rewards/accuracies": 0.75, "rewards/chosen": -44.78675842285156, "rewards/margins": 0.19590377807617188, "rewards/rejected": -44.982662200927734, "step": 3984 }, { "epoch": 0.542619825708061, "grad_norm": 43.420968440512304, "learning_rate": 4.103614857415434e-07, "logits/chosen": 11.92628288269043, "logits/rejected": 13.074661254882812, "logps/chosen": -3.7322380542755127, "logps/rejected": -3.867755889892578, "loss": 4.3017, "rewards/accuracies": 0.5, "rewards/chosen": -37.32238006591797, "rewards/margins": 1.3551788330078125, "rewards/rejected": -38.67755889892578, "step": 3985 }, { "epoch": 0.5427559912854031, "grad_norm": 41.52026038348263, "learning_rate": 4.1017140810849285e-07, "logits/chosen": 13.470172882080078, "logits/rejected": 14.280742645263672, "logps/chosen": -4.0768232345581055, "logps/rejected": -4.60802698135376, "loss": 4.0066, "rewards/accuracies": 1.0, "rewards/chosen": -40.768226623535156, "rewards/margins": 5.312043190002441, "rewards/rejected": -46.08027267456055, "step": 3986 }, { "epoch": 0.5428921568627451, "grad_norm": 39.571056427087576, "learning_rate": 4.0998132817712853e-07, "logits/chosen": 13.488166809082031, "logits/rejected": 14.485198974609375, "logps/chosen": -4.12639045715332, "logps/rejected": -4.536922454833984, "loss": 3.6798, "rewards/accuracies": 0.75, "rewards/chosen": -41.2639045715332, "rewards/margins": 4.105320930480957, "rewards/rejected": -45.369224548339844, "step": 3987 }, { "epoch": 0.5430283224400871, "grad_norm": 53.30878782816014, "learning_rate": 4.097912459904007e-07, "logits/chosen": 12.736512184143066, "logits/rejected": 13.84950065612793, "logps/chosen": -4.059711933135986, "logps/rejected": -4.332139492034912, "loss": 4.7754, "rewards/accuracies": 0.5, "rewards/chosen": -40.59712219238281, "rewards/margins": 2.724275588989258, "rewards/rejected": -43.32139587402344, "step": 3988 }, { "epoch": 0.5431644880174292, "grad_norm": 39.32096671960042, "learning_rate": 4.096011615912598e-07, "logits/chosen": 12.780851364135742, "logits/rejected": 13.742314338684082, "logps/chosen": -3.8600106239318848, "logps/rejected": -4.160880088806152, "loss": 3.9046, "rewards/accuracies": 0.75, "rewards/chosen": -38.60010528564453, "rewards/margins": 3.0086917877197266, "rewards/rejected": -41.60879898071289, "step": 3989 }, { "epoch": 0.5433006535947712, "grad_norm": 43.16479726550539, "learning_rate": 4.094110750226571e-07, "logits/chosen": 14.392956733703613, "logits/rejected": 14.006645202636719, "logps/chosen": -4.31650447845459, "logps/rejected": -4.505996227264404, "loss": 4.3972, "rewards/accuracies": 1.0, "rewards/chosen": -43.16504669189453, "rewards/margins": 1.8949155807495117, "rewards/rejected": -45.059959411621094, "step": 3990 }, { "epoch": 0.5434368191721133, "grad_norm": 53.935923494616766, "learning_rate": 4.0922098632754424e-07, "logits/chosen": 13.835236549377441, "logits/rejected": 14.446317672729492, "logps/chosen": -4.262257099151611, "logps/rejected": -4.467970848083496, "loss": 3.7801, "rewards/accuracies": 0.75, "rewards/chosen": -42.6225700378418, "rewards/margins": 2.057140350341797, "rewards/rejected": -44.679710388183594, "step": 3991 }, { "epoch": 0.5435729847494554, "grad_norm": 44.19213474104512, "learning_rate": 4.0903089554887324e-07, "logits/chosen": 14.01333999633789, "logits/rejected": 14.888741493225098, "logps/chosen": -4.256174087524414, "logps/rejected": -4.4774298667907715, "loss": 3.9309, "rewards/accuracies": 0.75, "rewards/chosen": -42.561744689941406, "rewards/margins": 2.212554931640625, "rewards/rejected": -44.77429962158203, "step": 3992 }, { "epoch": 0.5437091503267973, "grad_norm": 40.840198072134314, "learning_rate": 4.088408027295968e-07, "logits/chosen": 13.386340141296387, "logits/rejected": 12.731051445007324, "logps/chosen": -4.297686576843262, "logps/rejected": -4.179710388183594, "loss": 4.1075, "rewards/accuracies": 0.5, "rewards/chosen": -42.976871490478516, "rewards/margins": -1.179769515991211, "rewards/rejected": -41.79710006713867, "step": 3993 }, { "epoch": 0.5438453159041394, "grad_norm": 44.42159970075908, "learning_rate": 4.0865070791266796e-07, "logits/chosen": 13.71432113647461, "logits/rejected": 14.004265785217285, "logps/chosen": -4.231881618499756, "logps/rejected": -4.493437767028809, "loss": 3.6267, "rewards/accuracies": 0.75, "rewards/chosen": -42.318817138671875, "rewards/margins": 2.615560531616211, "rewards/rejected": -44.93437957763672, "step": 3994 }, { "epoch": 0.5439814814814815, "grad_norm": 38.01784220603555, "learning_rate": 4.0846061114103997e-07, "logits/chosen": 13.635110855102539, "logits/rejected": 13.686494827270508, "logps/chosen": -4.067500114440918, "logps/rejected": -4.344271659851074, "loss": 3.6549, "rewards/accuracies": 0.75, "rewards/chosen": -40.67499923706055, "rewards/margins": 2.7677183151245117, "rewards/rejected": -43.442718505859375, "step": 3995 }, { "epoch": 0.5441176470588235, "grad_norm": 39.74707021396795, "learning_rate": 4.0827051245766714e-07, "logits/chosen": 13.559261322021484, "logits/rejected": 14.181262016296387, "logps/chosen": -3.8330721855163574, "logps/rejected": -4.213889122009277, "loss": 3.8463, "rewards/accuracies": 0.75, "rewards/chosen": -38.330718994140625, "rewards/margins": 3.8081750869750977, "rewards/rejected": -42.13889694213867, "step": 3996 }, { "epoch": 0.5442538126361656, "grad_norm": 41.036200528127594, "learning_rate": 4.080804119055036e-07, "logits/chosen": 12.433666229248047, "logits/rejected": 13.408987045288086, "logps/chosen": -4.069488525390625, "logps/rejected": -4.555174350738525, "loss": 3.8276, "rewards/accuracies": 1.0, "rewards/chosen": -40.694889068603516, "rewards/margins": 4.856855392456055, "rewards/rejected": -45.55174255371094, "step": 3997 }, { "epoch": 0.5443899782135077, "grad_norm": 43.161410793329026, "learning_rate": 4.0789030952750416e-07, "logits/chosen": 13.46710205078125, "logits/rejected": 13.855178833007812, "logps/chosen": -4.640745639801025, "logps/rejected": -4.3682050704956055, "loss": 4.5465, "rewards/accuracies": 0.5, "rewards/chosen": -46.40745544433594, "rewards/margins": -2.7254104614257812, "rewards/rejected": -43.682044982910156, "step": 3998 }, { "epoch": 0.5445261437908496, "grad_norm": 46.83820697625048, "learning_rate": 4.0770020536662406e-07, "logits/chosen": 13.224638938903809, "logits/rejected": 13.738781929016113, "logps/chosen": -4.1544694900512695, "logps/rejected": -4.4576568603515625, "loss": 4.2086, "rewards/accuracies": 0.75, "rewards/chosen": -41.54469680786133, "rewards/margins": 3.031869888305664, "rewards/rejected": -44.576568603515625, "step": 3999 }, { "epoch": 0.5446623093681917, "grad_norm": 55.36668145016897, "learning_rate": 4.0751009946581896e-07, "logits/chosen": 13.251179695129395, "logits/rejected": 13.389915466308594, "logps/chosen": -4.089744567871094, "logps/rejected": -4.170332431793213, "loss": 3.9071, "rewards/accuracies": 0.5, "rewards/chosen": -40.89744186401367, "rewards/margins": 0.8058805465698242, "rewards/rejected": -41.70332336425781, "step": 4000 }, { "epoch": 0.5447984749455338, "grad_norm": 43.65701970604205, "learning_rate": 4.0731999186804476e-07, "logits/chosen": 12.363313674926758, "logits/rejected": 13.901264190673828, "logps/chosen": -3.910581588745117, "logps/rejected": -4.286228179931641, "loss": 3.7532, "rewards/accuracies": 1.0, "rewards/chosen": -39.10581970214844, "rewards/margins": 3.756466865539551, "rewards/rejected": -42.862281799316406, "step": 4001 }, { "epoch": 0.5449346405228758, "grad_norm": 37.20098499422243, "learning_rate": 4.071298826162579e-07, "logits/chosen": 12.581464767456055, "logits/rejected": 14.203594207763672, "logps/chosen": -4.027983665466309, "logps/rejected": -4.410909652709961, "loss": 3.5588, "rewards/accuracies": 1.0, "rewards/chosen": -40.27983856201172, "rewards/margins": 3.829258918762207, "rewards/rejected": -44.109100341796875, "step": 4002 }, { "epoch": 0.5450708061002179, "grad_norm": 44.51106373098045, "learning_rate": 4.0693977175341514e-07, "logits/chosen": 12.902685165405273, "logits/rejected": 12.910966873168945, "logps/chosen": -3.848005533218384, "logps/rejected": -4.229918956756592, "loss": 3.739, "rewards/accuracies": 1.0, "rewards/chosen": -38.48005676269531, "rewards/margins": 3.819136619567871, "rewards/rejected": -42.2991943359375, "step": 4003 }, { "epoch": 0.5452069716775599, "grad_norm": 38.403391243143126, "learning_rate": 4.0674965932247354e-07, "logits/chosen": 12.69202709197998, "logits/rejected": 13.384075164794922, "logps/chosen": -3.75034761428833, "logps/rejected": -4.034526824951172, "loss": 3.9252, "rewards/accuracies": 0.5, "rewards/chosen": -37.50347900390625, "rewards/margins": 2.841792106628418, "rewards/rejected": -40.345272064208984, "step": 4004 }, { "epoch": 0.5453431372549019, "grad_norm": 41.49428258775634, "learning_rate": 4.065595453663907e-07, "logits/chosen": 13.922093391418457, "logits/rejected": 13.951628684997559, "logps/chosen": -4.119996070861816, "logps/rejected": -4.5496649742126465, "loss": 3.935, "rewards/accuracies": 1.0, "rewards/chosen": -41.19995880126953, "rewards/margins": 4.296690940856934, "rewards/rejected": -45.49665069580078, "step": 4005 }, { "epoch": 0.545479302832244, "grad_norm": 41.76315214433174, "learning_rate": 4.063694299281244e-07, "logits/chosen": 13.227700233459473, "logits/rejected": 13.421677589416504, "logps/chosen": -3.8757312297821045, "logps/rejected": -3.962325096130371, "loss": 3.9973, "rewards/accuracies": 0.5, "rewards/chosen": -38.75730895996094, "rewards/margins": 0.8659420013427734, "rewards/rejected": -39.623252868652344, "step": 4006 }, { "epoch": 0.5456154684095861, "grad_norm": 39.39335595057322, "learning_rate": 4.061793130506326e-07, "logits/chosen": 13.118064880371094, "logits/rejected": 12.498208999633789, "logps/chosen": -4.06903076171875, "logps/rejected": -4.228053092956543, "loss": 4.1767, "rewards/accuracies": 0.75, "rewards/chosen": -40.6903076171875, "rewards/margins": 1.5902271270751953, "rewards/rejected": -42.28053283691406, "step": 4007 }, { "epoch": 0.545751633986928, "grad_norm": 46.57926475119744, "learning_rate": 4.05989194776874e-07, "logits/chosen": 12.714574813842773, "logits/rejected": 13.415656089782715, "logps/chosen": -4.149848937988281, "logps/rejected": -4.325741767883301, "loss": 3.5626, "rewards/accuracies": 0.75, "rewards/chosen": -41.49849319458008, "rewards/margins": 1.7589263916015625, "rewards/rejected": -43.25741958618164, "step": 4008 }, { "epoch": 0.5458877995642701, "grad_norm": 37.66391348949662, "learning_rate": 4.0579907514980744e-07, "logits/chosen": 11.989877700805664, "logits/rejected": 14.43841552734375, "logps/chosen": -3.6462316513061523, "logps/rejected": -4.436426639556885, "loss": 4.0342, "rewards/accuracies": 1.0, "rewards/chosen": -36.46231460571289, "rewards/margins": 7.901952743530273, "rewards/rejected": -44.36426544189453, "step": 4009 }, { "epoch": 0.5460239651416122, "grad_norm": 38.9375379405906, "learning_rate": 4.056089542123917e-07, "logits/chosen": 12.770715713500977, "logits/rejected": 13.746759414672852, "logps/chosen": -3.9003145694732666, "logps/rejected": -4.411188125610352, "loss": 3.7609, "rewards/accuracies": 1.0, "rewards/chosen": -39.00314712524414, "rewards/margins": 5.108733177185059, "rewards/rejected": -44.11187744140625, "step": 4010 }, { "epoch": 0.5461601307189542, "grad_norm": 39.39796774668724, "learning_rate": 4.054188320075866e-07, "logits/chosen": 13.339700698852539, "logits/rejected": 13.761130332946777, "logps/chosen": -4.369213104248047, "logps/rejected": -4.1765594482421875, "loss": 3.9105, "rewards/accuracies": 0.75, "rewards/chosen": -43.69213104248047, "rewards/margins": -1.9265403747558594, "rewards/rejected": -41.765594482421875, "step": 4011 }, { "epoch": 0.5462962962962963, "grad_norm": 37.1674892943354, "learning_rate": 4.052287085783515e-07, "logits/chosen": 12.8147554397583, "logits/rejected": 13.6863431930542, "logps/chosen": -3.8113484382629395, "logps/rejected": -4.246033668518066, "loss": 3.1795, "rewards/accuracies": 0.75, "rewards/chosen": -38.11348342895508, "rewards/margins": 4.346850395202637, "rewards/rejected": -42.46033477783203, "step": 4012 }, { "epoch": 0.5464324618736384, "grad_norm": 45.48695898294945, "learning_rate": 4.0503858396764655e-07, "logits/chosen": 13.654877662658691, "logits/rejected": 14.852989196777344, "logps/chosen": -4.098884105682373, "logps/rejected": -4.580202579498291, "loss": 3.8259, "rewards/accuracies": 1.0, "rewards/chosen": -40.98884201049805, "rewards/margins": 4.81318473815918, "rewards/rejected": -45.802024841308594, "step": 4013 }, { "epoch": 0.5465686274509803, "grad_norm": 39.48359980523494, "learning_rate": 4.0484845821843184e-07, "logits/chosen": 14.825677871704102, "logits/rejected": 14.733652114868164, "logps/chosen": -4.673675537109375, "logps/rejected": -4.580453395843506, "loss": 3.9365, "rewards/accuracies": 0.5, "rewards/chosen": -46.73675537109375, "rewards/margins": -0.9322185516357422, "rewards/rejected": -45.804534912109375, "step": 4014 }, { "epoch": 0.5467047930283224, "grad_norm": 39.3786276199812, "learning_rate": 4.046583313736679e-07, "logits/chosen": 12.925481796264648, "logits/rejected": 13.030495643615723, "logps/chosen": -3.702942132949829, "logps/rejected": -3.9322822093963623, "loss": 3.063, "rewards/accuracies": 0.75, "rewards/chosen": -37.0294189453125, "rewards/margins": 2.293402671813965, "rewards/rejected": -39.32282257080078, "step": 4015 }, { "epoch": 0.5468409586056645, "grad_norm": 45.560272231436045, "learning_rate": 4.0446820347631555e-07, "logits/chosen": 13.530440330505371, "logits/rejected": 15.366791725158691, "logps/chosen": -4.106325626373291, "logps/rejected": -4.6761322021484375, "loss": 4.0868, "rewards/accuracies": 1.0, "rewards/chosen": -41.063255310058594, "rewards/margins": 5.698066711425781, "rewards/rejected": -46.761322021484375, "step": 4016 }, { "epoch": 0.5469771241830066, "grad_norm": 41.11364482149417, "learning_rate": 4.0427807456933565e-07, "logits/chosen": 13.292739868164062, "logits/rejected": 13.241762161254883, "logps/chosen": -3.9632887840270996, "logps/rejected": -4.368724822998047, "loss": 4.336, "rewards/accuracies": 0.75, "rewards/chosen": -39.63288879394531, "rewards/margins": 4.054357528686523, "rewards/rejected": -43.6872444152832, "step": 4017 }, { "epoch": 0.5471132897603486, "grad_norm": 42.17883179986673, "learning_rate": 4.0408794469568946e-07, "logits/chosen": 12.404197692871094, "logits/rejected": 13.444579124450684, "logps/chosen": -4.0170464515686035, "logps/rejected": -4.182412147521973, "loss": 3.8997, "rewards/accuracies": 0.75, "rewards/chosen": -40.170467376708984, "rewards/margins": 1.6536521911621094, "rewards/rejected": -41.824119567871094, "step": 4018 }, { "epoch": 0.5472494553376906, "grad_norm": 45.04718014696314, "learning_rate": 4.038978138983383e-07, "logits/chosen": 13.152289390563965, "logits/rejected": 13.564958572387695, "logps/chosen": -4.03183650970459, "logps/rejected": -4.062413215637207, "loss": 4.1469, "rewards/accuracies": 0.5, "rewards/chosen": -40.31836700439453, "rewards/margins": 0.3057689666748047, "rewards/rejected": -40.6241340637207, "step": 4019 }, { "epoch": 0.5473856209150327, "grad_norm": 44.58754587718231, "learning_rate": 4.0370768222024393e-07, "logits/chosen": 13.876047134399414, "logits/rejected": 14.118085861206055, "logps/chosen": -4.330323696136475, "logps/rejected": -4.301039695739746, "loss": 4.3513, "rewards/accuracies": 0.5, "rewards/chosen": -43.30323791503906, "rewards/margins": -0.29283809661865234, "rewards/rejected": -43.010398864746094, "step": 4020 }, { "epoch": 0.5475217864923747, "grad_norm": 39.3844012177636, "learning_rate": 4.0351754970436815e-07, "logits/chosen": 12.338096618652344, "logits/rejected": 13.717299461364746, "logps/chosen": -3.795046806335449, "logps/rejected": -4.390833377838135, "loss": 4.0927, "rewards/accuracies": 1.0, "rewards/chosen": -37.95046615600586, "rewards/margins": 5.957866668701172, "rewards/rejected": -43.90833282470703, "step": 4021 }, { "epoch": 0.5476579520697168, "grad_norm": 38.51651590723026, "learning_rate": 4.0332741639367285e-07, "logits/chosen": 13.342750549316406, "logits/rejected": 13.179201126098633, "logps/chosen": -4.020237922668457, "logps/rejected": -4.272456169128418, "loss": 3.5778, "rewards/accuracies": 0.75, "rewards/chosen": -40.20237731933594, "rewards/margins": 2.52217960357666, "rewards/rejected": -42.72455596923828, "step": 4022 }, { "epoch": 0.5477941176470589, "grad_norm": 40.73646608042097, "learning_rate": 4.031372823311204e-07, "logits/chosen": 13.338406562805176, "logits/rejected": 12.966423034667969, "logps/chosen": -4.018775939941406, "logps/rejected": -3.992692470550537, "loss": 4.1886, "rewards/accuracies": 0.25, "rewards/chosen": -40.1877555847168, "rewards/margins": -0.2608318328857422, "rewards/rejected": -39.92692565917969, "step": 4023 }, { "epoch": 0.5479302832244008, "grad_norm": 43.06239378272661, "learning_rate": 4.0294714755967307e-07, "logits/chosen": 13.714783668518066, "logits/rejected": 13.7046480178833, "logps/chosen": -4.222851753234863, "logps/rejected": -4.417914390563965, "loss": 3.8744, "rewards/accuracies": 0.75, "rewards/chosen": -42.228515625, "rewards/margins": 1.9506330490112305, "rewards/rejected": -44.17914962768555, "step": 4024 }, { "epoch": 0.5480664488017429, "grad_norm": 43.41358678925225, "learning_rate": 4.0275701212229335e-07, "logits/chosen": 14.157402038574219, "logits/rejected": 13.220904350280762, "logps/chosen": -4.424874782562256, "logps/rejected": -4.369536876678467, "loss": 3.9912, "rewards/accuracies": 0.25, "rewards/chosen": -44.24874496459961, "rewards/margins": -0.5533790588378906, "rewards/rejected": -43.695369720458984, "step": 4025 }, { "epoch": 0.548202614379085, "grad_norm": 40.258751068937, "learning_rate": 4.02566876061944e-07, "logits/chosen": 13.277074813842773, "logits/rejected": 13.765192031860352, "logps/chosen": -4.294135570526123, "logps/rejected": -4.33040714263916, "loss": 3.6018, "rewards/accuracies": 0.5, "rewards/chosen": -42.94136047363281, "rewards/margins": 0.3627138137817383, "rewards/rejected": -43.30406951904297, "step": 4026 }, { "epoch": 0.548338779956427, "grad_norm": 44.790017044894846, "learning_rate": 4.023767394215878e-07, "logits/chosen": 13.264036178588867, "logits/rejected": 13.214515686035156, "logps/chosen": -4.352449893951416, "logps/rejected": -4.211770057678223, "loss": 4.3046, "rewards/accuracies": 0.25, "rewards/chosen": -43.524497985839844, "rewards/margins": -1.4067935943603516, "rewards/rejected": -42.11770248413086, "step": 4027 }, { "epoch": 0.5484749455337691, "grad_norm": 49.57489061395909, "learning_rate": 4.021866022441875e-07, "logits/chosen": 13.84632682800293, "logits/rejected": 13.164730072021484, "logps/chosen": -4.2867937088012695, "logps/rejected": -4.100996494293213, "loss": 3.7949, "rewards/accuracies": 0.0, "rewards/chosen": -42.86793518066406, "rewards/margins": -1.85797119140625, "rewards/rejected": -41.00996398925781, "step": 4028 }, { "epoch": 0.5486111111111112, "grad_norm": 43.94327531911699, "learning_rate": 4.019964645727065e-07, "logits/chosen": 12.639469146728516, "logits/rejected": 12.952068328857422, "logps/chosen": -4.131218910217285, "logps/rejected": -4.095638751983643, "loss": 3.7569, "rewards/accuracies": 0.5, "rewards/chosen": -41.31218719482422, "rewards/margins": -0.3558025360107422, "rewards/rejected": -40.95638656616211, "step": 4029 }, { "epoch": 0.5487472766884531, "grad_norm": 41.31407088239805, "learning_rate": 4.0180632645010784e-07, "logits/chosen": 13.461576461791992, "logits/rejected": 14.119556427001953, "logps/chosen": -4.384284496307373, "logps/rejected": -4.7330780029296875, "loss": 4.0063, "rewards/accuracies": 1.0, "rewards/chosen": -43.84284210205078, "rewards/margins": 3.487931251525879, "rewards/rejected": -47.33077621459961, "step": 4030 }, { "epoch": 0.5488834422657952, "grad_norm": 40.63567511670206, "learning_rate": 4.0161618791935474e-07, "logits/chosen": 12.52675724029541, "logits/rejected": 12.779163360595703, "logps/chosen": -4.048211574554443, "logps/rejected": -4.1403021812438965, "loss": 4.2438, "rewards/accuracies": 0.75, "rewards/chosen": -40.48211669921875, "rewards/margins": 0.9209051132202148, "rewards/rejected": -41.40302276611328, "step": 4031 }, { "epoch": 0.5490196078431373, "grad_norm": 41.664298291677675, "learning_rate": 4.0142604902341064e-07, "logits/chosen": 13.035894393920898, "logits/rejected": 13.069928169250488, "logps/chosen": -3.8854355812072754, "logps/rejected": -4.127028465270996, "loss": 3.6943, "rewards/accuracies": 0.75, "rewards/chosen": -38.8543586730957, "rewards/margins": 2.41593074798584, "rewards/rejected": -41.270286560058594, "step": 4032 }, { "epoch": 0.5491557734204793, "grad_norm": 47.80505323124095, "learning_rate": 4.01235909805239e-07, "logits/chosen": 12.258615493774414, "logits/rejected": 13.166374206542969, "logps/chosen": -3.7930028438568115, "logps/rejected": -3.9381237030029297, "loss": 3.83, "rewards/accuracies": 0.75, "rewards/chosen": -37.93002700805664, "rewards/margins": 1.4512100219726562, "rewards/rejected": -39.3812370300293, "step": 4033 }, { "epoch": 0.5492919389978214, "grad_norm": 42.27937628207364, "learning_rate": 4.0104577030780316e-07, "logits/chosen": 12.510908126831055, "logits/rejected": 13.112000465393066, "logps/chosen": -4.058789253234863, "logps/rejected": -4.291640758514404, "loss": 4.1262, "rewards/accuracies": 1.0, "rewards/chosen": -40.587894439697266, "rewards/margins": 2.3285131454467773, "rewards/rejected": -42.91640853881836, "step": 4034 }, { "epoch": 0.5494281045751634, "grad_norm": 41.05031394902103, "learning_rate": 4.0085563057406714e-07, "logits/chosen": 12.22315788269043, "logits/rejected": 14.129680633544922, "logps/chosen": -4.028223514556885, "logps/rejected": -4.546592712402344, "loss": 3.3873, "rewards/accuracies": 0.75, "rewards/chosen": -40.28223419189453, "rewards/margins": 5.1836957931518555, "rewards/rejected": -45.4659309387207, "step": 4035 }, { "epoch": 0.5495642701525054, "grad_norm": 41.74914675958989, "learning_rate": 4.0066549064699415e-07, "logits/chosen": 14.03958797454834, "logits/rejected": 13.61282730102539, "logps/chosen": -3.9050192832946777, "logps/rejected": -3.8512067794799805, "loss": 4.1922, "rewards/accuracies": 0.5, "rewards/chosen": -39.050193786621094, "rewards/margins": -0.5381245613098145, "rewards/rejected": -38.51206970214844, "step": 4036 }, { "epoch": 0.5497004357298475, "grad_norm": 40.7080309139954, "learning_rate": 4.00475350569548e-07, "logits/chosen": 12.125373840332031, "logits/rejected": 12.838520050048828, "logps/chosen": -4.100594997406006, "logps/rejected": -4.246172904968262, "loss": 4.415, "rewards/accuracies": 0.75, "rewards/chosen": -41.00594711303711, "rewards/margins": 1.4557790756225586, "rewards/rejected": -42.46173095703125, "step": 4037 }, { "epoch": 0.5498366013071896, "grad_norm": 36.33521615390697, "learning_rate": 4.0028521038469265e-07, "logits/chosen": 13.033985137939453, "logits/rejected": 12.75755500793457, "logps/chosen": -4.332101821899414, "logps/rejected": -4.069838523864746, "loss": 3.8229, "rewards/accuracies": 0.5, "rewards/chosen": -43.321014404296875, "rewards/margins": -2.6226301193237305, "rewards/rejected": -40.698387145996094, "step": 4038 }, { "epoch": 0.5499727668845316, "grad_norm": 43.11592515117052, "learning_rate": 4.0009507013539155e-07, "logits/chosen": 13.218280792236328, "logits/rejected": 12.859346389770508, "logps/chosen": -4.087268829345703, "logps/rejected": -3.986767292022705, "loss": 3.8125, "rewards/accuracies": 0.25, "rewards/chosen": -40.8726921081543, "rewards/margins": -1.0050172805786133, "rewards/rejected": -39.867671966552734, "step": 4039 }, { "epoch": 0.5501089324618736, "grad_norm": 43.36289595874017, "learning_rate": 3.9990492986460847e-07, "logits/chosen": 12.915783882141113, "logits/rejected": 13.708295822143555, "logps/chosen": -4.207658767700195, "logps/rejected": -4.539768695831299, "loss": 4.6272, "rewards/accuracies": 1.0, "rewards/chosen": -42.07658386230469, "rewards/margins": 3.321099281311035, "rewards/rejected": -45.39768600463867, "step": 4040 }, { "epoch": 0.5502450980392157, "grad_norm": 44.32857261412355, "learning_rate": 3.997147896153073e-07, "logits/chosen": 13.017640113830566, "logits/rejected": 14.207365036010742, "logps/chosen": -3.9374494552612305, "logps/rejected": -4.463679313659668, "loss": 4.0097, "rewards/accuracies": 0.75, "rewards/chosen": -39.37449264526367, "rewards/margins": 5.262303352355957, "rewards/rejected": -44.63679504394531, "step": 4041 }, { "epoch": 0.5503812636165577, "grad_norm": 39.71567223978346, "learning_rate": 3.99524649430452e-07, "logits/chosen": 12.904763221740723, "logits/rejected": 13.079399108886719, "logps/chosen": -3.970517635345459, "logps/rejected": -4.239732265472412, "loss": 3.6402, "rewards/accuracies": 0.75, "rewards/chosen": -39.705177307128906, "rewards/margins": 2.692148208618164, "rewards/rejected": -42.39732360839844, "step": 4042 }, { "epoch": 0.5505174291938998, "grad_norm": 40.714383401212295, "learning_rate": 3.993345093530058e-07, "logits/chosen": 13.741521835327148, "logits/rejected": 12.992271423339844, "logps/chosen": -4.064642906188965, "logps/rejected": -4.353808403015137, "loss": 3.6586, "rewards/accuracies": 1.0, "rewards/chosen": -40.64643096923828, "rewards/margins": 2.8916501998901367, "rewards/rejected": -43.538082122802734, "step": 4043 }, { "epoch": 0.5506535947712419, "grad_norm": 37.42412702244293, "learning_rate": 3.991443694259328e-07, "logits/chosen": 13.677679061889648, "logits/rejected": 13.713357925415039, "logps/chosen": -4.301735877990723, "logps/rejected": -4.339692115783691, "loss": 3.6552, "rewards/accuracies": 0.5, "rewards/chosen": -43.017356872558594, "rewards/margins": 0.37956714630126953, "rewards/rejected": -43.39692306518555, "step": 4044 }, { "epoch": 0.5507897603485838, "grad_norm": 40.839200103819245, "learning_rate": 3.989542296921968e-07, "logits/chosen": 13.212936401367188, "logits/rejected": 14.062698364257812, "logps/chosen": -4.129645347595215, "logps/rejected": -4.533836364746094, "loss": 3.4945, "rewards/accuracies": 1.0, "rewards/chosen": -41.296451568603516, "rewards/margins": 4.041909217834473, "rewards/rejected": -45.33836364746094, "step": 4045 }, { "epoch": 0.5509259259259259, "grad_norm": 73.21662542280936, "learning_rate": 3.9876409019476106e-07, "logits/chosen": 13.537471771240234, "logits/rejected": 14.459829330444336, "logps/chosen": -4.65573263168335, "logps/rejected": -4.369513988494873, "loss": 4.3547, "rewards/accuracies": 0.25, "rewards/chosen": -46.55732727050781, "rewards/margins": -2.8621912002563477, "rewards/rejected": -43.69513702392578, "step": 4046 }, { "epoch": 0.551062091503268, "grad_norm": 42.97421583270831, "learning_rate": 3.985739509765893e-07, "logits/chosen": 13.111421585083008, "logits/rejected": 13.420598983764648, "logps/chosen": -4.026719093322754, "logps/rejected": -3.9651873111724854, "loss": 4.1188, "rewards/accuracies": 0.25, "rewards/chosen": -40.26719284057617, "rewards/margins": -0.6153182983398438, "rewards/rejected": -39.65187454223633, "step": 4047 }, { "epoch": 0.55119825708061, "grad_norm": 45.44273740842226, "learning_rate": 3.9838381208064533e-07, "logits/chosen": 13.444124221801758, "logits/rejected": 13.561288833618164, "logps/chosen": -4.1336822509765625, "logps/rejected": -4.429790496826172, "loss": 3.6627, "rewards/accuracies": 0.75, "rewards/chosen": -41.336822509765625, "rewards/margins": 2.9610815048217773, "rewards/rejected": -44.29790496826172, "step": 4048 }, { "epoch": 0.5513344226579521, "grad_norm": 40.87528604568762, "learning_rate": 3.981936735498922e-07, "logits/chosen": 12.887401580810547, "logits/rejected": 13.381196022033691, "logps/chosen": -4.0988922119140625, "logps/rejected": -4.1595330238342285, "loss": 3.8286, "rewards/accuracies": 0.75, "rewards/chosen": -40.988922119140625, "rewards/margins": 0.6064033508300781, "rewards/rejected": -41.59532928466797, "step": 4049 }, { "epoch": 0.5514705882352942, "grad_norm": 44.44345174921768, "learning_rate": 3.980035354272934e-07, "logits/chosen": 13.75981330871582, "logits/rejected": 14.278545379638672, "logps/chosen": -4.037844181060791, "logps/rejected": -4.484819412231445, "loss": 4.3004, "rewards/accuracies": 0.75, "rewards/chosen": -40.378440856933594, "rewards/margins": 4.46975040435791, "rewards/rejected": -44.84819030761719, "step": 4050 }, { "epoch": 0.5516067538126361, "grad_norm": 49.4648429565512, "learning_rate": 3.978133977558125e-07, "logits/chosen": 14.18083667755127, "logits/rejected": 13.805744171142578, "logps/chosen": -4.85865592956543, "logps/rejected": -4.830208778381348, "loss": 3.9819, "rewards/accuracies": 0.75, "rewards/chosen": -48.5865592956543, "rewards/margins": -0.2844715118408203, "rewards/rejected": -48.302085876464844, "step": 4051 }, { "epoch": 0.5517429193899782, "grad_norm": 247.62679828658963, "learning_rate": 3.976232605784123e-07, "logits/chosen": 13.15240478515625, "logits/rejected": 13.853071212768555, "logps/chosen": -4.316869735717773, "logps/rejected": -4.439633369445801, "loss": 3.4436, "rewards/accuracies": 0.75, "rewards/chosen": -43.16869354248047, "rewards/margins": 1.2276344299316406, "rewards/rejected": -44.396331787109375, "step": 4052 }, { "epoch": 0.5518790849673203, "grad_norm": 52.86557450691409, "learning_rate": 3.9743312393805593e-07, "logits/chosen": 13.883245468139648, "logits/rejected": 13.04852294921875, "logps/chosen": -3.9369332790374756, "logps/rejected": -4.307149410247803, "loss": 4.2161, "rewards/accuracies": 0.75, "rewards/chosen": -39.36933135986328, "rewards/margins": 3.7021636962890625, "rewards/rejected": -43.071495056152344, "step": 4053 }, { "epoch": 0.5520152505446623, "grad_norm": 47.80428888033015, "learning_rate": 3.9724298787770667e-07, "logits/chosen": 13.663186073303223, "logits/rejected": 13.439495086669922, "logps/chosen": -4.5553693771362305, "logps/rejected": -4.251424789428711, "loss": 3.8556, "rewards/accuracies": 0.25, "rewards/chosen": -45.55369567871094, "rewards/margins": -3.039443016052246, "rewards/rejected": -42.514251708984375, "step": 4054 }, { "epoch": 0.5521514161220044, "grad_norm": 44.6533449469361, "learning_rate": 3.9705285244032695e-07, "logits/chosen": 12.706238746643066, "logits/rejected": 13.899487495422363, "logps/chosen": -3.8909950256347656, "logps/rejected": -4.604714870452881, "loss": 3.8925, "rewards/accuracies": 1.0, "rewards/chosen": -38.909950256347656, "rewards/margins": 7.137198448181152, "rewards/rejected": -46.047149658203125, "step": 4055 }, { "epoch": 0.5522875816993464, "grad_norm": 49.316515601136516, "learning_rate": 3.968627176688795e-07, "logits/chosen": 12.376302719116211, "logits/rejected": 13.169609069824219, "logps/chosen": -4.484335899353027, "logps/rejected": -4.74119758605957, "loss": 3.9167, "rewards/accuracies": 1.0, "rewards/chosen": -44.843353271484375, "rewards/margins": 2.5686187744140625, "rewards/rejected": -47.4119758605957, "step": 4056 }, { "epoch": 0.5524237472766884, "grad_norm": 40.95717455901423, "learning_rate": 3.9667258360632716e-07, "logits/chosen": 13.15005874633789, "logits/rejected": 14.105527877807617, "logps/chosen": -4.066232681274414, "logps/rejected": -4.495397567749023, "loss": 3.9319, "rewards/accuracies": 0.75, "rewards/chosen": -40.662330627441406, "rewards/margins": 4.29164981842041, "rewards/rejected": -44.9539794921875, "step": 4057 }, { "epoch": 0.5525599128540305, "grad_norm": 39.92787668742916, "learning_rate": 3.964824502956318e-07, "logits/chosen": 13.417194366455078, "logits/rejected": 13.249317169189453, "logps/chosen": -4.33845853805542, "logps/rejected": -4.365434169769287, "loss": 3.4387, "rewards/accuracies": 0.5, "rewards/chosen": -43.38458251953125, "rewards/margins": 0.2697572708129883, "rewards/rejected": -43.65434265136719, "step": 4058 }, { "epoch": 0.5526960784313726, "grad_norm": 47.250051465464765, "learning_rate": 3.96292317779756e-07, "logits/chosen": 13.394269943237305, "logits/rejected": 13.842205047607422, "logps/chosen": -4.564058780670166, "logps/rejected": -4.412031173706055, "loss": 3.7068, "rewards/accuracies": 0.5, "rewards/chosen": -45.640586853027344, "rewards/margins": -1.520277976989746, "rewards/rejected": -44.12030792236328, "step": 4059 }, { "epoch": 0.5528322440087146, "grad_norm": 40.3452094077768, "learning_rate": 3.961021861016617e-07, "logits/chosen": 13.362241744995117, "logits/rejected": 12.954957962036133, "logps/chosen": -3.998452663421631, "logps/rejected": -3.9352357387542725, "loss": 3.9824, "rewards/accuracies": 0.5, "rewards/chosen": -39.984527587890625, "rewards/margins": -0.6321706771850586, "rewards/rejected": -39.35235595703125, "step": 4060 }, { "epoch": 0.5529684095860566, "grad_norm": 46.636807467364136, "learning_rate": 3.9591205530431056e-07, "logits/chosen": 13.045984268188477, "logits/rejected": 13.541746139526367, "logps/chosen": -4.1720051765441895, "logps/rejected": -4.548483848571777, "loss": 4.4291, "rewards/accuracies": 0.75, "rewards/chosen": -41.72005081176758, "rewards/margins": 3.7647829055786133, "rewards/rejected": -45.484832763671875, "step": 4061 }, { "epoch": 0.5531045751633987, "grad_norm": 40.298549518465215, "learning_rate": 3.957219254306643e-07, "logits/chosen": 13.288269996643066, "logits/rejected": 14.001171112060547, "logps/chosen": -4.029691219329834, "logps/rejected": -4.279782295227051, "loss": 4.174, "rewards/accuracies": 1.0, "rewards/chosen": -40.296913146972656, "rewards/margins": 2.50091552734375, "rewards/rejected": -42.797828674316406, "step": 4062 }, { "epoch": 0.5532407407407407, "grad_norm": 41.23960591000742, "learning_rate": 3.9553179652368447e-07, "logits/chosen": 13.657730102539062, "logits/rejected": 14.23020076751709, "logps/chosen": -3.955531120300293, "logps/rejected": -4.474410057067871, "loss": 3.9751, "rewards/accuracies": 1.0, "rewards/chosen": -39.5553092956543, "rewards/margins": 5.188791275024414, "rewards/rejected": -44.74409866333008, "step": 4063 }, { "epoch": 0.5533769063180828, "grad_norm": 44.45150691004098, "learning_rate": 3.953416686263321e-07, "logits/chosen": 12.173093795776367, "logits/rejected": 13.3256254196167, "logps/chosen": -4.053228855133057, "logps/rejected": -4.18391752243042, "loss": 3.8441, "rewards/accuracies": 0.25, "rewards/chosen": -40.53228759765625, "rewards/margins": 1.3068885803222656, "rewards/rejected": -41.839176177978516, "step": 4064 }, { "epoch": 0.5535130718954249, "grad_norm": 41.855374079907364, "learning_rate": 3.9515154178156817e-07, "logits/chosen": 13.526365280151367, "logits/rejected": 13.987624168395996, "logps/chosen": -4.185525894165039, "logps/rejected": -4.355297565460205, "loss": 3.7044, "rewards/accuracies": 0.5, "rewards/chosen": -41.855262756347656, "rewards/margins": 1.6977128982543945, "rewards/rejected": -43.552974700927734, "step": 4065 }, { "epoch": 0.5536492374727668, "grad_norm": 42.87257964556116, "learning_rate": 3.949614160323535e-07, "logits/chosen": 13.210681915283203, "logits/rejected": 14.210944175720215, "logps/chosen": -4.334374904632568, "logps/rejected": -4.553770542144775, "loss": 4.2888, "rewards/accuracies": 0.75, "rewards/chosen": -43.34375, "rewards/margins": 2.1939611434936523, "rewards/rejected": -45.5377082824707, "step": 4066 }, { "epoch": 0.5537854030501089, "grad_norm": 59.117566552208864, "learning_rate": 3.947712914216485e-07, "logits/chosen": 12.30027961730957, "logits/rejected": 13.138991355895996, "logps/chosen": -3.7729625701904297, "logps/rejected": -4.066470146179199, "loss": 3.6382, "rewards/accuracies": 1.0, "rewards/chosen": -37.7296257019043, "rewards/margins": 2.935074806213379, "rewards/rejected": -40.664703369140625, "step": 4067 }, { "epoch": 0.553921568627451, "grad_norm": 39.749208428958326, "learning_rate": 3.945811679924134e-07, "logits/chosen": 12.986087799072266, "logits/rejected": 13.155447959899902, "logps/chosen": -4.114497184753418, "logps/rejected": -4.313071250915527, "loss": 3.3711, "rewards/accuracies": 0.75, "rewards/chosen": -41.14497375488281, "rewards/margins": 1.9857378005981445, "rewards/rejected": -43.130714416503906, "step": 4068 }, { "epoch": 0.554057734204793, "grad_norm": 43.90042747817676, "learning_rate": 3.9439104578760824e-07, "logits/chosen": 13.356371879577637, "logits/rejected": 13.377521514892578, "logps/chosen": -3.9628546237945557, "logps/rejected": -4.1467390060424805, "loss": 4.1401, "rewards/accuracies": 0.75, "rewards/chosen": -39.62854766845703, "rewards/margins": 1.8388433456420898, "rewards/rejected": -41.46739196777344, "step": 4069 }, { "epoch": 0.5541938997821351, "grad_norm": 50.34387352071962, "learning_rate": 3.9420092485019263e-07, "logits/chosen": 13.987030982971191, "logits/rejected": 14.414389610290527, "logps/chosen": -4.0604400634765625, "logps/rejected": -4.360675811767578, "loss": 3.7509, "rewards/accuracies": 0.5, "rewards/chosen": -40.604408264160156, "rewards/margins": 3.002354621887207, "rewards/rejected": -43.60675811767578, "step": 4070 }, { "epoch": 0.5543300653594772, "grad_norm": 45.718744010712406, "learning_rate": 3.94010805223126e-07, "logits/chosen": 13.223588943481445, "logits/rejected": 12.91755485534668, "logps/chosen": -4.1841230392456055, "logps/rejected": -4.294625282287598, "loss": 3.9791, "rewards/accuracies": 0.25, "rewards/chosen": -41.84123229980469, "rewards/margins": 1.1050195693969727, "rewards/rejected": -42.946250915527344, "step": 4071 }, { "epoch": 0.5544662309368191, "grad_norm": 46.01636258010832, "learning_rate": 3.938206869493674e-07, "logits/chosen": 12.858642578125, "logits/rejected": 13.211822509765625, "logps/chosen": -4.341677188873291, "logps/rejected": -4.424574851989746, "loss": 4.0915, "rewards/accuracies": 0.75, "rewards/chosen": -43.416770935058594, "rewards/margins": 0.8289794921875, "rewards/rejected": -44.245750427246094, "step": 4072 }, { "epoch": 0.5546023965141612, "grad_norm": 46.2673555348186, "learning_rate": 3.9363057007187563e-07, "logits/chosen": 13.696688652038574, "logits/rejected": 13.273004531860352, "logps/chosen": -4.19424295425415, "logps/rejected": -4.335219383239746, "loss": 3.7425, "rewards/accuracies": 0.75, "rewards/chosen": -41.94242858886719, "rewards/margins": 1.4097652435302734, "rewards/rejected": -43.352195739746094, "step": 4073 }, { "epoch": 0.5547385620915033, "grad_norm": 41.90666178945609, "learning_rate": 3.934404546336093e-07, "logits/chosen": 12.819295883178711, "logits/rejected": 13.723461151123047, "logps/chosen": -4.109528541564941, "logps/rejected": -4.464162349700928, "loss": 4.08, "rewards/accuracies": 0.75, "rewards/chosen": -41.09528350830078, "rewards/margins": 3.5463390350341797, "rewards/rejected": -44.64162063598633, "step": 4074 }, { "epoch": 0.5548747276688453, "grad_norm": 42.90924167736755, "learning_rate": 3.9325034067752643e-07, "logits/chosen": 12.031826972961426, "logits/rejected": 12.797944068908691, "logps/chosen": -3.765871524810791, "logps/rejected": -4.185932636260986, "loss": 3.6256, "rewards/accuracies": 0.75, "rewards/chosen": -37.658714294433594, "rewards/margins": 4.200611114501953, "rewards/rejected": -41.85932540893555, "step": 4075 }, { "epoch": 0.5550108932461874, "grad_norm": 42.35843780693774, "learning_rate": 3.930602282465848e-07, "logits/chosen": 13.0501708984375, "logits/rejected": 13.282712936401367, "logps/chosen": -4.024604797363281, "logps/rejected": -3.8940672874450684, "loss": 4.4503, "rewards/accuracies": 0.75, "rewards/chosen": -40.24604797363281, "rewards/margins": -1.305375099182129, "rewards/rejected": -38.940673828125, "step": 4076 }, { "epoch": 0.5551470588235294, "grad_norm": 40.61133348600817, "learning_rate": 3.9287011738374203e-07, "logits/chosen": 12.399709701538086, "logits/rejected": 14.427884101867676, "logps/chosen": -3.8343863487243652, "logps/rejected": -4.380002021789551, "loss": 4.3667, "rewards/accuracies": 1.0, "rewards/chosen": -38.34386444091797, "rewards/margins": 5.4561614990234375, "rewards/rejected": -43.800025939941406, "step": 4077 }, { "epoch": 0.5552832244008714, "grad_norm": 39.61992513650164, "learning_rate": 3.9268000813195526e-07, "logits/chosen": 12.973550796508789, "logits/rejected": 12.871893882751465, "logps/chosen": -4.166586875915527, "logps/rejected": -3.9409213066101074, "loss": 3.5443, "rewards/accuracies": 0.5, "rewards/chosen": -41.665870666503906, "rewards/margins": -2.256657600402832, "rewards/rejected": -39.40921401977539, "step": 4078 }, { "epoch": 0.5554193899782135, "grad_norm": 43.826338814470624, "learning_rate": 3.92489900534181e-07, "logits/chosen": 13.194930076599121, "logits/rejected": 13.797597885131836, "logps/chosen": -4.122292518615723, "logps/rejected": -4.483952522277832, "loss": 4.377, "rewards/accuracies": 1.0, "rewards/chosen": -41.22292709350586, "rewards/margins": 3.6165990829467773, "rewards/rejected": -44.83952331542969, "step": 4079 }, { "epoch": 0.5555555555555556, "grad_norm": 40.79118382317317, "learning_rate": 3.9229979463337585e-07, "logits/chosen": 14.355508804321289, "logits/rejected": 14.562271118164062, "logps/chosen": -4.106937408447266, "logps/rejected": -4.399937629699707, "loss": 3.8683, "rewards/accuracies": 1.0, "rewards/chosen": -41.069374084472656, "rewards/margins": 2.9300050735473633, "rewards/rejected": -43.9993782043457, "step": 4080 }, { "epoch": 0.5556917211328976, "grad_norm": 43.305985227614066, "learning_rate": 3.9210969047249586e-07, "logits/chosen": 13.686446189880371, "logits/rejected": 14.059144973754883, "logps/chosen": -4.374867916107178, "logps/rejected": -4.528242111206055, "loss": 4.06, "rewards/accuracies": 0.75, "rewards/chosen": -43.748680114746094, "rewards/margins": 1.5337371826171875, "rewards/rejected": -45.28241729736328, "step": 4081 }, { "epoch": 0.5558278867102396, "grad_norm": 43.230176681960586, "learning_rate": 3.919195880944964e-07, "logits/chosen": 13.470376968383789, "logits/rejected": 13.94225025177002, "logps/chosen": -3.9939637184143066, "logps/rejected": -4.347400665283203, "loss": 3.7696, "rewards/accuracies": 0.5, "rewards/chosen": -39.939640045166016, "rewards/margins": 3.534369468688965, "rewards/rejected": -43.47400665283203, "step": 4082 }, { "epoch": 0.5559640522875817, "grad_norm": 38.06370938286042, "learning_rate": 3.9172948754233277e-07, "logits/chosen": 12.7166109085083, "logits/rejected": 12.662940979003906, "logps/chosen": -3.7786755561828613, "logps/rejected": -3.9513778686523438, "loss": 3.7402, "rewards/accuracies": 0.5, "rewards/chosen": -37.78675842285156, "rewards/margins": 1.7270212173461914, "rewards/rejected": -39.51377868652344, "step": 4083 }, { "epoch": 0.5561002178649237, "grad_norm": 46.24387337937892, "learning_rate": 3.9153938885896005e-07, "logits/chosen": 13.175602912902832, "logits/rejected": 13.05173110961914, "logps/chosen": -4.034217834472656, "logps/rejected": -4.137879371643066, "loss": 4.1539, "rewards/accuracies": 0.75, "rewards/chosen": -40.3421745300293, "rewards/margins": 1.0366220474243164, "rewards/rejected": -41.37879943847656, "step": 4084 }, { "epoch": 0.5562363834422658, "grad_norm": 43.835992306113994, "learning_rate": 3.9134929208733205e-07, "logits/chosen": 13.651876449584961, "logits/rejected": 14.253849029541016, "logps/chosen": -4.009878635406494, "logps/rejected": -4.633891582489014, "loss": 4.2421, "rewards/accuracies": 0.75, "rewards/chosen": -40.09878921508789, "rewards/margins": 6.2401275634765625, "rewards/rejected": -46.33891296386719, "step": 4085 }, { "epoch": 0.5563725490196079, "grad_norm": 49.301217354789095, "learning_rate": 3.911591972704031e-07, "logits/chosen": 13.83389663696289, "logits/rejected": 13.721277236938477, "logps/chosen": -4.6416778564453125, "logps/rejected": -4.575943470001221, "loss": 4.2903, "rewards/accuracies": 0.5, "rewards/chosen": -46.416778564453125, "rewards/margins": -0.6573457717895508, "rewards/rejected": -45.759429931640625, "step": 4086 }, { "epoch": 0.5565087145969498, "grad_norm": 38.69828143503446, "learning_rate": 3.909691044511268e-07, "logits/chosen": 13.353734970092773, "logits/rejected": 13.894428253173828, "logps/chosen": -4.360553741455078, "logps/rejected": -4.468593597412109, "loss": 3.7414, "rewards/accuracies": 0.5, "rewards/chosen": -43.60553741455078, "rewards/margins": 1.080397605895996, "rewards/rejected": -44.685935974121094, "step": 4087 }, { "epoch": 0.5566448801742919, "grad_norm": 52.50747235632797, "learning_rate": 3.907790136724558e-07, "logits/chosen": 14.118233680725098, "logits/rejected": 14.12924861907959, "logps/chosen": -4.489105224609375, "logps/rejected": -4.734336853027344, "loss": 4.1962, "rewards/accuracies": 1.0, "rewards/chosen": -44.891048431396484, "rewards/margins": 2.452317237854004, "rewards/rejected": -47.34336853027344, "step": 4088 }, { "epoch": 0.556781045751634, "grad_norm": 43.00670237575473, "learning_rate": 3.905889249773428e-07, "logits/chosen": 13.283487319946289, "logits/rejected": 13.588037490844727, "logps/chosen": -3.983412027359009, "logps/rejected": -4.486242294311523, "loss": 3.6213, "rewards/accuracies": 1.0, "rewards/chosen": -39.83412170410156, "rewards/margins": 5.028301239013672, "rewards/rejected": -44.86241912841797, "step": 4089 }, { "epoch": 0.556917211328976, "grad_norm": 41.80585898705441, "learning_rate": 3.9039883840874027e-07, "logits/chosen": 12.846986770629883, "logits/rejected": 13.456039428710938, "logps/chosen": -3.900266170501709, "logps/rejected": -3.8361563682556152, "loss": 4.2374, "rewards/accuracies": 0.25, "rewards/chosen": -39.002662658691406, "rewards/margins": -0.6410970687866211, "rewards/rejected": -38.36156463623047, "step": 4090 }, { "epoch": 0.5570533769063181, "grad_norm": 43.60012233563875, "learning_rate": 3.9020875400959935e-07, "logits/chosen": 14.030447006225586, "logits/rejected": 14.694185256958008, "logps/chosen": -4.231750965118408, "logps/rejected": -4.544497966766357, "loss": 3.5757, "rewards/accuracies": 0.75, "rewards/chosen": -42.3175048828125, "rewards/margins": 3.1274709701538086, "rewards/rejected": -45.444976806640625, "step": 4091 }, { "epoch": 0.5571895424836601, "grad_norm": 40.23808800302778, "learning_rate": 3.900186718228714e-07, "logits/chosen": 12.4219388961792, "logits/rejected": 12.260151863098145, "logps/chosen": -3.9824423789978027, "logps/rejected": -4.041709899902344, "loss": 3.7448, "rewards/accuracies": 0.75, "rewards/chosen": -39.824424743652344, "rewards/margins": 0.5926761627197266, "rewards/rejected": -40.41709899902344, "step": 4092 }, { "epoch": 0.5573257080610022, "grad_norm": 46.82918466320894, "learning_rate": 3.898285918915072e-07, "logits/chosen": 13.585367202758789, "logits/rejected": 14.180750846862793, "logps/chosen": -4.375302314758301, "logps/rejected": -4.70786190032959, "loss": 3.3961, "rewards/accuracies": 1.0, "rewards/chosen": -43.753021240234375, "rewards/margins": 3.3255958557128906, "rewards/rejected": -47.078617095947266, "step": 4093 }, { "epoch": 0.5574618736383442, "grad_norm": 44.88776259118256, "learning_rate": 3.8963851425845664e-07, "logits/chosen": 14.421102523803711, "logits/rejected": 13.666203498840332, "logps/chosen": -4.558629989624023, "logps/rejected": -4.755641937255859, "loss": 3.7635, "rewards/accuracies": 0.5, "rewards/chosen": -45.586299896240234, "rewards/margins": 1.9701223373413086, "rewards/rejected": -47.556419372558594, "step": 4094 }, { "epoch": 0.5575980392156863, "grad_norm": 39.38266884580565, "learning_rate": 3.8944843896666934e-07, "logits/chosen": 13.662874221801758, "logits/rejected": 13.687740325927734, "logps/chosen": -4.047863006591797, "logps/rejected": -4.342665672302246, "loss": 3.5494, "rewards/accuracies": 0.75, "rewards/chosen": -40.47863006591797, "rewards/margins": 2.9480247497558594, "rewards/rejected": -43.42665100097656, "step": 4095 }, { "epoch": 0.5577342047930284, "grad_norm": 40.40262605622538, "learning_rate": 3.892583660590947e-07, "logits/chosen": 14.489548683166504, "logits/rejected": 14.698890686035156, "logps/chosen": -4.655829429626465, "logps/rejected": -4.883530616760254, "loss": 3.5207, "rewards/accuracies": 0.75, "rewards/chosen": -46.55830001831055, "rewards/margins": 2.277009963989258, "rewards/rejected": -48.83530807495117, "step": 4096 }, { "epoch": 0.5578703703703703, "grad_norm": 42.79231550043617, "learning_rate": 3.8906829557868093e-07, "logits/chosen": 13.712644577026367, "logits/rejected": 14.760546684265137, "logps/chosen": -4.256289482116699, "logps/rejected": -4.578118801116943, "loss": 3.9744, "rewards/accuracies": 0.75, "rewards/chosen": -42.562896728515625, "rewards/margins": 3.2182931900024414, "rewards/rejected": -45.78118896484375, "step": 4097 }, { "epoch": 0.5580065359477124, "grad_norm": 43.33558355304489, "learning_rate": 3.8887822756837605e-07, "logits/chosen": 13.048759460449219, "logits/rejected": 13.840526580810547, "logps/chosen": -4.080127239227295, "logps/rejected": -4.8495683670043945, "loss": 4.156, "rewards/accuracies": 0.75, "rewards/chosen": -40.80126953125, "rewards/margins": 7.694415092468262, "rewards/rejected": -48.49568557739258, "step": 4098 }, { "epoch": 0.5581427015250545, "grad_norm": 44.09057990374464, "learning_rate": 3.8868816207112776e-07, "logits/chosen": 13.235179901123047, "logits/rejected": 13.533844947814941, "logps/chosen": -4.203699588775635, "logps/rejected": -4.490332126617432, "loss": 4.0265, "rewards/accuracies": 0.75, "rewards/chosen": -42.0369987487793, "rewards/margins": 2.8663225173950195, "rewards/rejected": -44.9033203125, "step": 4099 }, { "epoch": 0.5582788671023965, "grad_norm": 48.50548688361437, "learning_rate": 3.884980991298826e-07, "logits/chosen": 13.343864440917969, "logits/rejected": 13.611143112182617, "logps/chosen": -4.269805431365967, "logps/rejected": -4.3894453048706055, "loss": 3.9909, "rewards/accuracies": 0.75, "rewards/chosen": -42.69805145263672, "rewards/margins": 1.1963996887207031, "rewards/rejected": -43.89445495605469, "step": 4100 }, { "epoch": 0.5584150326797386, "grad_norm": 42.54265117802819, "learning_rate": 3.88308038787587e-07, "logits/chosen": 14.168719291687012, "logits/rejected": 13.76381778717041, "logps/chosen": -4.3887786865234375, "logps/rejected": -4.154858112335205, "loss": 4.0813, "rewards/accuracies": 0.25, "rewards/chosen": -43.88779067993164, "rewards/margins": -2.3392105102539062, "rewards/rejected": -41.548580169677734, "step": 4101 }, { "epoch": 0.5585511982570807, "grad_norm": 46.365227059540445, "learning_rate": 3.881179810871869e-07, "logits/chosen": 13.531698226928711, "logits/rejected": 14.151683807373047, "logps/chosen": -4.099544048309326, "logps/rejected": -4.430273056030273, "loss": 4.2828, "rewards/accuracies": 0.75, "rewards/chosen": -40.99544143676758, "rewards/margins": 3.3072900772094727, "rewards/rejected": -44.302734375, "step": 4102 }, { "epoch": 0.5586873638344226, "grad_norm": 46.87254549036202, "learning_rate": 3.87927926071627e-07, "logits/chosen": 14.033236503601074, "logits/rejected": 14.048830032348633, "logps/chosen": -4.847806453704834, "logps/rejected": -4.589620590209961, "loss": 4.3543, "rewards/accuracies": 0.0, "rewards/chosen": -48.478065490722656, "rewards/margins": -2.5818567276000977, "rewards/rejected": -45.896209716796875, "step": 4103 }, { "epoch": 0.5588235294117647, "grad_norm": 46.97668178336697, "learning_rate": 3.87737873783852e-07, "logits/chosen": 13.73085880279541, "logits/rejected": 13.349315643310547, "logps/chosen": -3.879457950592041, "logps/rejected": -4.15450382232666, "loss": 4.2822, "rewards/accuracies": 0.75, "rewards/chosen": -38.794578552246094, "rewards/margins": 2.7504568099975586, "rewards/rejected": -41.54503631591797, "step": 4104 }, { "epoch": 0.5589596949891068, "grad_norm": 40.717732902367, "learning_rate": 3.875478242668059e-07, "logits/chosen": 12.68482494354248, "logits/rejected": 14.001220703125, "logps/chosen": -3.784742593765259, "logps/rejected": -4.283284664154053, "loss": 3.7754, "rewards/accuracies": 1.0, "rewards/chosen": -37.84742736816406, "rewards/margins": 4.985421180725098, "rewards/rejected": -42.832847595214844, "step": 4105 }, { "epoch": 0.5590958605664488, "grad_norm": 43.17559416638821, "learning_rate": 3.873577775634317e-07, "logits/chosen": 13.312582015991211, "logits/rejected": 14.644906997680664, "logps/chosen": -4.151198387145996, "logps/rejected": -4.489535331726074, "loss": 4.3444, "rewards/accuracies": 0.75, "rewards/chosen": -41.51198196411133, "rewards/margins": 3.383373260498047, "rewards/rejected": -44.895355224609375, "step": 4106 }, { "epoch": 0.5592320261437909, "grad_norm": 39.7767631008499, "learning_rate": 3.871677337166722e-07, "logits/chosen": 13.823952674865723, "logits/rejected": 13.46108627319336, "logps/chosen": -4.175698280334473, "logps/rejected": -4.456851959228516, "loss": 3.913, "rewards/accuracies": 0.75, "rewards/chosen": -41.756988525390625, "rewards/margins": 2.8115367889404297, "rewards/rejected": -44.56852340698242, "step": 4107 }, { "epoch": 0.559368191721133, "grad_norm": 41.235500205543666, "learning_rate": 3.8697769276946943e-07, "logits/chosen": 14.026145935058594, "logits/rejected": 13.62002944946289, "logps/chosen": -4.421951770782471, "logps/rejected": -4.407746315002441, "loss": 3.9377, "rewards/accuracies": 0.25, "rewards/chosen": -44.219520568847656, "rewards/margins": -0.14205265045166016, "rewards/rejected": -44.07746505737305, "step": 4108 }, { "epoch": 0.5595043572984749, "grad_norm": 41.43862651913864, "learning_rate": 3.867876547647645e-07, "logits/chosen": 12.97953987121582, "logits/rejected": 13.668720245361328, "logps/chosen": -3.875530242919922, "logps/rejected": -4.244373321533203, "loss": 3.9426, "rewards/accuracies": 1.0, "rewards/chosen": -38.75530242919922, "rewards/margins": 3.688431739807129, "rewards/rejected": -42.44373321533203, "step": 4109 }, { "epoch": 0.559640522875817, "grad_norm": 42.1331415480435, "learning_rate": 3.865976197454982e-07, "logits/chosen": 14.155778884887695, "logits/rejected": 14.912009239196777, "logps/chosen": -4.013964653015137, "logps/rejected": -4.548797607421875, "loss": 4.2467, "rewards/accuracies": 0.75, "rewards/chosen": -40.139644622802734, "rewards/margins": 5.348328590393066, "rewards/rejected": -45.48797607421875, "step": 4110 }, { "epoch": 0.5597766884531591, "grad_norm": 42.41152709969621, "learning_rate": 3.864075877546106e-07, "logits/chosen": 13.45764446258545, "logits/rejected": 14.005655288696289, "logps/chosen": -4.202532768249512, "logps/rejected": -4.6588616371154785, "loss": 3.9172, "rewards/accuracies": 1.0, "rewards/chosen": -42.025325775146484, "rewards/margins": 4.563292503356934, "rewards/rejected": -46.58861541748047, "step": 4111 }, { "epoch": 0.5599128540305011, "grad_norm": 41.184901782819765, "learning_rate": 3.862175588350409e-07, "logits/chosen": 15.067432403564453, "logits/rejected": 14.988784790039062, "logps/chosen": -4.538416385650635, "logps/rejected": -4.758274555206299, "loss": 3.4891, "rewards/accuracies": 0.75, "rewards/chosen": -45.38416290283203, "rewards/margins": 2.198582649230957, "rewards/rejected": -47.58274841308594, "step": 4112 }, { "epoch": 0.5600490196078431, "grad_norm": 40.65040395841961, "learning_rate": 3.860275330297276e-07, "logits/chosen": 13.09720230102539, "logits/rejected": 14.721096992492676, "logps/chosen": -4.071280002593994, "logps/rejected": -4.343462944030762, "loss": 3.7542, "rewards/accuracies": 0.75, "rewards/chosen": -40.71280288696289, "rewards/margins": 2.7218236923217773, "rewards/rejected": -43.43462371826172, "step": 4113 }, { "epoch": 0.5601851851851852, "grad_norm": 43.93410811883883, "learning_rate": 3.8583751038160876e-07, "logits/chosen": 13.671571731567383, "logits/rejected": 14.778996467590332, "logps/chosen": -4.1172685623168945, "logps/rejected": -4.644259929656982, "loss": 3.7828, "rewards/accuracies": 1.0, "rewards/chosen": -41.17268371582031, "rewards/margins": 5.269915580749512, "rewards/rejected": -46.44260025024414, "step": 4114 }, { "epoch": 0.5603213507625272, "grad_norm": 39.205514878671, "learning_rate": 3.8564749093362154e-07, "logits/chosen": 13.206401824951172, "logits/rejected": 13.178461074829102, "logps/chosen": -4.036074638366699, "logps/rejected": -4.195987701416016, "loss": 4.0028, "rewards/accuracies": 0.75, "rewards/chosen": -40.360748291015625, "rewards/margins": 1.5991325378417969, "rewards/rejected": -41.959877014160156, "step": 4115 }, { "epoch": 0.5604575163398693, "grad_norm": 41.39113430995222, "learning_rate": 3.854574747287023e-07, "logits/chosen": 14.568740844726562, "logits/rejected": 15.335103988647461, "logps/chosen": -4.136035919189453, "logps/rejected": -4.683748245239258, "loss": 3.7232, "rewards/accuracies": 1.0, "rewards/chosen": -41.36035919189453, "rewards/margins": 5.477126121520996, "rewards/rejected": -46.837486267089844, "step": 4116 }, { "epoch": 0.5605936819172114, "grad_norm": 43.44455739365012, "learning_rate": 3.8526746180978696e-07, "logits/chosen": 12.809867858886719, "logits/rejected": 13.261615753173828, "logps/chosen": -4.034356594085693, "logps/rejected": -4.296792507171631, "loss": 4.3001, "rewards/accuracies": 1.0, "rewards/chosen": -40.343563079833984, "rewards/margins": 2.6243600845336914, "rewards/rejected": -42.967926025390625, "step": 4117 }, { "epoch": 0.5607298474945533, "grad_norm": 44.75141185887449, "learning_rate": 3.850774522198103e-07, "logits/chosen": 12.459421157836914, "logits/rejected": 13.522239685058594, "logps/chosen": -3.9684219360351562, "logps/rejected": -4.240994453430176, "loss": 3.6522, "rewards/accuracies": 0.75, "rewards/chosen": -39.68421936035156, "rewards/margins": 2.7257232666015625, "rewards/rejected": -42.409942626953125, "step": 4118 }, { "epoch": 0.5608660130718954, "grad_norm": 38.43453302034219, "learning_rate": 3.8488744600170677e-07, "logits/chosen": 13.80621337890625, "logits/rejected": 13.604835510253906, "logps/chosen": -4.496172904968262, "logps/rejected": -4.492763519287109, "loss": 3.7816, "rewards/accuracies": 0.5, "rewards/chosen": -44.961727142333984, "rewards/margins": -0.03409385681152344, "rewards/rejected": -44.927635192871094, "step": 4119 }, { "epoch": 0.5610021786492375, "grad_norm": 41.4601146924318, "learning_rate": 3.846974431984097e-07, "logits/chosen": 14.32328987121582, "logits/rejected": 13.779449462890625, "logps/chosen": -4.784786224365234, "logps/rejected": -4.78024435043335, "loss": 4.2628, "rewards/accuracies": 0.75, "rewards/chosen": -47.847862243652344, "rewards/margins": -0.045418739318847656, "rewards/rejected": -47.80244445800781, "step": 4120 }, { "epoch": 0.5611383442265795, "grad_norm": 43.879128261931136, "learning_rate": 3.8450744385285183e-07, "logits/chosen": 13.558085441589355, "logits/rejected": 13.605854034423828, "logps/chosen": -4.339563369750977, "logps/rejected": -4.286201477050781, "loss": 3.6928, "rewards/accuracies": 0.5, "rewards/chosen": -43.395633697509766, "rewards/margins": -0.5336208343505859, "rewards/rejected": -42.86201477050781, "step": 4121 }, { "epoch": 0.5612745098039216, "grad_norm": 44.61547211298638, "learning_rate": 3.8431744800796507e-07, "logits/chosen": 13.444548606872559, "logits/rejected": 13.792465209960938, "logps/chosen": -4.222315311431885, "logps/rejected": -4.433365821838379, "loss": 3.4206, "rewards/accuracies": 0.75, "rewards/chosen": -42.22315216064453, "rewards/margins": 2.110508918762207, "rewards/rejected": -44.33366012573242, "step": 4122 }, { "epoch": 0.5614106753812637, "grad_norm": 39.88165661791463, "learning_rate": 3.841274557066806e-07, "logits/chosen": 13.603355407714844, "logits/rejected": 14.050182342529297, "logps/chosen": -4.101210594177246, "logps/rejected": -4.225561618804932, "loss": 4.0994, "rewards/accuracies": 0.5, "rewards/chosen": -41.012107849121094, "rewards/margins": 1.2435073852539062, "rewards/rejected": -42.255615234375, "step": 4123 }, { "epoch": 0.5615468409586056, "grad_norm": 42.89609974051499, "learning_rate": 3.8393746699192863e-07, "logits/chosen": 13.548782348632812, "logits/rejected": 13.938631057739258, "logps/chosen": -4.420721054077148, "logps/rejected": -4.182555198669434, "loss": 3.7624, "rewards/accuracies": 0.25, "rewards/chosen": -44.20720672607422, "rewards/margins": -2.381655693054199, "rewards/rejected": -41.82555389404297, "step": 4124 }, { "epoch": 0.5616830065359477, "grad_norm": 43.195785914684194, "learning_rate": 3.837474819066388e-07, "logits/chosen": 14.280628204345703, "logits/rejected": 15.104968070983887, "logps/chosen": -4.538714408874512, "logps/rejected": -4.6819233894348145, "loss": 4.0429, "rewards/accuracies": 0.75, "rewards/chosen": -45.387142181396484, "rewards/margins": 1.4320898056030273, "rewards/rejected": -46.819236755371094, "step": 4125 }, { "epoch": 0.5618191721132898, "grad_norm": 43.66618400068416, "learning_rate": 3.835575004937399e-07, "logits/chosen": 13.321001052856445, "logits/rejected": 13.495115280151367, "logps/chosen": -4.219029426574707, "logps/rejected": -4.30518913269043, "loss": 3.4451, "rewards/accuracies": 0.5, "rewards/chosen": -42.1902961730957, "rewards/margins": 0.8615942001342773, "rewards/rejected": -43.0518913269043, "step": 4126 }, { "epoch": 0.5619553376906318, "grad_norm": 41.69567932965276, "learning_rate": 3.8336752279615955e-07, "logits/chosen": 13.450385093688965, "logits/rejected": 14.630050659179688, "logps/chosen": -4.1944780349731445, "logps/rejected": -4.622499942779541, "loss": 4.3155, "rewards/accuracies": 0.75, "rewards/chosen": -41.94477844238281, "rewards/margins": 4.280220031738281, "rewards/rejected": -46.224998474121094, "step": 4127 }, { "epoch": 0.5620915032679739, "grad_norm": 50.01087854807992, "learning_rate": 3.831775488568249e-07, "logits/chosen": 13.507266998291016, "logits/rejected": 12.50696849822998, "logps/chosen": -4.542088508605957, "logps/rejected": -4.108527183532715, "loss": 4.3133, "rewards/accuracies": 0.25, "rewards/chosen": -45.4208869934082, "rewards/margins": -4.3356170654296875, "rewards/rejected": -41.08526611328125, "step": 4128 }, { "epoch": 0.5622276688453159, "grad_norm": 47.5147155486935, "learning_rate": 3.8298757871866226e-07, "logits/chosen": 13.466936111450195, "logits/rejected": 13.99050521850586, "logps/chosen": -4.192102909088135, "logps/rejected": -4.5810441970825195, "loss": 4.1556, "rewards/accuracies": 0.75, "rewards/chosen": -41.92102813720703, "rewards/margins": 3.8894128799438477, "rewards/rejected": -45.81044006347656, "step": 4129 }, { "epoch": 0.5623638344226579, "grad_norm": 42.76107322251895, "learning_rate": 3.8279761242459674e-07, "logits/chosen": 12.381166458129883, "logits/rejected": 13.244543075561523, "logps/chosen": -4.07086181640625, "logps/rejected": -4.238445281982422, "loss": 3.5059, "rewards/accuracies": 0.75, "rewards/chosen": -40.7086181640625, "rewards/margins": 1.675832748413086, "rewards/rejected": -42.38445281982422, "step": 4130 }, { "epoch": 0.5625, "grad_norm": 40.49188817412243, "learning_rate": 3.8260765001755286e-07, "logits/chosen": 13.437686920166016, "logits/rejected": 13.91905403137207, "logps/chosen": -4.0818867683410645, "logps/rejected": -4.315606117248535, "loss": 4.0799, "rewards/accuracies": 1.0, "rewards/chosen": -40.818870544433594, "rewards/margins": 2.337190628051758, "rewards/rejected": -43.15605926513672, "step": 4131 }, { "epoch": 0.5626361655773421, "grad_norm": 45.34282906874828, "learning_rate": 3.824176915404544e-07, "logits/chosen": 13.53278923034668, "logits/rejected": 13.821176528930664, "logps/chosen": -4.199910640716553, "logps/rejected": -4.477353096008301, "loss": 3.7206, "rewards/accuracies": 0.5, "rewards/chosen": -41.999107360839844, "rewards/margins": 2.7744245529174805, "rewards/rejected": -44.773529052734375, "step": 4132 }, { "epoch": 0.5627723311546841, "grad_norm": 43.244878206934246, "learning_rate": 3.822277370362237e-07, "logits/chosen": 13.643519401550293, "logits/rejected": 14.235732078552246, "logps/chosen": -4.340081691741943, "logps/rejected": -4.263132095336914, "loss": 3.9999, "rewards/accuracies": 0.25, "rewards/chosen": -43.400814056396484, "rewards/margins": -0.7694911956787109, "rewards/rejected": -42.631324768066406, "step": 4133 }, { "epoch": 0.5629084967320261, "grad_norm": 91.60054803328914, "learning_rate": 3.8203778654778273e-07, "logits/chosen": 13.359197616577148, "logits/rejected": 13.872093200683594, "logps/chosen": -4.229556560516357, "logps/rejected": -4.715344429016113, "loss": 3.7773, "rewards/accuracies": 1.0, "rewards/chosen": -42.29556655883789, "rewards/margins": 4.857874870300293, "rewards/rejected": -47.1534423828125, "step": 4134 }, { "epoch": 0.5630446623093682, "grad_norm": 48.828944262151374, "learning_rate": 3.818478401180525e-07, "logits/chosen": 14.107598304748535, "logits/rejected": 13.416413307189941, "logps/chosen": -4.030264854431152, "logps/rejected": -4.0626935958862305, "loss": 4.1488, "rewards/accuracies": 0.5, "rewards/chosen": -40.302642822265625, "rewards/margins": 0.3242931365966797, "rewards/rejected": -40.62693786621094, "step": 4135 }, { "epoch": 0.5631808278867102, "grad_norm": 46.86284376044668, "learning_rate": 3.8165789778995265e-07, "logits/chosen": 12.367328643798828, "logits/rejected": 12.129314422607422, "logps/chosen": -4.2194318771362305, "logps/rejected": -4.344106674194336, "loss": 4.0271, "rewards/accuracies": 0.5, "rewards/chosen": -42.19432067871094, "rewards/margins": 1.2467479705810547, "rewards/rejected": -43.441070556640625, "step": 4136 }, { "epoch": 0.5633169934640523, "grad_norm": 48.821607623765665, "learning_rate": 3.814679596064025e-07, "logits/chosen": 13.421350479125977, "logits/rejected": 15.015260696411133, "logps/chosen": -4.211512565612793, "logps/rejected": -4.898706912994385, "loss": 3.7118, "rewards/accuracies": 1.0, "rewards/chosen": -42.11512756347656, "rewards/margins": 6.871941566467285, "rewards/rejected": -48.98706817626953, "step": 4137 }, { "epoch": 0.5634531590413944, "grad_norm": 43.53955913167843, "learning_rate": 3.812780256103202e-07, "logits/chosen": 13.366175651550293, "logits/rejected": 13.501493453979492, "logps/chosen": -4.179383277893066, "logps/rejected": -4.365309715270996, "loss": 3.7903, "rewards/accuracies": 0.5, "rewards/chosen": -41.79383087158203, "rewards/margins": 1.8592662811279297, "rewards/rejected": -43.65309524536133, "step": 4138 }, { "epoch": 0.5635893246187363, "grad_norm": 43.260750367906255, "learning_rate": 3.8108809584462256e-07, "logits/chosen": 13.933589935302734, "logits/rejected": 14.045402526855469, "logps/chosen": -4.347506523132324, "logps/rejected": -4.579896926879883, "loss": 3.6984, "rewards/accuracies": 0.75, "rewards/chosen": -43.475067138671875, "rewards/margins": 2.323902130126953, "rewards/rejected": -45.798973083496094, "step": 4139 }, { "epoch": 0.5637254901960784, "grad_norm": 44.28430474128798, "learning_rate": 3.8089817035222606e-07, "logits/chosen": 12.888842582702637, "logits/rejected": 13.237848281860352, "logps/chosen": -4.104589462280273, "logps/rejected": -4.255778789520264, "loss": 4.4878, "rewards/accuracies": 1.0, "rewards/chosen": -41.0458984375, "rewards/margins": 1.5118885040283203, "rewards/rejected": -42.55778503417969, "step": 4140 }, { "epoch": 0.5638616557734205, "grad_norm": 46.219847540091, "learning_rate": 3.80708249176046e-07, "logits/chosen": 13.691821098327637, "logits/rejected": 13.492183685302734, "logps/chosen": -4.548095703125, "logps/rejected": -4.180814743041992, "loss": 4.3919, "rewards/accuracies": 0.0, "rewards/chosen": -45.480960845947266, "rewards/margins": -3.6728134155273438, "rewards/rejected": -41.80814743041992, "step": 4141 }, { "epoch": 0.5639978213507625, "grad_norm": 47.53356106824502, "learning_rate": 3.8051833235899634e-07, "logits/chosen": 13.495214462280273, "logits/rejected": 14.259634017944336, "logps/chosen": -4.260472297668457, "logps/rejected": -4.776922225952148, "loss": 3.7999, "rewards/accuracies": 1.0, "rewards/chosen": -42.60472106933594, "rewards/margins": 5.164505958557129, "rewards/rejected": -47.76922607421875, "step": 4142 }, { "epoch": 0.5641339869281046, "grad_norm": 41.42413106997957, "learning_rate": 3.803284199439905e-07, "logits/chosen": 13.177474975585938, "logits/rejected": 13.780027389526367, "logps/chosen": -4.286181449890137, "logps/rejected": -4.55881404876709, "loss": 3.2485, "rewards/accuracies": 0.75, "rewards/chosen": -42.861812591552734, "rewards/margins": 2.726332664489746, "rewards/rejected": -45.58814239501953, "step": 4143 }, { "epoch": 0.5642701525054467, "grad_norm": 43.106273461309506, "learning_rate": 3.80138511973941e-07, "logits/chosen": 12.529155731201172, "logits/rejected": 13.960821151733398, "logps/chosen": -3.766582489013672, "logps/rejected": -4.162184238433838, "loss": 4.2018, "rewards/accuracies": 0.5, "rewards/chosen": -37.66582489013672, "rewards/margins": 3.9560165405273438, "rewards/rejected": -41.62184143066406, "step": 4144 }, { "epoch": 0.5644063180827886, "grad_norm": 41.05336127122527, "learning_rate": 3.799486084917587e-07, "logits/chosen": 13.30422306060791, "logits/rejected": 14.32424545288086, "logps/chosen": -3.7253756523132324, "logps/rejected": -4.332499980926514, "loss": 3.6665, "rewards/accuracies": 1.0, "rewards/chosen": -37.25375747680664, "rewards/margins": 6.07124137878418, "rewards/rejected": -43.32499694824219, "step": 4145 }, { "epoch": 0.5645424836601307, "grad_norm": 71.22196011246044, "learning_rate": 3.7975870954035406e-07, "logits/chosen": 13.048858642578125, "logits/rejected": 14.1180419921875, "logps/chosen": -3.987159252166748, "logps/rejected": -4.359001159667969, "loss": 4.5878, "rewards/accuracies": 0.75, "rewards/chosen": -39.8715934753418, "rewards/margins": 3.7184181213378906, "rewards/rejected": -43.59001159667969, "step": 4146 }, { "epoch": 0.5646786492374728, "grad_norm": 38.238244628941544, "learning_rate": 3.7956881516263646e-07, "logits/chosen": 13.589836120605469, "logits/rejected": 13.612041473388672, "logps/chosen": -3.933344841003418, "logps/rejected": -4.469259262084961, "loss": 3.6829, "rewards/accuracies": 0.75, "rewards/chosen": -39.33344650268555, "rewards/margins": 5.359145164489746, "rewards/rejected": -44.69259262084961, "step": 4147 }, { "epoch": 0.5648148148148148, "grad_norm": 42.75745533330591, "learning_rate": 3.793789254015138e-07, "logits/chosen": 13.631816864013672, "logits/rejected": 13.413360595703125, "logps/chosen": -4.18135404586792, "logps/rejected": -4.161431312561035, "loss": 4.0533, "rewards/accuracies": 0.75, "rewards/chosen": -41.81353759765625, "rewards/margins": -0.19922256469726562, "rewards/rejected": -41.614315032958984, "step": 4148 }, { "epoch": 0.5649509803921569, "grad_norm": 90.16200508954067, "learning_rate": 3.791890402998934e-07, "logits/chosen": 12.821952819824219, "logits/rejected": 12.998870849609375, "logps/chosen": -4.306568145751953, "logps/rejected": -4.153109550476074, "loss": 3.6423, "rewards/accuracies": 0.25, "rewards/chosen": -43.06568145751953, "rewards/margins": -1.534585952758789, "rewards/rejected": -41.53109359741211, "step": 4149 }, { "epoch": 0.5650871459694989, "grad_norm": 46.8048159620032, "learning_rate": 3.7899915990068154e-07, "logits/chosen": 13.274675369262695, "logits/rejected": 13.560012817382812, "logps/chosen": -4.384300231933594, "logps/rejected": -4.638352394104004, "loss": 4.0732, "rewards/accuracies": 0.5, "rewards/chosen": -43.84299850463867, "rewards/margins": 2.540524482727051, "rewards/rejected": -46.383522033691406, "step": 4150 }, { "epoch": 0.5652233115468409, "grad_norm": 40.954670437557375, "learning_rate": 3.7880928424678293e-07, "logits/chosen": 13.087499618530273, "logits/rejected": 12.499076843261719, "logps/chosen": -4.179955959320068, "logps/rejected": -4.226835250854492, "loss": 3.8907, "rewards/accuracies": 0.75, "rewards/chosen": -41.799560546875, "rewards/margins": 0.4687948226928711, "rewards/rejected": -42.26835632324219, "step": 4151 }, { "epoch": 0.565359477124183, "grad_norm": 44.15902708629603, "learning_rate": 3.7861941338110175e-07, "logits/chosen": 13.236888885498047, "logits/rejected": 14.428674697875977, "logps/chosen": -4.291715621948242, "logps/rejected": -4.357928276062012, "loss": 3.8849, "rewards/accuracies": 0.5, "rewards/chosen": -42.91715621948242, "rewards/margins": 0.6621236801147461, "rewards/rejected": -43.579280853271484, "step": 4152 }, { "epoch": 0.5654956427015251, "grad_norm": 48.288328680765474, "learning_rate": 3.7842954734654096e-07, "logits/chosen": 13.132225036621094, "logits/rejected": 13.188997268676758, "logps/chosen": -4.04102087020874, "logps/rejected": -4.303983688354492, "loss": 3.7948, "rewards/accuracies": 1.0, "rewards/chosen": -40.41020965576172, "rewards/margins": 2.6296281814575195, "rewards/rejected": -43.03983688354492, "step": 4153 }, { "epoch": 0.565631808278867, "grad_norm": 46.019357183988, "learning_rate": 3.7823968618600214e-07, "logits/chosen": 12.282383918762207, "logits/rejected": 13.576879501342773, "logps/chosen": -4.089216232299805, "logps/rejected": -4.483071327209473, "loss": 3.7461, "rewards/accuracies": 0.75, "rewards/chosen": -40.89216613769531, "rewards/margins": 3.9385457038879395, "rewards/rejected": -44.830711364746094, "step": 4154 }, { "epoch": 0.5657679738562091, "grad_norm": 40.004199322137616, "learning_rate": 3.7804982994238603e-07, "logits/chosen": 12.999866485595703, "logits/rejected": 14.347854614257812, "logps/chosen": -4.452103614807129, "logps/rejected": -4.849108695983887, "loss": 3.6938, "rewards/accuracies": 1.0, "rewards/chosen": -44.521034240722656, "rewards/margins": 3.970050811767578, "rewards/rejected": -48.491085052490234, "step": 4155 }, { "epoch": 0.5659041394335512, "grad_norm": 37.497241085040734, "learning_rate": 3.778599786585925e-07, "logits/chosen": 13.121360778808594, "logits/rejected": 13.522804260253906, "logps/chosen": -4.146257400512695, "logps/rejected": -4.519689559936523, "loss": 3.6474, "rewards/accuracies": 0.75, "rewards/chosen": -41.46257019042969, "rewards/margins": 3.734323501586914, "rewards/rejected": -45.1968994140625, "step": 4156 }, { "epoch": 0.5660403050108932, "grad_norm": 39.38774718211455, "learning_rate": 3.7767013237751964e-07, "logits/chosen": 13.992141723632812, "logits/rejected": 14.05661392211914, "logps/chosen": -4.0026445388793945, "logps/rejected": -3.9403188228607178, "loss": 4.1378, "rewards/accuracies": 0.5, "rewards/chosen": -40.02644729614258, "rewards/margins": -0.623260498046875, "rewards/rejected": -39.4031867980957, "step": 4157 }, { "epoch": 0.5661764705882353, "grad_norm": 45.86235014007426, "learning_rate": 3.774802911420649e-07, "logits/chosen": 13.22775936126709, "logits/rejected": 12.771156311035156, "logps/chosen": -3.9364209175109863, "logps/rejected": -4.145328521728516, "loss": 3.553, "rewards/accuracies": 0.5, "rewards/chosen": -39.36421203613281, "rewards/margins": 2.0890722274780273, "rewards/rejected": -41.45328140258789, "step": 4158 }, { "epoch": 0.5663126361655774, "grad_norm": 46.682221322876615, "learning_rate": 3.7729045499512476e-07, "logits/chosen": 13.264177322387695, "logits/rejected": 13.576196670532227, "logps/chosen": -4.043935775756836, "logps/rejected": -4.244396686553955, "loss": 3.8215, "rewards/accuracies": 0.75, "rewards/chosen": -40.43935775756836, "rewards/margins": 2.004608154296875, "rewards/rejected": -42.443965911865234, "step": 4159 }, { "epoch": 0.5664488017429193, "grad_norm": 41.142442858920774, "learning_rate": 3.7710062397959386e-07, "logits/chosen": 12.292224884033203, "logits/rejected": 13.210624694824219, "logps/chosen": -3.7189855575561523, "logps/rejected": -3.983167886734009, "loss": 3.8607, "rewards/accuracies": 0.75, "rewards/chosen": -37.18985366821289, "rewards/margins": 2.6418237686157227, "rewards/rejected": -39.83168029785156, "step": 4160 }, { "epoch": 0.5665849673202614, "grad_norm": 53.204056701350545, "learning_rate": 3.7691079813836615e-07, "logits/chosen": 13.196073532104492, "logits/rejected": 13.465720176696777, "logps/chosen": -4.245617389678955, "logps/rejected": -4.229074001312256, "loss": 3.9389, "rewards/accuracies": 0.5, "rewards/chosen": -42.4561767578125, "rewards/margins": -0.1654338836669922, "rewards/rejected": -42.290740966796875, "step": 4161 }, { "epoch": 0.5667211328976035, "grad_norm": 40.05241149367907, "learning_rate": 3.7672097751433466e-07, "logits/chosen": 12.911981582641602, "logits/rejected": 13.452134132385254, "logps/chosen": -4.417707443237305, "logps/rejected": -4.300261497497559, "loss": 4.2167, "rewards/accuracies": 0.25, "rewards/chosen": -44.17707061767578, "rewards/margins": -1.174454689025879, "rewards/rejected": -43.00261688232422, "step": 4162 }, { "epoch": 0.5668572984749455, "grad_norm": 38.348715655657216, "learning_rate": 3.765311621503907e-07, "logits/chosen": 13.658950805664062, "logits/rejected": 13.430822372436523, "logps/chosen": -3.863847255706787, "logps/rejected": -4.327841758728027, "loss": 3.8242, "rewards/accuracies": 0.75, "rewards/chosen": -38.63847351074219, "rewards/margins": 4.6399431228637695, "rewards/rejected": -43.27841567993164, "step": 4163 }, { "epoch": 0.5669934640522876, "grad_norm": 40.06113581728843, "learning_rate": 3.763413520894245e-07, "logits/chosen": 12.629659652709961, "logits/rejected": 13.142618179321289, "logps/chosen": -3.7085392475128174, "logps/rejected": -3.965744733810425, "loss": 3.9336, "rewards/accuracies": 0.75, "rewards/chosen": -37.08539581298828, "rewards/margins": 2.572056770324707, "rewards/rejected": -39.657447814941406, "step": 4164 }, { "epoch": 0.5671296296296297, "grad_norm": 42.74158283743615, "learning_rate": 3.7615154737432555e-07, "logits/chosen": 12.95785903930664, "logits/rejected": 13.373138427734375, "logps/chosen": -4.096837043762207, "logps/rejected": -4.476932525634766, "loss": 4.1658, "rewards/accuracies": 1.0, "rewards/chosen": -40.96836853027344, "rewards/margins": 3.8009538650512695, "rewards/rejected": -44.769325256347656, "step": 4165 }, { "epoch": 0.5672657952069716, "grad_norm": 43.15405532722409, "learning_rate": 3.7596174804798153e-07, "logits/chosen": 13.040792465209961, "logits/rejected": 13.760259628295898, "logps/chosen": -4.09909725189209, "logps/rejected": -4.325675010681152, "loss": 4.4329, "rewards/accuracies": 0.75, "rewards/chosen": -40.99097442626953, "rewards/margins": 2.2657785415649414, "rewards/rejected": -43.256752014160156, "step": 4166 }, { "epoch": 0.5674019607843137, "grad_norm": 64.40089599569703, "learning_rate": 3.757719541532792e-07, "logits/chosen": 13.405762672424316, "logits/rejected": 14.033869743347168, "logps/chosen": -4.058746814727783, "logps/rejected": -4.43396520614624, "loss": 4.2384, "rewards/accuracies": 0.75, "rewards/chosen": -40.58747100830078, "rewards/margins": 3.752181053161621, "rewards/rejected": -44.33965301513672, "step": 4167 }, { "epoch": 0.5675381263616558, "grad_norm": 57.549920061418234, "learning_rate": 3.7558216573310415e-07, "logits/chosen": 14.204202651977539, "logits/rejected": 13.394906997680664, "logps/chosen": -4.20865535736084, "logps/rejected": -3.978672742843628, "loss": 4.766, "rewards/accuracies": 0.5, "rewards/chosen": -42.08655548095703, "rewards/margins": -2.2998275756835938, "rewards/rejected": -39.78672790527344, "step": 4168 }, { "epoch": 0.5676742919389978, "grad_norm": 40.14956496742343, "learning_rate": 3.753923828303405e-07, "logits/chosen": 13.088346481323242, "logits/rejected": 13.419100761413574, "logps/chosen": -4.184435844421387, "logps/rejected": -4.287522792816162, "loss": 4.2481, "rewards/accuracies": 0.5, "rewards/chosen": -41.8443603515625, "rewards/margins": 1.030867576599121, "rewards/rejected": -42.87522888183594, "step": 4169 }, { "epoch": 0.5678104575163399, "grad_norm": 46.641149445108496, "learning_rate": 3.752026054878715e-07, "logits/chosen": 13.159382820129395, "logits/rejected": 13.731775283813477, "logps/chosen": -3.9911410808563232, "logps/rejected": -4.2165374755859375, "loss": 3.5985, "rewards/accuracies": 0.75, "rewards/chosen": -39.911415100097656, "rewards/margins": 2.2539634704589844, "rewards/rejected": -42.165374755859375, "step": 4170 }, { "epoch": 0.5679466230936819, "grad_norm": 46.163903326883656, "learning_rate": 3.7501283374857874e-07, "logits/chosen": 13.169759750366211, "logits/rejected": 13.281965255737305, "logps/chosen": -3.999187707901001, "logps/rejected": -4.202752590179443, "loss": 3.9037, "rewards/accuracies": 0.75, "rewards/chosen": -39.991878509521484, "rewards/margins": 2.0356483459472656, "rewards/rejected": -42.02752685546875, "step": 4171 }, { "epoch": 0.568082788671024, "grad_norm": 37.83390944438756, "learning_rate": 3.7482306765534264e-07, "logits/chosen": 13.834854125976562, "logits/rejected": 14.310749053955078, "logps/chosen": -4.476845741271973, "logps/rejected": -4.662642478942871, "loss": 3.7319, "rewards/accuracies": 0.75, "rewards/chosen": -44.768455505371094, "rewards/margins": 1.857966423034668, "rewards/rejected": -46.626426696777344, "step": 4172 }, { "epoch": 0.568218954248366, "grad_norm": 41.20806842083675, "learning_rate": 3.746333072510426e-07, "logits/chosen": 13.927583694458008, "logits/rejected": 13.111261367797852, "logps/chosen": -4.2459540367126465, "logps/rejected": -4.270877838134766, "loss": 4.3908, "rewards/accuracies": 0.5, "rewards/chosen": -42.45954132080078, "rewards/margins": 0.249237060546875, "rewards/rejected": -42.708778381347656, "step": 4173 }, { "epoch": 0.5683551198257081, "grad_norm": 40.43486934088399, "learning_rate": 3.7444355257855646e-07, "logits/chosen": 12.752837181091309, "logits/rejected": 13.489967346191406, "logps/chosen": -3.912095308303833, "logps/rejected": -4.133791923522949, "loss": 4.1369, "rewards/accuracies": 0.5, "rewards/chosen": -39.12095260620117, "rewards/margins": 2.216968536376953, "rewards/rejected": -41.337921142578125, "step": 4174 }, { "epoch": 0.5684912854030502, "grad_norm": 38.70587246701243, "learning_rate": 3.742538036807608e-07, "logits/chosen": 13.550472259521484, "logits/rejected": 14.592630386352539, "logps/chosen": -4.20008659362793, "logps/rejected": -4.495889663696289, "loss": 3.8564, "rewards/accuracies": 0.75, "rewards/chosen": -42.00086212158203, "rewards/margins": 2.958036422729492, "rewards/rejected": -44.958900451660156, "step": 4175 }, { "epoch": 0.5686274509803921, "grad_norm": 38.625448186495326, "learning_rate": 3.74064060600531e-07, "logits/chosen": 13.738615036010742, "logits/rejected": 13.370662689208984, "logps/chosen": -4.183722496032715, "logps/rejected": -4.199810981750488, "loss": 4.1753, "rewards/accuracies": 0.5, "rewards/chosen": -41.837223052978516, "rewards/margins": 0.16088581085205078, "rewards/rejected": -41.99810791015625, "step": 4176 }, { "epoch": 0.5687636165577342, "grad_norm": 43.402784001488236, "learning_rate": 3.738743233807413e-07, "logits/chosen": 12.132373809814453, "logits/rejected": 13.420822143554688, "logps/chosen": -3.918172836303711, "logps/rejected": -3.9660000801086426, "loss": 3.8236, "rewards/accuracies": 0.25, "rewards/chosen": -39.181732177734375, "rewards/margins": 0.4782724380493164, "rewards/rejected": -39.660003662109375, "step": 4177 }, { "epoch": 0.5688997821350763, "grad_norm": 45.66768949523513, "learning_rate": 3.7368459206426405e-07, "logits/chosen": 12.679706573486328, "logits/rejected": 13.31817626953125, "logps/chosen": -4.152126312255859, "logps/rejected": -4.402491092681885, "loss": 4.1768, "rewards/accuracies": 0.75, "rewards/chosen": -41.52125930786133, "rewards/margins": 2.5036516189575195, "rewards/rejected": -44.02490997314453, "step": 4178 }, { "epoch": 0.5690359477124183, "grad_norm": 41.922770524331185, "learning_rate": 3.7349486669397067e-07, "logits/chosen": 13.62684154510498, "logits/rejected": 13.69906234741211, "logps/chosen": -4.17536735534668, "logps/rejected": -4.437254905700684, "loss": 3.9237, "rewards/accuracies": 1.0, "rewards/chosen": -41.75367736816406, "rewards/margins": 2.6188735961914062, "rewards/rejected": -44.37255096435547, "step": 4179 }, { "epoch": 0.5691721132897604, "grad_norm": 39.83851438871956, "learning_rate": 3.7330514731273147e-07, "logits/chosen": 13.430912971496582, "logits/rejected": 13.269889831542969, "logps/chosen": -4.3231987953186035, "logps/rejected": -4.329730033874512, "loss": 3.7381, "rewards/accuracies": 0.5, "rewards/chosen": -43.23198699951172, "rewards/margins": 0.06531620025634766, "rewards/rejected": -43.29730224609375, "step": 4180 }, { "epoch": 0.5693082788671024, "grad_norm": 38.9511344749428, "learning_rate": 3.731154339634147e-07, "logits/chosen": 13.526031494140625, "logits/rejected": 13.291766166687012, "logps/chosen": -4.1146955490112305, "logps/rejected": -4.28672981262207, "loss": 3.637, "rewards/accuracies": 0.5, "rewards/chosen": -41.14695358276367, "rewards/margins": 1.7203474044799805, "rewards/rejected": -42.86730194091797, "step": 4181 }, { "epoch": 0.5694444444444444, "grad_norm": 38.04749448037624, "learning_rate": 3.7292572668888787e-07, "logits/chosen": 13.330438613891602, "logits/rejected": 13.243114471435547, "logps/chosen": -4.079063892364502, "logps/rejected": -3.9601378440856934, "loss": 3.6786, "rewards/accuracies": 0.5, "rewards/chosen": -40.79064178466797, "rewards/margins": -1.1892595291137695, "rewards/rejected": -39.60137939453125, "step": 4182 }, { "epoch": 0.5695806100217865, "grad_norm": 38.97183542978643, "learning_rate": 3.727360255320171e-07, "logits/chosen": 13.677240371704102, "logits/rejected": 14.284778594970703, "logps/chosen": -4.189186096191406, "logps/rejected": -4.277260780334473, "loss": 3.8906, "rewards/accuracies": 0.75, "rewards/chosen": -41.89186096191406, "rewards/margins": 0.8807506561279297, "rewards/rejected": -42.772613525390625, "step": 4183 }, { "epoch": 0.5697167755991286, "grad_norm": 37.365104206540366, "learning_rate": 3.725463305356665e-07, "logits/chosen": 11.960248947143555, "logits/rejected": 12.75381851196289, "logps/chosen": -3.802398920059204, "logps/rejected": -4.24951171875, "loss": 3.5378, "rewards/accuracies": 1.0, "rewards/chosen": -38.023990631103516, "rewards/margins": 4.471124649047852, "rewards/rejected": -42.4951171875, "step": 4184 }, { "epoch": 0.5698529411764706, "grad_norm": 36.40679477545192, "learning_rate": 3.723566417426995e-07, "logits/chosen": 14.15993881225586, "logits/rejected": 13.414950370788574, "logps/chosen": -4.569947242736816, "logps/rejected": -4.373488426208496, "loss": 3.9777, "rewards/accuracies": 0.5, "rewards/chosen": -45.69947052001953, "rewards/margins": -1.9645843505859375, "rewards/rejected": -43.734886169433594, "step": 4185 }, { "epoch": 0.5699891067538126, "grad_norm": 43.77963080714284, "learning_rate": 3.721669591959779e-07, "logits/chosen": 13.103925704956055, "logits/rejected": 13.409841537475586, "logps/chosen": -3.92195987701416, "logps/rejected": -4.20236349105835, "loss": 3.7045, "rewards/accuracies": 1.0, "rewards/chosen": -39.21959686279297, "rewards/margins": 2.8040380477905273, "rewards/rejected": -42.02363586425781, "step": 4186 }, { "epoch": 0.5701252723311547, "grad_norm": 42.095308081706754, "learning_rate": 3.719772829383618e-07, "logits/chosen": 13.678800582885742, "logits/rejected": 13.375360488891602, "logps/chosen": -4.393125534057617, "logps/rejected": -4.104672908782959, "loss": 4.0131, "rewards/accuracies": 0.25, "rewards/chosen": -43.931251525878906, "rewards/margins": -2.8845205307006836, "rewards/rejected": -41.046730041503906, "step": 4187 }, { "epoch": 0.5702614379084967, "grad_norm": 35.54099744999478, "learning_rate": 3.7178761301271025e-07, "logits/chosen": 13.011357307434082, "logits/rejected": 13.008697509765625, "logps/chosen": -4.108953475952148, "logps/rejected": -4.183966636657715, "loss": 4.0685, "rewards/accuracies": 0.75, "rewards/chosen": -41.089534759521484, "rewards/margins": 0.7501287460327148, "rewards/rejected": -41.839664459228516, "step": 4188 }, { "epoch": 0.5703976034858388, "grad_norm": 38.260209394498716, "learning_rate": 3.7159794946188097e-07, "logits/chosen": 12.293188095092773, "logits/rejected": 13.564038276672363, "logps/chosen": -3.956676721572876, "logps/rejected": -4.426759243011475, "loss": 3.9827, "rewards/accuracies": 1.0, "rewards/chosen": -39.56676483154297, "rewards/margins": 4.700826644897461, "rewards/rejected": -44.26759338378906, "step": 4189 }, { "epoch": 0.5705337690631809, "grad_norm": 38.353638805941, "learning_rate": 3.714082923287296e-07, "logits/chosen": 14.274744033813477, "logits/rejected": 13.561203002929688, "logps/chosen": -4.312934875488281, "logps/rejected": -4.137380599975586, "loss": 3.8402, "rewards/accuracies": 0.25, "rewards/chosen": -43.12934875488281, "rewards/margins": -1.7555370330810547, "rewards/rejected": -41.373809814453125, "step": 4190 }, { "epoch": 0.5706699346405228, "grad_norm": 43.558453647442605, "learning_rate": 3.712186416561109e-07, "logits/chosen": 13.871244430541992, "logits/rejected": 13.760530471801758, "logps/chosen": -4.418492317199707, "logps/rejected": -4.489149570465088, "loss": 4.3238, "rewards/accuracies": 0.5, "rewards/chosen": -44.18492889404297, "rewards/margins": 0.7065696716308594, "rewards/rejected": -44.89149475097656, "step": 4191 }, { "epoch": 0.5708061002178649, "grad_norm": 41.932616976100306, "learning_rate": 3.7102899748687816e-07, "logits/chosen": 13.334738731384277, "logits/rejected": 13.992528915405273, "logps/chosen": -3.9597275257110596, "logps/rejected": -4.488845348358154, "loss": 4.2834, "rewards/accuracies": 1.0, "rewards/chosen": -39.59727478027344, "rewards/margins": 5.291180610656738, "rewards/rejected": -44.88845443725586, "step": 4192 }, { "epoch": 0.570942265795207, "grad_norm": 36.4046931513559, "learning_rate": 3.7083935986388277e-07, "logits/chosen": 12.863216400146484, "logits/rejected": 13.11699104309082, "logps/chosen": -3.906750202178955, "logps/rejected": -4.0106072425842285, "loss": 3.5505, "rewards/accuracies": 0.5, "rewards/chosen": -39.067501068115234, "rewards/margins": 1.0385713577270508, "rewards/rejected": -40.106075286865234, "step": 4193 }, { "epoch": 0.571078431372549, "grad_norm": 44.484656852761184, "learning_rate": 3.7064972882997505e-07, "logits/chosen": 13.135098457336426, "logits/rejected": 12.824580192565918, "logps/chosen": -4.027646064758301, "logps/rejected": -4.227514743804932, "loss": 3.8577, "rewards/accuracies": 0.5, "rewards/chosen": -40.276458740234375, "rewards/margins": 1.9986867904663086, "rewards/rejected": -42.275146484375, "step": 4194 }, { "epoch": 0.5712145969498911, "grad_norm": 37.93904417869221, "learning_rate": 3.7046010442800395e-07, "logits/chosen": 13.683028221130371, "logits/rejected": 13.755199432373047, "logps/chosen": -4.352569580078125, "logps/rejected": -4.437796592712402, "loss": 3.8483, "rewards/accuracies": 0.75, "rewards/chosen": -43.525699615478516, "rewards/margins": 0.8522653579711914, "rewards/rejected": -44.37796401977539, "step": 4195 }, { "epoch": 0.5713507625272332, "grad_norm": 40.289813690611396, "learning_rate": 3.702704867008162e-07, "logits/chosen": 13.76089096069336, "logits/rejected": 13.294061660766602, "logps/chosen": -4.0416154861450195, "logps/rejected": -3.998455047607422, "loss": 4.2011, "rewards/accuracies": 0.5, "rewards/chosen": -40.41615295410156, "rewards/margins": -0.4316062927246094, "rewards/rejected": -39.98455047607422, "step": 4196 }, { "epoch": 0.5714869281045751, "grad_norm": 36.753883102839055, "learning_rate": 3.700808756912577e-07, "logits/chosen": 12.860578536987305, "logits/rejected": 13.511913299560547, "logps/chosen": -3.912411689758301, "logps/rejected": -4.393431186676025, "loss": 3.9549, "rewards/accuracies": 1.0, "rewards/chosen": -39.124114990234375, "rewards/margins": 4.810197830200195, "rewards/rejected": -43.93431091308594, "step": 4197 }, { "epoch": 0.5716230936819172, "grad_norm": 48.14749112814364, "learning_rate": 3.698912714421729e-07, "logits/chosen": 14.227328300476074, "logits/rejected": 14.224130630493164, "logps/chosen": -4.221869468688965, "logps/rejected": -4.437276840209961, "loss": 4.3482, "rewards/accuracies": 0.5, "rewards/chosen": -42.218692779541016, "rewards/margins": 2.1540746688842773, "rewards/rejected": -44.372764587402344, "step": 4198 }, { "epoch": 0.5717592592592593, "grad_norm": 42.64436477752704, "learning_rate": 3.69701673996404e-07, "logits/chosen": 13.491579055786133, "logits/rejected": 13.739852905273438, "logps/chosen": -4.099010467529297, "logps/rejected": -4.3582353591918945, "loss": 3.3601, "rewards/accuracies": 0.5, "rewards/chosen": -40.99010467529297, "rewards/margins": 2.5922470092773438, "rewards/rejected": -43.58235168457031, "step": 4199 }, { "epoch": 0.5718954248366013, "grad_norm": 35.94828078827058, "learning_rate": 3.6951208339679234e-07, "logits/chosen": 13.714363098144531, "logits/rejected": 14.03946304321289, "logps/chosen": -3.7229557037353516, "logps/rejected": -4.2824883460998535, "loss": 3.3992, "rewards/accuracies": 0.75, "rewards/chosen": -37.229557037353516, "rewards/margins": 5.595325469970703, "rewards/rejected": -42.82488250732422, "step": 4200 }, { "epoch": 0.5720315904139434, "grad_norm": 38.99712761440066, "learning_rate": 3.6932249968617757e-07, "logits/chosen": 12.207036972045898, "logits/rejected": 13.171170234680176, "logps/chosen": -4.161901473999023, "logps/rejected": -4.4148969650268555, "loss": 3.3302, "rewards/accuracies": 0.75, "rewards/chosen": -41.619014739990234, "rewards/margins": 2.529952049255371, "rewards/rejected": -44.148963928222656, "step": 4201 }, { "epoch": 0.5721677559912854, "grad_norm": 46.01141038969935, "learning_rate": 3.691329229073974e-07, "logits/chosen": 13.163419723510742, "logits/rejected": 13.512248992919922, "logps/chosen": -4.4545464515686035, "logps/rejected": -4.651607036590576, "loss": 4.0823, "rewards/accuracies": 0.5, "rewards/chosen": -44.54546356201172, "rewards/margins": 1.970606803894043, "rewards/rejected": -46.51607131958008, "step": 4202 }, { "epoch": 0.5723039215686274, "grad_norm": 47.72491037199811, "learning_rate": 3.689433531032885e-07, "logits/chosen": 14.20262336730957, "logits/rejected": 14.644006729125977, "logps/chosen": -3.89969539642334, "logps/rejected": -4.330185890197754, "loss": 4.4286, "rewards/accuracies": 0.75, "rewards/chosen": -38.996952056884766, "rewards/margins": 4.30490255355835, "rewards/rejected": -43.301856994628906, "step": 4203 }, { "epoch": 0.5724400871459695, "grad_norm": 42.16674526054648, "learning_rate": 3.687537903166858e-07, "logits/chosen": 13.2005615234375, "logits/rejected": 14.757318496704102, "logps/chosen": -4.041266441345215, "logps/rejected": -4.498251438140869, "loss": 3.9614, "rewards/accuracies": 0.75, "rewards/chosen": -40.412662506103516, "rewards/margins": 4.569850921630859, "rewards/rejected": -44.982513427734375, "step": 4204 }, { "epoch": 0.5725762527233116, "grad_norm": 41.130724852740684, "learning_rate": 3.685642345904223e-07, "logits/chosen": 13.55430793762207, "logits/rejected": 13.87777328491211, "logps/chosen": -4.276965141296387, "logps/rejected": -4.189436435699463, "loss": 4.1813, "rewards/accuracies": 0.5, "rewards/chosen": -42.7696533203125, "rewards/margins": -0.8752861022949219, "rewards/rejected": -41.89436721801758, "step": 4205 }, { "epoch": 0.5727124183006536, "grad_norm": 37.83225246356416, "learning_rate": 3.683746859673299e-07, "logits/chosen": 14.03455924987793, "logits/rejected": 13.704702377319336, "logps/chosen": -3.866177558898926, "logps/rejected": -4.426307201385498, "loss": 3.7333, "rewards/accuracies": 1.0, "rewards/chosen": -38.661773681640625, "rewards/margins": 5.60129451751709, "rewards/rejected": -44.26306915283203, "step": 4206 }, { "epoch": 0.5728485838779956, "grad_norm": 43.73679960688458, "learning_rate": 3.6818514449023877e-07, "logits/chosen": 11.972857475280762, "logits/rejected": 13.313398361206055, "logps/chosen": -3.697765350341797, "logps/rejected": -4.381402015686035, "loss": 3.7281, "rewards/accuracies": 1.0, "rewards/chosen": -36.9776496887207, "rewards/margins": 6.836367607116699, "rewards/rejected": -43.81401824951172, "step": 4207 }, { "epoch": 0.5729847494553377, "grad_norm": 43.562984418832876, "learning_rate": 3.6799561020197704e-07, "logits/chosen": 13.066164016723633, "logits/rejected": 12.854618072509766, "logps/chosen": -3.9886577129364014, "logps/rejected": -3.859421968460083, "loss": 4.1656, "rewards/accuracies": 0.5, "rewards/chosen": -39.88657760620117, "rewards/margins": -1.2923564910888672, "rewards/rejected": -38.59422302246094, "step": 4208 }, { "epoch": 0.5731209150326797, "grad_norm": 39.81523891662601, "learning_rate": 3.678060831453717e-07, "logits/chosen": 13.1433687210083, "logits/rejected": 12.740005493164062, "logps/chosen": -4.1743268966674805, "logps/rejected": -4.059807777404785, "loss": 4.3245, "rewards/accuracies": 0.25, "rewards/chosen": -41.743263244628906, "rewards/margins": -1.145186424255371, "rewards/rejected": -40.598079681396484, "step": 4209 }, { "epoch": 0.5732570806100218, "grad_norm": 43.639348649536956, "learning_rate": 3.6761656336324815e-07, "logits/chosen": 13.889669418334961, "logits/rejected": 14.750572204589844, "logps/chosen": -4.354536533355713, "logps/rejected": -4.510272026062012, "loss": 4.0938, "rewards/accuracies": 0.5, "rewards/chosen": -43.54536437988281, "rewards/margins": 1.5573577880859375, "rewards/rejected": -45.10272216796875, "step": 4210 }, { "epoch": 0.5733932461873639, "grad_norm": 46.8704104148004, "learning_rate": 3.6742705089842974e-07, "logits/chosen": 14.446884155273438, "logits/rejected": 13.933246612548828, "logps/chosen": -4.282950401306152, "logps/rejected": -4.551835060119629, "loss": 4.2635, "rewards/accuracies": 0.75, "rewards/chosen": -42.829505920410156, "rewards/margins": 2.688845634460449, "rewards/rejected": -45.518348693847656, "step": 4211 }, { "epoch": 0.5735294117647058, "grad_norm": 37.62412457854694, "learning_rate": 3.6723754579373836e-07, "logits/chosen": 13.706607818603516, "logits/rejected": 14.82432746887207, "logps/chosen": -3.85713791847229, "logps/rejected": -4.434576988220215, "loss": 3.7433, "rewards/accuracies": 1.0, "rewards/chosen": -38.57137680053711, "rewards/margins": 5.774388313293457, "rewards/rejected": -44.34576416015625, "step": 4212 }, { "epoch": 0.5736655773420479, "grad_norm": 44.37850754141261, "learning_rate": 3.670480480919944e-07, "logits/chosen": 13.751834869384766, "logits/rejected": 13.404592514038086, "logps/chosen": -3.854684829711914, "logps/rejected": -3.8331613540649414, "loss": 3.8224, "rewards/accuracies": 0.25, "rewards/chosen": -38.54684829711914, "rewards/margins": -0.21523284912109375, "rewards/rejected": -38.33161544799805, "step": 4213 }, { "epoch": 0.57380174291939, "grad_norm": 41.666295118788916, "learning_rate": 3.668585578360164e-07, "logits/chosen": 13.792763710021973, "logits/rejected": 13.947144508361816, "logps/chosen": -4.340998649597168, "logps/rejected": -4.455471515655518, "loss": 4.0183, "rewards/accuracies": 0.5, "rewards/chosen": -43.40998840332031, "rewards/margins": 1.1447296142578125, "rewards/rejected": -44.554718017578125, "step": 4214 }, { "epoch": 0.573937908496732, "grad_norm": 45.709181543190276, "learning_rate": 3.6666907506862107e-07, "logits/chosen": 13.835729598999023, "logits/rejected": 14.246805191040039, "logps/chosen": -4.152122974395752, "logps/rejected": -4.456643104553223, "loss": 3.1362, "rewards/accuracies": 1.0, "rewards/chosen": -41.52123260498047, "rewards/margins": 3.0452003479003906, "rewards/rejected": -44.566429138183594, "step": 4215 }, { "epoch": 0.5740740740740741, "grad_norm": 44.91977151172106, "learning_rate": 3.6647959983262387e-07, "logits/chosen": 13.665178298950195, "logits/rejected": 13.619688034057617, "logps/chosen": -4.104816436767578, "logps/rejected": -4.18475341796875, "loss": 4.5732, "rewards/accuracies": 0.5, "rewards/chosen": -41.04816436767578, "rewards/margins": 0.7993650436401367, "rewards/rejected": -41.847530364990234, "step": 4216 }, { "epoch": 0.5742102396514162, "grad_norm": 52.91103831406494, "learning_rate": 3.6629013217083806e-07, "logits/chosen": 13.44502067565918, "logits/rejected": 13.417949676513672, "logps/chosen": -4.257209777832031, "logps/rejected": -4.138378143310547, "loss": 3.7972, "rewards/accuracies": 0.25, "rewards/chosen": -42.57209777832031, "rewards/margins": -1.1883153915405273, "rewards/rejected": -41.38378143310547, "step": 4217 }, { "epoch": 0.5743464052287581, "grad_norm": 40.07076470533167, "learning_rate": 3.6610067212607564e-07, "logits/chosen": 13.185735702514648, "logits/rejected": 13.237608909606934, "logps/chosen": -4.045194625854492, "logps/rejected": -4.079104423522949, "loss": 3.7691, "rewards/accuracies": 0.75, "rewards/chosen": -40.45195007324219, "rewards/margins": 0.3390932083129883, "rewards/rejected": -40.79104232788086, "step": 4218 }, { "epoch": 0.5744825708061002, "grad_norm": 43.65059007091663, "learning_rate": 3.659112197411466e-07, "logits/chosen": 13.717597961425781, "logits/rejected": 12.899335861206055, "logps/chosen": -4.397050857543945, "logps/rejected": -4.27362060546875, "loss": 3.9562, "rewards/accuracies": 0.75, "rewards/chosen": -43.97050857543945, "rewards/margins": -1.2343096733093262, "rewards/rejected": -42.736202239990234, "step": 4219 }, { "epoch": 0.5746187363834423, "grad_norm": 50.09147563092705, "learning_rate": 3.6572177505885905e-07, "logits/chosen": 13.480485916137695, "logits/rejected": 13.754264831542969, "logps/chosen": -4.1212053298950195, "logps/rejected": -4.224150657653809, "loss": 3.8995, "rewards/accuracies": 0.75, "rewards/chosen": -41.21205139160156, "rewards/margins": 1.0294570922851562, "rewards/rejected": -42.24150848388672, "step": 4220 }, { "epoch": 0.5747549019607843, "grad_norm": 42.824222766101634, "learning_rate": 3.6553233812201994e-07, "logits/chosen": 12.745952606201172, "logits/rejected": 13.246191024780273, "logps/chosen": -4.105266571044922, "logps/rejected": -4.540899276733398, "loss": 3.4724, "rewards/accuracies": 1.0, "rewards/chosen": -41.05266571044922, "rewards/margins": 4.356326103210449, "rewards/rejected": -45.40899658203125, "step": 4221 }, { "epoch": 0.5748910675381264, "grad_norm": 43.84020444869947, "learning_rate": 3.653429089734339e-07, "logits/chosen": 14.235372543334961, "logits/rejected": 13.448892593383789, "logps/chosen": -4.427475929260254, "logps/rejected": -4.356734275817871, "loss": 4.7082, "rewards/accuracies": 0.5, "rewards/chosen": -44.274757385253906, "rewards/margins": -0.7074174880981445, "rewards/rejected": -43.56734085083008, "step": 4222 }, { "epoch": 0.5750272331154684, "grad_norm": 39.80440524867889, "learning_rate": 3.65153487655904e-07, "logits/chosen": 13.80500316619873, "logits/rejected": 13.795902252197266, "logps/chosen": -4.107477188110352, "logps/rejected": -4.339659690856934, "loss": 3.958, "rewards/accuracies": 0.75, "rewards/chosen": -41.07476806640625, "rewards/margins": 2.321828842163086, "rewards/rejected": -43.39659881591797, "step": 4223 }, { "epoch": 0.5751633986928104, "grad_norm": 41.57553262413576, "learning_rate": 3.649640742122316e-07, "logits/chosen": 13.35014820098877, "logits/rejected": 12.881269454956055, "logps/chosen": -4.137401580810547, "logps/rejected": -4.275880813598633, "loss": 4.1857, "rewards/accuracies": 0.25, "rewards/chosen": -41.37401580810547, "rewards/margins": 1.384791374206543, "rewards/rejected": -42.75880432128906, "step": 4224 }, { "epoch": 0.5752995642701525, "grad_norm": 41.661468745164754, "learning_rate": 3.647746686852164e-07, "logits/chosen": 13.812986373901367, "logits/rejected": 13.674278259277344, "logps/chosen": -4.008904457092285, "logps/rejected": -4.473676681518555, "loss": 4.1051, "rewards/accuracies": 0.75, "rewards/chosen": -40.08904266357422, "rewards/margins": 4.6477251052856445, "rewards/rejected": -44.73676681518555, "step": 4225 }, { "epoch": 0.5754357298474946, "grad_norm": 48.65964481911055, "learning_rate": 3.6458527111765585e-07, "logits/chosen": 14.288031578063965, "logits/rejected": 14.04417610168457, "logps/chosen": -4.502452373504639, "logps/rejected": -4.394274711608887, "loss": 4.096, "rewards/accuracies": 0.5, "rewards/chosen": -45.0245246887207, "rewards/margins": -1.081772804260254, "rewards/rejected": -43.9427490234375, "step": 4226 }, { "epoch": 0.5755718954248366, "grad_norm": 39.86668068676601, "learning_rate": 3.64395881552346e-07, "logits/chosen": 13.624687194824219, "logits/rejected": 13.270806312561035, "logps/chosen": -4.247397422790527, "logps/rejected": -4.187621116638184, "loss": 3.7399, "rewards/accuracies": 0.25, "rewards/chosen": -42.473976135253906, "rewards/margins": -0.5977640151977539, "rewards/rejected": -41.87621307373047, "step": 4227 }, { "epoch": 0.5757080610021786, "grad_norm": 39.99664419829429, "learning_rate": 3.642065000320812e-07, "logits/chosen": 13.344943046569824, "logits/rejected": 13.091693878173828, "logps/chosen": -4.043145179748535, "logps/rejected": -4.0242767333984375, "loss": 3.8574, "rewards/accuracies": 0.75, "rewards/chosen": -40.43144989013672, "rewards/margins": -0.18868255615234375, "rewards/rejected": -40.242767333984375, "step": 4228 }, { "epoch": 0.5758442265795207, "grad_norm": 41.9765564225359, "learning_rate": 3.640171265996534e-07, "logits/chosen": 13.680155754089355, "logits/rejected": 14.748698234558105, "logps/chosen": -4.539345741271973, "logps/rejected": -4.728672027587891, "loss": 4.1594, "rewards/accuracies": 0.75, "rewards/chosen": -45.39345932006836, "rewards/margins": 1.8932628631591797, "rewards/rejected": -47.286720275878906, "step": 4229 }, { "epoch": 0.5759803921568627, "grad_norm": 39.520238985343255, "learning_rate": 3.638277612978533e-07, "logits/chosen": 13.239095687866211, "logits/rejected": 13.577796936035156, "logps/chosen": -3.9331047534942627, "logps/rejected": -4.377872467041016, "loss": 3.9524, "rewards/accuracies": 0.75, "rewards/chosen": -39.33104705810547, "rewards/margins": 4.4476728439331055, "rewards/rejected": -43.77872085571289, "step": 4230 }, { "epoch": 0.5761165577342048, "grad_norm": 41.23248866336705, "learning_rate": 3.636384041694697e-07, "logits/chosen": 13.814075469970703, "logits/rejected": 13.810493469238281, "logps/chosen": -4.244964122772217, "logps/rejected": -4.091353893280029, "loss": 3.851, "rewards/accuracies": 0.5, "rewards/chosen": -42.449642181396484, "rewards/margins": -1.5361013412475586, "rewards/rejected": -40.91353988647461, "step": 4231 }, { "epoch": 0.5762527233115469, "grad_norm": 48.82297661148181, "learning_rate": 3.6344905525728907e-07, "logits/chosen": 12.660446166992188, "logits/rejected": 12.73331069946289, "logps/chosen": -3.9134926795959473, "logps/rejected": -4.05678653717041, "loss": 3.279, "rewards/accuracies": 0.75, "rewards/chosen": -39.134925842285156, "rewards/margins": 1.4329357147216797, "rewards/rejected": -40.56786346435547, "step": 4232 }, { "epoch": 0.5763888888888888, "grad_norm": 46.875879588753016, "learning_rate": 3.632597146040966e-07, "logits/chosen": 13.596672058105469, "logits/rejected": 13.25190258026123, "logps/chosen": -4.155887603759766, "logps/rejected": -4.085693359375, "loss": 4.0062, "rewards/accuracies": 0.25, "rewards/chosen": -41.558876037597656, "rewards/margins": -0.7019433975219727, "rewards/rejected": -40.856929779052734, "step": 4233 }, { "epoch": 0.5765250544662309, "grad_norm": 40.634366052240466, "learning_rate": 3.630703822526754e-07, "logits/chosen": 13.307933807373047, "logits/rejected": 13.441511154174805, "logps/chosen": -4.2752366065979, "logps/rejected": -4.733366012573242, "loss": 4.127, "rewards/accuracies": 1.0, "rewards/chosen": -42.75236511230469, "rewards/margins": 4.581297874450684, "rewards/rejected": -47.33366394042969, "step": 4234 }, { "epoch": 0.576661220043573, "grad_norm": 44.84137170398234, "learning_rate": 3.628810582458065e-07, "logits/chosen": 14.378080368041992, "logits/rejected": 14.656449317932129, "logps/chosen": -4.692425727844238, "logps/rejected": -4.849818229675293, "loss": 3.639, "rewards/accuracies": 0.5, "rewards/chosen": -46.92425537109375, "rewards/margins": 1.573927879333496, "rewards/rejected": -48.49818420410156, "step": 4235 }, { "epoch": 0.576797385620915, "grad_norm": 42.27276776523362, "learning_rate": 3.6269174262626926e-07, "logits/chosen": 13.650785446166992, "logits/rejected": 14.825057983398438, "logps/chosen": -4.136506080627441, "logps/rejected": -4.488968849182129, "loss": 3.8854, "rewards/accuracies": 0.5, "rewards/chosen": -41.36506652832031, "rewards/margins": 3.5246219635009766, "rewards/rejected": -44.889686584472656, "step": 4236 }, { "epoch": 0.5769335511982571, "grad_norm": 40.709784426644624, "learning_rate": 3.625024354368413e-07, "logits/chosen": 12.869714736938477, "logits/rejected": 12.800113677978516, "logps/chosen": -3.8601365089416504, "logps/rejected": -4.051058769226074, "loss": 4.0093, "rewards/accuracies": 0.75, "rewards/chosen": -38.60136032104492, "rewards/margins": 1.9092283248901367, "rewards/rejected": -40.510589599609375, "step": 4237 }, { "epoch": 0.5770697167755992, "grad_norm": 39.74135386959084, "learning_rate": 3.62313136720298e-07, "logits/chosen": 13.610032081604004, "logits/rejected": 14.775793075561523, "logps/chosen": -4.052250862121582, "logps/rejected": -4.324454307556152, "loss": 3.9086, "rewards/accuracies": 0.75, "rewards/chosen": -40.52251052856445, "rewards/margins": 2.7220335006713867, "rewards/rejected": -43.244544982910156, "step": 4238 }, { "epoch": 0.5772058823529411, "grad_norm": 46.60806000810972, "learning_rate": 3.621238465194128e-07, "logits/chosen": 14.336051940917969, "logits/rejected": 14.662513732910156, "logps/chosen": -4.319779396057129, "logps/rejected": -4.405025482177734, "loss": 4.2696, "rewards/accuracies": 0.5, "rewards/chosen": -43.197792053222656, "rewards/margins": 0.8524665832519531, "rewards/rejected": -44.050254821777344, "step": 4239 }, { "epoch": 0.5773420479302832, "grad_norm": 42.671380078441956, "learning_rate": 3.619345648769578e-07, "logits/chosen": 13.952579498291016, "logits/rejected": 14.243022918701172, "logps/chosen": -4.626232147216797, "logps/rejected": -4.6300554275512695, "loss": 4.249, "rewards/accuracies": 0.5, "rewards/chosen": -46.2623176574707, "rewards/margins": 0.038238525390625, "rewards/rejected": -46.30055618286133, "step": 4240 }, { "epoch": 0.5774782135076253, "grad_norm": 46.52794833218027, "learning_rate": 3.6174529183570244e-07, "logits/chosen": 14.070793151855469, "logits/rejected": 13.837339401245117, "logps/chosen": -4.217347621917725, "logps/rejected": -4.164697647094727, "loss": 4.856, "rewards/accuracies": 0.5, "rewards/chosen": -42.1734733581543, "rewards/margins": -0.5264978408813477, "rewards/rejected": -41.646976470947266, "step": 4241 }, { "epoch": 0.5776143790849673, "grad_norm": 40.69826571839542, "learning_rate": 3.615560274384145e-07, "logits/chosen": 13.85759449005127, "logits/rejected": 13.360027313232422, "logps/chosen": -4.260040283203125, "logps/rejected": -4.1373090744018555, "loss": 3.5103, "rewards/accuracies": 0.5, "rewards/chosen": -42.60040283203125, "rewards/margins": -1.227315902709961, "rewards/rejected": -41.373085021972656, "step": 4242 }, { "epoch": 0.5777505446623094, "grad_norm": 46.896370250535604, "learning_rate": 3.613667717278601e-07, "logits/chosen": 12.73523235321045, "logits/rejected": 13.563079833984375, "logps/chosen": -3.7046115398406982, "logps/rejected": -3.9233744144439697, "loss": 3.3119, "rewards/accuracies": 0.75, "rewards/chosen": -37.04611587524414, "rewards/margins": 2.1876282691955566, "rewards/rejected": -39.233741760253906, "step": 4243 }, { "epoch": 0.5778867102396514, "grad_norm": 45.25375323091178, "learning_rate": 3.611775247468029e-07, "logits/chosen": 13.327888488769531, "logits/rejected": 13.43499755859375, "logps/chosen": -4.216327667236328, "logps/rejected": -3.9744203090667725, "loss": 4.3167, "rewards/accuracies": 0.25, "rewards/chosen": -42.16327667236328, "rewards/margins": -2.4190731048583984, "rewards/rejected": -39.74420166015625, "step": 4244 }, { "epoch": 0.5780228758169934, "grad_norm": 42.02106071693809, "learning_rate": 3.609882865380048e-07, "logits/chosen": 14.69741439819336, "logits/rejected": 14.866537094116211, "logps/chosen": -4.453899383544922, "logps/rejected": -4.756933212280273, "loss": 4.3673, "rewards/accuracies": 0.75, "rewards/chosen": -44.53899383544922, "rewards/margins": 3.030336380004883, "rewards/rejected": -47.569332122802734, "step": 4245 }, { "epoch": 0.5781590413943355, "grad_norm": 39.076366473695714, "learning_rate": 3.6079905714422607e-07, "logits/chosen": 13.969467163085938, "logits/rejected": 13.355077743530273, "logps/chosen": -4.524688720703125, "logps/rejected": -4.431331634521484, "loss": 4.0583, "rewards/accuracies": 0.5, "rewards/chosen": -45.246891021728516, "rewards/margins": -0.933568000793457, "rewards/rejected": -44.31332015991211, "step": 4246 }, { "epoch": 0.5782952069716776, "grad_norm": 43.49681324943803, "learning_rate": 3.6060983660822417e-07, "logits/chosen": 13.557817459106445, "logits/rejected": 13.26611328125, "logps/chosen": -4.0675272941589355, "logps/rejected": -4.170807838439941, "loss": 3.5473, "rewards/accuracies": 0.75, "rewards/chosen": -40.675270080566406, "rewards/margins": 1.0328054428100586, "rewards/rejected": -41.70807647705078, "step": 4247 }, { "epoch": 0.5784313725490197, "grad_norm": 55.273292074653654, "learning_rate": 3.604206249727554e-07, "logits/chosen": 13.059619903564453, "logits/rejected": 13.945491790771484, "logps/chosen": -3.988175630569458, "logps/rejected": -4.509998798370361, "loss": 3.7376, "rewards/accuracies": 0.75, "rewards/chosen": -39.88175964355469, "rewards/margins": 5.218229293823242, "rewards/rejected": -45.09998321533203, "step": 4248 }, { "epoch": 0.5785675381263616, "grad_norm": 58.35776281242394, "learning_rate": 3.6023142228057364e-07, "logits/chosen": 12.657970428466797, "logits/rejected": 13.978740692138672, "logps/chosen": -3.946568489074707, "logps/rejected": -4.234624862670898, "loss": 4.2755, "rewards/accuracies": 0.75, "rewards/chosen": -39.46568298339844, "rewards/margins": 2.8805675506591797, "rewards/rejected": -42.34625244140625, "step": 4249 }, { "epoch": 0.5787037037037037, "grad_norm": 53.57260915619529, "learning_rate": 3.600422285744306e-07, "logits/chosen": 12.918559074401855, "logits/rejected": 13.311627388000488, "logps/chosen": -3.7386131286621094, "logps/rejected": -3.889876127243042, "loss": 4.2556, "rewards/accuracies": 0.5, "rewards/chosen": -37.386131286621094, "rewards/margins": 1.5126323699951172, "rewards/rejected": -38.89875793457031, "step": 4250 }, { "epoch": 0.5788398692810458, "grad_norm": 41.678088614727145, "learning_rate": 3.5985304389707635e-07, "logits/chosen": 13.499856948852539, "logits/rejected": 13.372068405151367, "logps/chosen": -4.253553867340088, "logps/rejected": -4.40671443939209, "loss": 4.2877, "rewards/accuracies": 0.75, "rewards/chosen": -42.53553771972656, "rewards/margins": 1.5316076278686523, "rewards/rejected": -44.06714630126953, "step": 4251 }, { "epoch": 0.5789760348583878, "grad_norm": 41.450219246790056, "learning_rate": 3.596638682912589e-07, "logits/chosen": 13.168876647949219, "logits/rejected": 14.114831924438477, "logps/chosen": -4.453625679016113, "logps/rejected": -4.90813684463501, "loss": 3.9932, "rewards/accuracies": 1.0, "rewards/chosen": -44.5362548828125, "rewards/margins": 4.545113563537598, "rewards/rejected": -49.08136749267578, "step": 4252 }, { "epoch": 0.5791122004357299, "grad_norm": 45.88052165479234, "learning_rate": 3.5947470179972355e-07, "logits/chosen": 13.690792083740234, "logits/rejected": 13.65417194366455, "logps/chosen": -3.863006114959717, "logps/rejected": -3.9878101348876953, "loss": 3.6738, "rewards/accuracies": 0.75, "rewards/chosen": -38.630062103271484, "rewards/margins": 1.248042106628418, "rewards/rejected": -39.87810516357422, "step": 4253 }, { "epoch": 0.579248366013072, "grad_norm": 40.5156254233833, "learning_rate": 3.5928554446521434e-07, "logits/chosen": 13.913599014282227, "logits/rejected": 14.102106094360352, "logps/chosen": -4.119211196899414, "logps/rejected": -4.277050971984863, "loss": 3.7246, "rewards/accuracies": 0.75, "rewards/chosen": -41.192108154296875, "rewards/margins": 1.5783987045288086, "rewards/rejected": -42.7705078125, "step": 4254 }, { "epoch": 0.5793845315904139, "grad_norm": 42.808063149996315, "learning_rate": 3.590963963304731e-07, "logits/chosen": 13.251762390136719, "logits/rejected": 14.016763687133789, "logps/chosen": -4.174586772918701, "logps/rejected": -4.396457195281982, "loss": 3.8991, "rewards/accuracies": 0.75, "rewards/chosen": -41.745872497558594, "rewards/margins": 2.218703269958496, "rewards/rejected": -43.96457290649414, "step": 4255 }, { "epoch": 0.579520697167756, "grad_norm": 48.79001885286538, "learning_rate": 3.5890725743823905e-07, "logits/chosen": 13.459249496459961, "logits/rejected": 13.618072509765625, "logps/chosen": -4.422886848449707, "logps/rejected": -4.5193023681640625, "loss": 4.5181, "rewards/accuracies": 0.5, "rewards/chosen": -44.2288703918457, "rewards/margins": 0.9641504287719727, "rewards/rejected": -45.19301986694336, "step": 4256 }, { "epoch": 0.5796568627450981, "grad_norm": 49.59425719694942, "learning_rate": 3.5871812783124987e-07, "logits/chosen": 14.015541076660156, "logits/rejected": 14.241264343261719, "logps/chosen": -4.403632164001465, "logps/rejected": -4.7047038078308105, "loss": 3.9121, "rewards/accuracies": 0.75, "rewards/chosen": -44.03632354736328, "rewards/margins": 3.010714530944824, "rewards/rejected": -47.04703903198242, "step": 4257 }, { "epoch": 0.5797930283224401, "grad_norm": 43.49660341104531, "learning_rate": 3.5852900755224115e-07, "logits/chosen": 12.462596893310547, "logits/rejected": 13.190258026123047, "logps/chosen": -4.024540901184082, "logps/rejected": -4.228730201721191, "loss": 4.411, "rewards/accuracies": 0.75, "rewards/chosen": -40.24541091918945, "rewards/margins": 2.0418930053710938, "rewards/rejected": -42.28730392456055, "step": 4258 }, { "epoch": 0.5799291938997821, "grad_norm": 52.85086422055568, "learning_rate": 3.5833989664394574e-07, "logits/chosen": 13.507102966308594, "logits/rejected": 13.776963233947754, "logps/chosen": -4.348287582397461, "logps/rejected": -4.376526355743408, "loss": 4.1384, "rewards/accuracies": 0.5, "rewards/chosen": -43.48287582397461, "rewards/margins": 0.28238582611083984, "rewards/rejected": -43.765262603759766, "step": 4259 }, { "epoch": 0.5800653594771242, "grad_norm": 52.35524723489769, "learning_rate": 3.5815079514909504e-07, "logits/chosen": 13.640264511108398, "logits/rejected": 13.35368537902832, "logps/chosen": -4.4682416915893555, "logps/rejected": -4.3212785720825195, "loss": 4.4338, "rewards/accuracies": 0.5, "rewards/chosen": -44.682411193847656, "rewards/margins": -1.4696292877197266, "rewards/rejected": -43.21278381347656, "step": 4260 }, { "epoch": 0.5802015250544662, "grad_norm": 37.49365533932159, "learning_rate": 3.5796170311041826e-07, "logits/chosen": 13.172245025634766, "logits/rejected": 13.207260131835938, "logps/chosen": -3.662092685699463, "logps/rejected": -4.179913520812988, "loss": 3.9219, "rewards/accuracies": 0.75, "rewards/chosen": -36.62092590332031, "rewards/margins": 5.1782121658325195, "rewards/rejected": -41.799137115478516, "step": 4261 }, { "epoch": 0.5803376906318083, "grad_norm": 44.490437407538366, "learning_rate": 3.577726205706421e-07, "logits/chosen": 13.611787796020508, "logits/rejected": 13.873043060302734, "logps/chosen": -4.051097869873047, "logps/rejected": -4.4047441482543945, "loss": 3.6691, "rewards/accuracies": 1.0, "rewards/chosen": -40.51097869873047, "rewards/margins": 3.5364646911621094, "rewards/rejected": -44.04744338989258, "step": 4262 }, { "epoch": 0.5804738562091504, "grad_norm": 38.78704623942908, "learning_rate": 3.575835475724913e-07, "logits/chosen": 13.4385986328125, "logits/rejected": 13.080373764038086, "logps/chosen": -4.121237277984619, "logps/rejected": -4.079686641693115, "loss": 3.7542, "rewards/accuracies": 0.5, "rewards/chosen": -41.212371826171875, "rewards/margins": -0.41550350189208984, "rewards/rejected": -40.79686737060547, "step": 4263 }, { "epoch": 0.5806100217864923, "grad_norm": 40.926100851770556, "learning_rate": 3.5739448415868867e-07, "logits/chosen": 13.130828857421875, "logits/rejected": 14.016490936279297, "logps/chosen": -4.089593410491943, "logps/rejected": -4.285428047180176, "loss": 3.5223, "rewards/accuracies": 0.75, "rewards/chosen": -40.89593505859375, "rewards/margins": 1.9583444595336914, "rewards/rejected": -42.85428237915039, "step": 4264 }, { "epoch": 0.5807461873638344, "grad_norm": 43.82968003645297, "learning_rate": 3.572054303719545e-07, "logits/chosen": 12.73399543762207, "logits/rejected": 13.33255672454834, "logps/chosen": -3.853632688522339, "logps/rejected": -3.9619603157043457, "loss": 3.7901, "rewards/accuracies": 0.5, "rewards/chosen": -38.53632354736328, "rewards/margins": 1.0832796096801758, "rewards/rejected": -39.619606018066406, "step": 4265 }, { "epoch": 0.5808823529411765, "grad_norm": 39.9971607324992, "learning_rate": 3.5701638625500697e-07, "logits/chosen": 13.006378173828125, "logits/rejected": 13.506486892700195, "logps/chosen": -3.7952566146850586, "logps/rejected": -3.9548299312591553, "loss": 3.8393, "rewards/accuracies": 0.5, "rewards/chosen": -37.95256423950195, "rewards/margins": 1.5957322120666504, "rewards/rejected": -39.54829788208008, "step": 4266 }, { "epoch": 0.5810185185185185, "grad_norm": 38.726003734072336, "learning_rate": 3.5682735185056235e-07, "logits/chosen": 13.418411254882812, "logits/rejected": 13.996809005737305, "logps/chosen": -3.7864553928375244, "logps/rejected": -4.175942420959473, "loss": 3.6182, "rewards/accuracies": 0.75, "rewards/chosen": -37.86455535888672, "rewards/margins": 3.8948726654052734, "rewards/rejected": -41.75942611694336, "step": 4267 }, { "epoch": 0.5811546840958606, "grad_norm": 43.95786660800785, "learning_rate": 3.566383272013344e-07, "logits/chosen": 12.383890151977539, "logits/rejected": 13.614974975585938, "logps/chosen": -3.683169364929199, "logps/rejected": -4.165233612060547, "loss": 4.3662, "rewards/accuracies": 0.75, "rewards/chosen": -36.831695556640625, "rewards/margins": 4.820642471313477, "rewards/rejected": -41.65233612060547, "step": 4268 }, { "epoch": 0.5812908496732027, "grad_norm": 45.27502095389069, "learning_rate": 3.56449312350035e-07, "logits/chosen": 12.253890991210938, "logits/rejected": 13.284482955932617, "logps/chosen": -3.8273351192474365, "logps/rejected": -4.0820088386535645, "loss": 3.789, "rewards/accuracies": 0.75, "rewards/chosen": -38.27334976196289, "rewards/margins": 2.546738624572754, "rewards/rejected": -40.82008743286133, "step": 4269 }, { "epoch": 0.5814270152505446, "grad_norm": 41.120022645183774, "learning_rate": 3.562603073393733e-07, "logits/chosen": 13.500970840454102, "logits/rejected": 13.757695198059082, "logps/chosen": -4.1418304443359375, "logps/rejected": -4.497396469116211, "loss": 4.4617, "rewards/accuracies": 0.75, "rewards/chosen": -41.418304443359375, "rewards/margins": 3.5556583404541016, "rewards/rejected": -44.973968505859375, "step": 4270 }, { "epoch": 0.5815631808278867, "grad_norm": 42.548384134769286, "learning_rate": 3.5607131221205674e-07, "logits/chosen": 13.359676361083984, "logits/rejected": 13.425394058227539, "logps/chosen": -4.18484354019165, "logps/rejected": -4.023409843444824, "loss": 4.061, "rewards/accuracies": 0.25, "rewards/chosen": -41.84843444824219, "rewards/margins": -1.6143369674682617, "rewards/rejected": -40.234100341796875, "step": 4271 }, { "epoch": 0.5816993464052288, "grad_norm": 43.609905016164525, "learning_rate": 3.558823270107904e-07, "logits/chosen": 12.98696517944336, "logits/rejected": 13.595407485961914, "logps/chosen": -4.199023246765137, "logps/rejected": -4.572011947631836, "loss": 3.8527, "rewards/accuracies": 1.0, "rewards/chosen": -41.990234375, "rewards/margins": 3.729886054992676, "rewards/rejected": -45.720123291015625, "step": 4272 }, { "epoch": 0.5818355119825708, "grad_norm": 43.82498694643821, "learning_rate": 3.556933517782769e-07, "logits/chosen": 12.84468936920166, "logits/rejected": 13.610940933227539, "logps/chosen": -3.887098789215088, "logps/rejected": -4.205389499664307, "loss": 3.6451, "rewards/accuracies": 1.0, "rewards/chosen": -38.87099075317383, "rewards/margins": 3.1829051971435547, "rewards/rejected": -42.05389404296875, "step": 4273 }, { "epoch": 0.5819716775599129, "grad_norm": 42.11178589102685, "learning_rate": 3.5550438655721676e-07, "logits/chosen": 12.547136306762695, "logits/rejected": 13.131826400756836, "logps/chosen": -3.9574332237243652, "logps/rejected": -4.268854141235352, "loss": 3.9977, "rewards/accuracies": 0.75, "rewards/chosen": -39.5743293762207, "rewards/margins": 3.1142091751098633, "rewards/rejected": -42.688541412353516, "step": 4274 }, { "epoch": 0.5821078431372549, "grad_norm": 41.595799430688466, "learning_rate": 3.5531543139030826e-07, "logits/chosen": 13.752508163452148, "logits/rejected": 14.191330909729004, "logps/chosen": -4.14561653137207, "logps/rejected": -4.177323341369629, "loss": 4.415, "rewards/accuracies": 0.5, "rewards/chosen": -41.45616149902344, "rewards/margins": 0.31707000732421875, "rewards/rejected": -41.773231506347656, "step": 4275 }, { "epoch": 0.5822440087145969, "grad_norm": 51.954698852295294, "learning_rate": 3.551264863202476e-07, "logits/chosen": 12.340909957885742, "logits/rejected": 12.654850006103516, "logps/chosen": -3.7292516231536865, "logps/rejected": -4.045270919799805, "loss": 4.0637, "rewards/accuracies": 0.5, "rewards/chosen": -37.29251480102539, "rewards/margins": 3.1601948738098145, "rewards/rejected": -40.45271301269531, "step": 4276 }, { "epoch": 0.582380174291939, "grad_norm": 39.58999666522039, "learning_rate": 3.549375513897281e-07, "logits/chosen": 12.626493453979492, "logits/rejected": 13.462646484375, "logps/chosen": -3.844521999359131, "logps/rejected": -4.057680606842041, "loss": 3.8728, "rewards/accuracies": 0.75, "rewards/chosen": -38.445220947265625, "rewards/margins": 2.131585121154785, "rewards/rejected": -40.576805114746094, "step": 4277 }, { "epoch": 0.5825163398692811, "grad_norm": 51.302452875041084, "learning_rate": 3.5474862664144134e-07, "logits/chosen": 13.249589920043945, "logits/rejected": 13.366691589355469, "logps/chosen": -4.369102478027344, "logps/rejected": -4.320023536682129, "loss": 3.9116, "rewards/accuracies": 0.75, "rewards/chosen": -43.69102096557617, "rewards/margins": -0.49078845977783203, "rewards/rejected": -43.200233459472656, "step": 4278 }, { "epoch": 0.5826525054466231, "grad_norm": 43.12771282307487, "learning_rate": 3.545597121180766e-07, "logits/chosen": 13.42630672454834, "logits/rejected": 13.744710922241211, "logps/chosen": -4.446748733520508, "logps/rejected": -4.699286937713623, "loss": 4.3182, "rewards/accuracies": 0.5, "rewards/chosen": -44.46748733520508, "rewards/margins": 2.525381088256836, "rewards/rejected": -46.99286651611328, "step": 4279 }, { "epoch": 0.5827886710239651, "grad_norm": 51.15601972023125, "learning_rate": 3.543708078623204e-07, "logits/chosen": 12.841596603393555, "logits/rejected": 14.055984497070312, "logps/chosen": -3.8954572677612305, "logps/rejected": -3.8622193336486816, "loss": 3.9155, "rewards/accuracies": 0.5, "rewards/chosen": -38.95457458496094, "rewards/margins": -0.3323793411254883, "rewards/rejected": -38.622196197509766, "step": 4280 }, { "epoch": 0.5829248366013072, "grad_norm": 43.66993975990983, "learning_rate": 3.541819139168573e-07, "logits/chosen": 12.857190132141113, "logits/rejected": 14.246658325195312, "logps/chosen": -3.8185434341430664, "logps/rejected": -4.156088829040527, "loss": 4.2244, "rewards/accuracies": 0.75, "rewards/chosen": -38.18543243408203, "rewards/margins": 3.375455856323242, "rewards/rejected": -41.560890197753906, "step": 4281 }, { "epoch": 0.5830610021786492, "grad_norm": 43.76898906961558, "learning_rate": 3.5399303032436967e-07, "logits/chosen": 12.400751113891602, "logits/rejected": 13.270633697509766, "logps/chosen": -3.9749679565429688, "logps/rejected": -4.347236633300781, "loss": 4.4578, "rewards/accuracies": 0.75, "rewards/chosen": -39.74968338012695, "rewards/margins": 3.722681999206543, "rewards/rejected": -43.47236633300781, "step": 4282 }, { "epoch": 0.5831971677559913, "grad_norm": 46.69990843436706, "learning_rate": 3.5380415712753695e-07, "logits/chosen": 14.02488899230957, "logits/rejected": 14.093461990356445, "logps/chosen": -4.271542549133301, "logps/rejected": -4.51140022277832, "loss": 4.0514, "rewards/accuracies": 1.0, "rewards/chosen": -42.715423583984375, "rewards/margins": 2.398578643798828, "rewards/rejected": -45.1140022277832, "step": 4283 }, { "epoch": 0.5833333333333334, "grad_norm": 72.1821343220536, "learning_rate": 3.536152943690368e-07, "logits/chosen": 14.047417640686035, "logits/rejected": 13.350400924682617, "logps/chosen": -4.351179599761963, "logps/rejected": -4.087592601776123, "loss": 3.5827, "rewards/accuracies": 0.5, "rewards/chosen": -43.51179504394531, "rewards/margins": -2.6358699798583984, "rewards/rejected": -40.87592315673828, "step": 4284 }, { "epoch": 0.5834694989106753, "grad_norm": 45.09693218972388, "learning_rate": 3.534264420915445e-07, "logits/chosen": 13.773500442504883, "logits/rejected": 13.555929183959961, "logps/chosen": -4.3986358642578125, "logps/rejected": -4.360444068908691, "loss": 4.497, "rewards/accuracies": 0.75, "rewards/chosen": -43.986358642578125, "rewards/margins": -0.38191986083984375, "rewards/rejected": -43.60443878173828, "step": 4285 }, { "epoch": 0.5836056644880174, "grad_norm": 42.03292004841415, "learning_rate": 3.532376003377324e-07, "logits/chosen": 13.396753311157227, "logits/rejected": 13.039261817932129, "logps/chosen": -3.766940116882324, "logps/rejected": -3.962550401687622, "loss": 4.3138, "rewards/accuracies": 0.75, "rewards/chosen": -37.66939926147461, "rewards/margins": 1.9561071395874023, "rewards/rejected": -39.62550354003906, "step": 4286 }, { "epoch": 0.5837418300653595, "grad_norm": 43.37376034644964, "learning_rate": 3.53048769150271e-07, "logits/chosen": 13.407553672790527, "logits/rejected": 12.985078811645508, "logps/chosen": -4.182225704193115, "logps/rejected": -4.203861236572266, "loss": 3.7856, "rewards/accuracies": 0.5, "rewards/chosen": -41.82225799560547, "rewards/margins": 0.2163553237915039, "rewards/rejected": -42.038612365722656, "step": 4287 }, { "epoch": 0.5838779956427015, "grad_norm": 44.339499158613584, "learning_rate": 3.528599485718285e-07, "logits/chosen": 13.857898712158203, "logits/rejected": 14.989082336425781, "logps/chosen": -4.276212692260742, "logps/rejected": -4.569656848907471, "loss": 4.0363, "rewards/accuracies": 1.0, "rewards/chosen": -42.76213073730469, "rewards/margins": 2.9344377517700195, "rewards/rejected": -45.69656753540039, "step": 4288 }, { "epoch": 0.5840141612200436, "grad_norm": 41.17507710834532, "learning_rate": 3.5267113864507016e-07, "logits/chosen": 12.810403823852539, "logits/rejected": 12.79454231262207, "logps/chosen": -3.658449172973633, "logps/rejected": -3.945359945297241, "loss": 3.8097, "rewards/accuracies": 0.75, "rewards/chosen": -36.58449172973633, "rewards/margins": 2.869107246398926, "rewards/rejected": -39.45359802246094, "step": 4289 }, { "epoch": 0.5841503267973857, "grad_norm": 48.86040625687893, "learning_rate": 3.5248233941265926e-07, "logits/chosen": 13.939366340637207, "logits/rejected": 13.119863510131836, "logps/chosen": -4.120084762573242, "logps/rejected": -3.832118511199951, "loss": 4.2489, "rewards/accuracies": 0.25, "rewards/chosen": -41.20085144042969, "rewards/margins": -2.879666328430176, "rewards/rejected": -38.32118225097656, "step": 4290 }, { "epoch": 0.5842864923747276, "grad_norm": 39.65089598128361, "learning_rate": 3.522935509172567e-07, "logits/chosen": 13.48887825012207, "logits/rejected": 13.889187812805176, "logps/chosen": -4.086514472961426, "logps/rejected": -4.184231758117676, "loss": 3.7356, "rewards/accuracies": 0.5, "rewards/chosen": -40.86514663696289, "rewards/margins": 0.9771757125854492, "rewards/rejected": -41.842323303222656, "step": 4291 }, { "epoch": 0.5844226579520697, "grad_norm": 38.622200242322464, "learning_rate": 3.521047732015205e-07, "logits/chosen": 13.4889497756958, "logits/rejected": 13.68838119506836, "logps/chosen": -4.041041374206543, "logps/rejected": -4.309422969818115, "loss": 3.9498, "rewards/accuracies": 0.75, "rewards/chosen": -40.4104118347168, "rewards/margins": 2.6838178634643555, "rewards/rejected": -43.09423065185547, "step": 4292 }, { "epoch": 0.5845588235294118, "grad_norm": 40.39950373140633, "learning_rate": 3.519160063081067e-07, "logits/chosen": 13.128811836242676, "logits/rejected": 13.182086944580078, "logps/chosen": -3.9896068572998047, "logps/rejected": -4.191348075866699, "loss": 4.3113, "rewards/accuracies": 0.75, "rewards/chosen": -39.89606857299805, "rewards/margins": 2.017411231994629, "rewards/rejected": -41.913482666015625, "step": 4293 }, { "epoch": 0.5846949891067538, "grad_norm": 42.90220448333397, "learning_rate": 3.517272502796689e-07, "logits/chosen": 14.119014739990234, "logits/rejected": 14.46517562866211, "logps/chosen": -4.259039402008057, "logps/rejected": -4.49937105178833, "loss": 4.1943, "rewards/accuracies": 0.75, "rewards/chosen": -42.59039306640625, "rewards/margins": 2.4033193588256836, "rewards/rejected": -44.993709564208984, "step": 4294 }, { "epoch": 0.5848311546840959, "grad_norm": 41.27879145318781, "learning_rate": 3.515385051588578e-07, "logits/chosen": 13.027134895324707, "logits/rejected": 13.765565872192383, "logps/chosen": -4.335943698883057, "logps/rejected": -4.7430219650268555, "loss": 4.4426, "rewards/accuracies": 0.75, "rewards/chosen": -43.35943603515625, "rewards/margins": 4.0707807540893555, "rewards/rejected": -47.43021774291992, "step": 4295 }, { "epoch": 0.5849673202614379, "grad_norm": 42.38984873578419, "learning_rate": 3.5134977098832195e-07, "logits/chosen": 12.092887878417969, "logits/rejected": 12.72728157043457, "logps/chosen": -4.16765022277832, "logps/rejected": -3.9744677543640137, "loss": 4.6018, "rewards/accuracies": 0.5, "rewards/chosen": -41.67650604248047, "rewards/margins": -1.9318275451660156, "rewards/rejected": -39.74467468261719, "step": 4296 }, { "epoch": 0.5851034858387799, "grad_norm": 41.7056235735312, "learning_rate": 3.5116104781070774e-07, "logits/chosen": 12.641155242919922, "logits/rejected": 12.28789234161377, "logps/chosen": -4.0031561851501465, "logps/rejected": -3.7711851596832275, "loss": 4.214, "rewards/accuracies": 0.25, "rewards/chosen": -40.03156280517578, "rewards/margins": -2.319713592529297, "rewards/rejected": -37.71185302734375, "step": 4297 }, { "epoch": 0.585239651416122, "grad_norm": 50.64373745300544, "learning_rate": 3.509723356686583e-07, "logits/chosen": 13.866331100463867, "logits/rejected": 13.207113265991211, "logps/chosen": -4.300279140472412, "logps/rejected": -4.272418975830078, "loss": 3.7238, "rewards/accuracies": 0.5, "rewards/chosen": -43.00279235839844, "rewards/margins": -0.2786064147949219, "rewards/rejected": -42.72418212890625, "step": 4298 }, { "epoch": 0.5853758169934641, "grad_norm": 40.57050391801842, "learning_rate": 3.507836346048149e-07, "logits/chosen": 13.669160842895508, "logits/rejected": 13.548855781555176, "logps/chosen": -4.298103332519531, "logps/rejected": -4.441295623779297, "loss": 4.122, "rewards/accuracies": 0.5, "rewards/chosen": -42.98103332519531, "rewards/margins": 1.4319238662719727, "rewards/rejected": -44.41295623779297, "step": 4299 }, { "epoch": 0.585511982570806, "grad_norm": 39.0381939056817, "learning_rate": 3.5059494466181623e-07, "logits/chosen": 14.335673332214355, "logits/rejected": 14.083944320678711, "logps/chosen": -4.494156837463379, "logps/rejected": -4.768950462341309, "loss": 3.7188, "rewards/accuracies": 0.75, "rewards/chosen": -44.94157028198242, "rewards/margins": 2.7479352951049805, "rewards/rejected": -47.68950653076172, "step": 4300 }, { "epoch": 0.5856481481481481, "grad_norm": 42.85917790408909, "learning_rate": 3.5040626588229803e-07, "logits/chosen": 13.832700729370117, "logits/rejected": 13.50514030456543, "logps/chosen": -4.486448287963867, "logps/rejected": -4.415960788726807, "loss": 3.9104, "rewards/accuracies": 0.25, "rewards/chosen": -44.864479064941406, "rewards/margins": -0.7048721313476562, "rewards/rejected": -44.15960693359375, "step": 4301 }, { "epoch": 0.5857843137254902, "grad_norm": 41.2298879905672, "learning_rate": 3.50217598308894e-07, "logits/chosen": 13.638866424560547, "logits/rejected": 13.97785472869873, "logps/chosen": -4.388130187988281, "logps/rejected": -4.360433578491211, "loss": 4.3121, "rewards/accuracies": 0.75, "rewards/chosen": -43.88130569458008, "rewards/margins": -0.27696800231933594, "rewards/rejected": -43.604339599609375, "step": 4302 }, { "epoch": 0.5859204793028322, "grad_norm": 39.16731184510021, "learning_rate": 3.5002894198423533e-07, "logits/chosen": 13.823366165161133, "logits/rejected": 13.665675163269043, "logps/chosen": -4.162510871887207, "logps/rejected": -4.260349750518799, "loss": 3.8531, "rewards/accuracies": 0.5, "rewards/chosen": -41.62510681152344, "rewards/margins": 0.9783916473388672, "rewards/rejected": -42.60350036621094, "step": 4303 }, { "epoch": 0.5860566448801743, "grad_norm": 41.172917647320276, "learning_rate": 3.498402969509501e-07, "logits/chosen": 13.920587539672852, "logits/rejected": 14.63228988647461, "logps/chosen": -4.243627548217773, "logps/rejected": -4.472935676574707, "loss": 4.179, "rewards/accuracies": 1.0, "rewards/chosen": -42.43627166748047, "rewards/margins": 2.2930870056152344, "rewards/rejected": -44.7293586730957, "step": 4304 }, { "epoch": 0.5861928104575164, "grad_norm": 40.55291553475119, "learning_rate": 3.496516632516644e-07, "logits/chosen": 13.645025253295898, "logits/rejected": 13.53955364227295, "logps/chosen": -4.260712623596191, "logps/rejected": -4.2229323387146, "loss": 4.1113, "rewards/accuracies": 0.25, "rewards/chosen": -42.60713195800781, "rewards/margins": -0.3778057098388672, "rewards/rejected": -42.22932434082031, "step": 4305 }, { "epoch": 0.5863289760348583, "grad_norm": 39.134925036574046, "learning_rate": 3.494630409290017e-07, "logits/chosen": 14.175517082214355, "logits/rejected": 14.043231010437012, "logps/chosen": -4.491948127746582, "logps/rejected": -4.444305419921875, "loss": 4.0117, "rewards/accuracies": 0.75, "rewards/chosen": -44.91947937011719, "rewards/margins": -0.47643089294433594, "rewards/rejected": -44.44305419921875, "step": 4306 }, { "epoch": 0.5864651416122004, "grad_norm": 189.24378797792926, "learning_rate": 3.4927443002558255e-07, "logits/chosen": 12.620954513549805, "logits/rejected": 13.263776779174805, "logps/chosen": -3.8361990451812744, "logps/rejected": -3.974544048309326, "loss": 3.8196, "rewards/accuracies": 0.75, "rewards/chosen": -38.36199188232422, "rewards/margins": 1.3834476470947266, "rewards/rejected": -39.74544143676758, "step": 4307 }, { "epoch": 0.5866013071895425, "grad_norm": 46.70551841084928, "learning_rate": 3.4908583058402517e-07, "logits/chosen": 13.979469299316406, "logits/rejected": 14.518855094909668, "logps/chosen": -4.7094573974609375, "logps/rejected": -4.82792329788208, "loss": 3.8269, "rewards/accuracies": 0.75, "rewards/chosen": -47.094573974609375, "rewards/margins": 1.1846599578857422, "rewards/rejected": -48.279232025146484, "step": 4308 }, { "epoch": 0.5867374727668845, "grad_norm": 47.48381034872391, "learning_rate": 3.488972426469454e-07, "logits/chosen": 12.044387817382812, "logits/rejected": 13.216536521911621, "logps/chosen": -3.8035449981689453, "logps/rejected": -4.230819225311279, "loss": 4.2257, "rewards/accuracies": 1.0, "rewards/chosen": -38.03545379638672, "rewards/margins": 4.272741317749023, "rewards/rejected": -42.30819320678711, "step": 4309 }, { "epoch": 0.5868736383442266, "grad_norm": 45.05346941845319, "learning_rate": 3.4870866625695595e-07, "logits/chosen": 14.03844928741455, "logits/rejected": 13.722061157226562, "logps/chosen": -4.260354042053223, "logps/rejected": -4.138250827789307, "loss": 3.3626, "rewards/accuracies": 0.5, "rewards/chosen": -42.603538513183594, "rewards/margins": -1.2210330963134766, "rewards/rejected": -41.38250732421875, "step": 4310 }, { "epoch": 0.5870098039215687, "grad_norm": 34.92788486482609, "learning_rate": 3.4852010145666733e-07, "logits/chosen": 13.742666244506836, "logits/rejected": 13.322813034057617, "logps/chosen": -4.276019096374512, "logps/rejected": -4.3853254318237305, "loss": 3.9662, "rewards/accuracies": 0.5, "rewards/chosen": -42.76019287109375, "rewards/margins": 1.0930652618408203, "rewards/rejected": -43.85325622558594, "step": 4311 }, { "epoch": 0.5871459694989106, "grad_norm": 42.803270556622834, "learning_rate": 3.483315482886874e-07, "logits/chosen": 13.939580917358398, "logits/rejected": 14.109336853027344, "logps/chosen": -4.526040077209473, "logps/rejected": -4.586982727050781, "loss": 4.1385, "rewards/accuracies": 0.5, "rewards/chosen": -45.26040267944336, "rewards/margins": 0.6094274520874023, "rewards/rejected": -45.86982727050781, "step": 4312 }, { "epoch": 0.5872821350762527, "grad_norm": 40.32058173493773, "learning_rate": 3.4814300679562127e-07, "logits/chosen": 13.729853630065918, "logits/rejected": 14.681896209716797, "logps/chosen": -4.379509449005127, "logps/rejected": -4.916311264038086, "loss": 3.5663, "rewards/accuracies": 0.75, "rewards/chosen": -43.79509735107422, "rewards/margins": 5.368021011352539, "rewards/rejected": -49.163116455078125, "step": 4313 }, { "epoch": 0.5874183006535948, "grad_norm": 38.70261748994582, "learning_rate": 3.4795447702007127e-07, "logits/chosen": 12.996185302734375, "logits/rejected": 13.62963581085205, "logps/chosen": -4.204122543334961, "logps/rejected": -4.40248441696167, "loss": 4.0582, "rewards/accuracies": 0.75, "rewards/chosen": -42.041221618652344, "rewards/margins": 1.9836196899414062, "rewards/rejected": -44.02484130859375, "step": 4314 }, { "epoch": 0.5875544662309368, "grad_norm": 56.66399918078792, "learning_rate": 3.4776595900463745e-07, "logits/chosen": 14.150341033935547, "logits/rejected": 14.520428657531738, "logps/chosen": -4.53103494644165, "logps/rejected": -4.6669721603393555, "loss": 3.9375, "rewards/accuracies": 0.75, "rewards/chosen": -45.31034851074219, "rewards/margins": 1.3593711853027344, "rewards/rejected": -46.66972351074219, "step": 4315 }, { "epoch": 0.5876906318082789, "grad_norm": 41.539551766815926, "learning_rate": 3.47577452791917e-07, "logits/chosen": 14.301251411437988, "logits/rejected": 14.093637466430664, "logps/chosen": -4.1526970863342285, "logps/rejected": -4.281662464141846, "loss": 4.3013, "rewards/accuracies": 0.75, "rewards/chosen": -41.52696990966797, "rewards/margins": 1.2896528244018555, "rewards/rejected": -42.81662368774414, "step": 4316 }, { "epoch": 0.5878267973856209, "grad_norm": 37.67945954154742, "learning_rate": 3.473889584245044e-07, "logits/chosen": 13.40877914428711, "logits/rejected": 14.026348114013672, "logps/chosen": -3.984506130218506, "logps/rejected": -4.426200866699219, "loss": 3.4715, "rewards/accuracies": 1.0, "rewards/chosen": -39.84505844116211, "rewards/margins": 4.4169511795043945, "rewards/rejected": -44.26200866699219, "step": 4317 }, { "epoch": 0.5879629629629629, "grad_norm": 40.056800714049295, "learning_rate": 3.472004759449916e-07, "logits/chosen": 14.49998664855957, "logits/rejected": 13.813831329345703, "logps/chosen": -4.579801082611084, "logps/rejected": -4.750051498413086, "loss": 4.1781, "rewards/accuracies": 0.75, "rewards/chosen": -45.79800796508789, "rewards/margins": 1.7025070190429688, "rewards/rejected": -47.500518798828125, "step": 4318 }, { "epoch": 0.588099128540305, "grad_norm": 42.38422843984808, "learning_rate": 3.470120053959675e-07, "logits/chosen": 13.705309867858887, "logits/rejected": 13.848670959472656, "logps/chosen": -4.345311164855957, "logps/rejected": -4.519989967346191, "loss": 4.0691, "rewards/accuracies": 0.75, "rewards/chosen": -43.45310974121094, "rewards/margins": 1.746790885925293, "rewards/rejected": -45.19990539550781, "step": 4319 }, { "epoch": 0.5882352941176471, "grad_norm": 39.85978168381945, "learning_rate": 3.46823546820019e-07, "logits/chosen": 13.680425643920898, "logits/rejected": 13.7301664352417, "logps/chosen": -4.1370744705200195, "logps/rejected": -4.267681121826172, "loss": 4.4635, "rewards/accuracies": 0.75, "rewards/chosen": -41.370750427246094, "rewards/margins": 1.3060665130615234, "rewards/rejected": -42.676815032958984, "step": 4320 }, { "epoch": 0.588371459694989, "grad_norm": 38.662196720839944, "learning_rate": 3.466351002597296e-07, "logits/chosen": 13.10025691986084, "logits/rejected": 14.159589767456055, "logps/chosen": -4.003345489501953, "logps/rejected": -4.437107086181641, "loss": 3.8238, "rewards/accuracies": 1.0, "rewards/chosen": -40.03345489501953, "rewards/margins": 4.337617874145508, "rewards/rejected": -44.371070861816406, "step": 4321 }, { "epoch": 0.5885076252723311, "grad_norm": 39.821093196799815, "learning_rate": 3.4644666575768035e-07, "logits/chosen": 12.59730339050293, "logits/rejected": 13.817813873291016, "logps/chosen": -3.8946940898895264, "logps/rejected": -4.189352989196777, "loss": 3.9341, "rewards/accuracies": 0.5, "rewards/chosen": -38.94694137573242, "rewards/margins": 2.946587562561035, "rewards/rejected": -41.893531799316406, "step": 4322 }, { "epoch": 0.5886437908496732, "grad_norm": 40.199820927614745, "learning_rate": 3.4625824335644963e-07, "logits/chosen": 12.75479507446289, "logits/rejected": 13.586759567260742, "logps/chosen": -4.211781978607178, "logps/rejected": -4.273665904998779, "loss": 4.1335, "rewards/accuracies": 0.25, "rewards/chosen": -42.117820739746094, "rewards/margins": 0.6188383102416992, "rewards/rejected": -42.736656188964844, "step": 4323 }, { "epoch": 0.5887799564270153, "grad_norm": 46.83616353440841, "learning_rate": 3.460698330986132e-07, "logits/chosen": 13.511344909667969, "logits/rejected": 13.452463150024414, "logps/chosen": -4.285112380981445, "logps/rejected": -4.529021739959717, "loss": 3.7081, "rewards/accuracies": 0.75, "rewards/chosen": -42.85111999511719, "rewards/margins": 2.4390945434570312, "rewards/rejected": -45.29021453857422, "step": 4324 }, { "epoch": 0.5889161220043573, "grad_norm": 35.66188622010509, "learning_rate": 3.458814350267437e-07, "logits/chosen": 13.659627914428711, "logits/rejected": 13.288667678833008, "logps/chosen": -4.617932319641113, "logps/rejected": -4.425488471984863, "loss": 3.9103, "rewards/accuracies": 0.25, "rewards/chosen": -46.17932891845703, "rewards/margins": -1.9244403839111328, "rewards/rejected": -44.2548828125, "step": 4325 }, { "epoch": 0.5890522875816994, "grad_norm": 39.948491584902, "learning_rate": 3.4569304918341124e-07, "logits/chosen": 13.092483520507812, "logits/rejected": 13.013954162597656, "logps/chosen": -3.825315475463867, "logps/rejected": -3.918311834335327, "loss": 3.8945, "rewards/accuracies": 0.75, "rewards/chosen": -38.253150939941406, "rewards/margins": 0.9299650192260742, "rewards/rejected": -39.18312072753906, "step": 4326 }, { "epoch": 0.5891884531590414, "grad_norm": 41.991113707073666, "learning_rate": 3.455046756111834e-07, "logits/chosen": 13.188711166381836, "logits/rejected": 13.764543533325195, "logps/chosen": -4.162691116333008, "logps/rejected": -4.631637096405029, "loss": 4.4511, "rewards/accuracies": 1.0, "rewards/chosen": -41.62691116333008, "rewards/margins": 4.689459800720215, "rewards/rejected": -46.316368103027344, "step": 4327 }, { "epoch": 0.5893246187363834, "grad_norm": 40.23998446130309, "learning_rate": 3.453163143526244e-07, "logits/chosen": 14.200606346130371, "logits/rejected": 13.997411727905273, "logps/chosen": -4.650093078613281, "logps/rejected": -4.511104106903076, "loss": 4.1067, "rewards/accuracies": 0.25, "rewards/chosen": -46.50093078613281, "rewards/margins": -1.3898906707763672, "rewards/rejected": -45.11104202270508, "step": 4328 }, { "epoch": 0.5894607843137255, "grad_norm": 41.26765203116648, "learning_rate": 3.4512796545029616e-07, "logits/chosen": 13.192590713500977, "logits/rejected": 13.695428848266602, "logps/chosen": -4.01795768737793, "logps/rejected": -4.666455268859863, "loss": 3.8742, "rewards/accuracies": 1.0, "rewards/chosen": -40.1795768737793, "rewards/margins": 6.484975814819336, "rewards/rejected": -46.66455078125, "step": 4329 }, { "epoch": 0.5895969498910676, "grad_norm": 39.987931932598535, "learning_rate": 3.4493962894675794e-07, "logits/chosen": 14.232585906982422, "logits/rejected": 13.41817855834961, "logps/chosen": -4.509221076965332, "logps/rejected": -4.65326452255249, "loss": 3.5807, "rewards/accuracies": 0.5, "rewards/chosen": -45.09220886230469, "rewards/margins": 1.4404354095458984, "rewards/rejected": -46.53264617919922, "step": 4330 }, { "epoch": 0.5897331154684096, "grad_norm": 43.95709050523301, "learning_rate": 3.4475130488456543e-07, "logits/chosen": 13.32349967956543, "logits/rejected": 13.847848892211914, "logps/chosen": -3.9757397174835205, "logps/rejected": -4.403846263885498, "loss": 3.984, "rewards/accuracies": 0.75, "rewards/chosen": -39.75740051269531, "rewards/margins": 4.281064033508301, "rewards/rejected": -44.03845977783203, "step": 4331 }, { "epoch": 0.5898692810457516, "grad_norm": 50.88259355599713, "learning_rate": 3.445629933062723e-07, "logits/chosen": 13.321462631225586, "logits/rejected": 13.95924186706543, "logps/chosen": -4.4813923835754395, "logps/rejected": -4.393791198730469, "loss": 3.7319, "rewards/accuracies": 0.25, "rewards/chosen": -44.81392288208008, "rewards/margins": -0.8760128021240234, "rewards/rejected": -43.93791198730469, "step": 4332 }, { "epoch": 0.5900054466230937, "grad_norm": 43.98984114110639, "learning_rate": 3.443746942544293e-07, "logits/chosen": 13.230945587158203, "logits/rejected": 13.282575607299805, "logps/chosen": -4.17078161239624, "logps/rejected": -4.014044284820557, "loss": 4.0966, "rewards/accuracies": 0.25, "rewards/chosen": -41.70781326293945, "rewards/margins": -1.5673704147338867, "rewards/rejected": -40.140445709228516, "step": 4333 }, { "epoch": 0.5901416122004357, "grad_norm": 45.071773075814235, "learning_rate": 3.441864077715838e-07, "logits/chosen": 13.393321990966797, "logits/rejected": 13.550792694091797, "logps/chosen": -4.361236572265625, "logps/rejected": -4.280847549438477, "loss": 4.4506, "rewards/accuracies": 0.5, "rewards/chosen": -43.612369537353516, "rewards/margins": -0.80389404296875, "rewards/rejected": -42.8084716796875, "step": 4334 }, { "epoch": 0.5902777777777778, "grad_norm": 42.108140139937056, "learning_rate": 3.4399813390028073e-07, "logits/chosen": 13.147708892822266, "logits/rejected": 13.152702331542969, "logps/chosen": -3.992717742919922, "logps/rejected": -4.19569206237793, "loss": 3.6498, "rewards/accuracies": 0.75, "rewards/chosen": -39.92717742919922, "rewards/margins": 2.029743194580078, "rewards/rejected": -41.95692443847656, "step": 4335 }, { "epoch": 0.5904139433551199, "grad_norm": 40.173209333552116, "learning_rate": 3.438098726830624e-07, "logits/chosen": 13.136293411254883, "logits/rejected": 13.370393753051758, "logps/chosen": -4.300230026245117, "logps/rejected": -4.6205034255981445, "loss": 3.5782, "rewards/accuracies": 0.75, "rewards/chosen": -43.00230407714844, "rewards/margins": 3.2027359008789062, "rewards/rejected": -46.20503616333008, "step": 4336 }, { "epoch": 0.5905501089324618, "grad_norm": 37.98965045658045, "learning_rate": 3.436216241624677e-07, "logits/chosen": 12.765145301818848, "logits/rejected": 13.279656410217285, "logps/chosen": -3.7151474952697754, "logps/rejected": -4.216002464294434, "loss": 3.9304, "rewards/accuracies": 1.0, "rewards/chosen": -37.15147399902344, "rewards/margins": 5.008545875549316, "rewards/rejected": -42.1600227355957, "step": 4337 }, { "epoch": 0.5906862745098039, "grad_norm": 41.42936579141186, "learning_rate": 3.43433388381033e-07, "logits/chosen": 13.508018493652344, "logits/rejected": 13.966939926147461, "logps/chosen": -4.375087738037109, "logps/rejected": -4.038622856140137, "loss": 4.2708, "rewards/accuracies": 0.25, "rewards/chosen": -43.750877380371094, "rewards/margins": -3.364649772644043, "rewards/rejected": -40.386226654052734, "step": 4338 }, { "epoch": 0.590822440087146, "grad_norm": 38.55258944600666, "learning_rate": 3.43245165381292e-07, "logits/chosen": 13.382080078125, "logits/rejected": 13.264945983886719, "logps/chosen": -4.237894058227539, "logps/rejected": -4.259644985198975, "loss": 4.0187, "rewards/accuracies": 0.5, "rewards/chosen": -42.378936767578125, "rewards/margins": 0.21751117706298828, "rewards/rejected": -42.59645080566406, "step": 4339 }, { "epoch": 0.590958605664488, "grad_norm": 39.50543193461904, "learning_rate": 3.430569552057748e-07, "logits/chosen": 13.763830184936523, "logits/rejected": 13.369348526000977, "logps/chosen": -3.902681350708008, "logps/rejected": -3.9462366104125977, "loss": 3.9715, "rewards/accuracies": 0.75, "rewards/chosen": -39.02681350708008, "rewards/margins": 0.43555450439453125, "rewards/rejected": -39.46236801147461, "step": 4340 }, { "epoch": 0.5910947712418301, "grad_norm": 42.70821356733408, "learning_rate": 3.4286875789700926e-07, "logits/chosen": 13.185810089111328, "logits/rejected": 13.923236846923828, "logps/chosen": -4.359104156494141, "logps/rejected": -4.551340103149414, "loss": 3.6265, "rewards/accuracies": 0.5, "rewards/chosen": -43.59103775024414, "rewards/margins": 1.9223594665527344, "rewards/rejected": -45.513397216796875, "step": 4341 }, { "epoch": 0.5912309368191722, "grad_norm": 40.69512586300813, "learning_rate": 3.426805734975203e-07, "logits/chosen": 13.974278450012207, "logits/rejected": 14.106708526611328, "logps/chosen": -4.338364601135254, "logps/rejected": -4.384223937988281, "loss": 4.3977, "rewards/accuracies": 0.25, "rewards/chosen": -43.38364791870117, "rewards/margins": 0.45859241485595703, "rewards/rejected": -43.84223937988281, "step": 4342 }, { "epoch": 0.5913671023965141, "grad_norm": 43.850721975364095, "learning_rate": 3.4249240204982944e-07, "logits/chosen": 12.505617141723633, "logits/rejected": 13.07974624633789, "logps/chosen": -4.172321319580078, "logps/rejected": -4.410087585449219, "loss": 4.6015, "rewards/accuracies": 0.5, "rewards/chosen": -41.72321319580078, "rewards/margins": 2.377659797668457, "rewards/rejected": -44.10087585449219, "step": 4343 }, { "epoch": 0.5915032679738562, "grad_norm": 46.06287203472854, "learning_rate": 3.423042435964557e-07, "logits/chosen": 13.76417350769043, "logits/rejected": 13.804916381835938, "logps/chosen": -4.098508834838867, "logps/rejected": -4.61700439453125, "loss": 4.4475, "rewards/accuracies": 0.75, "rewards/chosen": -40.98509216308594, "rewards/margins": 5.184954643249512, "rewards/rejected": -46.1700439453125, "step": 4344 }, { "epoch": 0.5916394335511983, "grad_norm": 41.529905109040136, "learning_rate": 3.421160981799152e-07, "logits/chosen": 14.027532577514648, "logits/rejected": 13.347895622253418, "logps/chosen": -4.645906925201416, "logps/rejected": -4.24305534362793, "loss": 4.5488, "rewards/accuracies": 0.25, "rewards/chosen": -46.459075927734375, "rewards/margins": -4.02851676940918, "rewards/rejected": -42.43055725097656, "step": 4345 }, { "epoch": 0.5917755991285403, "grad_norm": 43.627288418310016, "learning_rate": 3.4192796584272057e-07, "logits/chosen": 12.6951322555542, "logits/rejected": 13.245628356933594, "logps/chosen": -3.9508113861083984, "logps/rejected": -4.330628395080566, "loss": 3.9364, "rewards/accuracies": 1.0, "rewards/chosen": -39.50811767578125, "rewards/margins": 3.7981672286987305, "rewards/rejected": -43.30628204345703, "step": 4346 }, { "epoch": 0.5919117647058824, "grad_norm": 41.77270370505783, "learning_rate": 3.417398466273821e-07, "logits/chosen": 14.571434020996094, "logits/rejected": 13.586692810058594, "logps/chosen": -4.443883419036865, "logps/rejected": -4.159981727600098, "loss": 4.241, "rewards/accuracies": 0.0, "rewards/chosen": -44.43883514404297, "rewards/margins": -2.839022636413574, "rewards/rejected": -41.599815368652344, "step": 4347 }, { "epoch": 0.5920479302832244, "grad_norm": 39.116108112848, "learning_rate": 3.4155174057640703e-07, "logits/chosen": 13.296073913574219, "logits/rejected": 14.471161842346191, "logps/chosen": -4.118619918823242, "logps/rejected": -4.651468276977539, "loss": 3.9628, "rewards/accuracies": 0.75, "rewards/chosen": -41.18619918823242, "rewards/margins": 5.328485488891602, "rewards/rejected": -46.514686584472656, "step": 4348 }, { "epoch": 0.5921840958605664, "grad_norm": 42.40130077812464, "learning_rate": 3.413636477322992e-07, "logits/chosen": 13.603649139404297, "logits/rejected": 13.597070693969727, "logps/chosen": -4.22309684753418, "logps/rejected": -4.244063377380371, "loss": 4.2063, "rewards/accuracies": 0.75, "rewards/chosen": -42.23097229003906, "rewards/margins": 0.20966053009033203, "rewards/rejected": -42.44062805175781, "step": 4349 }, { "epoch": 0.5923202614379085, "grad_norm": 41.08492969645726, "learning_rate": 3.4117556813755985e-07, "logits/chosen": 13.1868257522583, "logits/rejected": 13.057145118713379, "logps/chosen": -3.9703850746154785, "logps/rejected": -4.106907367706299, "loss": 4.2639, "rewards/accuracies": 0.75, "rewards/chosen": -39.70384979248047, "rewards/margins": 1.3652238845825195, "rewards/rejected": -41.06907653808594, "step": 4350 }, { "epoch": 0.5924564270152506, "grad_norm": 38.76076831378057, "learning_rate": 3.4098750183468726e-07, "logits/chosen": 13.098566055297852, "logits/rejected": 13.794827461242676, "logps/chosen": -4.036380767822266, "logps/rejected": -4.25720739364624, "loss": 3.8011, "rewards/accuracies": 0.75, "rewards/chosen": -40.363807678222656, "rewards/margins": 2.208266258239746, "rewards/rejected": -42.57207107543945, "step": 4351 }, { "epoch": 0.5925925925925926, "grad_norm": 45.169668237846594, "learning_rate": 3.407994488661763e-07, "logits/chosen": 13.335983276367188, "logits/rejected": 13.523667335510254, "logps/chosen": -4.125065803527832, "logps/rejected": -4.3809309005737305, "loss": 4.1177, "rewards/accuracies": 1.0, "rewards/chosen": -41.25065994262695, "rewards/margins": 2.558648109436035, "rewards/rejected": -43.809303283691406, "step": 4352 }, { "epoch": 0.5927287581699346, "grad_norm": 44.54058365825299, "learning_rate": 3.4061140927451915e-07, "logits/chosen": 13.427595138549805, "logits/rejected": 13.384209632873535, "logps/chosen": -4.355703353881836, "logps/rejected": -4.32790470123291, "loss": 4.315, "rewards/accuracies": 0.5, "rewards/chosen": -43.55703353881836, "rewards/margins": -0.2779865264892578, "rewards/rejected": -43.27904510498047, "step": 4353 }, { "epoch": 0.5928649237472767, "grad_norm": 40.49983370089256, "learning_rate": 3.4042338310220524e-07, "logits/chosen": 13.254109382629395, "logits/rejected": 14.567268371582031, "logps/chosen": -4.011884689331055, "logps/rejected": -4.5171613693237305, "loss": 3.5058, "rewards/accuracies": 1.0, "rewards/chosen": -40.11885070800781, "rewards/margins": 5.052761077880859, "rewards/rejected": -45.17161178588867, "step": 4354 }, { "epoch": 0.5930010893246187, "grad_norm": 38.99937279470688, "learning_rate": 3.4023537039172015e-07, "logits/chosen": 12.955955505371094, "logits/rejected": 13.089639663696289, "logps/chosen": -4.3334503173828125, "logps/rejected": -4.314452171325684, "loss": 3.6915, "rewards/accuracies": 0.25, "rewards/chosen": -43.334503173828125, "rewards/margins": -0.18998241424560547, "rewards/rejected": -43.14452362060547, "step": 4355 }, { "epoch": 0.5931372549019608, "grad_norm": 45.550579822630255, "learning_rate": 3.400473711855472e-07, "logits/chosen": 13.625041961669922, "logits/rejected": 13.956258773803711, "logps/chosen": -4.309151649475098, "logps/rejected": -4.477687835693359, "loss": 3.7127, "rewards/accuracies": 0.5, "rewards/chosen": -43.09151840209961, "rewards/margins": 1.6853570938110352, "rewards/rejected": -44.77687454223633, "step": 4356 }, { "epoch": 0.5932734204793029, "grad_norm": 40.905582838531586, "learning_rate": 3.3985938552616646e-07, "logits/chosen": 13.404170989990234, "logits/rejected": 13.544118881225586, "logps/chosen": -4.09712553024292, "logps/rejected": -4.519315719604492, "loss": 3.8153, "rewards/accuracies": 1.0, "rewards/chosen": -40.97125244140625, "rewards/margins": 4.221900939941406, "rewards/rejected": -45.19315719604492, "step": 4357 }, { "epoch": 0.5934095860566448, "grad_norm": 40.64061366927126, "learning_rate": 3.396714134560545e-07, "logits/chosen": 13.130264282226562, "logits/rejected": 14.242759704589844, "logps/chosen": -4.165284156799316, "logps/rejected": -4.37272834777832, "loss": 3.8996, "rewards/accuracies": 0.5, "rewards/chosen": -41.65283966064453, "rewards/margins": 2.0744447708129883, "rewards/rejected": -43.72727966308594, "step": 4358 }, { "epoch": 0.5935457516339869, "grad_norm": 41.90236506484107, "learning_rate": 3.394834550176853e-07, "logits/chosen": 13.280539512634277, "logits/rejected": 13.11175537109375, "logps/chosen": -4.3438215255737305, "logps/rejected": -4.277554988861084, "loss": 3.9379, "rewards/accuracies": 0.25, "rewards/chosen": -43.43821334838867, "rewards/margins": -0.6626644134521484, "rewards/rejected": -42.775550842285156, "step": 4359 }, { "epoch": 0.593681917211329, "grad_norm": 46.95750986914637, "learning_rate": 3.3929551025352987e-07, "logits/chosen": 13.603727340698242, "logits/rejected": 12.960192680358887, "logps/chosen": -4.229073524475098, "logps/rejected": -3.944366216659546, "loss": 4.2561, "rewards/accuracies": 0.25, "rewards/chosen": -42.290733337402344, "rewards/margins": -2.847071647644043, "rewards/rejected": -39.443660736083984, "step": 4360 }, { "epoch": 0.593818082788671, "grad_norm": 47.272868125490895, "learning_rate": 3.391075792060556e-07, "logits/chosen": 13.567151069641113, "logits/rejected": 13.215681076049805, "logps/chosen": -4.10147762298584, "logps/rejected": -4.111930847167969, "loss": 4.4621, "rewards/accuracies": 0.5, "rewards/chosen": -41.01477813720703, "rewards/margins": 0.10453319549560547, "rewards/rejected": -41.11930847167969, "step": 4361 }, { "epoch": 0.5939542483660131, "grad_norm": 45.76038911121762, "learning_rate": 3.389196619177271e-07, "logits/chosen": 13.504724502563477, "logits/rejected": 14.344345092773438, "logps/chosen": -4.681843280792236, "logps/rejected": -4.69072151184082, "loss": 4.1014, "rewards/accuracies": 0.5, "rewards/chosen": -46.81843185424805, "rewards/margins": 0.08878326416015625, "rewards/rejected": -46.9072151184082, "step": 4362 }, { "epoch": 0.5940904139433552, "grad_norm": 44.946371253009204, "learning_rate": 3.38731758431006e-07, "logits/chosen": 13.804948806762695, "logits/rejected": 14.250370979309082, "logps/chosen": -4.4733452796936035, "logps/rejected": -4.56176233291626, "loss": 3.937, "rewards/accuracies": 0.5, "rewards/chosen": -44.73345184326172, "rewards/margins": 0.8841667175292969, "rewards/rejected": -45.61762237548828, "step": 4363 }, { "epoch": 0.5942265795206971, "grad_norm": 41.95492277956311, "learning_rate": 3.385438687883504e-07, "logits/chosen": 12.340608596801758, "logits/rejected": 13.382160186767578, "logps/chosen": -4.029430389404297, "logps/rejected": -4.347507476806641, "loss": 3.9323, "rewards/accuracies": 0.75, "rewards/chosen": -40.294307708740234, "rewards/margins": 3.1807689666748047, "rewards/rejected": -43.475074768066406, "step": 4364 }, { "epoch": 0.5943627450980392, "grad_norm": 36.60119581161633, "learning_rate": 3.3835599303221567e-07, "logits/chosen": 13.232592582702637, "logits/rejected": 14.364629745483398, "logps/chosen": -3.8560104370117188, "logps/rejected": -4.478695392608643, "loss": 3.9851, "rewards/accuracies": 1.0, "rewards/chosen": -38.56010437011719, "rewards/margins": 6.226852893829346, "rewards/rejected": -44.786956787109375, "step": 4365 }, { "epoch": 0.5944989106753813, "grad_norm": 51.64783797218401, "learning_rate": 3.38168131205054e-07, "logits/chosen": 13.44540023803711, "logits/rejected": 13.489677429199219, "logps/chosen": -4.070199966430664, "logps/rejected": -4.516508102416992, "loss": 3.8834, "rewards/accuracies": 1.0, "rewards/chosen": -40.70199966430664, "rewards/margins": 4.4630842208862305, "rewards/rejected": -45.16508483886719, "step": 4366 }, { "epoch": 0.5946350762527233, "grad_norm": 39.268127771381835, "learning_rate": 3.3798028334931404e-07, "logits/chosen": 12.880139350891113, "logits/rejected": 13.168149948120117, "logps/chosen": -4.205012798309326, "logps/rejected": -4.0449604988098145, "loss": 3.8678, "rewards/accuracies": 0.5, "rewards/chosen": -42.05012512207031, "rewards/margins": -1.6005229949951172, "rewards/rejected": -40.44960403442383, "step": 4367 }, { "epoch": 0.5947712418300654, "grad_norm": 42.373780855410466, "learning_rate": 3.3779244950744177e-07, "logits/chosen": 13.190476417541504, "logits/rejected": 13.45060920715332, "logps/chosen": -4.326820373535156, "logps/rejected": -4.079063415527344, "loss": 3.955, "rewards/accuracies": 0.0, "rewards/chosen": -43.2681999206543, "rewards/margins": -2.477566719055176, "rewards/rejected": -40.79063415527344, "step": 4368 }, { "epoch": 0.5949074074074074, "grad_norm": 39.71144009903103, "learning_rate": 3.376046297218798e-07, "logits/chosen": 14.48134994506836, "logits/rejected": 13.500408172607422, "logps/chosen": -4.210629463195801, "logps/rejected": -4.322282791137695, "loss": 3.4193, "rewards/accuracies": 0.5, "rewards/chosen": -42.106292724609375, "rewards/margins": 1.116537094116211, "rewards/rejected": -43.22283172607422, "step": 4369 }, { "epoch": 0.5950435729847494, "grad_norm": 43.939084552621026, "learning_rate": 3.3741682403506746e-07, "logits/chosen": 13.710749626159668, "logits/rejected": 13.741312026977539, "logps/chosen": -4.314040184020996, "logps/rejected": -4.323269367218018, "loss": 4.1733, "rewards/accuracies": 0.5, "rewards/chosen": -43.14040756225586, "rewards/margins": 0.09228706359863281, "rewards/rejected": -43.232696533203125, "step": 4370 }, { "epoch": 0.5951797385620915, "grad_norm": 41.98591219251124, "learning_rate": 3.372290324894411e-07, "logits/chosen": 12.698415756225586, "logits/rejected": 13.00912094116211, "logps/chosen": -4.049271583557129, "logps/rejected": -4.174753189086914, "loss": 3.8471, "rewards/accuracies": 0.5, "rewards/chosen": -40.492713928222656, "rewards/margins": 1.2548151016235352, "rewards/rejected": -41.747528076171875, "step": 4371 }, { "epoch": 0.5953159041394336, "grad_norm": 37.37386403847025, "learning_rate": 3.370412551274337e-07, "logits/chosen": 13.379053115844727, "logits/rejected": 13.71818733215332, "logps/chosen": -3.9641120433807373, "logps/rejected": -3.8773837089538574, "loss": 3.9, "rewards/accuracies": 0.25, "rewards/chosen": -39.64112091064453, "rewards/margins": -0.867283821105957, "rewards/rejected": -38.77383804321289, "step": 4372 }, { "epoch": 0.5954520697167756, "grad_norm": 39.81211303851716, "learning_rate": 3.36853491991475e-07, "logits/chosen": 13.132781982421875, "logits/rejected": 14.143999099731445, "logps/chosen": -3.7893564701080322, "logps/rejected": -4.346914291381836, "loss": 3.7923, "rewards/accuracies": 0.75, "rewards/chosen": -37.89356231689453, "rewards/margins": 5.575582027435303, "rewards/rejected": -43.469146728515625, "step": 4373 }, { "epoch": 0.5955882352941176, "grad_norm": 48.93012893139058, "learning_rate": 3.3666574312399183e-07, "logits/chosen": 12.644267082214355, "logits/rejected": 12.582433700561523, "logps/chosen": -4.192347526550293, "logps/rejected": -4.066512584686279, "loss": 4.2066, "rewards/accuracies": 0.5, "rewards/chosen": -41.92347717285156, "rewards/margins": -1.2583503723144531, "rewards/rejected": -40.66512680053711, "step": 4374 }, { "epoch": 0.5957244008714597, "grad_norm": 39.61200464124634, "learning_rate": 3.3647800856740766e-07, "logits/chosen": 13.520622253417969, "logits/rejected": 13.76885986328125, "logps/chosen": -3.9658737182617188, "logps/rejected": -4.353580951690674, "loss": 4.2372, "rewards/accuracies": 0.75, "rewards/chosen": -39.65874099731445, "rewards/margins": 3.877072334289551, "rewards/rejected": -43.53581237792969, "step": 4375 }, { "epoch": 0.5958605664488017, "grad_norm": 44.24890315231145, "learning_rate": 3.362902883641424e-07, "logits/chosen": 12.34902572631836, "logits/rejected": 13.154207229614258, "logps/chosen": -3.7689647674560547, "logps/rejected": -4.119637966156006, "loss": 4.0027, "rewards/accuracies": 1.0, "rewards/chosen": -37.68964767456055, "rewards/margins": 3.506732940673828, "rewards/rejected": -41.196380615234375, "step": 4376 }, { "epoch": 0.5959967320261438, "grad_norm": 42.73885664306293, "learning_rate": 3.3610258255661303e-07, "logits/chosen": 13.454456329345703, "logits/rejected": 13.064966201782227, "logps/chosen": -4.363742828369141, "logps/rejected": -4.476776123046875, "loss": 4.2472, "rewards/accuracies": 0.75, "rewards/chosen": -43.637428283691406, "rewards/margins": 1.130335807800293, "rewards/rejected": -44.76776123046875, "step": 4377 }, { "epoch": 0.5961328976034859, "grad_norm": 42.11189816284722, "learning_rate": 3.359148911872336e-07, "logits/chosen": 13.105194091796875, "logits/rejected": 13.269798278808594, "logps/chosen": -4.062442302703857, "logps/rejected": -4.2706451416015625, "loss": 3.7655, "rewards/accuracies": 0.75, "rewards/chosen": -40.624420166015625, "rewards/margins": 2.082028388977051, "rewards/rejected": -42.706451416015625, "step": 4378 }, { "epoch": 0.5962690631808278, "grad_norm": 41.02151364714461, "learning_rate": 3.35727214298414e-07, "logits/chosen": 12.83216381072998, "logits/rejected": 13.605918884277344, "logps/chosen": -3.780102252960205, "logps/rejected": -3.930642604827881, "loss": 4.3147, "rewards/accuracies": 0.75, "rewards/chosen": -37.801025390625, "rewards/margins": 1.5054025650024414, "rewards/rejected": -39.306427001953125, "step": 4379 }, { "epoch": 0.5964052287581699, "grad_norm": 42.23192475962844, "learning_rate": 3.355395519325616e-07, "logits/chosen": 13.706981658935547, "logits/rejected": 13.967737197875977, "logps/chosen": -4.404160499572754, "logps/rejected": -4.204089641571045, "loss": 3.9949, "rewards/accuracies": 0.25, "rewards/chosen": -44.041603088378906, "rewards/margins": -2.000706672668457, "rewards/rejected": -42.040897369384766, "step": 4380 }, { "epoch": 0.596541394335512, "grad_norm": 42.11934351280743, "learning_rate": 3.3535190413208046e-07, "logits/chosen": 12.669705390930176, "logits/rejected": 12.785787582397461, "logps/chosen": -4.117176532745361, "logps/rejected": -4.041377067565918, "loss": 3.9681, "rewards/accuracies": 0.75, "rewards/chosen": -41.1717643737793, "rewards/margins": -0.7579975128173828, "rewards/rejected": -40.41376876831055, "step": 4381 }, { "epoch": 0.596677559912854, "grad_norm": 40.06087154102159, "learning_rate": 3.351642709393708e-07, "logits/chosen": 14.220447540283203, "logits/rejected": 14.098512649536133, "logps/chosen": -4.099231719970703, "logps/rejected": -4.926141262054443, "loss": 3.8093, "rewards/accuracies": 1.0, "rewards/chosen": -40.992313385009766, "rewards/margins": 8.269097328186035, "rewards/rejected": -49.26141357421875, "step": 4382 }, { "epoch": 0.5968137254901961, "grad_norm": 44.88354133471371, "learning_rate": 3.349766523968301e-07, "logits/chosen": 12.91596794128418, "logits/rejected": 13.709022521972656, "logps/chosen": -4.056389331817627, "logps/rejected": -4.60330867767334, "loss": 3.6952, "rewards/accuracies": 0.75, "rewards/chosen": -40.56389617919922, "rewards/margins": 5.469191551208496, "rewards/rejected": -46.033084869384766, "step": 4383 }, { "epoch": 0.5969498910675382, "grad_norm": 49.66354354750118, "learning_rate": 3.347890485468524e-07, "logits/chosen": 13.813776016235352, "logits/rejected": 13.84716796875, "logps/chosen": -4.521148681640625, "logps/rejected": -4.647719860076904, "loss": 3.7671, "rewards/accuracies": 0.5, "rewards/chosen": -45.21148681640625, "rewards/margins": 1.2657136917114258, "rewards/rejected": -46.47719955444336, "step": 4384 }, { "epoch": 0.5970860566448801, "grad_norm": 44.225220465452765, "learning_rate": 3.346014594318281e-07, "logits/chosen": 13.176706314086914, "logits/rejected": 13.399150848388672, "logps/chosen": -4.009052276611328, "logps/rejected": -4.050729751586914, "loss": 4.0572, "rewards/accuracies": 0.5, "rewards/chosen": -40.09052276611328, "rewards/margins": 0.4167776107788086, "rewards/rejected": -40.507301330566406, "step": 4385 }, { "epoch": 0.5972222222222222, "grad_norm": 39.413505923038066, "learning_rate": 3.344138850941446e-07, "logits/chosen": 12.867704391479492, "logits/rejected": 13.014424324035645, "logps/chosen": -4.012912750244141, "logps/rejected": -4.261141777038574, "loss": 3.3634, "rewards/accuracies": 0.75, "rewards/chosen": -40.129127502441406, "rewards/margins": 2.4822864532470703, "rewards/rejected": -42.611412048339844, "step": 4386 }, { "epoch": 0.5973583877995643, "grad_norm": 44.272996210221365, "learning_rate": 3.34226325576186e-07, "logits/chosen": 12.552682876586914, "logits/rejected": 13.429861068725586, "logps/chosen": -4.0100016593933105, "logps/rejected": -4.420594692230225, "loss": 4.14, "rewards/accuracies": 1.0, "rewards/chosen": -40.10001754760742, "rewards/margins": 4.105926513671875, "rewards/rejected": -44.20594787597656, "step": 4387 }, { "epoch": 0.5974945533769063, "grad_norm": 43.12278840370183, "learning_rate": 3.3403878092033276e-07, "logits/chosen": 13.73193359375, "logits/rejected": 13.728071212768555, "logps/chosen": -4.208321571350098, "logps/rejected": -3.9611361026763916, "loss": 3.9864, "rewards/accuracies": 0.25, "rewards/chosen": -42.083213806152344, "rewards/margins": -2.471853256225586, "rewards/rejected": -39.611358642578125, "step": 4388 }, { "epoch": 0.5976307189542484, "grad_norm": 40.291037706528364, "learning_rate": 3.338512511689622e-07, "logits/chosen": 12.900199890136719, "logits/rejected": 13.285825729370117, "logps/chosen": -4.2420973777771, "logps/rejected": -4.3801069259643555, "loss": 4.1188, "rewards/accuracies": 0.5, "rewards/chosen": -42.42097473144531, "rewards/margins": 1.3800945281982422, "rewards/rejected": -43.80107116699219, "step": 4389 }, { "epoch": 0.5977668845315904, "grad_norm": 60.62003459469137, "learning_rate": 3.336637363644484e-07, "logits/chosen": 13.085269927978516, "logits/rejected": 13.963460922241211, "logps/chosen": -4.058804988861084, "logps/rejected": -4.440110206604004, "loss": 4.3095, "rewards/accuracies": 0.75, "rewards/chosen": -40.588050842285156, "rewards/margins": 3.813051223754883, "rewards/rejected": -44.401100158691406, "step": 4390 }, { "epoch": 0.5979030501089324, "grad_norm": 38.1113376686696, "learning_rate": 3.3347623654916147e-07, "logits/chosen": 12.378463745117188, "logits/rejected": 13.7179594039917, "logps/chosen": -4.05902624130249, "logps/rejected": -4.273432731628418, "loss": 3.8338, "rewards/accuracies": 0.5, "rewards/chosen": -40.59026336669922, "rewards/margins": 2.1440658569335938, "rewards/rejected": -42.73432922363281, "step": 4391 }, { "epoch": 0.5980392156862745, "grad_norm": 37.963991594996045, "learning_rate": 3.332887517654688e-07, "logits/chosen": 13.000011444091797, "logits/rejected": 14.341496467590332, "logps/chosen": -4.264659404754639, "logps/rejected": -4.621881484985352, "loss": 3.7031, "rewards/accuracies": 1.0, "rewards/chosen": -42.6465950012207, "rewards/margins": 3.572220802307129, "rewards/rejected": -46.21881866455078, "step": 4392 }, { "epoch": 0.5981753812636166, "grad_norm": 39.66319231655849, "learning_rate": 3.331012820557344e-07, "logits/chosen": 14.598369598388672, "logits/rejected": 13.712882995605469, "logps/chosen": -4.490025043487549, "logps/rejected": -4.278652191162109, "loss": 4.1076, "rewards/accuracies": 0.25, "rewards/chosen": -44.90024948120117, "rewards/margins": -2.1137237548828125, "rewards/rejected": -42.78652572631836, "step": 4393 }, { "epoch": 0.5983115468409586, "grad_norm": 41.123579602503845, "learning_rate": 3.32913827462318e-07, "logits/chosen": 13.953639030456543, "logits/rejected": 13.901325225830078, "logps/chosen": -4.181113243103027, "logps/rejected": -4.574759483337402, "loss": 3.5813, "rewards/accuracies": 1.0, "rewards/chosen": -41.811134338378906, "rewards/margins": 3.936459541320801, "rewards/rejected": -45.74759292602539, "step": 4394 }, { "epoch": 0.5984477124183006, "grad_norm": 39.92368096362, "learning_rate": 3.3272638802757687e-07, "logits/chosen": 13.321181297302246, "logits/rejected": 13.353458404541016, "logps/chosen": -4.048357963562012, "logps/rejected": -4.345236778259277, "loss": 3.6902, "rewards/accuracies": 0.75, "rewards/chosen": -40.483577728271484, "rewards/margins": 2.96878719329834, "rewards/rejected": -43.45236587524414, "step": 4395 }, { "epoch": 0.5985838779956427, "grad_norm": 41.24278261492888, "learning_rate": 3.325389637938646e-07, "logits/chosen": 13.039712905883789, "logits/rejected": 12.917983055114746, "logps/chosen": -4.09651517868042, "logps/rejected": -4.122245788574219, "loss": 4.0634, "rewards/accuracies": 0.75, "rewards/chosen": -40.965152740478516, "rewards/margins": 0.25730419158935547, "rewards/rejected": -41.22245788574219, "step": 4396 }, { "epoch": 0.5987200435729847, "grad_norm": 42.7579256652023, "learning_rate": 3.32351554803531e-07, "logits/chosen": 13.737619400024414, "logits/rejected": 14.238936424255371, "logps/chosen": -4.475340843200684, "logps/rejected": -4.633689880371094, "loss": 4.516, "rewards/accuracies": 0.5, "rewards/chosen": -44.75341033935547, "rewards/margins": 1.5834856033325195, "rewards/rejected": -46.33689498901367, "step": 4397 }, { "epoch": 0.5988562091503268, "grad_norm": 44.57728723847716, "learning_rate": 3.3216416109892274e-07, "logits/chosen": 13.906305313110352, "logits/rejected": 13.51131820678711, "logps/chosen": -4.085684776306152, "logps/rejected": -4.348931789398193, "loss": 4.0369, "rewards/accuracies": 1.0, "rewards/chosen": -40.856849670410156, "rewards/margins": 2.63247013092041, "rewards/rejected": -43.48931884765625, "step": 4398 }, { "epoch": 0.5989923747276689, "grad_norm": 44.424902379647314, "learning_rate": 3.3197678272238317e-07, "logits/chosen": 12.302209854125977, "logits/rejected": 13.700191497802734, "logps/chosen": -3.7092056274414062, "logps/rejected": -4.140208721160889, "loss": 3.3838, "rewards/accuracies": 0.75, "rewards/chosen": -37.09205627441406, "rewards/margins": 4.310031890869141, "rewards/rejected": -41.40209197998047, "step": 4399 }, { "epoch": 0.599128540305011, "grad_norm": 43.71840449639695, "learning_rate": 3.317894197162517e-07, "logits/chosen": 13.484434127807617, "logits/rejected": 13.532665252685547, "logps/chosen": -4.028494358062744, "logps/rejected": -4.391786575317383, "loss": 4.3234, "rewards/accuracies": 0.75, "rewards/chosen": -40.284942626953125, "rewards/margins": 3.6329240798950195, "rewards/rejected": -43.91786575317383, "step": 4400 }, { "epoch": 0.5992647058823529, "grad_norm": 37.65198629674747, "learning_rate": 3.3160207212286465e-07, "logits/chosen": 12.84950065612793, "logits/rejected": 13.128583908081055, "logps/chosen": -3.8642544746398926, "logps/rejected": -4.365689754486084, "loss": 3.7173, "rewards/accuracies": 0.75, "rewards/chosen": -38.64254379272461, "rewards/margins": 5.014351844787598, "rewards/rejected": -43.65689468383789, "step": 4401 }, { "epoch": 0.599400871459695, "grad_norm": 46.099793732183706, "learning_rate": 3.3141473998455495e-07, "logits/chosen": 13.227304458618164, "logits/rejected": 13.715352058410645, "logps/chosen": -4.270667552947998, "logps/rejected": -4.388982772827148, "loss": 4.7117, "rewards/accuracies": 0.75, "rewards/chosen": -42.7066764831543, "rewards/margins": 1.1831483840942383, "rewards/rejected": -43.88982391357422, "step": 4402 }, { "epoch": 0.5995370370370371, "grad_norm": 42.912012852820304, "learning_rate": 3.3122742334365154e-07, "logits/chosen": 13.139105796813965, "logits/rejected": 13.59679126739502, "logps/chosen": -4.314217567443848, "logps/rejected": -4.488376617431641, "loss": 3.8736, "rewards/accuracies": 0.75, "rewards/chosen": -43.142173767089844, "rewards/margins": 1.7415895462036133, "rewards/rejected": -44.883766174316406, "step": 4403 }, { "epoch": 0.5996732026143791, "grad_norm": 43.786000098858125, "learning_rate": 3.310401222424803e-07, "logits/chosen": 13.091115951538086, "logits/rejected": 13.491705894470215, "logps/chosen": -4.22799825668335, "logps/rejected": -4.402676582336426, "loss": 4.0725, "rewards/accuracies": 0.5, "rewards/chosen": -42.27997970581055, "rewards/margins": 1.7467842102050781, "rewards/rejected": -44.026763916015625, "step": 4404 }, { "epoch": 0.5998093681917211, "grad_norm": 41.48024518772849, "learning_rate": 3.3085283672336364e-07, "logits/chosen": 13.380149841308594, "logits/rejected": 14.17388916015625, "logps/chosen": -4.400642395019531, "logps/rejected": -4.408507823944092, "loss": 3.5783, "rewards/accuracies": 0.25, "rewards/chosen": -44.00642395019531, "rewards/margins": 0.07865428924560547, "rewards/rejected": -44.085079193115234, "step": 4405 }, { "epoch": 0.5999455337690632, "grad_norm": 290.2602566526825, "learning_rate": 3.3066556682861987e-07, "logits/chosen": 13.247509002685547, "logits/rejected": 13.827561378479004, "logps/chosen": -4.323358535766602, "logps/rejected": -4.528942108154297, "loss": 3.4245, "rewards/accuracies": 0.5, "rewards/chosen": -43.23358154296875, "rewards/margins": 2.0558385848999023, "rewards/rejected": -45.28942108154297, "step": 4406 }, { "epoch": 0.6000816993464052, "grad_norm": 46.66722783911424, "learning_rate": 3.3047831260056446e-07, "logits/chosen": 13.713138580322266, "logits/rejected": 13.503371238708496, "logps/chosen": -3.836207151412964, "logps/rejected": -3.9299674034118652, "loss": 3.6619, "rewards/accuracies": 0.75, "rewards/chosen": -38.3620719909668, "rewards/margins": 0.9376010894775391, "rewards/rejected": -39.29967498779297, "step": 4407 }, { "epoch": 0.6002178649237473, "grad_norm": 43.16032057953408, "learning_rate": 3.3029107408150903e-07, "logits/chosen": 13.213151931762695, "logits/rejected": 12.670795440673828, "logps/chosen": -4.2564167976379395, "logps/rejected": -4.417649745941162, "loss": 4.5739, "rewards/accuracies": 0.5, "rewards/chosen": -42.56416702270508, "rewards/margins": 1.612330436706543, "rewards/rejected": -44.17649841308594, "step": 4408 }, { "epoch": 0.6003540305010894, "grad_norm": 42.74709885952285, "learning_rate": 3.3010385131376167e-07, "logits/chosen": 14.34996223449707, "logits/rejected": 14.7095947265625, "logps/chosen": -4.462729454040527, "logps/rejected": -4.60036563873291, "loss": 3.6149, "rewards/accuracies": 0.75, "rewards/chosen": -44.627296447753906, "rewards/margins": 1.3763551712036133, "rewards/rejected": -46.00365447998047, "step": 4409 }, { "epoch": 0.6004901960784313, "grad_norm": 39.344886997047126, "learning_rate": 3.2991664433962674e-07, "logits/chosen": 12.936859130859375, "logits/rejected": 13.759733200073242, "logps/chosen": -4.006315231323242, "logps/rejected": -4.193790912628174, "loss": 3.807, "rewards/accuracies": 0.5, "rewards/chosen": -40.06315612792969, "rewards/margins": 1.8747568130493164, "rewards/rejected": -41.93791198730469, "step": 4410 }, { "epoch": 0.6006263616557734, "grad_norm": 44.00265409872798, "learning_rate": 3.297294532014055e-07, "logits/chosen": 13.46148681640625, "logits/rejected": 13.942364692687988, "logps/chosen": -4.464425086975098, "logps/rejected": -4.630335807800293, "loss": 4.1903, "rewards/accuracies": 0.75, "rewards/chosen": -44.644248962402344, "rewards/margins": 1.6591081619262695, "rewards/rejected": -46.30335998535156, "step": 4411 }, { "epoch": 0.6007625272331155, "grad_norm": 48.81007739589797, "learning_rate": 3.2954227794139514e-07, "logits/chosen": 13.70890998840332, "logits/rejected": 13.617593765258789, "logps/chosen": -4.236102104187012, "logps/rejected": -4.3364973068237305, "loss": 4.4716, "rewards/accuracies": 0.75, "rewards/chosen": -42.36102294921875, "rewards/margins": 1.0039491653442383, "rewards/rejected": -43.36497116088867, "step": 4412 }, { "epoch": 0.6008986928104575, "grad_norm": 36.260407643439855, "learning_rate": 3.293551186018894e-07, "logits/chosen": 13.020498275756836, "logits/rejected": 14.158844947814941, "logps/chosen": -3.7891690731048584, "logps/rejected": -4.523838520050049, "loss": 3.0804, "rewards/accuracies": 1.0, "rewards/chosen": -37.891693115234375, "rewards/margins": 7.34669303894043, "rewards/rejected": -45.23838806152344, "step": 4413 }, { "epoch": 0.6010348583877996, "grad_norm": 40.409670877041926, "learning_rate": 3.291679752251786e-07, "logits/chosen": 13.84980583190918, "logits/rejected": 14.008878707885742, "logps/chosen": -3.937310218811035, "logps/rejected": -4.409493923187256, "loss": 3.7149, "rewards/accuracies": 1.0, "rewards/chosen": -39.37310028076172, "rewards/margins": 4.721837997436523, "rewards/rejected": -44.094940185546875, "step": 4414 }, { "epoch": 0.6011710239651417, "grad_norm": 41.22567217061657, "learning_rate": 3.2898084785354925e-07, "logits/chosen": 13.415664672851562, "logits/rejected": 13.630868911743164, "logps/chosen": -4.293527126312256, "logps/rejected": -4.2859296798706055, "loss": 3.996, "rewards/accuracies": 0.25, "rewards/chosen": -42.935272216796875, "rewards/margins": -0.0759744644165039, "rewards/rejected": -42.85929870605469, "step": 4415 }, { "epoch": 0.6013071895424836, "grad_norm": 54.2921815203013, "learning_rate": 3.287937365292845e-07, "logits/chosen": 13.91871452331543, "logits/rejected": 13.988903045654297, "logps/chosen": -4.398123264312744, "logps/rejected": -3.8590989112854004, "loss": 3.788, "rewards/accuracies": 0.0, "rewards/chosen": -43.981231689453125, "rewards/margins": -5.390244483947754, "rewards/rejected": -38.59098815917969, "step": 4416 }, { "epoch": 0.6014433551198257, "grad_norm": 42.60243376623992, "learning_rate": 3.2860664129466357e-07, "logits/chosen": 12.468812942504883, "logits/rejected": 12.623080253601074, "logps/chosen": -4.253442764282227, "logps/rejected": -3.931156635284424, "loss": 3.9888, "rewards/accuracies": 0.5, "rewards/chosen": -42.534423828125, "rewards/margins": -3.2228622436523438, "rewards/rejected": -39.311561584472656, "step": 4417 }, { "epoch": 0.6015795206971678, "grad_norm": 42.799890888042796, "learning_rate": 3.284195621919621e-07, "logits/chosen": 14.214593887329102, "logits/rejected": 14.149023056030273, "logps/chosen": -4.298591613769531, "logps/rejected": -4.591928005218506, "loss": 4.3147, "rewards/accuracies": 0.5, "rewards/chosen": -42.98591232299805, "rewards/margins": 2.9333667755126953, "rewards/rejected": -45.919281005859375, "step": 4418 }, { "epoch": 0.6017156862745098, "grad_norm": 39.486030294854295, "learning_rate": 3.2823249926345227e-07, "logits/chosen": 12.851875305175781, "logits/rejected": 13.708795547485352, "logps/chosen": -3.976017951965332, "logps/rejected": -4.282196044921875, "loss": 4.3466, "rewards/accuracies": 0.5, "rewards/chosen": -39.76017761230469, "rewards/margins": 3.061784267425537, "rewards/rejected": -42.821964263916016, "step": 4419 }, { "epoch": 0.6018518518518519, "grad_norm": 41.43449402878763, "learning_rate": 3.280454525514025e-07, "logits/chosen": 14.383609771728516, "logits/rejected": 13.454151153564453, "logps/chosen": -4.293017387390137, "logps/rejected": -4.248995780944824, "loss": 3.9483, "rewards/accuracies": 0.5, "rewards/chosen": -42.93017578125, "rewards/margins": -0.440216064453125, "rewards/rejected": -42.489959716796875, "step": 4420 }, { "epoch": 0.601988017429194, "grad_norm": 47.78265617242797, "learning_rate": 3.2785842209807743e-07, "logits/chosen": 13.142511367797852, "logits/rejected": 13.576944351196289, "logps/chosen": -4.184843063354492, "logps/rejected": -4.341699600219727, "loss": 4.1211, "rewards/accuracies": 0.75, "rewards/chosen": -41.84843444824219, "rewards/margins": 1.5685644149780273, "rewards/rejected": -43.41699981689453, "step": 4421 }, { "epoch": 0.6021241830065359, "grad_norm": 42.646711902304396, "learning_rate": 3.276714079457383e-07, "logits/chosen": 13.602672576904297, "logits/rejected": 12.946409225463867, "logps/chosen": -4.1943511962890625, "logps/rejected": -4.256297588348389, "loss": 4.0639, "rewards/accuracies": 0.5, "rewards/chosen": -41.943511962890625, "rewards/margins": 0.6194629669189453, "rewards/rejected": -42.56297302246094, "step": 4422 }, { "epoch": 0.602260348583878, "grad_norm": 39.849490716437174, "learning_rate": 3.2748441013664243e-07, "logits/chosen": 14.328660011291504, "logits/rejected": 14.381111145019531, "logps/chosen": -4.191455841064453, "logps/rejected": -4.514561653137207, "loss": 3.9152, "rewards/accuracies": 0.75, "rewards/chosen": -41.9145622253418, "rewards/margins": 3.231050491333008, "rewards/rejected": -45.14561080932617, "step": 4423 }, { "epoch": 0.6023965141612201, "grad_norm": 37.80353740253754, "learning_rate": 3.2729742871304347e-07, "logits/chosen": 14.054158210754395, "logits/rejected": 15.03070068359375, "logps/chosen": -4.2056779861450195, "logps/rejected": -4.556090831756592, "loss": 3.1961, "rewards/accuracies": 1.0, "rewards/chosen": -42.056785583496094, "rewards/margins": 3.5041236877441406, "rewards/rejected": -45.56090545654297, "step": 4424 }, { "epoch": 0.6025326797385621, "grad_norm": 39.450106796410836, "learning_rate": 3.271104637171914e-07, "logits/chosen": 12.234726905822754, "logits/rejected": 13.755374908447266, "logps/chosen": -3.8193111419677734, "logps/rejected": -4.414381980895996, "loss": 3.784, "rewards/accuracies": 1.0, "rewards/chosen": -38.193111419677734, "rewards/margins": 5.950709342956543, "rewards/rejected": -44.143821716308594, "step": 4425 }, { "epoch": 0.6026688453159041, "grad_norm": 40.01876130330005, "learning_rate": 3.2692351519133274e-07, "logits/chosen": 13.59712028503418, "logits/rejected": 14.228586196899414, "logps/chosen": -4.184844017028809, "logps/rejected": -4.444391250610352, "loss": 3.5974, "rewards/accuracies": 1.0, "rewards/chosen": -41.84844207763672, "rewards/margins": 2.595468521118164, "rewards/rejected": -44.44390869140625, "step": 4426 }, { "epoch": 0.6028050108932462, "grad_norm": 37.62254534910373, "learning_rate": 3.2673658317770965e-07, "logits/chosen": 13.134815216064453, "logits/rejected": 13.294851303100586, "logps/chosen": -3.9948318004608154, "logps/rejected": -4.299127101898193, "loss": 3.4399, "rewards/accuracies": 1.0, "rewards/chosen": -39.94831848144531, "rewards/margins": 3.0429515838623047, "rewards/rejected": -42.99127197265625, "step": 4427 }, { "epoch": 0.6029411764705882, "grad_norm": 43.365323164296925, "learning_rate": 3.2654966771856127e-07, "logits/chosen": 12.83702278137207, "logits/rejected": 13.69446086883545, "logps/chosen": -4.029417991638184, "logps/rejected": -4.095278263092041, "loss": 4.2597, "rewards/accuracies": 0.5, "rewards/chosen": -40.294185638427734, "rewards/margins": 0.6585960388183594, "rewards/rejected": -40.952781677246094, "step": 4428 }, { "epoch": 0.6030773420479303, "grad_norm": 39.858376086485876, "learning_rate": 3.263627688561227e-07, "logits/chosen": 14.431286811828613, "logits/rejected": 14.374119758605957, "logps/chosen": -4.526402473449707, "logps/rejected": -4.974910736083984, "loss": 4.5259, "rewards/accuracies": 0.75, "rewards/chosen": -45.2640266418457, "rewards/margins": 4.485076904296875, "rewards/rejected": -49.74910354614258, "step": 4429 }, { "epoch": 0.6032135076252724, "grad_norm": 41.00666715287975, "learning_rate": 3.261758866326251e-07, "logits/chosen": 12.993325233459473, "logits/rejected": 13.40816879272461, "logps/chosen": -4.260241985321045, "logps/rejected": -4.317904472351074, "loss": 4.1356, "rewards/accuracies": 0.5, "rewards/chosen": -42.602420806884766, "rewards/margins": 0.5766277313232422, "rewards/rejected": -43.179046630859375, "step": 4430 }, { "epoch": 0.6033496732026143, "grad_norm": 36.06046335991057, "learning_rate": 3.259890210902962e-07, "logits/chosen": 12.70272159576416, "logits/rejected": 13.124338150024414, "logps/chosen": -3.7413182258605957, "logps/rejected": -4.07257080078125, "loss": 3.6992, "rewards/accuracies": 0.75, "rewards/chosen": -37.41318130493164, "rewards/margins": 3.312525749206543, "rewards/rejected": -40.7257080078125, "step": 4431 }, { "epoch": 0.6034858387799564, "grad_norm": 39.98608072468495, "learning_rate": 3.258021722713599e-07, "logits/chosen": 13.738250732421875, "logits/rejected": 13.380874633789062, "logps/chosen": -4.2131147384643555, "logps/rejected": -4.26033353805542, "loss": 3.8577, "rewards/accuracies": 0.75, "rewards/chosen": -42.13114547729492, "rewards/margins": 0.4721870422363281, "rewards/rejected": -42.60333251953125, "step": 4432 }, { "epoch": 0.6036220043572985, "grad_norm": 45.163711973786334, "learning_rate": 3.2561534021803587e-07, "logits/chosen": 13.245377540588379, "logits/rejected": 13.795076370239258, "logps/chosen": -4.402240753173828, "logps/rejected": -4.5772881507873535, "loss": 4.2752, "rewards/accuracies": 0.5, "rewards/chosen": -44.02240753173828, "rewards/margins": 1.7504749298095703, "rewards/rejected": -45.77288055419922, "step": 4433 }, { "epoch": 0.6037581699346405, "grad_norm": 40.48873228322243, "learning_rate": 3.254285249725407e-07, "logits/chosen": 12.612390518188477, "logits/rejected": 13.168878555297852, "logps/chosen": -4.049871444702148, "logps/rejected": -4.208430290222168, "loss": 4.0632, "rewards/accuracies": 0.75, "rewards/chosen": -40.498714447021484, "rewards/margins": 1.5855894088745117, "rewards/rejected": -42.08430480957031, "step": 4434 }, { "epoch": 0.6038943355119826, "grad_norm": 39.48283879631169, "learning_rate": 3.2524172657708676e-07, "logits/chosen": 13.784823417663574, "logits/rejected": 13.720624923706055, "logps/chosen": -4.4954986572265625, "logps/rejected": -4.267634391784668, "loss": 3.816, "rewards/accuracies": 0.5, "rewards/chosen": -44.954986572265625, "rewards/margins": -2.2786436080932617, "rewards/rejected": -42.67634201049805, "step": 4435 }, { "epoch": 0.6040305010893247, "grad_norm": 43.98930228303894, "learning_rate": 3.2505494507388256e-07, "logits/chosen": 12.934530258178711, "logits/rejected": 13.856632232666016, "logps/chosen": -4.381162643432617, "logps/rejected": -4.5992631912231445, "loss": 4.009, "rewards/accuracies": 0.75, "rewards/chosen": -43.81162643432617, "rewards/margins": 2.18100643157959, "rewards/rejected": -45.99263000488281, "step": 4436 }, { "epoch": 0.6041666666666666, "grad_norm": 44.27193921488791, "learning_rate": 3.24868180505133e-07, "logits/chosen": 13.263456344604492, "logits/rejected": 13.845014572143555, "logps/chosen": -4.06606388092041, "logps/rejected": -4.5383524894714355, "loss": 3.6756, "rewards/accuracies": 1.0, "rewards/chosen": -40.660640716552734, "rewards/margins": 4.7228851318359375, "rewards/rejected": -45.38352584838867, "step": 4437 }, { "epoch": 0.6043028322440087, "grad_norm": 41.21478871897567, "learning_rate": 3.246814329130393e-07, "logits/chosen": 13.993390083312988, "logits/rejected": 14.648811340332031, "logps/chosen": -4.35002326965332, "logps/rejected": -4.493925094604492, "loss": 3.3463, "rewards/accuracies": 1.0, "rewards/chosen": -43.50023651123047, "rewards/margins": 1.439011573791504, "rewards/rejected": -44.939247131347656, "step": 4438 }, { "epoch": 0.6044389978213508, "grad_norm": 45.64209040806763, "learning_rate": 3.2449470233979825e-07, "logits/chosen": 13.382835388183594, "logits/rejected": 13.947924613952637, "logps/chosen": -4.252742767333984, "logps/rejected": -4.509152412414551, "loss": 3.9581, "rewards/accuracies": 0.75, "rewards/chosen": -42.52743148803711, "rewards/margins": 2.5640974044799805, "rewards/rejected": -45.09152603149414, "step": 4439 }, { "epoch": 0.6045751633986928, "grad_norm": 40.860135954882495, "learning_rate": 3.243079888276033e-07, "logits/chosen": 13.653514862060547, "logits/rejected": 13.71280288696289, "logps/chosen": -4.298657417297363, "logps/rejected": -4.145569801330566, "loss": 3.3573, "rewards/accuracies": 0.5, "rewards/chosen": -42.986572265625, "rewards/margins": -1.5308785438537598, "rewards/rejected": -41.45569610595703, "step": 4440 }, { "epoch": 0.6047113289760349, "grad_norm": 40.006779907798595, "learning_rate": 3.241212924186442e-07, "logits/chosen": 13.089503288269043, "logits/rejected": 13.34708023071289, "logps/chosen": -4.047407150268555, "logps/rejected": -4.252841949462891, "loss": 3.5232, "rewards/accuracies": 0.75, "rewards/chosen": -40.47406768798828, "rewards/margins": 2.0543508529663086, "rewards/rejected": -42.528419494628906, "step": 4441 }, { "epoch": 0.6048474945533769, "grad_norm": 43.75903465764806, "learning_rate": 3.2393461315510605e-07, "logits/chosen": 12.872673034667969, "logits/rejected": 13.29294490814209, "logps/chosen": -3.9084603786468506, "logps/rejected": -4.435527801513672, "loss": 3.6801, "rewards/accuracies": 1.0, "rewards/chosen": -39.08460235595703, "rewards/margins": 5.270678520202637, "rewards/rejected": -44.355281829833984, "step": 4442 }, { "epoch": 0.6049836601307189, "grad_norm": 44.79060201513074, "learning_rate": 3.2374795107917085e-07, "logits/chosen": 12.693740844726562, "logits/rejected": 12.628992080688477, "logps/chosen": -4.242382049560547, "logps/rejected": -4.331639289855957, "loss": 4.0738, "rewards/accuracies": 0.75, "rewards/chosen": -42.42382049560547, "rewards/margins": 0.8925714492797852, "rewards/rejected": -43.31639099121094, "step": 4443 }, { "epoch": 0.605119825708061, "grad_norm": 42.29876759008325, "learning_rate": 3.235613062330166e-07, "logits/chosen": 14.173225402832031, "logits/rejected": 14.132488250732422, "logps/chosen": -4.564563751220703, "logps/rejected": -4.476068496704102, "loss": 3.675, "rewards/accuracies": 0.5, "rewards/chosen": -45.64563751220703, "rewards/margins": -0.8849544525146484, "rewards/rejected": -44.760684967041016, "step": 4444 }, { "epoch": 0.6052559912854031, "grad_norm": 49.80493574078045, "learning_rate": 3.233746786588168e-07, "logits/chosen": 12.794839859008789, "logits/rejected": 13.853103637695312, "logps/chosen": -3.7807791233062744, "logps/rejected": -4.25778341293335, "loss": 3.8508, "rewards/accuracies": 0.75, "rewards/chosen": -37.80778884887695, "rewards/margins": 4.77004337310791, "rewards/rejected": -42.57783508300781, "step": 4445 }, { "epoch": 0.6053921568627451, "grad_norm": 45.38523163849402, "learning_rate": 3.231880683987418e-07, "logits/chosen": 13.318674087524414, "logits/rejected": 13.690248489379883, "logps/chosen": -4.499749183654785, "logps/rejected": -4.664973258972168, "loss": 3.7303, "rewards/accuracies": 0.75, "rewards/chosen": -44.99748992919922, "rewards/margins": 1.6522445678710938, "rewards/rejected": -46.64973449707031, "step": 4446 }, { "epoch": 0.6055283224400871, "grad_norm": 52.56815336482922, "learning_rate": 3.230014754949579e-07, "logits/chosen": 12.566071510314941, "logits/rejected": 13.488393783569336, "logps/chosen": -4.002447128295898, "logps/rejected": -4.121997833251953, "loss": 4.1865, "rewards/accuracies": 0.5, "rewards/chosen": -40.02446746826172, "rewards/margins": 1.195509910583496, "rewards/rejected": -41.2199821472168, "step": 4447 }, { "epoch": 0.6056644880174292, "grad_norm": 43.60293117444654, "learning_rate": 3.2281489998962687e-07, "logits/chosen": 12.795852661132812, "logits/rejected": 12.643974304199219, "logps/chosen": -4.174964904785156, "logps/rejected": -4.140752792358398, "loss": 4.4021, "rewards/accuracies": 0.25, "rewards/chosen": -41.74965286254883, "rewards/margins": -0.34212303161621094, "rewards/rejected": -41.40753173828125, "step": 4448 }, { "epoch": 0.6058006535947712, "grad_norm": 42.51784727868809, "learning_rate": 3.2262834192490724e-07, "logits/chosen": 13.489799499511719, "logits/rejected": 13.735466003417969, "logps/chosen": -3.771897792816162, "logps/rejected": -3.954929828643799, "loss": 4.1609, "rewards/accuracies": 0.75, "rewards/chosen": -37.71897888183594, "rewards/margins": 1.8303217887878418, "rewards/rejected": -39.54930114746094, "step": 4449 }, { "epoch": 0.6059368191721133, "grad_norm": 43.44987762294053, "learning_rate": 3.2244180134295347e-07, "logits/chosen": 12.711750030517578, "logits/rejected": 12.024795532226562, "logps/chosen": -3.8225555419921875, "logps/rejected": -4.070565700531006, "loss": 3.7507, "rewards/accuracies": 0.75, "rewards/chosen": -38.225555419921875, "rewards/margins": 2.4801034927368164, "rewards/rejected": -40.705657958984375, "step": 4450 }, { "epoch": 0.6060729847494554, "grad_norm": 44.41567650314296, "learning_rate": 3.222552782859156e-07, "logits/chosen": 12.114930152893066, "logits/rejected": 13.147297859191895, "logps/chosen": -3.9644367694854736, "logps/rejected": -4.306753158569336, "loss": 4.2422, "rewards/accuracies": 1.0, "rewards/chosen": -39.64436721801758, "rewards/margins": 3.4231653213500977, "rewards/rejected": -43.06753158569336, "step": 4451 }, { "epoch": 0.6062091503267973, "grad_norm": 41.83409843686618, "learning_rate": 3.220687727959402e-07, "logits/chosen": 13.765779495239258, "logits/rejected": 13.90296459197998, "logps/chosen": -4.422876358032227, "logps/rejected": -4.657918930053711, "loss": 3.8536, "rewards/accuracies": 0.75, "rewards/chosen": -44.228763580322266, "rewards/margins": 2.3504247665405273, "rewards/rejected": -46.57918930053711, "step": 4452 }, { "epoch": 0.6063453159041394, "grad_norm": 42.106977749879725, "learning_rate": 3.2188228491517e-07, "logits/chosen": 12.94815444946289, "logits/rejected": 12.851792335510254, "logps/chosen": -4.177230358123779, "logps/rejected": -4.041354656219482, "loss": 3.6219, "rewards/accuracies": 0.25, "rewards/chosen": -41.77230453491211, "rewards/margins": -1.358755111694336, "rewards/rejected": -40.41354751586914, "step": 4453 }, { "epoch": 0.6064814814814815, "grad_norm": 43.58055478769851, "learning_rate": 3.216958146857431e-07, "logits/chosen": 13.638202667236328, "logits/rejected": 14.123756408691406, "logps/chosen": -4.044538497924805, "logps/rejected": -4.441625595092773, "loss": 3.4173, "rewards/accuracies": 0.75, "rewards/chosen": -40.44538497924805, "rewards/margins": 3.970874786376953, "rewards/rejected": -44.416259765625, "step": 4454 }, { "epoch": 0.6066176470588235, "grad_norm": 40.9967759203702, "learning_rate": 3.2150936214979416e-07, "logits/chosen": 12.914710998535156, "logits/rejected": 13.857895851135254, "logps/chosen": -4.063236713409424, "logps/rejected": -4.397277355194092, "loss": 3.8366, "rewards/accuracies": 1.0, "rewards/chosen": -40.63236999511719, "rewards/margins": 3.340407371520996, "rewards/rejected": -43.972774505615234, "step": 4455 }, { "epoch": 0.6067538126361656, "grad_norm": 39.68433723468923, "learning_rate": 3.213229273494537e-07, "logits/chosen": 13.93376350402832, "logits/rejected": 13.860008239746094, "logps/chosen": -4.385244369506836, "logps/rejected": -4.532927513122559, "loss": 3.6075, "rewards/accuracies": 0.5, "rewards/chosen": -43.85244369506836, "rewards/margins": 1.4768304824829102, "rewards/rejected": -45.32927322387695, "step": 4456 }, { "epoch": 0.6068899782135077, "grad_norm": 38.80145178900138, "learning_rate": 3.211365103268481e-07, "logits/chosen": 13.030845642089844, "logits/rejected": 13.535097122192383, "logps/chosen": -4.337796211242676, "logps/rejected": -4.359496116638184, "loss": 3.4164, "rewards/accuracies": 0.5, "rewards/chosen": -43.377960205078125, "rewards/margins": 0.21700191497802734, "rewards/rejected": -43.59496307373047, "step": 4457 }, { "epoch": 0.6070261437908496, "grad_norm": 41.4773472861059, "learning_rate": 3.2095011112409986e-07, "logits/chosen": 13.821247100830078, "logits/rejected": 13.507954597473145, "logps/chosen": -4.351849555969238, "logps/rejected": -4.270557403564453, "loss": 4.0697, "rewards/accuracies": 0.5, "rewards/chosen": -43.518497467041016, "rewards/margins": -0.8129253387451172, "rewards/rejected": -42.70557403564453, "step": 4458 }, { "epoch": 0.6071623093681917, "grad_norm": 62.0412114448156, "learning_rate": 3.2076372978332754e-07, "logits/chosen": 13.616750717163086, "logits/rejected": 14.386787414550781, "logps/chosen": -4.193258285522461, "logps/rejected": -4.468304634094238, "loss": 4.1284, "rewards/accuracies": 0.75, "rewards/chosen": -41.932579040527344, "rewards/margins": 2.7504634857177734, "rewards/rejected": -44.68304443359375, "step": 4459 }, { "epoch": 0.6072984749455338, "grad_norm": 40.341558855396926, "learning_rate": 3.205773663466454e-07, "logits/chosen": 12.764204025268555, "logits/rejected": 13.878157615661621, "logps/chosen": -4.003196716308594, "logps/rejected": -4.312209129333496, "loss": 4.0131, "rewards/accuracies": 1.0, "rewards/chosen": -40.03196716308594, "rewards/margins": 3.090120315551758, "rewards/rejected": -43.12208938598633, "step": 4460 }, { "epoch": 0.6074346405228758, "grad_norm": 41.35966360719303, "learning_rate": 3.203910208561638e-07, "logits/chosen": 13.174164772033691, "logits/rejected": 13.873445510864258, "logps/chosen": -4.271914958953857, "logps/rejected": -4.478360176086426, "loss": 3.776, "rewards/accuracies": 0.5, "rewards/chosen": -42.71915054321289, "rewards/margins": 2.064450263977051, "rewards/rejected": -44.783599853515625, "step": 4461 }, { "epoch": 0.6075708061002179, "grad_norm": 43.93398367243269, "learning_rate": 3.2020469335398915e-07, "logits/chosen": 12.537544250488281, "logits/rejected": 12.654033660888672, "logps/chosen": -3.8647756576538086, "logps/rejected": -4.011700630187988, "loss": 4.1534, "rewards/accuracies": 0.5, "rewards/chosen": -38.64775466918945, "rewards/margins": 1.4692487716674805, "rewards/rejected": -40.11700439453125, "step": 4462 }, { "epoch": 0.6077069716775599, "grad_norm": 45.476604108596625, "learning_rate": 3.2001838388222366e-07, "logits/chosen": 13.301446914672852, "logits/rejected": 13.19051742553711, "logps/chosen": -4.363455772399902, "logps/rejected": -4.429478168487549, "loss": 3.9824, "rewards/accuracies": 0.75, "rewards/chosen": -43.634552001953125, "rewards/margins": 0.6602249145507812, "rewards/rejected": -44.294776916503906, "step": 4463 }, { "epoch": 0.6078431372549019, "grad_norm": 44.60210973109612, "learning_rate": 3.1983209248296537e-07, "logits/chosen": 12.325750350952148, "logits/rejected": 13.482340812683105, "logps/chosen": -3.8400654792785645, "logps/rejected": -4.435340404510498, "loss": 3.2431, "rewards/accuracies": 1.0, "rewards/chosen": -38.40065002441406, "rewards/margins": 5.952752113342285, "rewards/rejected": -44.3534049987793, "step": 4464 }, { "epoch": 0.607979302832244, "grad_norm": 42.73918357764844, "learning_rate": 3.196458191983086e-07, "logits/chosen": 11.955345153808594, "logits/rejected": 12.285209655761719, "logps/chosen": -3.8496952056884766, "logps/rejected": -4.200588226318359, "loss": 3.5854, "rewards/accuracies": 1.0, "rewards/chosen": -38.496952056884766, "rewards/margins": 3.5089282989501953, "rewards/rejected": -42.00587844848633, "step": 4465 }, { "epoch": 0.6081154684095861, "grad_norm": 41.79804859030462, "learning_rate": 3.19459564070343e-07, "logits/chosen": 12.942468643188477, "logits/rejected": 13.643472671508789, "logps/chosen": -3.963988780975342, "logps/rejected": -4.289875030517578, "loss": 3.6568, "rewards/accuracies": 0.75, "rewards/chosen": -39.639888763427734, "rewards/margins": 3.2588624954223633, "rewards/rejected": -42.89875030517578, "step": 4466 }, { "epoch": 0.608251633986928, "grad_norm": 49.92729457644361, "learning_rate": 3.192733271411548e-07, "logits/chosen": 12.471311569213867, "logits/rejected": 12.44960880279541, "logps/chosen": -4.065085411071777, "logps/rejected": -4.291389465332031, "loss": 4.2455, "rewards/accuracies": 0.5, "rewards/chosen": -40.650856018066406, "rewards/margins": 2.2630414962768555, "rewards/rejected": -42.91389465332031, "step": 4467 }, { "epoch": 0.6083877995642701, "grad_norm": 43.77340532429972, "learning_rate": 3.1908710845282564e-07, "logits/chosen": 13.483007431030273, "logits/rejected": 13.277578353881836, "logps/chosen": -4.409566402435303, "logps/rejected": -4.381841659545898, "loss": 3.8453, "rewards/accuracies": 0.25, "rewards/chosen": -44.09566879272461, "rewards/margins": -0.27724647521972656, "rewards/rejected": -43.81842041015625, "step": 4468 }, { "epoch": 0.6085239651416122, "grad_norm": 50.539183401829234, "learning_rate": 3.1890090804743304e-07, "logits/chosen": 12.258594512939453, "logits/rejected": 13.030406951904297, "logps/chosen": -3.8227949142456055, "logps/rejected": -4.222484588623047, "loss": 3.7435, "rewards/accuracies": 0.75, "rewards/chosen": -38.22794723510742, "rewards/margins": 3.996896266937256, "rewards/rejected": -42.22484588623047, "step": 4469 }, { "epoch": 0.6086601307189542, "grad_norm": 41.016241552176695, "learning_rate": 3.187147259670507e-07, "logits/chosen": 12.562177658081055, "logits/rejected": 13.638968467712402, "logps/chosen": -4.014969825744629, "logps/rejected": -4.32881498336792, "loss": 3.5905, "rewards/accuracies": 0.5, "rewards/chosen": -40.149696350097656, "rewards/margins": 3.138455390930176, "rewards/rejected": -43.28815460205078, "step": 4470 }, { "epoch": 0.6087962962962963, "grad_norm": 38.193727458612024, "learning_rate": 3.18528562253748e-07, "logits/chosen": 13.666805267333984, "logits/rejected": 13.941740036010742, "logps/chosen": -4.250380516052246, "logps/rejected": -4.343306064605713, "loss": 4.127, "rewards/accuracies": 0.75, "rewards/chosen": -42.503807067871094, "rewards/margins": 0.929255485534668, "rewards/rejected": -43.43305969238281, "step": 4471 }, { "epoch": 0.6089324618736384, "grad_norm": 43.066160849148275, "learning_rate": 3.1834241694959e-07, "logits/chosen": 12.333372116088867, "logits/rejected": 13.1194429397583, "logps/chosen": -3.775393486022949, "logps/rejected": -4.0019211769104, "loss": 4.2774, "rewards/accuracies": 0.75, "rewards/chosen": -37.75393295288086, "rewards/margins": 2.2652788162231445, "rewards/rejected": -40.01921081542969, "step": 4472 }, { "epoch": 0.6090686274509803, "grad_norm": 40.306903787569354, "learning_rate": 3.1815629009663786e-07, "logits/chosen": 12.188948631286621, "logits/rejected": 13.939342498779297, "logps/chosen": -4.300424575805664, "logps/rejected": -4.721233367919922, "loss": 3.7066, "rewards/accuracies": 0.75, "rewards/chosen": -43.00424575805664, "rewards/margins": 4.208089828491211, "rewards/rejected": -47.212337493896484, "step": 4473 }, { "epoch": 0.6092047930283224, "grad_norm": 40.87189429196207, "learning_rate": 3.1797018173694874e-07, "logits/chosen": 14.459254264831543, "logits/rejected": 14.131407737731934, "logps/chosen": -4.165921688079834, "logps/rejected": -4.487959384918213, "loss": 3.375, "rewards/accuracies": 0.75, "rewards/chosen": -41.65921401977539, "rewards/margins": 3.2203760147094727, "rewards/rejected": -44.87959289550781, "step": 4474 }, { "epoch": 0.6093409586056645, "grad_norm": 49.36013943849826, "learning_rate": 3.1778409191257487e-07, "logits/chosen": 13.104720115661621, "logits/rejected": 13.424362182617188, "logps/chosen": -4.0168304443359375, "logps/rejected": -4.579770565032959, "loss": 4.6415, "rewards/accuracies": 1.0, "rewards/chosen": -40.168304443359375, "rewards/margins": 5.6294050216674805, "rewards/rejected": -45.797706604003906, "step": 4475 }, { "epoch": 0.6094771241830066, "grad_norm": 46.47520594880493, "learning_rate": 3.175980206655651e-07, "logits/chosen": 13.935657501220703, "logits/rejected": 13.766790390014648, "logps/chosen": -4.570621490478516, "logps/rejected": -4.55456018447876, "loss": 4.1067, "rewards/accuracies": 0.5, "rewards/chosen": -45.706207275390625, "rewards/margins": -0.16060733795166016, "rewards/rejected": -45.54560089111328, "step": 4476 }, { "epoch": 0.6096132897603486, "grad_norm": 46.95488870659224, "learning_rate": 3.174119680379638e-07, "logits/chosen": 13.459938049316406, "logits/rejected": 13.146638870239258, "logps/chosen": -4.189329147338867, "logps/rejected": -4.20350456237793, "loss": 4.6018, "rewards/accuracies": 0.5, "rewards/chosen": -41.893287658691406, "rewards/margins": 0.14175796508789062, "rewards/rejected": -42.0350456237793, "step": 4477 }, { "epoch": 0.6097494553376906, "grad_norm": 39.80105970544427, "learning_rate": 3.172259340718109e-07, "logits/chosen": 13.335077285766602, "logits/rejected": 13.567014694213867, "logps/chosen": -4.18565034866333, "logps/rejected": -4.532437324523926, "loss": 4.0221, "rewards/accuracies": 0.75, "rewards/chosen": -41.85650634765625, "rewards/margins": 3.467867851257324, "rewards/rejected": -45.324371337890625, "step": 4478 }, { "epoch": 0.6098856209150327, "grad_norm": 42.03102931889123, "learning_rate": 3.1703991880914236e-07, "logits/chosen": 13.367050170898438, "logits/rejected": 13.522584915161133, "logps/chosen": -4.133530139923096, "logps/rejected": -4.545451641082764, "loss": 3.4883, "rewards/accuracies": 1.0, "rewards/chosen": -41.335304260253906, "rewards/margins": 4.11921501159668, "rewards/rejected": -45.45451736450195, "step": 4479 }, { "epoch": 0.6100217864923747, "grad_norm": 42.86455901752851, "learning_rate": 3.168539222919901e-07, "logits/chosen": 13.705127716064453, "logits/rejected": 13.954834938049316, "logps/chosen": -4.330096244812012, "logps/rejected": -4.346035957336426, "loss": 4.0612, "rewards/accuracies": 0.25, "rewards/chosen": -43.30096435546875, "rewards/margins": 0.15939617156982422, "rewards/rejected": -43.46036148071289, "step": 4480 }, { "epoch": 0.6101579520697168, "grad_norm": 51.428843531148104, "learning_rate": 3.166679445623812e-07, "logits/chosen": 13.897168159484863, "logits/rejected": 13.199491500854492, "logps/chosen": -4.192820072174072, "logps/rejected": -4.066386699676514, "loss": 3.9617, "rewards/accuracies": 0.5, "rewards/chosen": -41.928199768066406, "rewards/margins": -1.2643318176269531, "rewards/rejected": -40.66386795043945, "step": 4481 }, { "epoch": 0.6102941176470589, "grad_norm": 38.36791970631354, "learning_rate": 3.1648198566233915e-07, "logits/chosen": 13.72045612335205, "logits/rejected": 14.29355239868164, "logps/chosen": -4.302959442138672, "logps/rejected": -4.574965476989746, "loss": 4.067, "rewards/accuracies": 0.75, "rewards/chosen": -43.02959442138672, "rewards/margins": 2.720057487487793, "rewards/rejected": -45.74965286254883, "step": 4482 }, { "epoch": 0.6104302832244008, "grad_norm": 41.4199912803134, "learning_rate": 3.1629604563388287e-07, "logits/chosen": 12.645227432250977, "logits/rejected": 13.43806266784668, "logps/chosen": -4.593490123748779, "logps/rejected": -4.411327362060547, "loss": 3.9182, "rewards/accuracies": 0.25, "rewards/chosen": -45.934906005859375, "rewards/margins": -1.8216323852539062, "rewards/rejected": -44.11327362060547, "step": 4483 }, { "epoch": 0.6105664488017429, "grad_norm": 51.36035259666919, "learning_rate": 3.161101245190268e-07, "logits/chosen": 13.361043930053711, "logits/rejected": 13.47777271270752, "logps/chosen": -4.461987495422363, "logps/rejected": -4.650941848754883, "loss": 3.7485, "rewards/accuracies": 0.75, "rewards/chosen": -44.619873046875, "rewards/margins": 1.889542579650879, "rewards/rejected": -46.50941848754883, "step": 4484 }, { "epoch": 0.610702614379085, "grad_norm": 37.22579193792436, "learning_rate": 3.1592422235978164e-07, "logits/chosen": 12.099196434020996, "logits/rejected": 12.59562873840332, "logps/chosen": -3.976314067840576, "logps/rejected": -4.11882209777832, "loss": 3.7822, "rewards/accuracies": 0.5, "rewards/chosen": -39.763145446777344, "rewards/margins": 1.425079345703125, "rewards/rejected": -41.18822479248047, "step": 4485 }, { "epoch": 0.610838779956427, "grad_norm": 42.27730243831666, "learning_rate": 3.157383391981535e-07, "logits/chosen": 13.003885269165039, "logits/rejected": 13.3604736328125, "logps/chosen": -4.161611557006836, "logps/rejected": -4.322887420654297, "loss": 4.4317, "rewards/accuracies": 0.75, "rewards/chosen": -41.61611557006836, "rewards/margins": 1.612762451171875, "rewards/rejected": -43.228878021240234, "step": 4486 }, { "epoch": 0.6109749455337691, "grad_norm": 46.42815791619361, "learning_rate": 3.15552475076144e-07, "logits/chosen": 12.104635238647461, "logits/rejected": 12.87106990814209, "logps/chosen": -4.134222030639648, "logps/rejected": -4.244841575622559, "loss": 3.9162, "rewards/accuracies": 0.5, "rewards/chosen": -41.34221649169922, "rewards/margins": 1.1061973571777344, "rewards/rejected": -42.44841766357422, "step": 4487 }, { "epoch": 0.6111111111111112, "grad_norm": 44.33502878212157, "learning_rate": 3.1536663003575083e-07, "logits/chosen": 12.562124252319336, "logits/rejected": 13.666679382324219, "logps/chosen": -4.0254058837890625, "logps/rejected": -4.3704047203063965, "loss": 4.2448, "rewards/accuracies": 0.75, "rewards/chosen": -40.254058837890625, "rewards/margins": 3.44998836517334, "rewards/rejected": -43.70404815673828, "step": 4488 }, { "epoch": 0.6112472766884531, "grad_norm": 38.49606971549718, "learning_rate": 3.1518080411896736e-07, "logits/chosen": 13.564177513122559, "logits/rejected": 13.577205657958984, "logps/chosen": -4.689050674438477, "logps/rejected": -4.733961582183838, "loss": 3.7306, "rewards/accuracies": 0.5, "rewards/chosen": -46.8905029296875, "rewards/margins": 0.4491100311279297, "rewards/rejected": -47.33961486816406, "step": 4489 }, { "epoch": 0.6113834422657952, "grad_norm": 41.019635702833064, "learning_rate": 3.1499499736778214e-07, "logits/chosen": 12.836226463317871, "logits/rejected": 13.478262901306152, "logps/chosen": -3.9901740550994873, "logps/rejected": -4.139059543609619, "loss": 3.7344, "rewards/accuracies": 0.5, "rewards/chosen": -39.90174102783203, "rewards/margins": 1.4888544082641602, "rewards/rejected": -41.390594482421875, "step": 4490 }, { "epoch": 0.6115196078431373, "grad_norm": 43.806702899711595, "learning_rate": 3.1480920982417993e-07, "logits/chosen": 13.262312889099121, "logits/rejected": 13.302590370178223, "logps/chosen": -4.570640563964844, "logps/rejected": -4.300273895263672, "loss": 3.741, "rewards/accuracies": 0.25, "rewards/chosen": -45.70640563964844, "rewards/margins": -2.7036657333374023, "rewards/rejected": -43.00273895263672, "step": 4491 }, { "epoch": 0.6116557734204793, "grad_norm": 49.63635587876847, "learning_rate": 3.1462344153014107e-07, "logits/chosen": 13.98061752319336, "logits/rejected": 13.263821601867676, "logps/chosen": -4.234650611877441, "logps/rejected": -4.372533798217773, "loss": 3.8904, "rewards/accuracies": 0.75, "rewards/chosen": -42.34650421142578, "rewards/margins": 1.3788366317749023, "rewards/rejected": -43.725337982177734, "step": 4492 }, { "epoch": 0.6117919389978214, "grad_norm": 41.33621930988605, "learning_rate": 3.144376925276412e-07, "logits/chosen": 12.770349502563477, "logits/rejected": 13.21287727355957, "logps/chosen": -4.306401252746582, "logps/rejected": -4.586677074432373, "loss": 4.004, "rewards/accuracies": 0.75, "rewards/chosen": -43.06401062011719, "rewards/margins": 2.8027591705322266, "rewards/rejected": -45.86677169799805, "step": 4493 }, { "epoch": 0.6119281045751634, "grad_norm": 39.28003411032791, "learning_rate": 3.14251962858652e-07, "logits/chosen": 13.924797058105469, "logits/rejected": 13.631207466125488, "logps/chosen": -4.125723361968994, "logps/rejected": -4.4002203941345215, "loss": 4.1699, "rewards/accuracies": 0.75, "rewards/chosen": -41.257232666015625, "rewards/margins": 2.7449703216552734, "rewards/rejected": -44.00220489501953, "step": 4494 }, { "epoch": 0.6120642701525054, "grad_norm": 41.361243949737094, "learning_rate": 3.140662525651407e-07, "logits/chosen": 13.052680969238281, "logits/rejected": 14.101323127746582, "logps/chosen": -3.9253110885620117, "logps/rejected": -4.378569602966309, "loss": 4.0013, "rewards/accuracies": 0.75, "rewards/chosen": -39.25311279296875, "rewards/margins": 4.532581329345703, "rewards/rejected": -43.78569412231445, "step": 4495 }, { "epoch": 0.6122004357298475, "grad_norm": 39.998394921689304, "learning_rate": 3.138805616890698e-07, "logits/chosen": 12.902069091796875, "logits/rejected": 13.595866203308105, "logps/chosen": -4.138573169708252, "logps/rejected": -4.435769081115723, "loss": 3.8409, "rewards/accuracies": 1.0, "rewards/chosen": -41.38572692871094, "rewards/margins": 2.971963882446289, "rewards/rejected": -44.35769271850586, "step": 4496 }, { "epoch": 0.6123366013071896, "grad_norm": 38.49975456947322, "learning_rate": 3.1369489027239786e-07, "logits/chosen": 13.432856559753418, "logits/rejected": 12.781673431396484, "logps/chosen": -4.267275810241699, "logps/rejected": -4.081519603729248, "loss": 4.0335, "rewards/accuracies": 0.0, "rewards/chosen": -42.672752380371094, "rewards/margins": -1.857558250427246, "rewards/rejected": -40.8151969909668, "step": 4497 }, { "epoch": 0.6124727668845316, "grad_norm": 48.932688104451245, "learning_rate": 3.1350923835707907e-07, "logits/chosen": 12.762928009033203, "logits/rejected": 13.76871109008789, "logps/chosen": -4.21694278717041, "logps/rejected": -4.627527236938477, "loss": 3.6387, "rewards/accuracies": 0.75, "rewards/chosen": -42.16942596435547, "rewards/margins": 4.105847358703613, "rewards/rejected": -46.27527618408203, "step": 4498 }, { "epoch": 0.6126089324618736, "grad_norm": 39.663929114579446, "learning_rate": 3.133236059850627e-07, "logits/chosen": 12.489922523498535, "logits/rejected": 13.932031631469727, "logps/chosen": -4.238863945007324, "logps/rejected": -4.778890609741211, "loss": 4.1964, "rewards/accuracies": 1.0, "rewards/chosen": -42.388641357421875, "rewards/margins": 5.400266647338867, "rewards/rejected": -47.788909912109375, "step": 4499 }, { "epoch": 0.6127450980392157, "grad_norm": 42.74432929088592, "learning_rate": 3.131379931982939e-07, "logits/chosen": 13.115856170654297, "logits/rejected": 12.744461059570312, "logps/chosen": -4.537423133850098, "logps/rejected": -4.257923126220703, "loss": 4.5811, "rewards/accuracies": 0.0, "rewards/chosen": -45.37423324584961, "rewards/margins": -2.795003890991211, "rewards/rejected": -42.57923126220703, "step": 4500 }, { "epoch": 0.6128812636165577, "grad_norm": 42.5317806587879, "learning_rate": 3.129524000387138e-07, "logits/chosen": 12.521238327026367, "logits/rejected": 13.46856689453125, "logps/chosen": -3.8477885723114014, "logps/rejected": -4.294086456298828, "loss": 4.2256, "rewards/accuracies": 1.0, "rewards/chosen": -38.47788619995117, "rewards/margins": 4.462977409362793, "rewards/rejected": -42.94086456298828, "step": 4501 }, { "epoch": 0.6130174291938998, "grad_norm": 39.45649978331262, "learning_rate": 3.127668265482582e-07, "logits/chosen": 12.952905654907227, "logits/rejected": 13.255720138549805, "logps/chosen": -4.168927192687988, "logps/rejected": -4.2860212326049805, "loss": 3.9033, "rewards/accuracies": 0.5, "rewards/chosen": -41.68927001953125, "rewards/margins": 1.1709423065185547, "rewards/rejected": -42.86021041870117, "step": 4502 }, { "epoch": 0.6131535947712419, "grad_norm": 45.18765278450634, "learning_rate": 3.1258127276885934e-07, "logits/chosen": 12.639288902282715, "logits/rejected": 13.532949447631836, "logps/chosen": -4.45331335067749, "logps/rejected": -4.497527122497559, "loss": 3.9247, "rewards/accuracies": 0.5, "rewards/chosen": -44.53313446044922, "rewards/margins": 0.4421396255493164, "rewards/rejected": -44.97527313232422, "step": 4503 }, { "epoch": 0.6132897603485838, "grad_norm": 42.38418774602391, "learning_rate": 3.123957387424446e-07, "logits/chosen": 13.196898460388184, "logits/rejected": 13.213708877563477, "logps/chosen": -4.005841255187988, "logps/rejected": -4.404378890991211, "loss": 3.6032, "rewards/accuracies": 0.75, "rewards/chosen": -40.05841827392578, "rewards/margins": 3.9853715896606445, "rewards/rejected": -44.04378890991211, "step": 4504 }, { "epoch": 0.6134259259259259, "grad_norm": 49.85294132830311, "learning_rate": 3.1221022451093666e-07, "logits/chosen": 12.901399612426758, "logits/rejected": 14.343494415283203, "logps/chosen": -3.990934371948242, "logps/rejected": -4.551177024841309, "loss": 3.8052, "rewards/accuracies": 1.0, "rewards/chosen": -39.90934753417969, "rewards/margins": 5.602421760559082, "rewards/rejected": -45.51176834106445, "step": 4505 }, { "epoch": 0.613562091503268, "grad_norm": 40.294549178327294, "learning_rate": 3.1202473011625423e-07, "logits/chosen": 12.947154998779297, "logits/rejected": 13.698081970214844, "logps/chosen": -4.141323566436768, "logps/rejected": -4.421200275421143, "loss": 3.8347, "rewards/accuracies": 0.75, "rewards/chosen": -41.413238525390625, "rewards/margins": 2.7987680435180664, "rewards/rejected": -44.212005615234375, "step": 4506 }, { "epoch": 0.61369825708061, "grad_norm": 43.34419603570044, "learning_rate": 3.118392556003114e-07, "logits/chosen": 12.507181167602539, "logits/rejected": 13.467449188232422, "logps/chosen": -3.859440326690674, "logps/rejected": -4.1762213706970215, "loss": 4.6945, "rewards/accuracies": 1.0, "rewards/chosen": -38.59440231323242, "rewards/margins": 3.1678104400634766, "rewards/rejected": -41.76221466064453, "step": 4507 }, { "epoch": 0.6138344226579521, "grad_norm": 40.30416410908267, "learning_rate": 3.116538010050173e-07, "logits/chosen": 13.181721687316895, "logits/rejected": 13.629054069519043, "logps/chosen": -4.197445869445801, "logps/rejected": -4.443292617797852, "loss": 4.3247, "rewards/accuracies": 0.75, "rewards/chosen": -41.97446060180664, "rewards/margins": 2.458462715148926, "rewards/rejected": -44.43292236328125, "step": 4508 }, { "epoch": 0.6139705882352942, "grad_norm": 44.297205633693636, "learning_rate": 3.114683663722771e-07, "logits/chosen": 12.91697883605957, "logits/rejected": 13.312527656555176, "logps/chosen": -3.9642655849456787, "logps/rejected": -4.078103065490723, "loss": 4.1821, "rewards/accuracies": 1.0, "rewards/chosen": -39.64265441894531, "rewards/margins": 1.1383752822875977, "rewards/rejected": -40.781028747558594, "step": 4509 }, { "epoch": 0.6141067538126361, "grad_norm": 37.63826213146126, "learning_rate": 3.112829517439915e-07, "logits/chosen": 13.377202033996582, "logits/rejected": 13.289436340332031, "logps/chosen": -4.129576683044434, "logps/rejected": -4.158900260925293, "loss": 3.6792, "rewards/accuracies": 0.5, "rewards/chosen": -41.295772552490234, "rewards/margins": 0.2932310104370117, "rewards/rejected": -41.58900451660156, "step": 4510 }, { "epoch": 0.6142429193899782, "grad_norm": 39.444099138143905, "learning_rate": 3.1109755716205625e-07, "logits/chosen": 13.473237991333008, "logits/rejected": 13.269752502441406, "logps/chosen": -4.155534267425537, "logps/rejected": -4.249970436096191, "loss": 3.9774, "rewards/accuracies": 0.5, "rewards/chosen": -41.55534362792969, "rewards/margins": 0.9443655014038086, "rewards/rejected": -42.49971008300781, "step": 4511 }, { "epoch": 0.6143790849673203, "grad_norm": 42.849070797335166, "learning_rate": 3.1091218266836283e-07, "logits/chosen": 12.310863494873047, "logits/rejected": 12.517988204956055, "logps/chosen": -3.9222307205200195, "logps/rejected": -4.275448799133301, "loss": 3.6867, "rewards/accuracies": 0.75, "rewards/chosen": -39.22230529785156, "rewards/margins": 3.5321807861328125, "rewards/rejected": -42.754486083984375, "step": 4512 }, { "epoch": 0.6145152505446623, "grad_norm": 38.953555373437794, "learning_rate": 3.1072682830479815e-07, "logits/chosen": 12.793718338012695, "logits/rejected": 13.286697387695312, "logps/chosen": -4.205414772033691, "logps/rejected": -4.522219657897949, "loss": 3.9644, "rewards/accuracies": 0.5, "rewards/chosen": -42.05414962768555, "rewards/margins": 3.168048858642578, "rewards/rejected": -45.222198486328125, "step": 4513 }, { "epoch": 0.6146514161220044, "grad_norm": 38.99185959565876, "learning_rate": 3.1054149411324454e-07, "logits/chosen": 13.147924423217773, "logits/rejected": 13.339397430419922, "logps/chosen": -3.880570411682129, "logps/rejected": -4.125095367431641, "loss": 3.8201, "rewards/accuracies": 1.0, "rewards/chosen": -38.805702209472656, "rewards/margins": 2.4452505111694336, "rewards/rejected": -41.250953674316406, "step": 4514 }, { "epoch": 0.6147875816993464, "grad_norm": 47.3406576289892, "learning_rate": 3.1035618013557974e-07, "logits/chosen": 12.968786239624023, "logits/rejected": 13.317481994628906, "logps/chosen": -4.301675796508789, "logps/rejected": -4.463013172149658, "loss": 3.913, "rewards/accuracies": 0.5, "rewards/chosen": -43.016754150390625, "rewards/margins": 1.613377571105957, "rewards/rejected": -44.63013458251953, "step": 4515 }, { "epoch": 0.6149237472766884, "grad_norm": 33.295226084407936, "learning_rate": 3.1017088641367714e-07, "logits/chosen": 12.716585159301758, "logits/rejected": 12.48448371887207, "logps/chosen": -4.128905296325684, "logps/rejected": -4.17646598815918, "loss": 3.4206, "rewards/accuracies": 0.5, "rewards/chosen": -41.2890510559082, "rewards/margins": 0.47560977935791016, "rewards/rejected": -41.76466369628906, "step": 4516 }, { "epoch": 0.6150599128540305, "grad_norm": 38.73020229486628, "learning_rate": 3.0998561298940516e-07, "logits/chosen": 13.727371215820312, "logits/rejected": 14.058517456054688, "logps/chosen": -4.370518207550049, "logps/rejected": -4.402713298797607, "loss": 4.0755, "rewards/accuracies": 0.25, "rewards/chosen": -43.70518112182617, "rewards/margins": 0.32195281982421875, "rewards/rejected": -44.02713394165039, "step": 4517 }, { "epoch": 0.6151960784313726, "grad_norm": 42.71301042500994, "learning_rate": 3.098003599046282e-07, "logits/chosen": 12.563386917114258, "logits/rejected": 13.376260757446289, "logps/chosen": -3.8931894302368164, "logps/rejected": -4.235909461975098, "loss": 3.956, "rewards/accuracies": 0.75, "rewards/chosen": -38.93189239501953, "rewards/margins": 3.4272050857543945, "rewards/rejected": -42.35909652709961, "step": 4518 }, { "epoch": 0.6153322440087146, "grad_norm": 39.5533352309363, "learning_rate": 3.096151272012054e-07, "logits/chosen": 13.838142395019531, "logits/rejected": 12.859318733215332, "logps/chosen": -4.124834060668945, "logps/rejected": -4.066714763641357, "loss": 3.5454, "rewards/accuracies": 0.25, "rewards/chosen": -41.24833679199219, "rewards/margins": -0.5811929702758789, "rewards/rejected": -40.667144775390625, "step": 4519 }, { "epoch": 0.6154684095860566, "grad_norm": 40.750838761756526, "learning_rate": 3.0942991492099167e-07, "logits/chosen": 13.624275207519531, "logits/rejected": 13.472190856933594, "logps/chosen": -3.8462655544281006, "logps/rejected": -4.554521083831787, "loss": 4.1536, "rewards/accuracies": 0.75, "rewards/chosen": -38.46265411376953, "rewards/margins": 7.08255672454834, "rewards/rejected": -45.54521179199219, "step": 4520 }, { "epoch": 0.6156045751633987, "grad_norm": 45.08487952860512, "learning_rate": 3.092447231058374e-07, "logits/chosen": 13.522254943847656, "logits/rejected": 13.152225494384766, "logps/chosen": -4.374932289123535, "logps/rejected": -4.197378158569336, "loss": 4.3233, "rewards/accuracies": 0.5, "rewards/chosen": -43.74932098388672, "rewards/margins": -1.775538444519043, "rewards/rejected": -41.97378158569336, "step": 4521 }, { "epoch": 0.6157407407407407, "grad_norm": 36.177901058688725, "learning_rate": 3.090595517975882e-07, "logits/chosen": 12.727630615234375, "logits/rejected": 12.842846870422363, "logps/chosen": -3.9865896701812744, "logps/rejected": -4.241236209869385, "loss": 3.3993, "rewards/accuracies": 0.75, "rewards/chosen": -39.86589813232422, "rewards/margins": 2.546466827392578, "rewards/rejected": -42.41236114501953, "step": 4522 }, { "epoch": 0.6158769063180828, "grad_norm": 46.557806443831446, "learning_rate": 3.0887440103808484e-07, "logits/chosen": 14.489728927612305, "logits/rejected": 14.37419319152832, "logps/chosen": -4.806239128112793, "logps/rejected": -4.884284019470215, "loss": 4.2683, "rewards/accuracies": 0.5, "rewards/chosen": -48.06238555908203, "rewards/margins": 0.7804479598999023, "rewards/rejected": -48.84283447265625, "step": 4523 }, { "epoch": 0.6160130718954249, "grad_norm": 48.2458141578026, "learning_rate": 3.0868927086916385e-07, "logits/chosen": 12.455368041992188, "logits/rejected": 13.184033393859863, "logps/chosen": -4.073274612426758, "logps/rejected": -4.456325054168701, "loss": 3.5369, "rewards/accuracies": 1.0, "rewards/chosen": -40.73274612426758, "rewards/margins": 3.83050537109375, "rewards/rejected": -44.563255310058594, "step": 4524 }, { "epoch": 0.6161492374727668, "grad_norm": 40.97734868534115, "learning_rate": 3.0850416133265705e-07, "logits/chosen": 13.942483901977539, "logits/rejected": 14.517534255981445, "logps/chosen": -4.378303527832031, "logps/rejected": -4.3416547775268555, "loss": 3.9696, "rewards/accuracies": 0.5, "rewards/chosen": -43.78303527832031, "rewards/margins": -0.3664865493774414, "rewards/rejected": -43.41654968261719, "step": 4525 }, { "epoch": 0.6162854030501089, "grad_norm": 41.765769462323945, "learning_rate": 3.0831907247039114e-07, "logits/chosen": 14.024713516235352, "logits/rejected": 14.33622932434082, "logps/chosen": -4.574826240539551, "logps/rejected": -4.533293724060059, "loss": 4.1545, "rewards/accuracies": 0.5, "rewards/chosen": -45.748260498046875, "rewards/margins": -0.41531944274902344, "rewards/rejected": -45.33293914794922, "step": 4526 }, { "epoch": 0.616421568627451, "grad_norm": 39.78069580950027, "learning_rate": 3.081340043241887e-07, "logits/chosen": 12.616666793823242, "logits/rejected": 12.861129760742188, "logps/chosen": -4.123479843139648, "logps/rejected": -4.217928409576416, "loss": 4.2427, "rewards/accuracies": 0.5, "rewards/chosen": -41.234798431396484, "rewards/margins": 0.944488525390625, "rewards/rejected": -42.17928695678711, "step": 4527 }, { "epoch": 0.616557734204793, "grad_norm": 52.664535480748455, "learning_rate": 3.0794895693586746e-07, "logits/chosen": 13.141671180725098, "logits/rejected": 13.111674308776855, "logps/chosen": -4.310919284820557, "logps/rejected": -4.167087078094482, "loss": 4.2881, "rewards/accuracies": 0.25, "rewards/chosen": -43.10919189453125, "rewards/margins": -1.4383211135864258, "rewards/rejected": -41.670867919921875, "step": 4528 }, { "epoch": 0.6166938997821351, "grad_norm": 41.28671517185391, "learning_rate": 3.077639303472401e-07, "logits/chosen": 12.918027877807617, "logits/rejected": 13.714234352111816, "logps/chosen": -4.226449012756348, "logps/rejected": -4.400866508483887, "loss": 3.9438, "rewards/accuracies": 1.0, "rewards/chosen": -42.264495849609375, "rewards/margins": 1.7441749572753906, "rewards/rejected": -44.0086669921875, "step": 4529 }, { "epoch": 0.6168300653594772, "grad_norm": 52.29180287445699, "learning_rate": 3.075789246001152e-07, "logits/chosen": 13.593635559082031, "logits/rejected": 13.86163330078125, "logps/chosen": -4.1516523361206055, "logps/rejected": -4.442801475524902, "loss": 4.1059, "rewards/accuracies": 1.0, "rewards/chosen": -41.51652908325195, "rewards/margins": 2.911484718322754, "rewards/rejected": -44.428009033203125, "step": 4530 }, { "epoch": 0.6169662309368191, "grad_norm": 39.798640687786545, "learning_rate": 3.0739393973629636e-07, "logits/chosen": 13.217000961303711, "logits/rejected": 14.335121154785156, "logps/chosen": -4.35847282409668, "logps/rejected": -4.7217912673950195, "loss": 4.0216, "rewards/accuracies": 1.0, "rewards/chosen": -43.58472442626953, "rewards/margins": 3.633183479309082, "rewards/rejected": -47.21791076660156, "step": 4531 }, { "epoch": 0.6171023965141612, "grad_norm": 41.235606512494655, "learning_rate": 3.0720897579758215e-07, "logits/chosen": 13.83007526397705, "logits/rejected": 14.414606094360352, "logps/chosen": -4.214031219482422, "logps/rejected": -4.6930060386657715, "loss": 4.3039, "rewards/accuracies": 1.0, "rewards/chosen": -42.140316009521484, "rewards/margins": 4.789742469787598, "rewards/rejected": -46.930057525634766, "step": 4532 }, { "epoch": 0.6172385620915033, "grad_norm": 40.51649112004493, "learning_rate": 3.070240328257669e-07, "logits/chosen": 12.952786445617676, "logits/rejected": 13.721441268920898, "logps/chosen": -3.8966431617736816, "logps/rejected": -4.394521236419678, "loss": 3.8139, "rewards/accuracies": 1.0, "rewards/chosen": -38.9664306640625, "rewards/margins": 4.97878360748291, "rewards/rejected": -43.945213317871094, "step": 4533 }, { "epoch": 0.6173747276688453, "grad_norm": 41.38368625730351, "learning_rate": 3.068391108626402e-07, "logits/chosen": 13.505167007446289, "logits/rejected": 13.163366317749023, "logps/chosen": -4.074654579162598, "logps/rejected": -4.250264644622803, "loss": 3.6985, "rewards/accuracies": 0.75, "rewards/chosen": -40.746551513671875, "rewards/margins": 1.7560977935791016, "rewards/rejected": -42.502647399902344, "step": 4534 }, { "epoch": 0.6175108932461874, "grad_norm": 45.380401754033116, "learning_rate": 3.0665420994998623e-07, "logits/chosen": 14.696016311645508, "logits/rejected": 14.352411270141602, "logps/chosen": -4.202053546905518, "logps/rejected": -4.478570938110352, "loss": 3.837, "rewards/accuracies": 0.75, "rewards/chosen": -42.020538330078125, "rewards/margins": 2.7651710510253906, "rewards/rejected": -44.78570556640625, "step": 4535 }, { "epoch": 0.6176470588235294, "grad_norm": 56.03633722607573, "learning_rate": 3.0646933012958516e-07, "logits/chosen": 14.413436889648438, "logits/rejected": 14.342742919921875, "logps/chosen": -4.394229412078857, "logps/rejected": -4.626007080078125, "loss": 3.547, "rewards/accuracies": 1.0, "rewards/chosen": -43.942291259765625, "rewards/margins": 2.3177833557128906, "rewards/rejected": -46.260074615478516, "step": 4536 }, { "epoch": 0.6177832244008714, "grad_norm": 43.85975302485332, "learning_rate": 3.0628447144321225e-07, "logits/chosen": 13.131189346313477, "logits/rejected": 12.875898361206055, "logps/chosen": -4.175464630126953, "logps/rejected": -4.237058639526367, "loss": 4.2064, "rewards/accuracies": 0.25, "rewards/chosen": -41.75464630126953, "rewards/margins": 0.6159391403198242, "rewards/rejected": -42.370582580566406, "step": 4537 }, { "epoch": 0.6179193899782135, "grad_norm": 126.23201455491962, "learning_rate": 3.0609963393263745e-07, "logits/chosen": 14.165068626403809, "logits/rejected": 14.250577926635742, "logps/chosen": -4.190665245056152, "logps/rejected": -4.4228034019470215, "loss": 4.3774, "rewards/accuracies": 1.0, "rewards/chosen": -41.90665054321289, "rewards/margins": 2.3213815689086914, "rewards/rejected": -44.22803497314453, "step": 4538 }, { "epoch": 0.6180555555555556, "grad_norm": 45.35225884841833, "learning_rate": 3.059148176396266e-07, "logits/chosen": 12.562686920166016, "logits/rejected": 14.717710494995117, "logps/chosen": -4.025393962860107, "logps/rejected": -4.696887969970703, "loss": 3.6882, "rewards/accuracies": 1.0, "rewards/chosen": -40.25394058227539, "rewards/margins": 6.71494197845459, "rewards/rejected": -46.9688835144043, "step": 4539 }, { "epoch": 0.6181917211328976, "grad_norm": 39.01304364027122, "learning_rate": 3.0573002260594064e-07, "logits/chosen": 14.781440734863281, "logits/rejected": 13.667888641357422, "logps/chosen": -4.466398239135742, "logps/rejected": -4.402790069580078, "loss": 3.4837, "rewards/accuracies": 0.75, "rewards/chosen": -44.66398239135742, "rewards/margins": -0.6360855102539062, "rewards/rejected": -44.02790069580078, "step": 4540 }, { "epoch": 0.6183278867102396, "grad_norm": 43.029726956037905, "learning_rate": 3.05545248873335e-07, "logits/chosen": 14.18695068359375, "logits/rejected": 14.758962631225586, "logps/chosen": -4.195359230041504, "logps/rejected": -4.412389278411865, "loss": 4.3795, "rewards/accuracies": 0.75, "rewards/chosen": -41.953590393066406, "rewards/margins": 2.170302391052246, "rewards/rejected": -44.12389373779297, "step": 4541 }, { "epoch": 0.6184640522875817, "grad_norm": 42.6575370904836, "learning_rate": 3.053604964835613e-07, "logits/chosen": 14.18813419342041, "logits/rejected": 13.757402420043945, "logps/chosen": -4.638258457183838, "logps/rejected": -4.688112258911133, "loss": 3.4872, "rewards/accuracies": 0.75, "rewards/chosen": -46.38258361816406, "rewards/margins": 0.4985361099243164, "rewards/rejected": -46.88111877441406, "step": 4542 }, { "epoch": 0.6186002178649237, "grad_norm": 48.1249913819188, "learning_rate": 3.0517576547836585e-07, "logits/chosen": 13.083793640136719, "logits/rejected": 13.948683738708496, "logps/chosen": -4.2851457595825195, "logps/rejected": -4.382940292358398, "loss": 3.4689, "rewards/accuracies": 0.5, "rewards/chosen": -42.85145950317383, "rewards/margins": 0.9779443740844727, "rewards/rejected": -43.829402923583984, "step": 4543 }, { "epoch": 0.6187363834422658, "grad_norm": 49.70038257503077, "learning_rate": 3.049910558994898e-07, "logits/chosen": 14.043943405151367, "logits/rejected": 13.668243408203125, "logps/chosen": -4.336822509765625, "logps/rejected": -4.369132995605469, "loss": 3.7, "rewards/accuracies": 0.5, "rewards/chosen": -43.36822509765625, "rewards/margins": 0.3231067657470703, "rewards/rejected": -43.69132995605469, "step": 4544 }, { "epoch": 0.6188725490196079, "grad_norm": 49.17401770454355, "learning_rate": 3.0480636778867004e-07, "logits/chosen": 13.000235557556152, "logits/rejected": 13.272431373596191, "logps/chosen": -4.44596004486084, "logps/rejected": -4.545830249786377, "loss": 4.0344, "rewards/accuracies": 0.75, "rewards/chosen": -44.4595947265625, "rewards/margins": 0.9987058639526367, "rewards/rejected": -45.45830154418945, "step": 4545 }, { "epoch": 0.6190087145969498, "grad_norm": 42.72591485597096, "learning_rate": 3.0462170118763856e-07, "logits/chosen": 12.630931854248047, "logits/rejected": 13.626358032226562, "logps/chosen": -3.8131580352783203, "logps/rejected": -4.505654811859131, "loss": 4.1287, "rewards/accuracies": 1.0, "rewards/chosen": -38.1315803527832, "rewards/margins": 6.924968719482422, "rewards/rejected": -45.056549072265625, "step": 4546 }, { "epoch": 0.6191448801742919, "grad_norm": 45.215564079218645, "learning_rate": 3.044370561381219e-07, "logits/chosen": 13.387615203857422, "logits/rejected": 13.910839080810547, "logps/chosen": -4.133088111877441, "logps/rejected": -4.328741073608398, "loss": 3.8057, "rewards/accuracies": 0.75, "rewards/chosen": -41.33088684082031, "rewards/margins": 1.9565277099609375, "rewards/rejected": -43.28741455078125, "step": 4547 }, { "epoch": 0.619281045751634, "grad_norm": 45.14754435159912, "learning_rate": 3.0425243268184233e-07, "logits/chosen": 12.99046516418457, "logits/rejected": 13.8699951171875, "logps/chosen": -3.992737293243408, "logps/rejected": -4.2103166580200195, "loss": 4.4395, "rewards/accuracies": 0.75, "rewards/chosen": -39.927371978759766, "rewards/margins": 2.1757936477661133, "rewards/rejected": -42.10316467285156, "step": 4548 }, { "epoch": 0.619417211328976, "grad_norm": 49.31023433211634, "learning_rate": 3.040678308605172e-07, "logits/chosen": 12.67111873626709, "logits/rejected": 12.710270881652832, "logps/chosen": -4.0607404708862305, "logps/rejected": -4.237339973449707, "loss": 3.9909, "rewards/accuracies": 0.75, "rewards/chosen": -40.60740661621094, "rewards/margins": 1.7659931182861328, "rewards/rejected": -42.37339782714844, "step": 4549 }, { "epoch": 0.6195533769063181, "grad_norm": 38.68287540374026, "learning_rate": 3.038832507158586e-07, "logits/chosen": 11.589237213134766, "logits/rejected": 13.0179443359375, "logps/chosen": -3.7839481830596924, "logps/rejected": -4.243496417999268, "loss": 4.0718, "rewards/accuracies": 1.0, "rewards/chosen": -37.83948516845703, "rewards/margins": 4.595481872558594, "rewards/rejected": -42.434967041015625, "step": 4550 }, { "epoch": 0.6196895424836601, "grad_norm": 42.850690252397975, "learning_rate": 3.036986922895739e-07, "logits/chosen": 14.498363494873047, "logits/rejected": 13.71798324584961, "logps/chosen": -4.08338737487793, "logps/rejected": -3.950812339782715, "loss": 4.3103, "rewards/accuracies": 0.25, "rewards/chosen": -40.83386993408203, "rewards/margins": -1.325749397277832, "rewards/rejected": -39.508121490478516, "step": 4551 }, { "epoch": 0.6198257080610022, "grad_norm": 39.753823671321435, "learning_rate": 3.0351415562336594e-07, "logits/chosen": 14.064201354980469, "logits/rejected": 14.180336952209473, "logps/chosen": -4.393895626068115, "logps/rejected": -4.563088417053223, "loss": 3.6471, "rewards/accuracies": 0.75, "rewards/chosen": -43.93895721435547, "rewards/margins": 1.6919288635253906, "rewards/rejected": -45.63088607788086, "step": 4552 }, { "epoch": 0.6199618736383442, "grad_norm": 45.49154869393731, "learning_rate": 3.033296407589319e-07, "logits/chosen": 12.792787551879883, "logits/rejected": 14.203038215637207, "logps/chosen": -3.9295425415039062, "logps/rejected": -4.708971977233887, "loss": 3.432, "rewards/accuracies": 1.0, "rewards/chosen": -39.29542541503906, "rewards/margins": 7.794295310974121, "rewards/rejected": -47.0897216796875, "step": 4553 }, { "epoch": 0.6200980392156863, "grad_norm": 39.93950888354021, "learning_rate": 3.0314514773796463e-07, "logits/chosen": 14.147247314453125, "logits/rejected": 13.883044242858887, "logps/chosen": -4.301568031311035, "logps/rejected": -4.095634460449219, "loss": 4.1123, "rewards/accuracies": 0.0, "rewards/chosen": -43.015682220458984, "rewards/margins": -2.0593338012695312, "rewards/rejected": -40.95634841918945, "step": 4554 }, { "epoch": 0.6202342047930284, "grad_norm": 46.27896777033473, "learning_rate": 3.02960676602152e-07, "logits/chosen": 13.525094985961914, "logits/rejected": 13.992979049682617, "logps/chosen": -4.288723468780518, "logps/rejected": -4.596303462982178, "loss": 3.9108, "rewards/accuracies": 0.75, "rewards/chosen": -42.887237548828125, "rewards/margins": 3.075800895690918, "rewards/rejected": -45.963035583496094, "step": 4555 }, { "epoch": 0.6203703703703703, "grad_norm": 49.880022156109945, "learning_rate": 3.0277622739317643e-07, "logits/chosen": 13.101096153259277, "logits/rejected": 13.693328857421875, "logps/chosen": -4.065694808959961, "logps/rejected": -4.170597076416016, "loss": 3.6872, "rewards/accuracies": 0.5, "rewards/chosen": -40.656951904296875, "rewards/margins": 1.0490217208862305, "rewards/rejected": -41.705970764160156, "step": 4556 }, { "epoch": 0.6205065359477124, "grad_norm": 42.316240630710055, "learning_rate": 3.0259180015271594e-07, "logits/chosen": 13.405267715454102, "logits/rejected": 13.36451530456543, "logps/chosen": -4.132025241851807, "logps/rejected": -4.298854351043701, "loss": 3.9347, "rewards/accuracies": 0.75, "rewards/chosen": -41.32025146484375, "rewards/margins": 1.6682910919189453, "rewards/rejected": -42.98854446411133, "step": 4557 }, { "epoch": 0.6206427015250545, "grad_norm": 38.97576518766847, "learning_rate": 3.024073949224435e-07, "logits/chosen": 13.269119262695312, "logits/rejected": 13.873676300048828, "logps/chosen": -3.8043787479400635, "logps/rejected": -4.1327056884765625, "loss": 3.5783, "rewards/accuracies": 0.75, "rewards/chosen": -38.043785095214844, "rewards/margins": 3.2832746505737305, "rewards/rejected": -41.327064514160156, "step": 4558 }, { "epoch": 0.6207788671023965, "grad_norm": 50.95827798789583, "learning_rate": 3.0222301174402684e-07, "logits/chosen": 13.942625999450684, "logits/rejected": 14.455266952514648, "logps/chosen": -4.236623764038086, "logps/rejected": -4.443784236907959, "loss": 4.7503, "rewards/accuracies": 0.5, "rewards/chosen": -42.366233825683594, "rewards/margins": 2.0716094970703125, "rewards/rejected": -44.437843322753906, "step": 4559 }, { "epoch": 0.6209150326797386, "grad_norm": 48.04239631091611, "learning_rate": 3.020386506591289e-07, "logits/chosen": 12.763290405273438, "logits/rejected": 12.767891883850098, "logps/chosen": -3.955575942993164, "logps/rejected": -4.176649570465088, "loss": 3.9299, "rewards/accuracies": 0.5, "rewards/chosen": -39.555755615234375, "rewards/margins": 2.2107372283935547, "rewards/rejected": -41.76649475097656, "step": 4560 }, { "epoch": 0.6210511982570807, "grad_norm": 39.732901559731125, "learning_rate": 3.018543117094076e-07, "logits/chosen": 13.471481323242188, "logits/rejected": 13.145967483520508, "logps/chosen": -4.41960334777832, "logps/rejected": -4.288081645965576, "loss": 3.9122, "rewards/accuracies": 0.25, "rewards/chosen": -44.19603729248047, "rewards/margins": -1.3152236938476562, "rewards/rejected": -42.88081359863281, "step": 4561 }, { "epoch": 0.6211873638344226, "grad_norm": 39.84921124071916, "learning_rate": 3.0166999493651595e-07, "logits/chosen": 13.417144775390625, "logits/rejected": 14.822365760803223, "logps/chosen": -4.320559501647949, "logps/rejected": -4.92108154296875, "loss": 3.5392, "rewards/accuracies": 1.0, "rewards/chosen": -43.205596923828125, "rewards/margins": 6.005214691162109, "rewards/rejected": -49.2108154296875, "step": 4562 }, { "epoch": 0.6213235294117647, "grad_norm": 41.33359457287873, "learning_rate": 3.014857003821016e-07, "logits/chosen": 13.661203384399414, "logits/rejected": 13.862977981567383, "logps/chosen": -4.173215389251709, "logps/rejected": -4.2397613525390625, "loss": 3.7194, "rewards/accuracies": 0.5, "rewards/chosen": -41.732154846191406, "rewards/margins": 0.6654567718505859, "rewards/rejected": -42.397613525390625, "step": 4563 }, { "epoch": 0.6214596949891068, "grad_norm": 40.9334199604802, "learning_rate": 3.0130142808780764e-07, "logits/chosen": 14.346710205078125, "logits/rejected": 14.441411972045898, "logps/chosen": -4.117650985717773, "logps/rejected": -4.2342424392700195, "loss": 4.3248, "rewards/accuracies": 0.75, "rewards/chosen": -41.176513671875, "rewards/margins": 1.1659154891967773, "rewards/rejected": -42.342430114746094, "step": 4564 }, { "epoch": 0.6215958605664488, "grad_norm": 36.660194217720054, "learning_rate": 3.0111717809527185e-07, "logits/chosen": 13.115219116210938, "logits/rejected": 12.613615036010742, "logps/chosen": -3.899257183074951, "logps/rejected": -4.086677074432373, "loss": 3.7108, "rewards/accuracies": 0.75, "rewards/chosen": -38.99257278442383, "rewards/margins": 1.8741989135742188, "rewards/rejected": -40.86677169799805, "step": 4565 }, { "epoch": 0.6217320261437909, "grad_norm": 50.75505323648714, "learning_rate": 3.0093295044612705e-07, "logits/chosen": 13.288257598876953, "logits/rejected": 13.38557243347168, "logps/chosen": -3.957815647125244, "logps/rejected": -4.103161334991455, "loss": 4.0809, "rewards/accuracies": 0.75, "rewards/chosen": -39.578155517578125, "rewards/margins": 1.453455924987793, "rewards/rejected": -41.03160858154297, "step": 4566 }, { "epoch": 0.621868191721133, "grad_norm": 39.85215430130793, "learning_rate": 3.00748745182001e-07, "logits/chosen": 12.877119064331055, "logits/rejected": 13.161758422851562, "logps/chosen": -4.041321277618408, "logps/rejected": -4.116364479064941, "loss": 3.7723, "rewards/accuracies": 0.5, "rewards/chosen": -40.41321563720703, "rewards/margins": 0.750432014465332, "rewards/rejected": -41.16364669799805, "step": 4567 }, { "epoch": 0.6220043572984749, "grad_norm": 42.91823032129704, "learning_rate": 3.005645623445163e-07, "logits/chosen": 12.90435791015625, "logits/rejected": 13.515029907226562, "logps/chosen": -4.174020290374756, "logps/rejected": -4.298937797546387, "loss": 4.7657, "rewards/accuracies": 0.5, "rewards/chosen": -41.740203857421875, "rewards/margins": 1.2491741180419922, "rewards/rejected": -42.989376068115234, "step": 4568 }, { "epoch": 0.622140522875817, "grad_norm": 40.08530567895543, "learning_rate": 3.003804019752908e-07, "logits/chosen": 12.68813705444336, "logits/rejected": 13.07583236694336, "logps/chosen": -4.442376136779785, "logps/rejected": -4.32235050201416, "loss": 3.3393, "rewards/accuracies": 0.5, "rewards/chosen": -44.42376708984375, "rewards/margins": -1.2002582550048828, "rewards/rejected": -43.22350311279297, "step": 4569 }, { "epoch": 0.6222766884531591, "grad_norm": 42.27918978535861, "learning_rate": 3.0019626411593695e-07, "logits/chosen": 13.7310791015625, "logits/rejected": 13.706413269042969, "logps/chosen": -4.7874298095703125, "logps/rejected": -4.597304821014404, "loss": 3.6101, "rewards/accuracies": 0.25, "rewards/chosen": -47.874298095703125, "rewards/margins": -1.901249885559082, "rewards/rejected": -45.97304916381836, "step": 4570 }, { "epoch": 0.6224128540305011, "grad_norm": 37.72979061853482, "learning_rate": 3.0001214880806213e-07, "logits/chosen": 13.111491203308105, "logits/rejected": 14.461650848388672, "logps/chosen": -4.146638870239258, "logps/rejected": -4.829623222351074, "loss": 3.7071, "rewards/accuracies": 1.0, "rewards/chosen": -41.46638870239258, "rewards/margins": 6.82984733581543, "rewards/rejected": -48.296234130859375, "step": 4571 }, { "epoch": 0.6225490196078431, "grad_norm": 39.13353051930783, "learning_rate": 2.998280560932688e-07, "logits/chosen": 13.29202938079834, "logits/rejected": 13.399259567260742, "logps/chosen": -4.237035751342773, "logps/rejected": -4.379823684692383, "loss": 4.0349, "rewards/accuracies": 0.5, "rewards/chosen": -42.370361328125, "rewards/margins": 1.427872657775879, "rewards/rejected": -43.79823303222656, "step": 4572 }, { "epoch": 0.6226851851851852, "grad_norm": 41.887332785378085, "learning_rate": 2.996439860131543e-07, "logits/chosen": 13.220853805541992, "logits/rejected": 13.454447746276855, "logps/chosen": -3.6404669284820557, "logps/rejected": -4.016380310058594, "loss": 3.9558, "rewards/accuracies": 1.0, "rewards/chosen": -36.40467071533203, "rewards/margins": 3.759134292602539, "rewards/rejected": -40.16380310058594, "step": 4573 }, { "epoch": 0.6228213507625272, "grad_norm": 56.09470030997365, "learning_rate": 2.9945993860931066e-07, "logits/chosen": 13.648900985717773, "logits/rejected": 12.522040367126465, "logps/chosen": -4.449136734008789, "logps/rejected": -4.247781276702881, "loss": 4.6408, "rewards/accuracies": 0.0, "rewards/chosen": -44.491363525390625, "rewards/margins": -2.0135536193847656, "rewards/rejected": -42.477813720703125, "step": 4574 }, { "epoch": 0.6229575163398693, "grad_norm": 37.1723136514466, "learning_rate": 2.992759139233249e-07, "logits/chosen": 12.672149658203125, "logits/rejected": 13.311455726623535, "logps/chosen": -4.070901870727539, "logps/rejected": -4.201735496520996, "loss": 3.4372, "rewards/accuracies": 0.75, "rewards/chosen": -40.709014892578125, "rewards/margins": 1.308335304260254, "rewards/rejected": -42.01735305786133, "step": 4575 }, { "epoch": 0.6230936819172114, "grad_norm": 40.148553413679615, "learning_rate": 2.9909191199677917e-07, "logits/chosen": 11.800442695617676, "logits/rejected": 13.215736389160156, "logps/chosen": -3.8653316497802734, "logps/rejected": -4.145711898803711, "loss": 3.4295, "rewards/accuracies": 0.5, "rewards/chosen": -38.6533203125, "rewards/margins": 2.8037986755371094, "rewards/rejected": -41.45711898803711, "step": 4576 }, { "epoch": 0.6232298474945533, "grad_norm": 38.63467850019677, "learning_rate": 2.9890793287124987e-07, "logits/chosen": 13.125340461730957, "logits/rejected": 12.769064903259277, "logps/chosen": -4.307644844055176, "logps/rejected": -4.48028039932251, "loss": 3.8442, "rewards/accuracies": 0.75, "rewards/chosen": -43.07645034790039, "rewards/margins": 1.7263565063476562, "rewards/rejected": -44.80280303955078, "step": 4577 }, { "epoch": 0.6233660130718954, "grad_norm": 50.58417372112148, "learning_rate": 2.987239765883088e-07, "logits/chosen": 13.973089218139648, "logits/rejected": 14.366111755371094, "logps/chosen": -4.358530044555664, "logps/rejected": -4.464717864990234, "loss": 4.1257, "rewards/accuracies": 0.5, "rewards/chosen": -43.585304260253906, "rewards/margins": 1.061873435974121, "rewards/rejected": -44.647178649902344, "step": 4578 }, { "epoch": 0.6235021786492375, "grad_norm": 40.66906438362738, "learning_rate": 2.985400431895225e-07, "logits/chosen": 14.31035041809082, "logits/rejected": 14.421670913696289, "logps/chosen": -4.6402130126953125, "logps/rejected": -4.683359622955322, "loss": 4.0995, "rewards/accuracies": 0.75, "rewards/chosen": -46.402130126953125, "rewards/margins": 0.43146419525146484, "rewards/rejected": -46.833595275878906, "step": 4579 }, { "epoch": 0.6236383442265795, "grad_norm": 40.42152139233585, "learning_rate": 2.9835613271645194e-07, "logits/chosen": 12.1109619140625, "logits/rejected": 12.460308074951172, "logps/chosen": -3.9636871814727783, "logps/rejected": -4.101672649383545, "loss": 3.859, "rewards/accuracies": 0.5, "rewards/chosen": -39.636871337890625, "rewards/margins": 1.379857063293457, "rewards/rejected": -41.016727447509766, "step": 4580 }, { "epoch": 0.6237745098039216, "grad_norm": 39.00671559472146, "learning_rate": 2.981722452106534e-07, "logits/chosen": 13.265815734863281, "logits/rejected": 12.829702377319336, "logps/chosen": -4.235137939453125, "logps/rejected": -4.760109901428223, "loss": 3.437, "rewards/accuracies": 0.75, "rewards/chosen": -42.35137939453125, "rewards/margins": 5.249719619750977, "rewards/rejected": -47.601097106933594, "step": 4581 }, { "epoch": 0.6239106753812637, "grad_norm": 63.02860809092436, "learning_rate": 2.9798838071367797e-07, "logits/chosen": 13.049032211303711, "logits/rejected": 13.072196960449219, "logps/chosen": -4.335027694702148, "logps/rejected": -4.3771562576293945, "loss": 4.2455, "rewards/accuracies": 0.25, "rewards/chosen": -43.35027313232422, "rewards/margins": 0.42128753662109375, "rewards/rejected": -43.77156066894531, "step": 4582 }, { "epoch": 0.6240468409586056, "grad_norm": 40.415035572440665, "learning_rate": 2.97804539267071e-07, "logits/chosen": 13.399829864501953, "logits/rejected": 13.164268493652344, "logps/chosen": -4.223502159118652, "logps/rejected": -4.389063358306885, "loss": 3.9338, "rewards/accuracies": 0.75, "rewards/chosen": -42.235023498535156, "rewards/margins": 1.6556119918823242, "rewards/rejected": -43.89063262939453, "step": 4583 }, { "epoch": 0.6241830065359477, "grad_norm": 42.53966951759187, "learning_rate": 2.976207209123731e-07, "logits/chosen": 12.594058990478516, "logits/rejected": 12.518780708312988, "logps/chosen": -4.188467979431152, "logps/rejected": -4.420585632324219, "loss": 4.0925, "rewards/accuracies": 1.0, "rewards/chosen": -41.88467788696289, "rewards/margins": 2.3211803436279297, "rewards/rejected": -44.20586013793945, "step": 4584 }, { "epoch": 0.6243191721132898, "grad_norm": 41.672989974742606, "learning_rate": 2.974369256911197e-07, "logits/chosen": 12.682876586914062, "logits/rejected": 13.5658597946167, "logps/chosen": -3.9758100509643555, "logps/rejected": -4.424308776855469, "loss": 4.5485, "rewards/accuracies": 1.0, "rewards/chosen": -39.75810241699219, "rewards/margins": 4.484983444213867, "rewards/rejected": -44.24308776855469, "step": 4585 }, { "epoch": 0.6244553376906318, "grad_norm": 39.55076393156527, "learning_rate": 2.9725315364484067e-07, "logits/chosen": 13.732383728027344, "logits/rejected": 13.845784187316895, "logps/chosen": -4.518496990203857, "logps/rejected": -4.69598388671875, "loss": 4.1347, "rewards/accuracies": 0.75, "rewards/chosen": -45.184967041015625, "rewards/margins": 1.774871826171875, "rewards/rejected": -46.9598388671875, "step": 4586 }, { "epoch": 0.6245915032679739, "grad_norm": 41.223185853610886, "learning_rate": 2.9706940481506085e-07, "logits/chosen": 12.563451766967773, "logits/rejected": 13.919158935546875, "logps/chosen": -3.661226749420166, "logps/rejected": -4.021398544311523, "loss": 3.7183, "rewards/accuracies": 1.0, "rewards/chosen": -36.612266540527344, "rewards/margins": 3.60172176361084, "rewards/rejected": -40.213985443115234, "step": 4587 }, { "epoch": 0.6247276688453159, "grad_norm": 39.788944090453334, "learning_rate": 2.9688567924329995e-07, "logits/chosen": 12.208337783813477, "logits/rejected": 13.062738418579102, "logps/chosen": -3.8801755905151367, "logps/rejected": -4.210504055023193, "loss": 4.1478, "rewards/accuracies": 0.75, "rewards/chosen": -38.8017578125, "rewards/margins": 3.30328369140625, "rewards/rejected": -42.10504150390625, "step": 4588 }, { "epoch": 0.6248638344226579, "grad_norm": 48.92430100410325, "learning_rate": 2.967019769710721e-07, "logits/chosen": 12.685657501220703, "logits/rejected": 12.894229888916016, "logps/chosen": -4.055685997009277, "logps/rejected": -4.071512222290039, "loss": 4.4461, "rewards/accuracies": 0.5, "rewards/chosen": -40.556861877441406, "rewards/margins": 0.15826034545898438, "rewards/rejected": -40.715118408203125, "step": 4589 }, { "epoch": 0.625, "grad_norm": 36.47293633471604, "learning_rate": 2.965182980398864e-07, "logits/chosen": 12.488090515136719, "logits/rejected": 12.4446382522583, "logps/chosen": -4.078544616699219, "logps/rejected": -4.208847999572754, "loss": 3.859, "rewards/accuracies": 0.75, "rewards/chosen": -40.78544998168945, "rewards/margins": 1.3030328750610352, "rewards/rejected": -42.08848190307617, "step": 4590 }, { "epoch": 0.6251361655773421, "grad_norm": 37.60848619439634, "learning_rate": 2.9633464249124683e-07, "logits/chosen": 12.492300987243652, "logits/rejected": 12.432109832763672, "logps/chosen": -4.12699031829834, "logps/rejected": -4.298313140869141, "loss": 3.9488, "rewards/accuracies": 0.75, "rewards/chosen": -41.26990509033203, "rewards/margins": 1.7132272720336914, "rewards/rejected": -42.983131408691406, "step": 4591 }, { "epoch": 0.6252723311546841, "grad_norm": 38.734226998366545, "learning_rate": 2.9615101036665147e-07, "logits/chosen": 13.6128568649292, "logits/rejected": 13.307770729064941, "logps/chosen": -4.560559272766113, "logps/rejected": -4.450259208679199, "loss": 4.3856, "rewards/accuracies": 0.25, "rewards/chosen": -45.605594635009766, "rewards/margins": -1.1029996871948242, "rewards/rejected": -44.502593994140625, "step": 4592 }, { "epoch": 0.6254084967320261, "grad_norm": 41.220667001011805, "learning_rate": 2.9596740170759377e-07, "logits/chosen": 13.330316543579102, "logits/rejected": 13.87962532043457, "logps/chosen": -4.193595886230469, "logps/rejected": -4.391194820404053, "loss": 3.7946, "rewards/accuracies": 0.75, "rewards/chosen": -41.93595504760742, "rewards/margins": 1.975992202758789, "rewards/rejected": -43.911949157714844, "step": 4593 }, { "epoch": 0.6255446623093682, "grad_norm": 43.33592664821984, "learning_rate": 2.9578381655556175e-07, "logits/chosen": 13.587822914123535, "logits/rejected": 14.008387565612793, "logps/chosen": -3.9682347774505615, "logps/rejected": -4.34977912902832, "loss": 3.4399, "rewards/accuracies": 1.0, "rewards/chosen": -39.68234634399414, "rewards/margins": 3.8154468536376953, "rewards/rejected": -43.49779510498047, "step": 4594 }, { "epoch": 0.6256808278867102, "grad_norm": 37.49235823208774, "learning_rate": 2.956002549520377e-07, "logits/chosen": 12.39212417602539, "logits/rejected": 11.768895149230957, "logps/chosen": -3.967318534851074, "logps/rejected": -3.9292831420898438, "loss": 3.9154, "rewards/accuracies": 0.5, "rewards/chosen": -39.67318344116211, "rewards/margins": -0.38035106658935547, "rewards/rejected": -39.29283142089844, "step": 4595 }, { "epoch": 0.6258169934640523, "grad_norm": 40.641672300713374, "learning_rate": 2.9541671693849904e-07, "logits/chosen": 12.537468910217285, "logits/rejected": 13.833234786987305, "logps/chosen": -3.96793794631958, "logps/rejected": -4.4482221603393555, "loss": 4.0874, "rewards/accuracies": 0.75, "rewards/chosen": -39.679378509521484, "rewards/margins": 4.80284309387207, "rewards/rejected": -44.48222351074219, "step": 4596 }, { "epoch": 0.6259531590413944, "grad_norm": 38.758032294263785, "learning_rate": 2.9523320255641785e-07, "logits/chosen": 12.919328689575195, "logits/rejected": 13.459888458251953, "logps/chosen": -3.8822460174560547, "logps/rejected": -4.173414707183838, "loss": 3.7759, "rewards/accuracies": 1.0, "rewards/chosen": -38.82246017456055, "rewards/margins": 2.9116883277893066, "rewards/rejected": -41.73414611816406, "step": 4597 }, { "epoch": 0.6260893246187363, "grad_norm": 42.233464450290406, "learning_rate": 2.9504971184726037e-07, "logits/chosen": 13.067584991455078, "logits/rejected": 13.289039611816406, "logps/chosen": -4.2080583572387695, "logps/rejected": -4.683659553527832, "loss": 4.1493, "rewards/accuracies": 0.75, "rewards/chosen": -42.080589294433594, "rewards/margins": 4.756004333496094, "rewards/rejected": -46.83659362792969, "step": 4598 }, { "epoch": 0.6262254901960784, "grad_norm": 38.761917821474995, "learning_rate": 2.9486624485248797e-07, "logits/chosen": 13.009611129760742, "logits/rejected": 13.108582496643066, "logps/chosen": -4.018526077270508, "logps/rejected": -4.332587242126465, "loss": 3.7956, "rewards/accuracies": 0.75, "rewards/chosen": -40.18525695800781, "rewards/margins": 3.1406173706054688, "rewards/rejected": -43.32587432861328, "step": 4599 }, { "epoch": 0.6263616557734205, "grad_norm": 37.735132663584515, "learning_rate": 2.9468280161355677e-07, "logits/chosen": 12.757232666015625, "logits/rejected": 12.96303939819336, "logps/chosen": -4.201175689697266, "logps/rejected": -4.210051536560059, "loss": 3.657, "rewards/accuracies": 0.25, "rewards/chosen": -42.011756896972656, "rewards/margins": 0.08875560760498047, "rewards/rejected": -42.10051345825195, "step": 4600 }, { "epoch": 0.6264978213507625, "grad_norm": 40.98607010476861, "learning_rate": 2.94499382171917e-07, "logits/chosen": 11.864910125732422, "logits/rejected": 12.755430221557617, "logps/chosen": -3.9080328941345215, "logps/rejected": -4.208491325378418, "loss": 4.126, "rewards/accuracies": 0.5, "rewards/chosen": -39.08032989501953, "rewards/margins": 3.0045838356018066, "rewards/rejected": -42.08491516113281, "step": 4601 }, { "epoch": 0.6266339869281046, "grad_norm": 43.05645298982052, "learning_rate": 2.9431598656901387e-07, "logits/chosen": 13.065498352050781, "logits/rejected": 12.885862350463867, "logps/chosen": -4.353594779968262, "logps/rejected": -4.249629974365234, "loss": 3.8531, "rewards/accuracies": 0.25, "rewards/chosen": -43.53594970703125, "rewards/margins": -1.0396490097045898, "rewards/rejected": -42.496299743652344, "step": 4602 }, { "epoch": 0.6267701525054467, "grad_norm": 37.15639670291099, "learning_rate": 2.941326148462873e-07, "logits/chosen": 12.285690307617188, "logits/rejected": 13.812997817993164, "logps/chosen": -4.0342583656311035, "logps/rejected": -4.325927257537842, "loss": 3.1724, "rewards/accuracies": 1.0, "rewards/chosen": -40.34258270263672, "rewards/margins": 2.9166908264160156, "rewards/rejected": -43.259273529052734, "step": 4603 }, { "epoch": 0.6269063180827886, "grad_norm": 39.32170446332116, "learning_rate": 2.939492670451714e-07, "logits/chosen": 12.608760833740234, "logits/rejected": 12.074251174926758, "logps/chosen": -4.071338653564453, "logps/rejected": -4.027249813079834, "loss": 4.055, "rewards/accuracies": 0.5, "rewards/chosen": -40.71338653564453, "rewards/margins": -0.440887451171875, "rewards/rejected": -40.272499084472656, "step": 4604 }, { "epoch": 0.6270424836601307, "grad_norm": 40.164549574367456, "learning_rate": 2.9376594320709523e-07, "logits/chosen": 12.557754516601562, "logits/rejected": 13.282196044921875, "logps/chosen": -4.0157976150512695, "logps/rejected": -4.396856307983398, "loss": 4.0579, "rewards/accuracies": 1.0, "rewards/chosen": -40.15797424316406, "rewards/margins": 3.8105878829956055, "rewards/rejected": -43.968563079833984, "step": 4605 }, { "epoch": 0.6271786492374728, "grad_norm": 45.246227944542554, "learning_rate": 2.935826433734825e-07, "logits/chosen": 13.241622924804688, "logits/rejected": 13.5346040725708, "logps/chosen": -4.010739803314209, "logps/rejected": -4.409542083740234, "loss": 4.3387, "rewards/accuracies": 0.75, "rewards/chosen": -40.107398986816406, "rewards/margins": 3.9880247116088867, "rewards/rejected": -44.095420837402344, "step": 4606 }, { "epoch": 0.6273148148148148, "grad_norm": 41.467028771612874, "learning_rate": 2.9339936758575097e-07, "logits/chosen": 10.983911514282227, "logits/rejected": 11.623908996582031, "logps/chosen": -3.990983009338379, "logps/rejected": -4.210329055786133, "loss": 3.9194, "rewards/accuracies": 1.0, "rewards/chosen": -39.909828186035156, "rewards/margins": 2.1934585571289062, "rewards/rejected": -42.10328674316406, "step": 4607 }, { "epoch": 0.6274509803921569, "grad_norm": 35.64754328368885, "learning_rate": 2.932161158853135e-07, "logits/chosen": 13.209814071655273, "logits/rejected": 13.247129440307617, "logps/chosen": -4.122064590454102, "logps/rejected": -4.081399917602539, "loss": 3.7894, "rewards/accuracies": 0.25, "rewards/chosen": -41.220645904541016, "rewards/margins": -0.406646728515625, "rewards/rejected": -40.81399917602539, "step": 4608 }, { "epoch": 0.6275871459694989, "grad_norm": 43.98458130997989, "learning_rate": 2.9303288831357744e-07, "logits/chosen": 14.073576927185059, "logits/rejected": 13.346351623535156, "logps/chosen": -4.630218505859375, "logps/rejected": -4.265361309051514, "loss": 3.9129, "rewards/accuracies": 0.25, "rewards/chosen": -46.30218505859375, "rewards/margins": -3.648569107055664, "rewards/rejected": -42.65361022949219, "step": 4609 }, { "epoch": 0.6277233115468409, "grad_norm": 42.32785460086409, "learning_rate": 2.9284968491194447e-07, "logits/chosen": 11.67546272277832, "logits/rejected": 12.848664283752441, "logps/chosen": -3.626063346862793, "logps/rejected": -4.252347946166992, "loss": 4.2911, "rewards/accuracies": 1.0, "rewards/chosen": -36.2606315612793, "rewards/margins": 6.262848854064941, "rewards/rejected": -42.52348327636719, "step": 4610 }, { "epoch": 0.627859477124183, "grad_norm": 40.683180912108746, "learning_rate": 2.9266650572181084e-07, "logits/chosen": 13.265443801879883, "logits/rejected": 12.25173568725586, "logps/chosen": -4.263212203979492, "logps/rejected": -4.36839485168457, "loss": 3.7301, "rewards/accuracies": 0.5, "rewards/chosen": -42.63212203979492, "rewards/margins": 1.0518293380737305, "rewards/rejected": -43.68395233154297, "step": 4611 }, { "epoch": 0.6279956427015251, "grad_norm": 40.119767794420696, "learning_rate": 2.9248335078456746e-07, "logits/chosen": 12.740851402282715, "logits/rejected": 11.667924880981445, "logps/chosen": -3.988961696624756, "logps/rejected": -4.061520576477051, "loss": 4.0901, "rewards/accuracies": 1.0, "rewards/chosen": -39.889617919921875, "rewards/margins": 0.7255878448486328, "rewards/rejected": -40.61520767211914, "step": 4612 }, { "epoch": 0.628131808278867, "grad_norm": 42.560299384137934, "learning_rate": 2.9230022014159976e-07, "logits/chosen": 12.00117301940918, "logits/rejected": 12.424922943115234, "logps/chosen": -3.861316204071045, "logps/rejected": -4.097886085510254, "loss": 3.7825, "rewards/accuracies": 0.75, "rewards/chosen": -38.613162994384766, "rewards/margins": 2.365694046020508, "rewards/rejected": -40.978858947753906, "step": 4613 }, { "epoch": 0.6282679738562091, "grad_norm": 39.694476070164136, "learning_rate": 2.921171138342875e-07, "logits/chosen": 12.974782943725586, "logits/rejected": 13.318021774291992, "logps/chosen": -4.297294616699219, "logps/rejected": -4.029307842254639, "loss": 3.5377, "rewards/accuracies": 0.25, "rewards/chosen": -42.97294616699219, "rewards/margins": -2.679865837097168, "rewards/rejected": -40.2930793762207, "step": 4614 }, { "epoch": 0.6284041394335512, "grad_norm": 41.31216716594083, "learning_rate": 2.9193403190400524e-07, "logits/chosen": 13.363273620605469, "logits/rejected": 13.822803497314453, "logps/chosen": -4.406280517578125, "logps/rejected": -4.6559648513793945, "loss": 4.1294, "rewards/accuracies": 0.75, "rewards/chosen": -44.062801361083984, "rewards/margins": 2.4968433380126953, "rewards/rejected": -46.55964660644531, "step": 4615 }, { "epoch": 0.6285403050108932, "grad_norm": 37.363669695160276, "learning_rate": 2.9175097439212166e-07, "logits/chosen": 13.34450912475586, "logits/rejected": 13.4810791015625, "logps/chosen": -4.320072650909424, "logps/rejected": -4.529815673828125, "loss": 3.7374, "rewards/accuracies": 0.75, "rewards/chosen": -43.20072937011719, "rewards/margins": 2.0974302291870117, "rewards/rejected": -45.29815673828125, "step": 4616 }, { "epoch": 0.6286764705882353, "grad_norm": 42.536942310170666, "learning_rate": 2.915679413400003e-07, "logits/chosen": 12.087125778198242, "logits/rejected": 12.661297798156738, "logps/chosen": -4.135611534118652, "logps/rejected": -4.150684833526611, "loss": 4.0631, "rewards/accuracies": 0.5, "rewards/chosen": -41.356117248535156, "rewards/margins": 0.15073585510253906, "rewards/rejected": -41.50685119628906, "step": 4617 }, { "epoch": 0.6288126361655774, "grad_norm": 42.848421611140346, "learning_rate": 2.9138493278899886e-07, "logits/chosen": 13.542940139770508, "logits/rejected": 13.421335220336914, "logps/chosen": -4.223449230194092, "logps/rejected": -4.621118068695068, "loss": 3.6177, "rewards/accuracies": 1.0, "rewards/chosen": -42.23448944091797, "rewards/margins": 3.976687431335449, "rewards/rejected": -46.211181640625, "step": 4618 }, { "epoch": 0.6289488017429193, "grad_norm": 36.013351326679135, "learning_rate": 2.9120194878046964e-07, "logits/chosen": 12.348172187805176, "logits/rejected": 12.897384643554688, "logps/chosen": -3.8388466835021973, "logps/rejected": -4.2681779861450195, "loss": 3.5662, "rewards/accuracies": 0.5, "rewards/chosen": -38.388465881347656, "rewards/margins": 4.293316841125488, "rewards/rejected": -42.68178176879883, "step": 4619 }, { "epoch": 0.6290849673202614, "grad_norm": 41.03063028741322, "learning_rate": 2.9101898935575946e-07, "logits/chosen": 13.930426597595215, "logits/rejected": 13.358041763305664, "logps/chosen": -4.282376289367676, "logps/rejected": -4.151662826538086, "loss": 3.958, "rewards/accuracies": 0.5, "rewards/chosen": -42.823760986328125, "rewards/margins": -1.307133674621582, "rewards/rejected": -41.516624450683594, "step": 4620 }, { "epoch": 0.6292211328976035, "grad_norm": 43.63872022736027, "learning_rate": 2.9083605455620954e-07, "logits/chosen": 13.045326232910156, "logits/rejected": 12.307695388793945, "logps/chosen": -4.150367259979248, "logps/rejected": -4.18220329284668, "loss": 4.5331, "rewards/accuracies": 0.5, "rewards/chosen": -41.5036735534668, "rewards/margins": 0.3183584213256836, "rewards/rejected": -41.82202911376953, "step": 4621 }, { "epoch": 0.6293572984749455, "grad_norm": 42.80927650415764, "learning_rate": 2.906531444231553e-07, "logits/chosen": 13.4148588180542, "logits/rejected": 13.057003021240234, "logps/chosen": -4.533062934875488, "logps/rejected": -4.664704322814941, "loss": 4.1677, "rewards/accuracies": 0.5, "rewards/chosen": -45.33062744140625, "rewards/margins": 1.316415786743164, "rewards/rejected": -46.64704132080078, "step": 4622 }, { "epoch": 0.6294934640522876, "grad_norm": 39.88692700215323, "learning_rate": 2.90470258997927e-07, "logits/chosen": 11.956827163696289, "logits/rejected": 12.875078201293945, "logps/chosen": -3.7091362476348877, "logps/rejected": -3.9552011489868164, "loss": 4.3845, "rewards/accuracies": 0.75, "rewards/chosen": -37.09136199951172, "rewards/margins": 2.4606494903564453, "rewards/rejected": -39.5520133972168, "step": 4623 }, { "epoch": 0.6296296296296297, "grad_norm": 42.18202199216601, "learning_rate": 2.9028739832184925e-07, "logits/chosen": 13.317697525024414, "logits/rejected": 13.239450454711914, "logps/chosen": -4.22853422164917, "logps/rejected": -4.341470241546631, "loss": 4.2486, "rewards/accuracies": 0.75, "rewards/chosen": -42.285343170166016, "rewards/margins": 1.1293601989746094, "rewards/rejected": -43.414703369140625, "step": 4624 }, { "epoch": 0.6297657952069716, "grad_norm": 45.02950450411145, "learning_rate": 2.9010456243624056e-07, "logits/chosen": 13.41500186920166, "logits/rejected": 13.766057968139648, "logps/chosen": -3.8705859184265137, "logps/rejected": -4.388163089752197, "loss": 4.3102, "rewards/accuracies": 1.0, "rewards/chosen": -38.70586013793945, "rewards/margins": 5.175771713256836, "rewards/rejected": -43.881629943847656, "step": 4625 }, { "epoch": 0.6299019607843137, "grad_norm": 38.75020206459592, "learning_rate": 2.8992175138241435e-07, "logits/chosen": 12.810768127441406, "logits/rejected": 13.174576759338379, "logps/chosen": -4.016701698303223, "logps/rejected": -4.22223424911499, "loss": 3.7306, "rewards/accuracies": 0.75, "rewards/chosen": -40.167015075683594, "rewards/margins": 2.055325508117676, "rewards/rejected": -42.22234344482422, "step": 4626 }, { "epoch": 0.6300381263616558, "grad_norm": 48.0522101681407, "learning_rate": 2.897389652016786e-07, "logits/chosen": 13.348651885986328, "logits/rejected": 13.586368560791016, "logps/chosen": -4.516407489776611, "logps/rejected": -4.496671676635742, "loss": 3.8332, "rewards/accuracies": 0.25, "rewards/chosen": -45.1640739440918, "rewards/margins": -0.197357177734375, "rewards/rejected": -44.96671676635742, "step": 4627 }, { "epoch": 0.6301742919389978, "grad_norm": 45.73325209113746, "learning_rate": 2.895562039353348e-07, "logits/chosen": 13.414627075195312, "logits/rejected": 13.058995246887207, "logps/chosen": -4.3524580001831055, "logps/rejected": -3.8842270374298096, "loss": 4.1602, "rewards/accuracies": 0.25, "rewards/chosen": -43.52458190917969, "rewards/margins": -4.682313919067383, "rewards/rejected": -38.84226989746094, "step": 4628 }, { "epoch": 0.6303104575163399, "grad_norm": 44.09943059513487, "learning_rate": 2.8937346762467974e-07, "logits/chosen": 13.255168914794922, "logits/rejected": 13.532715797424316, "logps/chosen": -4.316469192504883, "logps/rejected": -4.491421699523926, "loss": 3.7174, "rewards/accuracies": 0.5, "rewards/chosen": -43.16469192504883, "rewards/margins": 1.7495250701904297, "rewards/rejected": -44.914215087890625, "step": 4629 }, { "epoch": 0.6304466230936819, "grad_norm": 39.38762659360331, "learning_rate": 2.8919075631100424e-07, "logits/chosen": 13.501065254211426, "logits/rejected": 13.994819641113281, "logps/chosen": -4.1797075271606445, "logps/rejected": -4.213271141052246, "loss": 3.8077, "rewards/accuracies": 0.5, "rewards/chosen": -41.79707336425781, "rewards/margins": 0.3356342315673828, "rewards/rejected": -42.13270568847656, "step": 4630 }, { "epoch": 0.630582788671024, "grad_norm": 39.08458432984308, "learning_rate": 2.890080700355932e-07, "logits/chosen": 13.04690170288086, "logits/rejected": 13.932092666625977, "logps/chosen": -4.541916847229004, "logps/rejected": -4.558737754821777, "loss": 3.9334, "rewards/accuracies": 0.75, "rewards/chosen": -45.41917037963867, "rewards/margins": 0.16820621490478516, "rewards/rejected": -45.58737564086914, "step": 4631 }, { "epoch": 0.630718954248366, "grad_norm": 50.99303559033821, "learning_rate": 2.8882540883972606e-07, "logits/chosen": 12.874368667602539, "logits/rejected": 13.57237720489502, "logps/chosen": -3.975482940673828, "logps/rejected": -4.470827102661133, "loss": 3.4937, "rewards/accuracies": 0.75, "rewards/chosen": -39.75482940673828, "rewards/margins": 4.953437805175781, "rewards/rejected": -44.70826721191406, "step": 4632 }, { "epoch": 0.6308551198257081, "grad_norm": 44.6059232051763, "learning_rate": 2.8864277276467706e-07, "logits/chosen": 14.034842491149902, "logits/rejected": 14.062088012695312, "logps/chosen": -4.644495487213135, "logps/rejected": -4.529216766357422, "loss": 4.0818, "rewards/accuracies": 0.75, "rewards/chosen": -46.44495391845703, "rewards/margins": -1.1527833938598633, "rewards/rejected": -45.292171478271484, "step": 4633 }, { "epoch": 0.6309912854030502, "grad_norm": 40.24874640075751, "learning_rate": 2.8846016185171384e-07, "logits/chosen": 13.278581619262695, "logits/rejected": 13.627399444580078, "logps/chosen": -4.472741603851318, "logps/rejected": -4.665042400360107, "loss": 3.5908, "rewards/accuracies": 0.75, "rewards/chosen": -44.7274169921875, "rewards/margins": 1.9230060577392578, "rewards/rejected": -46.650421142578125, "step": 4634 }, { "epoch": 0.6311274509803921, "grad_norm": 61.38724368022111, "learning_rate": 2.882775761420991e-07, "logits/chosen": 12.691873550415039, "logits/rejected": 13.584662437438965, "logps/chosen": -3.90997052192688, "logps/rejected": -4.189899444580078, "loss": 3.4338, "rewards/accuracies": 1.0, "rewards/chosen": -39.09970474243164, "rewards/margins": 2.799290657043457, "rewards/rejected": -41.89899444580078, "step": 4635 }, { "epoch": 0.6312636165577342, "grad_norm": 40.66990338547733, "learning_rate": 2.8809501567708967e-07, "logits/chosen": 12.810243606567383, "logits/rejected": 13.143600463867188, "logps/chosen": -4.349493026733398, "logps/rejected": -4.4566450119018555, "loss": 4.1265, "rewards/accuracies": 0.75, "rewards/chosen": -43.49492645263672, "rewards/margins": 1.0715179443359375, "rewards/rejected": -44.566444396972656, "step": 4636 }, { "epoch": 0.6313997821350763, "grad_norm": 41.46424745031541, "learning_rate": 2.8791248049793624e-07, "logits/chosen": 12.213715553283691, "logits/rejected": 13.639228820800781, "logps/chosen": -4.19637393951416, "logps/rejected": -4.522967338562012, "loss": 3.5983, "rewards/accuracies": 1.0, "rewards/chosen": -41.96373748779297, "rewards/margins": 3.265932083129883, "rewards/rejected": -45.229671478271484, "step": 4637 }, { "epoch": 0.6315359477124183, "grad_norm": 47.4603414919189, "learning_rate": 2.8772997064588443e-07, "logits/chosen": 13.503667831420898, "logits/rejected": 13.343284606933594, "logps/chosen": -4.222685813903809, "logps/rejected": -4.213901519775391, "loss": 4.3599, "rewards/accuracies": 0.5, "rewards/chosen": -42.22686004638672, "rewards/margins": -0.08784770965576172, "rewards/rejected": -42.139015197753906, "step": 4638 }, { "epoch": 0.6316721132897604, "grad_norm": 38.114127712146825, "learning_rate": 2.87547486162174e-07, "logits/chosen": 13.645036697387695, "logits/rejected": 14.10855484008789, "logps/chosen": -4.3562774658203125, "logps/rejected": -4.597541332244873, "loss": 3.6248, "rewards/accuracies": 0.75, "rewards/chosen": -43.56277084350586, "rewards/margins": 2.412639617919922, "rewards/rejected": -45.97541046142578, "step": 4639 }, { "epoch": 0.6318082788671024, "grad_norm": 40.822438036009785, "learning_rate": 2.8736502708803835e-07, "logits/chosen": 12.328392028808594, "logits/rejected": 13.286041259765625, "logps/chosen": -4.2789459228515625, "logps/rejected": -4.488971710205078, "loss": 3.7934, "rewards/accuracies": 0.75, "rewards/chosen": -42.789459228515625, "rewards/margins": 2.1002578735351562, "rewards/rejected": -44.88971710205078, "step": 4640 }, { "epoch": 0.6319444444444444, "grad_norm": 39.645903520557404, "learning_rate": 2.8718259346470593e-07, "logits/chosen": 12.987239837646484, "logits/rejected": 13.543500900268555, "logps/chosen": -4.367839813232422, "logps/rejected": -4.48923921585083, "loss": 3.447, "rewards/accuracies": 0.5, "rewards/chosen": -43.678401947021484, "rewards/margins": 1.2139902114868164, "rewards/rejected": -44.892391204833984, "step": 4641 }, { "epoch": 0.6320806100217865, "grad_norm": 43.89492315267001, "learning_rate": 2.870001853333992e-07, "logits/chosen": 13.46803092956543, "logits/rejected": 13.761427879333496, "logps/chosen": -4.346829414367676, "logps/rejected": -4.4063401222229, "loss": 4.0741, "rewards/accuracies": 0.75, "rewards/chosen": -43.468292236328125, "rewards/margins": 0.5951061248779297, "rewards/rejected": -44.06340026855469, "step": 4642 }, { "epoch": 0.6322167755991286, "grad_norm": 38.59396781652251, "learning_rate": 2.8681780273533454e-07, "logits/chosen": 13.379682540893555, "logits/rejected": 13.647698402404785, "logps/chosen": -4.1420392990112305, "logps/rejected": -4.492623805999756, "loss": 3.8294, "rewards/accuracies": 0.75, "rewards/chosen": -41.42039108276367, "rewards/margins": 3.505843162536621, "rewards/rejected": -44.926239013671875, "step": 4643 }, { "epoch": 0.6323529411764706, "grad_norm": 50.15451967378349, "learning_rate": 2.866354457117229e-07, "logits/chosen": 13.847126007080078, "logits/rejected": 13.524101257324219, "logps/chosen": -4.025486946105957, "logps/rejected": -4.134030818939209, "loss": 4.0711, "rewards/accuracies": 0.75, "rewards/chosen": -40.25486755371094, "rewards/margins": 1.0854425430297852, "rewards/rejected": -41.340309143066406, "step": 4644 }, { "epoch": 0.6324891067538126, "grad_norm": 39.170529435496746, "learning_rate": 2.8645311430376957e-07, "logits/chosen": 12.82529067993164, "logits/rejected": 13.636978149414062, "logps/chosen": -4.260564804077148, "logps/rejected": -4.583115577697754, "loss": 4.2273, "rewards/accuracies": 0.75, "rewards/chosen": -42.60565185546875, "rewards/margins": 3.225505828857422, "rewards/rejected": -45.831153869628906, "step": 4645 }, { "epoch": 0.6326252723311547, "grad_norm": 42.543279778110346, "learning_rate": 2.8627080855267344e-07, "logits/chosen": 13.08897876739502, "logits/rejected": 13.267303466796875, "logps/chosen": -4.4062910079956055, "logps/rejected": -4.234984397888184, "loss": 4.1067, "rewards/accuracies": 0.5, "rewards/chosen": -44.06290817260742, "rewards/margins": -1.7130632400512695, "rewards/rejected": -42.34984588623047, "step": 4646 }, { "epoch": 0.6327614379084967, "grad_norm": 38.948403850834325, "learning_rate": 2.8608852849962826e-07, "logits/chosen": 13.102476119995117, "logits/rejected": 13.138245582580566, "logps/chosen": -4.3017168045043945, "logps/rejected": -4.272950649261475, "loss": 3.7522, "rewards/accuracies": 0.25, "rewards/chosen": -43.01716232299805, "rewards/margins": -0.2876567840576172, "rewards/rejected": -42.72950744628906, "step": 4647 }, { "epoch": 0.6328976034858388, "grad_norm": 42.156635930906965, "learning_rate": 2.859062741858218e-07, "logits/chosen": 12.872594833374023, "logits/rejected": 13.013381958007812, "logps/chosen": -4.497645854949951, "logps/rejected": -4.42665958404541, "loss": 4.0371, "rewards/accuracies": 0.5, "rewards/chosen": -44.97645950317383, "rewards/margins": -0.709864616394043, "rewards/rejected": -44.26659393310547, "step": 4648 }, { "epoch": 0.6330337690631809, "grad_norm": 40.313869452161754, "learning_rate": 2.857240456524357e-07, "logits/chosen": 13.304298400878906, "logits/rejected": 12.964414596557617, "logps/chosen": -4.212189197540283, "logps/rejected": -4.275764465332031, "loss": 3.6845, "rewards/accuracies": 0.5, "rewards/chosen": -42.121891021728516, "rewards/margins": 0.6357555389404297, "rewards/rejected": -42.75764846801758, "step": 4649 }, { "epoch": 0.6331699346405228, "grad_norm": 40.1936428207797, "learning_rate": 2.85541842940646e-07, "logits/chosen": 13.771961212158203, "logits/rejected": 13.969847679138184, "logps/chosen": -4.3435516357421875, "logps/rejected": -4.616367340087891, "loss": 3.7288, "rewards/accuracies": 0.5, "rewards/chosen": -43.435516357421875, "rewards/margins": 2.728156089782715, "rewards/rejected": -46.163673400878906, "step": 4650 }, { "epoch": 0.6333061002178649, "grad_norm": 45.35063144948242, "learning_rate": 2.8535966609162325e-07, "logits/chosen": 13.012463569641113, "logits/rejected": 12.839971542358398, "logps/chosen": -4.52754020690918, "logps/rejected": -4.496879577636719, "loss": 3.909, "rewards/accuracies": 0.5, "rewards/chosen": -45.27539825439453, "rewards/margins": -0.30660152435302734, "rewards/rejected": -44.96879959106445, "step": 4651 }, { "epoch": 0.633442265795207, "grad_norm": 41.45221109082947, "learning_rate": 2.851775151465314e-07, "logits/chosen": 12.737869262695312, "logits/rejected": 13.649127006530762, "logps/chosen": -4.558117866516113, "logps/rejected": -4.4226393699646, "loss": 3.7427, "rewards/accuracies": 0.5, "rewards/chosen": -45.5811767578125, "rewards/margins": -1.3547821044921875, "rewards/rejected": -44.22639465332031, "step": 4652 }, { "epoch": 0.633578431372549, "grad_norm": 41.692342133235904, "learning_rate": 2.849953901465291e-07, "logits/chosen": 12.661920547485352, "logits/rejected": 14.169814109802246, "logps/chosen": -4.459616184234619, "logps/rejected": -5.236484050750732, "loss": 3.951, "rewards/accuracies": 0.75, "rewards/chosen": -44.596160888671875, "rewards/margins": 7.768679618835449, "rewards/rejected": -52.364837646484375, "step": 4653 }, { "epoch": 0.6337145969498911, "grad_norm": 40.36735404565343, "learning_rate": 2.848132911327692e-07, "logits/chosen": 12.94271469116211, "logits/rejected": 13.341136932373047, "logps/chosen": -4.279346942901611, "logps/rejected": -4.50931453704834, "loss": 3.6784, "rewards/accuracies": 0.75, "rewards/chosen": -42.79347229003906, "rewards/margins": 2.2996749877929688, "rewards/rejected": -45.09314727783203, "step": 4654 }, { "epoch": 0.6338507625272332, "grad_norm": 40.227808715563164, "learning_rate": 2.8463121814639816e-07, "logits/chosen": 12.95893669128418, "logits/rejected": 13.71358871459961, "logps/chosen": -4.1394782066345215, "logps/rejected": -4.491900444030762, "loss": 4.4697, "rewards/accuracies": 0.75, "rewards/chosen": -41.39478302001953, "rewards/margins": 3.5242204666137695, "rewards/rejected": -44.919002532958984, "step": 4655 }, { "epoch": 0.6339869281045751, "grad_norm": 41.65760077540811, "learning_rate": 2.84449171228557e-07, "logits/chosen": 13.192559242248535, "logits/rejected": 13.019485473632812, "logps/chosen": -4.415341377258301, "logps/rejected": -4.338165760040283, "loss": 3.7606, "rewards/accuracies": 0.25, "rewards/chosen": -44.15341567993164, "rewards/margins": -0.7717599868774414, "rewards/rejected": -43.38165283203125, "step": 4656 }, { "epoch": 0.6341230936819172, "grad_norm": 49.979540827863424, "learning_rate": 2.8426715042038084e-07, "logits/chosen": 12.527030944824219, "logits/rejected": 12.679520606994629, "logps/chosen": -4.089168071746826, "logps/rejected": -4.287298202514648, "loss": 4.1495, "rewards/accuracies": 0.75, "rewards/chosen": -40.89167785644531, "rewards/margins": 1.9813060760498047, "rewards/rejected": -42.87298583984375, "step": 4657 }, { "epoch": 0.6342592592592593, "grad_norm": 45.70812873896673, "learning_rate": 2.8408515576299875e-07, "logits/chosen": 13.269054412841797, "logits/rejected": 13.225397109985352, "logps/chosen": -3.951871156692505, "logps/rejected": -4.426582336425781, "loss": 3.6755, "rewards/accuracies": 1.0, "rewards/chosen": -39.51871109008789, "rewards/margins": 4.747114181518555, "rewards/rejected": -44.26582336425781, "step": 4658 }, { "epoch": 0.6343954248366013, "grad_norm": 44.07405659912438, "learning_rate": 2.8390318729753373e-07, "logits/chosen": 12.89373779296875, "logits/rejected": 13.674443244934082, "logps/chosen": -3.9490466117858887, "logps/rejected": -4.581280708312988, "loss": 3.9497, "rewards/accuracies": 1.0, "rewards/chosen": -39.4904670715332, "rewards/margins": 6.322342395782471, "rewards/rejected": -45.812808990478516, "step": 4659 }, { "epoch": 0.6345315904139434, "grad_norm": 41.4998370542298, "learning_rate": 2.837212450651034e-07, "logits/chosen": 13.148948669433594, "logits/rejected": 12.794282913208008, "logps/chosen": -4.402338981628418, "logps/rejected": -4.100533485412598, "loss": 4.0255, "rewards/accuracies": 0.25, "rewards/chosen": -44.02338790893555, "rewards/margins": -3.018054962158203, "rewards/rejected": -41.005332946777344, "step": 4660 }, { "epoch": 0.6346677559912854, "grad_norm": 41.92685568910259, "learning_rate": 2.835393291068188e-07, "logits/chosen": 14.455867767333984, "logits/rejected": 13.96581745147705, "logps/chosen": -4.354536056518555, "logps/rejected": -4.879886627197266, "loss": 4.5396, "rewards/accuracies": 0.5, "rewards/chosen": -43.54535675048828, "rewards/margins": 5.253510475158691, "rewards/rejected": -48.79887008666992, "step": 4661 }, { "epoch": 0.6348039215686274, "grad_norm": 43.2712153073023, "learning_rate": 2.833574394637854e-07, "logits/chosen": 13.104537963867188, "logits/rejected": 13.537313461303711, "logps/chosen": -3.8437225818634033, "logps/rejected": -4.276055335998535, "loss": 4.3946, "rewards/accuracies": 0.5, "rewards/chosen": -38.437225341796875, "rewards/margins": 4.323328971862793, "rewards/rejected": -42.76055145263672, "step": 4662 }, { "epoch": 0.6349400871459695, "grad_norm": 40.52035077569823, "learning_rate": 2.8317557617710285e-07, "logits/chosen": 12.81814193725586, "logits/rejected": 13.19282341003418, "logps/chosen": -3.8208136558532715, "logps/rejected": -4.3954854011535645, "loss": 3.9228, "rewards/accuracies": 1.0, "rewards/chosen": -38.20813751220703, "rewards/margins": 5.746716499328613, "rewards/rejected": -43.954856872558594, "step": 4663 }, { "epoch": 0.6350762527233116, "grad_norm": 48.788405946621914, "learning_rate": 2.829937392878645e-07, "logits/chosen": 12.890079498291016, "logits/rejected": 13.377586364746094, "logps/chosen": -4.232650279998779, "logps/rejected": -4.373390197753906, "loss": 3.9621, "rewards/accuracies": 0.5, "rewards/chosen": -42.32650375366211, "rewards/margins": 1.4073963165283203, "rewards/rejected": -43.73390197753906, "step": 4664 }, { "epoch": 0.6352124183006536, "grad_norm": 43.54454463248062, "learning_rate": 2.8281192883715795e-07, "logits/chosen": 12.592997550964355, "logits/rejected": 13.60614013671875, "logps/chosen": -4.323171615600586, "logps/rejected": -4.607845783233643, "loss": 4.1432, "rewards/accuracies": 0.5, "rewards/chosen": -43.23171615600586, "rewards/margins": 2.8467397689819336, "rewards/rejected": -46.07845687866211, "step": 4665 }, { "epoch": 0.6353485838779956, "grad_norm": 45.93251475879178, "learning_rate": 2.826301448660648e-07, "logits/chosen": 13.674980163574219, "logits/rejected": 13.598411560058594, "logps/chosen": -4.484118938446045, "logps/rejected": -4.408622741699219, "loss": 4.3564, "rewards/accuracies": 0.25, "rewards/chosen": -44.841190338134766, "rewards/margins": -0.7549629211425781, "rewards/rejected": -44.08622741699219, "step": 4666 }, { "epoch": 0.6354847494553377, "grad_norm": 40.81544489658825, "learning_rate": 2.824483874156605e-07, "logits/chosen": 11.573867797851562, "logits/rejected": 12.35387134552002, "logps/chosen": -3.953059673309326, "logps/rejected": -4.249695777893066, "loss": 3.9552, "rewards/accuracies": 0.75, "rewards/chosen": -39.53059387207031, "rewards/margins": 2.9663591384887695, "rewards/rejected": -42.49695587158203, "step": 4667 }, { "epoch": 0.6356209150326797, "grad_norm": 46.121710018876236, "learning_rate": 2.822666565270149e-07, "logits/chosen": 12.78728199005127, "logits/rejected": 13.780241012573242, "logps/chosen": -4.382721424102783, "logps/rejected": -4.605737686157227, "loss": 3.6399, "rewards/accuracies": 0.5, "rewards/chosen": -43.82721710205078, "rewards/margins": 2.2301559448242188, "rewards/rejected": -46.057373046875, "step": 4668 }, { "epoch": 0.6357570806100218, "grad_norm": 44.167759331342516, "learning_rate": 2.8208495224119137e-07, "logits/chosen": 12.530437469482422, "logits/rejected": 12.635627746582031, "logps/chosen": -4.290813446044922, "logps/rejected": -4.474750518798828, "loss": 3.8954, "rewards/accuracies": 0.75, "rewards/chosen": -42.90813446044922, "rewards/margins": 1.8393726348876953, "rewards/rejected": -44.74750518798828, "step": 4669 }, { "epoch": 0.6358932461873639, "grad_norm": 50.77260882822025, "learning_rate": 2.8190327459924746e-07, "logits/chosen": 12.262328147888184, "logits/rejected": 12.685918807983398, "logps/chosen": -4.125449180603027, "logps/rejected": -4.208632469177246, "loss": 3.3525, "rewards/accuracies": 0.75, "rewards/chosen": -41.254493713378906, "rewards/margins": 0.8318319320678711, "rewards/rejected": -42.08632278442383, "step": 4670 }, { "epoch": 0.6360294117647058, "grad_norm": 42.76442331125781, "learning_rate": 2.817216236422349e-07, "logits/chosen": 12.677070617675781, "logits/rejected": 12.351460456848145, "logps/chosen": -4.363602161407471, "logps/rejected": -4.396340370178223, "loss": 3.2814, "rewards/accuracies": 0.75, "rewards/chosen": -43.636024475097656, "rewards/margins": 0.3273811340332031, "rewards/rejected": -43.963401794433594, "step": 4671 }, { "epoch": 0.6361655773420479, "grad_norm": 50.018754748264485, "learning_rate": 2.815399994111994e-07, "logits/chosen": 13.533931732177734, "logits/rejected": 12.506016731262207, "logps/chosen": -4.2205376625061035, "logps/rejected": -4.225855350494385, "loss": 3.8175, "rewards/accuracies": 0.25, "rewards/chosen": -42.20537567138672, "rewards/margins": 0.05317401885986328, "rewards/rejected": -42.25855255126953, "step": 4672 }, { "epoch": 0.63630174291939, "grad_norm": 40.69649995733967, "learning_rate": 2.813584019471801e-07, "logits/chosen": 13.369564056396484, "logits/rejected": 12.165908813476562, "logps/chosen": -4.388054847717285, "logps/rejected": -4.046882629394531, "loss": 4.236, "rewards/accuracies": 0.0, "rewards/chosen": -43.88054656982422, "rewards/margins": -3.4117212295532227, "rewards/rejected": -40.46882629394531, "step": 4673 }, { "epoch": 0.636437908496732, "grad_norm": 39.9819881462436, "learning_rate": 2.8117683129121043e-07, "logits/chosen": 14.025032043457031, "logits/rejected": 13.931220054626465, "logps/chosen": -4.558286190032959, "logps/rejected": -4.637369632720947, "loss": 4.1132, "rewards/accuracies": 0.75, "rewards/chosen": -45.582862854003906, "rewards/margins": 0.7908344268798828, "rewards/rejected": -46.373695373535156, "step": 4674 }, { "epoch": 0.6365740740740741, "grad_norm": 44.16864921536962, "learning_rate": 2.809952874843182e-07, "logits/chosen": 14.179695129394531, "logits/rejected": 14.40831184387207, "logps/chosen": -4.3602399826049805, "logps/rejected": -4.666605472564697, "loss": 4.3126, "rewards/accuracies": 1.0, "rewards/chosen": -43.602394104003906, "rewards/margins": 3.06365966796875, "rewards/rejected": -46.666053771972656, "step": 4675 }, { "epoch": 0.6367102396514162, "grad_norm": 41.94363657047811, "learning_rate": 2.808137705675243e-07, "logits/chosen": 14.177458763122559, "logits/rejected": 14.169559478759766, "logps/chosen": -4.269052028656006, "logps/rejected": -4.452582359313965, "loss": 3.7465, "rewards/accuracies": 0.75, "rewards/chosen": -42.690521240234375, "rewards/margins": 1.8353052139282227, "rewards/rejected": -44.52582550048828, "step": 4676 }, { "epoch": 0.6368464052287581, "grad_norm": 45.73506974962746, "learning_rate": 2.806322805818441e-07, "logits/chosen": 13.231419563293457, "logits/rejected": 13.953929901123047, "logps/chosen": -4.131048202514648, "logps/rejected": -4.512669086456299, "loss": 3.5838, "rewards/accuracies": 1.0, "rewards/chosen": -41.31047821044922, "rewards/margins": 3.816211700439453, "rewards/rejected": -45.12668991088867, "step": 4677 }, { "epoch": 0.6369825708061002, "grad_norm": 42.55088005443943, "learning_rate": 2.8045081756828695e-07, "logits/chosen": 12.549922943115234, "logits/rejected": 13.674659729003906, "logps/chosen": -4.335395812988281, "logps/rejected": -4.552125930786133, "loss": 4.2713, "rewards/accuracies": 0.5, "rewards/chosen": -43.35395812988281, "rewards/margins": 2.167304039001465, "rewards/rejected": -45.521263122558594, "step": 4678 }, { "epoch": 0.6371187363834423, "grad_norm": 44.921698830581995, "learning_rate": 2.802693815678557e-07, "logits/chosen": 12.936737060546875, "logits/rejected": 14.420181274414062, "logps/chosen": -4.0437164306640625, "logps/rejected": -4.415131092071533, "loss": 4.2092, "rewards/accuracies": 0.75, "rewards/chosen": -40.43716812133789, "rewards/margins": 3.714143753051758, "rewards/rejected": -44.15131378173828, "step": 4679 }, { "epoch": 0.6372549019607843, "grad_norm": 47.01401726595827, "learning_rate": 2.800879726215473e-07, "logits/chosen": 13.800830841064453, "logits/rejected": 14.447673797607422, "logps/chosen": -4.326495170593262, "logps/rejected": -4.517127990722656, "loss": 3.7533, "rewards/accuracies": 0.5, "rewards/chosen": -43.26495361328125, "rewards/margins": 1.906327247619629, "rewards/rejected": -45.17127990722656, "step": 4680 }, { "epoch": 0.6373910675381264, "grad_norm": 46.299934769159016, "learning_rate": 2.799065907703529e-07, "logits/chosen": 12.896034240722656, "logits/rejected": 13.616415023803711, "logps/chosen": -4.062148094177246, "logps/rejected": -4.327269077301025, "loss": 3.6677, "rewards/accuracies": 0.75, "rewards/chosen": -40.621482849121094, "rewards/margins": 2.651211738586426, "rewards/rejected": -43.27268981933594, "step": 4681 }, { "epoch": 0.6375272331154684, "grad_norm": 42.21622311630215, "learning_rate": 2.7972523605525684e-07, "logits/chosen": 13.040111541748047, "logits/rejected": 13.656250953674316, "logps/chosen": -4.2909698486328125, "logps/rejected": -4.166712760925293, "loss": 4.1051, "rewards/accuracies": 0.5, "rewards/chosen": -42.909698486328125, "rewards/margins": -1.242568016052246, "rewards/rejected": -41.66712951660156, "step": 4682 }, { "epoch": 0.6376633986928104, "grad_norm": 45.82210373545754, "learning_rate": 2.7954390851723793e-07, "logits/chosen": 13.494738578796387, "logits/rejected": 14.242769241333008, "logps/chosen": -4.343788146972656, "logps/rejected": -4.591200828552246, "loss": 4.2406, "rewards/accuracies": 0.75, "rewards/chosen": -43.4378776550293, "rewards/margins": 2.4741296768188477, "rewards/rejected": -45.912010192871094, "step": 4683 }, { "epoch": 0.6377995642701525, "grad_norm": 39.61359625901673, "learning_rate": 2.793626081972687e-07, "logits/chosen": 13.770206451416016, "logits/rejected": 13.974543571472168, "logps/chosen": -4.304096698760986, "logps/rejected": -4.1625895500183105, "loss": 3.9326, "rewards/accuracies": 0.5, "rewards/chosen": -43.04096603393555, "rewards/margins": -1.4150714874267578, "rewards/rejected": -41.625892639160156, "step": 4684 }, { "epoch": 0.6379357298474946, "grad_norm": 40.996728165997325, "learning_rate": 2.791813351363152e-07, "logits/chosen": 14.074588775634766, "logits/rejected": 14.496889114379883, "logps/chosen": -4.819014549255371, "logps/rejected": -4.801166534423828, "loss": 4.1645, "rewards/accuracies": 0.25, "rewards/chosen": -48.19014358520508, "rewards/margins": -0.1784820556640625, "rewards/rejected": -48.01166534423828, "step": 4685 }, { "epoch": 0.6380718954248366, "grad_norm": 43.38313705048143, "learning_rate": 2.790000893753377e-07, "logits/chosen": 12.990724563598633, "logits/rejected": 12.637754440307617, "logps/chosen": -4.488508224487305, "logps/rejected": -4.392801761627197, "loss": 4.1323, "rewards/accuracies": 0.25, "rewards/chosen": -44.88508605957031, "rewards/margins": -0.957066535949707, "rewards/rejected": -43.928016662597656, "step": 4686 }, { "epoch": 0.6382080610021786, "grad_norm": 40.933441227327485, "learning_rate": 2.788188709552904e-07, "logits/chosen": 14.092622756958008, "logits/rejected": 13.986333847045898, "logps/chosen": -4.472311019897461, "logps/rejected": -5.005949020385742, "loss": 3.6393, "rewards/accuracies": 1.0, "rewards/chosen": -44.723114013671875, "rewards/margins": 5.336377143859863, "rewards/rejected": -50.059486389160156, "step": 4687 }, { "epoch": 0.6383442265795207, "grad_norm": 40.60644971644538, "learning_rate": 2.7863767991712075e-07, "logits/chosen": 13.195683479309082, "logits/rejected": 13.464807510375977, "logps/chosen": -4.129647254943848, "logps/rejected": -4.466382026672363, "loss": 3.717, "rewards/accuracies": 1.0, "rewards/chosen": -41.29647445678711, "rewards/margins": 3.3673477172851562, "rewards/rejected": -44.663818359375, "step": 4688 }, { "epoch": 0.6384803921568627, "grad_norm": 46.72339850991418, "learning_rate": 2.7845651630177045e-07, "logits/chosen": 13.10793685913086, "logits/rejected": 13.203100204467773, "logps/chosen": -3.956033229827881, "logps/rejected": -4.322492599487305, "loss": 4.5165, "rewards/accuracies": 0.75, "rewards/chosen": -39.560333251953125, "rewards/margins": 3.6645936965942383, "rewards/rejected": -43.22492599487305, "step": 4689 }, { "epoch": 0.6386165577342048, "grad_norm": 45.6622566613954, "learning_rate": 2.7827538015017523e-07, "logits/chosen": 13.537822723388672, "logits/rejected": 13.75039291381836, "logps/chosen": -4.037372589111328, "logps/rejected": -4.294404983520508, "loss": 3.8274, "rewards/accuracies": 0.5, "rewards/chosen": -40.37372589111328, "rewards/margins": 2.5703258514404297, "rewards/rejected": -42.944053649902344, "step": 4690 }, { "epoch": 0.6387527233115469, "grad_norm": 39.144597567842254, "learning_rate": 2.780942715032639e-07, "logits/chosen": 13.359068870544434, "logits/rejected": 14.046095848083496, "logps/chosen": -4.069341659545898, "logps/rejected": -4.1505961418151855, "loss": 3.8229, "rewards/accuracies": 0.5, "rewards/chosen": -40.69342041015625, "rewards/margins": 0.8125419616699219, "rewards/rejected": -41.505958557128906, "step": 4691 }, { "epoch": 0.6388888888888888, "grad_norm": 43.487479833162894, "learning_rate": 2.779131904019595e-07, "logits/chosen": 13.24417781829834, "logits/rejected": 13.275091171264648, "logps/chosen": -4.3844709396362305, "logps/rejected": -4.65487813949585, "loss": 3.8978, "rewards/accuracies": 0.75, "rewards/chosen": -43.84471130371094, "rewards/margins": 2.7040719985961914, "rewards/rejected": -46.54878234863281, "step": 4692 }, { "epoch": 0.6390250544662309, "grad_norm": 42.66398899221486, "learning_rate": 2.7773213688717914e-07, "logits/chosen": 13.390571594238281, "logits/rejected": 12.805953025817871, "logps/chosen": -4.1719584465026855, "logps/rejected": -4.404808044433594, "loss": 3.7956, "rewards/accuracies": 0.75, "rewards/chosen": -41.71958923339844, "rewards/margins": 2.3284902572631836, "rewards/rejected": -44.04808044433594, "step": 4693 }, { "epoch": 0.639161220043573, "grad_norm": 49.07161443978716, "learning_rate": 2.775511109998329e-07, "logits/chosen": 13.089483261108398, "logits/rejected": 13.120582580566406, "logps/chosen": -4.104957580566406, "logps/rejected": -4.205874443054199, "loss": 3.7333, "rewards/accuracies": 0.75, "rewards/chosen": -41.04957580566406, "rewards/margins": 1.0091676712036133, "rewards/rejected": -42.058746337890625, "step": 4694 }, { "epoch": 0.639297385620915, "grad_norm": 47.21154459825051, "learning_rate": 2.773701127808254e-07, "logits/chosen": 13.297563552856445, "logits/rejected": 14.815496444702148, "logps/chosen": -4.464484214782715, "logps/rejected": -4.934582710266113, "loss": 3.8999, "rewards/accuracies": 0.75, "rewards/chosen": -44.64484405517578, "rewards/margins": 4.700979232788086, "rewards/rejected": -49.3458251953125, "step": 4695 }, { "epoch": 0.6394335511982571, "grad_norm": 64.76620545053967, "learning_rate": 2.771891422710547e-07, "logits/chosen": 14.056982040405273, "logits/rejected": 14.129666328430176, "logps/chosen": -4.757636547088623, "logps/rejected": -4.463468551635742, "loss": 4.9305, "rewards/accuracies": 0.25, "rewards/chosen": -47.57636642456055, "rewards/margins": -2.941678047180176, "rewards/rejected": -44.63468933105469, "step": 4696 }, { "epoch": 0.6395697167755992, "grad_norm": 46.752326030648106, "learning_rate": 2.770081995114123e-07, "logits/chosen": 13.027548789978027, "logits/rejected": 13.449493408203125, "logps/chosen": -4.329408645629883, "logps/rejected": -4.518057823181152, "loss": 3.7439, "rewards/accuracies": 0.5, "rewards/chosen": -43.294090270996094, "rewards/margins": 1.886490821838379, "rewards/rejected": -45.180580139160156, "step": 4697 }, { "epoch": 0.6397058823529411, "grad_norm": 41.53893803196288, "learning_rate": 2.768272845427839e-07, "logits/chosen": 13.368144989013672, "logits/rejected": 14.744102478027344, "logps/chosen": -4.360235214233398, "logps/rejected": -4.719097137451172, "loss": 3.943, "rewards/accuracies": 1.0, "rewards/chosen": -43.60234832763672, "rewards/margins": 3.5886240005493164, "rewards/rejected": -47.19097137451172, "step": 4698 }, { "epoch": 0.6398420479302832, "grad_norm": 48.4745887599619, "learning_rate": 2.766463974060489e-07, "logits/chosen": 13.938652038574219, "logits/rejected": 13.797897338867188, "logps/chosen": -4.6146697998046875, "logps/rejected": -4.797577381134033, "loss": 3.9694, "rewards/accuracies": 0.75, "rewards/chosen": -46.14670181274414, "rewards/margins": 1.8290729522705078, "rewards/rejected": -47.975772857666016, "step": 4699 }, { "epoch": 0.6399782135076253, "grad_norm": 40.34535242559996, "learning_rate": 2.764655381420798e-07, "logits/chosen": 13.147404670715332, "logits/rejected": 12.963396072387695, "logps/chosen": -4.1838250160217285, "logps/rejected": -4.445048809051514, "loss": 3.9448, "rewards/accuracies": 0.75, "rewards/chosen": -41.83824920654297, "rewards/margins": 2.612236976623535, "rewards/rejected": -44.45048904418945, "step": 4700 }, { "epoch": 0.6401143790849673, "grad_norm": 40.47478657918756, "learning_rate": 2.7628470679174357e-07, "logits/chosen": 12.118423461914062, "logits/rejected": 13.122194290161133, "logps/chosen": -4.228668212890625, "logps/rejected": -4.3759050369262695, "loss": 3.9615, "rewards/accuracies": 0.75, "rewards/chosen": -42.28668212890625, "rewards/margins": 1.472365379333496, "rewards/rejected": -43.75904846191406, "step": 4701 }, { "epoch": 0.6402505446623094, "grad_norm": 40.67603901651111, "learning_rate": 2.761039033959006e-07, "logits/chosen": 12.382936477661133, "logits/rejected": 13.124387741088867, "logps/chosen": -4.158754348754883, "logps/rejected": -4.530854225158691, "loss": 3.7618, "rewards/accuracies": 1.0, "rewards/chosen": -41.58754348754883, "rewards/margins": 3.720996856689453, "rewards/rejected": -45.30854034423828, "step": 4702 }, { "epoch": 0.6403867102396514, "grad_norm": 42.63773033994935, "learning_rate": 2.759231279954047e-07, "logits/chosen": 13.536701202392578, "logits/rejected": 12.952827453613281, "logps/chosen": -4.407306671142578, "logps/rejected": -4.175138473510742, "loss": 4.4274, "rewards/accuracies": 0.25, "rewards/chosen": -44.07306671142578, "rewards/margins": -2.3216819763183594, "rewards/rejected": -41.75138854980469, "step": 4703 }, { "epoch": 0.6405228758169934, "grad_norm": 43.531364089498744, "learning_rate": 2.757423806311036e-07, "logits/chosen": 13.478255271911621, "logits/rejected": 13.521478652954102, "logps/chosen": -4.634931564331055, "logps/rejected": -4.606825828552246, "loss": 3.7295, "rewards/accuracies": 0.5, "rewards/chosen": -46.34931182861328, "rewards/margins": -0.28105735778808594, "rewards/rejected": -46.06825256347656, "step": 4704 }, { "epoch": 0.6406590413943355, "grad_norm": 39.17392352202418, "learning_rate": 2.7556166134383895e-07, "logits/chosen": 12.890493392944336, "logits/rejected": 14.355928421020508, "logps/chosen": -4.231895923614502, "logps/rejected": -4.700024604797363, "loss": 3.779, "rewards/accuracies": 1.0, "rewards/chosen": -42.31895446777344, "rewards/margins": 4.681291580200195, "rewards/rejected": -47.00025177001953, "step": 4705 }, { "epoch": 0.6407952069716776, "grad_norm": 40.33278092319453, "learning_rate": 2.753809701744453e-07, "logits/chosen": 12.548696517944336, "logits/rejected": 13.021499633789062, "logps/chosen": -4.046252250671387, "logps/rejected": -4.342411041259766, "loss": 3.2994, "rewards/accuracies": 1.0, "rewards/chosen": -40.462520599365234, "rewards/margins": 2.9615936279296875, "rewards/rejected": -43.424110412597656, "step": 4706 }, { "epoch": 0.6409313725490197, "grad_norm": 41.19221352976534, "learning_rate": 2.752003071637516e-07, "logits/chosen": 12.898857116699219, "logits/rejected": 12.615089416503906, "logps/chosen": -4.114603519439697, "logps/rejected": -3.82615327835083, "loss": 4.36, "rewards/accuracies": 0.25, "rewards/chosen": -41.146034240722656, "rewards/margins": -2.8844995498657227, "rewards/rejected": -38.261531829833984, "step": 4707 }, { "epoch": 0.6410675381263616, "grad_norm": 42.01277539696705, "learning_rate": 2.750196723525802e-07, "logits/chosen": 12.467676162719727, "logits/rejected": 13.402769088745117, "logps/chosen": -4.3906965255737305, "logps/rejected": -4.663601875305176, "loss": 3.9094, "rewards/accuracies": 1.0, "rewards/chosen": -43.90696716308594, "rewards/margins": 2.729053497314453, "rewards/rejected": -46.636016845703125, "step": 4708 }, { "epoch": 0.6412037037037037, "grad_norm": 41.7414355772806, "learning_rate": 2.7483906578174686e-07, "logits/chosen": 12.603584289550781, "logits/rejected": 13.528142929077148, "logps/chosen": -4.302859306335449, "logps/rejected": -4.611328125, "loss": 3.8656, "rewards/accuracies": 0.75, "rewards/chosen": -43.028594970703125, "rewards/margins": 3.084688186645508, "rewards/rejected": -46.11328125, "step": 4709 }, { "epoch": 0.6413398692810458, "grad_norm": 40.86399631338512, "learning_rate": 2.7465848749206115e-07, "logits/chosen": 12.818900108337402, "logits/rejected": 13.143194198608398, "logps/chosen": -3.9533450603485107, "logps/rejected": -4.290837287902832, "loss": 3.5522, "rewards/accuracies": 0.75, "rewards/chosen": -39.533447265625, "rewards/margins": 3.3749237060546875, "rewards/rejected": -42.90837097167969, "step": 4710 }, { "epoch": 0.6414760348583878, "grad_norm": 41.829941445801396, "learning_rate": 2.7447793752432635e-07, "logits/chosen": 12.668876647949219, "logits/rejected": 13.193102836608887, "logps/chosen": -4.206582546234131, "logps/rejected": -4.358611583709717, "loss": 3.9843, "rewards/accuracies": 0.75, "rewards/chosen": -42.06582260131836, "rewards/margins": 1.5202903747558594, "rewards/rejected": -43.58611297607422, "step": 4711 }, { "epoch": 0.6416122004357299, "grad_norm": 37.41017648298227, "learning_rate": 2.742974159193392e-07, "logits/chosen": 14.188496589660645, "logits/rejected": 13.84554672241211, "logps/chosen": -4.57977294921875, "logps/rejected": -4.585589408874512, "loss": 4.1203, "rewards/accuracies": 0.75, "rewards/chosen": -45.79772186279297, "rewards/margins": 0.058165550231933594, "rewards/rejected": -45.85588836669922, "step": 4712 }, { "epoch": 0.641748366013072, "grad_norm": 44.40380347539337, "learning_rate": 2.741169227178898e-07, "logits/chosen": 14.500856399536133, "logits/rejected": 14.355569839477539, "logps/chosen": -4.684639930725098, "logps/rejected": -4.391756534576416, "loss": 4.2895, "rewards/accuracies": 0.0, "rewards/chosen": -46.846397399902344, "rewards/margins": -2.9288320541381836, "rewards/rejected": -43.917564392089844, "step": 4713 }, { "epoch": 0.6418845315904139, "grad_norm": 41.050968407545156, "learning_rate": 2.739364579607624e-07, "logits/chosen": 12.627756118774414, "logits/rejected": 12.226875305175781, "logps/chosen": -4.46773624420166, "logps/rejected": -4.350167274475098, "loss": 4.2431, "rewards/accuracies": 0.5, "rewards/chosen": -44.67736053466797, "rewards/margins": -1.1756877899169922, "rewards/rejected": -43.50167465209961, "step": 4714 }, { "epoch": 0.642020697167756, "grad_norm": 52.85452737533247, "learning_rate": 2.7375602168873435e-07, "logits/chosen": 11.703792572021484, "logits/rejected": 12.937559127807617, "logps/chosen": -4.009719371795654, "logps/rejected": -4.23861026763916, "loss": 3.6769, "rewards/accuracies": 0.5, "rewards/chosen": -40.09719467163086, "rewards/margins": 2.288909912109375, "rewards/rejected": -42.386104583740234, "step": 4715 }, { "epoch": 0.6421568627450981, "grad_norm": 45.22699646568002, "learning_rate": 2.735756139425768e-07, "logits/chosen": 12.708314895629883, "logits/rejected": 12.832359313964844, "logps/chosen": -3.807894706726074, "logps/rejected": -4.260650634765625, "loss": 3.574, "rewards/accuracies": 1.0, "rewards/chosen": -38.078948974609375, "rewards/margins": 4.527555465698242, "rewards/rejected": -42.60650634765625, "step": 4716 }, { "epoch": 0.6422930283224401, "grad_norm": 40.74024640855229, "learning_rate": 2.7339523476305426e-07, "logits/chosen": 13.399520874023438, "logits/rejected": 13.807276725769043, "logps/chosen": -4.233159065246582, "logps/rejected": -4.279148578643799, "loss": 3.6893, "rewards/accuracies": 0.5, "rewards/chosen": -42.33158874511719, "rewards/margins": 0.45989418029785156, "rewards/rejected": -42.79148483276367, "step": 4717 }, { "epoch": 0.6424291938997821, "grad_norm": 44.437394314708584, "learning_rate": 2.732148841909249e-07, "logits/chosen": 12.364266395568848, "logits/rejected": 12.807859420776367, "logps/chosen": -3.934809923171997, "logps/rejected": -4.330726146697998, "loss": 3.9936, "rewards/accuracies": 1.0, "rewards/chosen": -39.34809875488281, "rewards/margins": 3.959160804748535, "rewards/rejected": -43.3072624206543, "step": 4718 }, { "epoch": 0.6425653594771242, "grad_norm": 38.74683740406977, "learning_rate": 2.7303456226694056e-07, "logits/chosen": 13.479301452636719, "logits/rejected": 13.391149520874023, "logps/chosen": -4.5149335861206055, "logps/rejected": -4.467728137969971, "loss": 3.7258, "rewards/accuracies": 0.5, "rewards/chosen": -45.14933395385742, "rewards/margins": -0.47205162048339844, "rewards/rejected": -44.67728042602539, "step": 4719 }, { "epoch": 0.6427015250544662, "grad_norm": 41.76151155741219, "learning_rate": 2.7285426903184636e-07, "logits/chosen": 12.455954551696777, "logits/rejected": 13.434505462646484, "logps/chosen": -4.018771648406982, "logps/rejected": -4.31789493560791, "loss": 4.2101, "rewards/accuracies": 0.75, "rewards/chosen": -40.18771743774414, "rewards/margins": 2.9912309646606445, "rewards/rejected": -43.17894744873047, "step": 4720 }, { "epoch": 0.6428376906318083, "grad_norm": 46.63469819288124, "learning_rate": 2.726740045263811e-07, "logits/chosen": 12.4801607131958, "logits/rejected": 13.230987548828125, "logps/chosen": -4.515798568725586, "logps/rejected": -4.810244083404541, "loss": 4.0423, "rewards/accuracies": 1.0, "rewards/chosen": -45.157989501953125, "rewards/margins": 2.944451332092285, "rewards/rejected": -48.102439880371094, "step": 4721 }, { "epoch": 0.6429738562091504, "grad_norm": 38.95367662583487, "learning_rate": 2.724937687912769e-07, "logits/chosen": 12.788124084472656, "logits/rejected": 13.296529769897461, "logps/chosen": -4.208703994750977, "logps/rejected": -4.134410858154297, "loss": 3.5773, "rewards/accuracies": 0.5, "rewards/chosen": -42.0870361328125, "rewards/margins": -0.7429304122924805, "rewards/rejected": -41.34410858154297, "step": 4722 }, { "epoch": 0.6431100217864923, "grad_norm": 62.2695730023636, "learning_rate": 2.7231356186725976e-07, "logits/chosen": 12.530231475830078, "logits/rejected": 12.971342086791992, "logps/chosen": -3.9036412239074707, "logps/rejected": -4.175050258636475, "loss": 3.7191, "rewards/accuracies": 0.75, "rewards/chosen": -39.03641128540039, "rewards/margins": 2.714090347290039, "rewards/rejected": -41.75050354003906, "step": 4723 }, { "epoch": 0.6432461873638344, "grad_norm": 42.19095318359076, "learning_rate": 2.721333837950486e-07, "logits/chosen": 12.050679206848145, "logits/rejected": 13.51168441772461, "logps/chosen": -3.9184999465942383, "logps/rejected": -4.298943519592285, "loss": 4.0064, "rewards/accuracies": 1.0, "rewards/chosen": -39.18499755859375, "rewards/margins": 3.8044376373291016, "rewards/rejected": -42.98944091796875, "step": 4724 }, { "epoch": 0.6433823529411765, "grad_norm": 44.34109072870148, "learning_rate": 2.7195323461535644e-07, "logits/chosen": 13.508871078491211, "logits/rejected": 13.98432731628418, "logps/chosen": -4.298000335693359, "logps/rejected": -4.2586350440979, "loss": 4.1559, "rewards/accuracies": 0.25, "rewards/chosen": -42.980003356933594, "rewards/margins": -0.3936491012573242, "rewards/rejected": -42.58635330200195, "step": 4725 }, { "epoch": 0.6435185185185185, "grad_norm": 49.180677892023816, "learning_rate": 2.717731143688895e-07, "logits/chosen": 11.50925064086914, "logits/rejected": 13.165992736816406, "logps/chosen": -3.5533957481384277, "logps/rejected": -4.356315612792969, "loss": 3.8893, "rewards/accuracies": 1.0, "rewards/chosen": -35.533958435058594, "rewards/margins": 8.029196739196777, "rewards/rejected": -43.56315612792969, "step": 4726 }, { "epoch": 0.6436546840958606, "grad_norm": 38.89912744576688, "learning_rate": 2.7159302309634705e-07, "logits/chosen": 13.305707931518555, "logits/rejected": 13.935077667236328, "logps/chosen": -4.639974117279053, "logps/rejected": -4.8499755859375, "loss": 4.1573, "rewards/accuracies": 0.5, "rewards/chosen": -46.399742126464844, "rewards/margins": 2.1000137329101562, "rewards/rejected": -48.499755859375, "step": 4727 }, { "epoch": 0.6437908496732027, "grad_norm": 37.00433259315994, "learning_rate": 2.7141296083842255e-07, "logits/chosen": 11.672506332397461, "logits/rejected": 12.669711112976074, "logps/chosen": -3.737727165222168, "logps/rejected": -4.214366912841797, "loss": 3.8088, "rewards/accuracies": 1.0, "rewards/chosen": -37.37727355957031, "rewards/margins": 4.7663984298706055, "rewards/rejected": -42.14366912841797, "step": 4728 }, { "epoch": 0.6439270152505446, "grad_norm": 43.59648911625058, "learning_rate": 2.712329276358026e-07, "logits/chosen": 12.37135124206543, "logits/rejected": 13.021842002868652, "logps/chosen": -3.9740004539489746, "logps/rejected": -4.268393516540527, "loss": 4.2094, "rewards/accuracies": 0.5, "rewards/chosen": -39.74000549316406, "rewards/margins": 2.9439315795898438, "rewards/rejected": -42.683937072753906, "step": 4729 }, { "epoch": 0.6440631808278867, "grad_norm": 44.3349114324084, "learning_rate": 2.710529235291669e-07, "logits/chosen": 13.135055541992188, "logits/rejected": 12.57345199584961, "logps/chosen": -4.528393268585205, "logps/rejected": -4.265558242797852, "loss": 4.435, "rewards/accuracies": 0.5, "rewards/chosen": -45.283931732177734, "rewards/margins": -2.628347396850586, "rewards/rejected": -42.65558624267578, "step": 4730 }, { "epoch": 0.6441993464052288, "grad_norm": 41.68447758902066, "learning_rate": 2.708729485591889e-07, "logits/chosen": 13.550837516784668, "logits/rejected": 13.513802528381348, "logps/chosen": -4.416945457458496, "logps/rejected": -4.434418678283691, "loss": 4.1824, "rewards/accuracies": 0.5, "rewards/chosen": -44.169456481933594, "rewards/margins": 0.17473220825195312, "rewards/rejected": -44.34418487548828, "step": 4731 }, { "epoch": 0.6443355119825708, "grad_norm": 41.027977432558146, "learning_rate": 2.7069300276653584e-07, "logits/chosen": 12.33980941772461, "logits/rejected": 13.969683647155762, "logps/chosen": -4.322264671325684, "logps/rejected": -4.817452430725098, "loss": 3.8408, "rewards/accuracies": 1.0, "rewards/chosen": -43.22264862060547, "rewards/margins": 4.951876640319824, "rewards/rejected": -48.174522399902344, "step": 4732 }, { "epoch": 0.6444716775599129, "grad_norm": 38.63739995978957, "learning_rate": 2.7051308619186744e-07, "logits/chosen": 13.124587059020996, "logits/rejected": 14.6627197265625, "logps/chosen": -4.4839301109313965, "logps/rejected": -4.663500785827637, "loss": 3.674, "rewards/accuracies": 0.75, "rewards/chosen": -44.83930206298828, "rewards/margins": 1.7957048416137695, "rewards/rejected": -46.635005950927734, "step": 4733 }, { "epoch": 0.6446078431372549, "grad_norm": 40.94318386060541, "learning_rate": 2.7033319887583765e-07, "logits/chosen": 13.541043281555176, "logits/rejected": 13.22547435760498, "logps/chosen": -3.9971938133239746, "logps/rejected": -4.118474006652832, "loss": 4.1675, "rewards/accuracies": 0.5, "rewards/chosen": -39.97193908691406, "rewards/margins": 1.2128047943115234, "rewards/rejected": -41.18474197387695, "step": 4734 }, { "epoch": 0.6447440087145969, "grad_norm": 41.27364184475187, "learning_rate": 2.701533408590935e-07, "logits/chosen": 13.347034454345703, "logits/rejected": 13.313514709472656, "logps/chosen": -4.290853500366211, "logps/rejected": -4.371848106384277, "loss": 4.0523, "rewards/accuracies": 0.5, "rewards/chosen": -42.908538818359375, "rewards/margins": 0.8099422454833984, "rewards/rejected": -43.71847915649414, "step": 4735 }, { "epoch": 0.644880174291939, "grad_norm": 42.55113502541027, "learning_rate": 2.6997351218227515e-07, "logits/chosen": 12.53220272064209, "logits/rejected": 12.157320022583008, "logps/chosen": -3.969182252883911, "logps/rejected": -3.9301669597625732, "loss": 3.8018, "rewards/accuracies": 0.25, "rewards/chosen": -39.69182205200195, "rewards/margins": -0.3901529312133789, "rewards/rejected": -39.30167007446289, "step": 4736 }, { "epoch": 0.6450163398692811, "grad_norm": 41.453989361557184, "learning_rate": 2.697937128860166e-07, "logits/chosen": 13.501602172851562, "logits/rejected": 13.406702041625977, "logps/chosen": -4.356605529785156, "logps/rejected": -4.460118293762207, "loss": 3.6165, "rewards/accuracies": 0.5, "rewards/chosen": -43.56605529785156, "rewards/margins": 1.0351324081420898, "rewards/rejected": -44.60118865966797, "step": 4737 }, { "epoch": 0.6451525054466231, "grad_norm": 40.487117170655736, "learning_rate": 2.69613943010945e-07, "logits/chosen": 13.169556617736816, "logits/rejected": 13.524930953979492, "logps/chosen": -4.340892791748047, "logps/rejected": -4.225163459777832, "loss": 3.3878, "rewards/accuracies": 0.5, "rewards/chosen": -43.408931732177734, "rewards/margins": -1.1572952270507812, "rewards/rejected": -42.25163269042969, "step": 4738 }, { "epoch": 0.6452886710239651, "grad_norm": 40.08960132633904, "learning_rate": 2.6943420259768063e-07, "logits/chosen": 13.626899719238281, "logits/rejected": 13.144411087036133, "logps/chosen": -4.60148811340332, "logps/rejected": -4.556933879852295, "loss": 4.3056, "rewards/accuracies": 0.25, "rewards/chosen": -46.0148811340332, "rewards/margins": -0.4455432891845703, "rewards/rejected": -45.5693359375, "step": 4739 }, { "epoch": 0.6454248366013072, "grad_norm": 45.36026864722153, "learning_rate": 2.6925449168683736e-07, "logits/chosen": 13.658289909362793, "logits/rejected": 13.489770889282227, "logps/chosen": -4.194999694824219, "logps/rejected": -4.430088043212891, "loss": 3.8575, "rewards/accuracies": 0.75, "rewards/chosen": -41.94999313354492, "rewards/margins": 2.350888252258301, "rewards/rejected": -44.300880432128906, "step": 4740 }, { "epoch": 0.6455610021786492, "grad_norm": 37.59279274788805, "learning_rate": 2.690748103190227e-07, "logits/chosen": 13.518171310424805, "logits/rejected": 13.818283081054688, "logps/chosen": -4.3458757400512695, "logps/rejected": -4.710341453552246, "loss": 4.3282, "rewards/accuracies": 0.75, "rewards/chosen": -43.45875549316406, "rewards/margins": 3.6446619033813477, "rewards/rejected": -47.103416442871094, "step": 4741 }, { "epoch": 0.6456971677559913, "grad_norm": 43.02304065433765, "learning_rate": 2.688951585348367e-07, "logits/chosen": 13.281390190124512, "logits/rejected": 13.859966278076172, "logps/chosen": -4.1943359375, "logps/rejected": -4.380054473876953, "loss": 3.4913, "rewards/accuracies": 0.5, "rewards/chosen": -41.943359375, "rewards/margins": 1.8571786880493164, "rewards/rejected": -43.800540924072266, "step": 4742 }, { "epoch": 0.6458333333333334, "grad_norm": 42.709877339386985, "learning_rate": 2.687155363748734e-07, "logits/chosen": 12.995776176452637, "logits/rejected": 13.85307502746582, "logps/chosen": -4.246554374694824, "logps/rejected": -4.446562767028809, "loss": 3.8302, "rewards/accuracies": 0.5, "rewards/chosen": -42.465545654296875, "rewards/margins": 2.0000810623168945, "rewards/rejected": -44.46562576293945, "step": 4743 }, { "epoch": 0.6459694989106753, "grad_norm": 42.375627334619196, "learning_rate": 2.6853594387972005e-07, "logits/chosen": 13.14261245727539, "logits/rejected": 13.120927810668945, "logps/chosen": -4.425854206085205, "logps/rejected": -4.124143600463867, "loss": 4.234, "rewards/accuracies": 0.0, "rewards/chosen": -44.258541107177734, "rewards/margins": -3.0171022415161133, "rewards/rejected": -41.24143981933594, "step": 4744 }, { "epoch": 0.6461056644880174, "grad_norm": 38.50968144375236, "learning_rate": 2.683563810899566e-07, "logits/chosen": 12.623703002929688, "logits/rejected": 13.20914077758789, "logps/chosen": -3.9249398708343506, "logps/rejected": -4.264893054962158, "loss": 3.8232, "rewards/accuracies": 1.0, "rewards/chosen": -39.24939727783203, "rewards/margins": 3.3995323181152344, "rewards/rejected": -42.64893341064453, "step": 4745 }, { "epoch": 0.6462418300653595, "grad_norm": 41.32859381235365, "learning_rate": 2.6817684804615706e-07, "logits/chosen": 13.09390640258789, "logits/rejected": 14.055522918701172, "logps/chosen": -4.219465255737305, "logps/rejected": -4.394097328186035, "loss": 3.9051, "rewards/accuracies": 0.5, "rewards/chosen": -42.19465255737305, "rewards/margins": 1.7463197708129883, "rewards/rejected": -43.94097137451172, "step": 4746 }, { "epoch": 0.6463779956427015, "grad_norm": 40.054385160938004, "learning_rate": 2.6799734478888855e-07, "logits/chosen": 13.423513412475586, "logits/rejected": 13.833961486816406, "logps/chosen": -4.22636079788208, "logps/rejected": -4.638059616088867, "loss": 4.2229, "rewards/accuracies": 0.75, "rewards/chosen": -42.263607025146484, "rewards/margins": 4.116992950439453, "rewards/rejected": -46.38059997558594, "step": 4747 }, { "epoch": 0.6465141612200436, "grad_norm": 50.49409225661293, "learning_rate": 2.6781787135871097e-07, "logits/chosen": 13.118637084960938, "logits/rejected": 14.363505363464355, "logps/chosen": -4.1138386726379395, "logps/rejected": -4.628452777862549, "loss": 3.5322, "rewards/accuracies": 1.0, "rewards/chosen": -41.13838195800781, "rewards/margins": 5.14614200592041, "rewards/rejected": -46.28452682495117, "step": 4748 }, { "epoch": 0.6466503267973857, "grad_norm": 43.97915121377121, "learning_rate": 2.6763842779617793e-07, "logits/chosen": 12.698275566101074, "logits/rejected": 12.81458854675293, "logps/chosen": -4.110878944396973, "logps/rejected": -4.238320350646973, "loss": 4.309, "rewards/accuracies": 0.5, "rewards/chosen": -41.108787536621094, "rewards/margins": 1.2744150161743164, "rewards/rejected": -42.383201599121094, "step": 4749 }, { "epoch": 0.6467864923747276, "grad_norm": 41.726353648982794, "learning_rate": 2.674590141418365e-07, "logits/chosen": 13.829376220703125, "logits/rejected": 14.113624572753906, "logps/chosen": -4.192799091339111, "logps/rejected": -4.621081352233887, "loss": 4.0188, "rewards/accuracies": 0.75, "rewards/chosen": -41.9279899597168, "rewards/margins": 4.282825469970703, "rewards/rejected": -46.2108154296875, "step": 4750 }, { "epoch": 0.6469226579520697, "grad_norm": 37.693722066816, "learning_rate": 2.672796304362262e-07, "logits/chosen": 12.91086196899414, "logits/rejected": 13.574122428894043, "logps/chosen": -4.457182884216309, "logps/rejected": -4.503661155700684, "loss": 3.7774, "rewards/accuracies": 0.5, "rewards/chosen": -44.57183074951172, "rewards/margins": 0.4647817611694336, "rewards/rejected": -45.0366096496582, "step": 4751 }, { "epoch": 0.6470588235294118, "grad_norm": 37.73077746674652, "learning_rate": 2.6710027671988044e-07, "logits/chosen": 12.898330688476562, "logits/rejected": 13.68140697479248, "logps/chosen": -4.033631324768066, "logps/rejected": -4.349745750427246, "loss": 3.7269, "rewards/accuracies": 0.75, "rewards/chosen": -40.33631896972656, "rewards/margins": 3.161139488220215, "rewards/rejected": -43.49745559692383, "step": 4752 }, { "epoch": 0.6471949891067538, "grad_norm": 43.75726216998399, "learning_rate": 2.6692095303332596e-07, "logits/chosen": 13.732332229614258, "logits/rejected": 13.840237617492676, "logps/chosen": -4.064730644226074, "logps/rejected": -4.244019508361816, "loss": 3.861, "rewards/accuracies": 0.75, "rewards/chosen": -40.64730453491211, "rewards/margins": 1.7928876876831055, "rewards/rejected": -42.44019317626953, "step": 4753 }, { "epoch": 0.6473311546840959, "grad_norm": 47.8729627911896, "learning_rate": 2.66741659417082e-07, "logits/chosen": 13.151960372924805, "logits/rejected": 12.96595573425293, "logps/chosen": -3.9982101917266846, "logps/rejected": -4.007462978363037, "loss": 3.7142, "rewards/accuracies": 0.5, "rewards/chosen": -39.98210144042969, "rewards/margins": 0.09252643585205078, "rewards/rejected": -40.07463073730469, "step": 4754 }, { "epoch": 0.6474673202614379, "grad_norm": 38.023543054283245, "learning_rate": 2.665623959116616e-07, "logits/chosen": 11.948115348815918, "logits/rejected": 12.976408004760742, "logps/chosen": -3.76536226272583, "logps/rejected": -4.317486763000488, "loss": 3.7831, "rewards/accuracies": 1.0, "rewards/chosen": -37.653621673583984, "rewards/margins": 5.521247863769531, "rewards/rejected": -43.174869537353516, "step": 4755 }, { "epoch": 0.6476034858387799, "grad_norm": 40.77710961864274, "learning_rate": 2.6638316255757094e-07, "logits/chosen": 12.574798583984375, "logits/rejected": 13.907732009887695, "logps/chosen": -4.098322868347168, "logps/rejected": -4.427359580993652, "loss": 4.3226, "rewards/accuracies": 0.75, "rewards/chosen": -40.98323059082031, "rewards/margins": 3.2903661727905273, "rewards/rejected": -44.273597717285156, "step": 4756 }, { "epoch": 0.647739651416122, "grad_norm": 39.60965882186701, "learning_rate": 2.662039593953092e-07, "logits/chosen": 12.855436325073242, "logits/rejected": 13.72675609588623, "logps/chosen": -4.029246807098389, "logps/rejected": -4.446190357208252, "loss": 4.124, "rewards/accuracies": 0.75, "rewards/chosen": -40.29247283935547, "rewards/margins": 4.16943359375, "rewards/rejected": -44.4619026184082, "step": 4757 }, { "epoch": 0.6478758169934641, "grad_norm": 39.63500693404492, "learning_rate": 2.660247864653687e-07, "logits/chosen": 12.619377136230469, "logits/rejected": 13.628809928894043, "logps/chosen": -3.958211898803711, "logps/rejected": -4.459938049316406, "loss": 3.7884, "rewards/accuracies": 1.0, "rewards/chosen": -39.58211898803711, "rewards/margins": 5.017263412475586, "rewards/rejected": -44.59938049316406, "step": 4758 }, { "epoch": 0.648011982570806, "grad_norm": 35.33458431849867, "learning_rate": 2.658456438082352e-07, "logits/chosen": 13.721002578735352, "logits/rejected": 13.78248405456543, "logps/chosen": -4.234816551208496, "logps/rejected": -4.555893898010254, "loss": 3.4294, "rewards/accuracies": 0.5, "rewards/chosen": -42.34816360473633, "rewards/margins": 3.2107810974121094, "rewards/rejected": -45.55894470214844, "step": 4759 }, { "epoch": 0.6481481481481481, "grad_norm": 41.22056337725978, "learning_rate": 2.656665314643875e-07, "logits/chosen": 13.502462387084961, "logits/rejected": 13.51340389251709, "logps/chosen": -4.084539413452148, "logps/rejected": -4.100955963134766, "loss": 4.481, "rewards/accuracies": 0.5, "rewards/chosen": -40.84539031982422, "rewards/margins": 0.1641693115234375, "rewards/rejected": -41.009559631347656, "step": 4760 }, { "epoch": 0.6482843137254902, "grad_norm": 45.56716887991532, "learning_rate": 2.6548744947429725e-07, "logits/chosen": 13.668557167053223, "logits/rejected": 12.990161895751953, "logps/chosen": -4.547068119049072, "logps/rejected": -4.282642364501953, "loss": 3.6794, "rewards/accuracies": 0.25, "rewards/chosen": -45.470680236816406, "rewards/margins": -2.644256591796875, "rewards/rejected": -42.82642364501953, "step": 4761 }, { "epoch": 0.6484204793028322, "grad_norm": 86.62404606065772, "learning_rate": 2.6530839787842986e-07, "logits/chosen": 12.324542999267578, "logits/rejected": 12.719131469726562, "logps/chosen": -4.093564987182617, "logps/rejected": -4.351241111755371, "loss": 3.7841, "rewards/accuracies": 0.75, "rewards/chosen": -40.935646057128906, "rewards/margins": 2.5767688751220703, "rewards/rejected": -43.512413024902344, "step": 4762 }, { "epoch": 0.6485566448801743, "grad_norm": 41.248829206937344, "learning_rate": 2.6512937671724315e-07, "logits/chosen": 13.082784652709961, "logits/rejected": 13.81155776977539, "logps/chosen": -4.250563621520996, "logps/rejected": -4.618842124938965, "loss": 4.0529, "rewards/accuracies": 1.0, "rewards/chosen": -42.505638122558594, "rewards/margins": 3.6827774047851562, "rewards/rejected": -46.188419342041016, "step": 4763 }, { "epoch": 0.6486928104575164, "grad_norm": 37.689299272547736, "learning_rate": 2.6495038603118873e-07, "logits/chosen": 13.755148887634277, "logits/rejected": 13.673927307128906, "logps/chosen": -4.317073822021484, "logps/rejected": -4.423145294189453, "loss": 3.8678, "rewards/accuracies": 0.5, "rewards/chosen": -43.170738220214844, "rewards/margins": 1.060715675354004, "rewards/rejected": -44.23145294189453, "step": 4764 }, { "epoch": 0.6488289760348583, "grad_norm": 41.898105385658255, "learning_rate": 2.6477142586071104e-07, "logits/chosen": 13.19127082824707, "logits/rejected": 12.964579582214355, "logps/chosen": -3.9297351837158203, "logps/rejected": -4.118960857391357, "loss": 3.5626, "rewards/accuracies": 0.75, "rewards/chosen": -39.2973518371582, "rewards/margins": 1.8922548294067383, "rewards/rejected": -41.189605712890625, "step": 4765 }, { "epoch": 0.6489651416122004, "grad_norm": 36.13488141288903, "learning_rate": 2.645924962462473e-07, "logits/chosen": 14.110034942626953, "logits/rejected": 13.514082908630371, "logps/chosen": -3.992316484451294, "logps/rejected": -4.3981781005859375, "loss": 3.542, "rewards/accuracies": 1.0, "rewards/chosen": -39.92316436767578, "rewards/margins": 4.058618545532227, "rewards/rejected": -43.98178482055664, "step": 4766 }, { "epoch": 0.6491013071895425, "grad_norm": 43.69606366817356, "learning_rate": 2.644135972282284e-07, "logits/chosen": 13.165071487426758, "logits/rejected": 12.47087287902832, "logps/chosen": -3.945094108581543, "logps/rejected": -3.961059093475342, "loss": 4.1508, "rewards/accuracies": 0.5, "rewards/chosen": -39.45094299316406, "rewards/margins": 0.1596517562866211, "rewards/rejected": -39.610591888427734, "step": 4767 }, { "epoch": 0.6492374727668845, "grad_norm": 43.88476277750638, "learning_rate": 2.6423472884707803e-07, "logits/chosen": 13.651942253112793, "logits/rejected": 13.554901123046875, "logps/chosen": -4.16910457611084, "logps/rejected": -4.387090682983398, "loss": 3.6601, "rewards/accuracies": 0.75, "rewards/chosen": -41.69104766845703, "rewards/margins": 2.1798572540283203, "rewards/rejected": -43.87090301513672, "step": 4768 }, { "epoch": 0.6493736383442266, "grad_norm": 43.10736063531388, "learning_rate": 2.640558911432128e-07, "logits/chosen": 12.892671585083008, "logits/rejected": 13.939098358154297, "logps/chosen": -4.472857475280762, "logps/rejected": -4.769109725952148, "loss": 3.9996, "rewards/accuracies": 0.75, "rewards/chosen": -44.728572845458984, "rewards/margins": 2.9625253677368164, "rewards/rejected": -47.691097259521484, "step": 4769 }, { "epoch": 0.6495098039215687, "grad_norm": 55.22194604092682, "learning_rate": 2.638770841570427e-07, "logits/chosen": 12.55816650390625, "logits/rejected": 12.709137916564941, "logps/chosen": -4.294979095458984, "logps/rejected": -4.420378684997559, "loss": 3.9966, "rewards/accuracies": 0.5, "rewards/chosen": -42.94979476928711, "rewards/margins": 1.2539958953857422, "rewards/rejected": -44.20378875732422, "step": 4770 }, { "epoch": 0.6496459694989106, "grad_norm": 42.432127547456986, "learning_rate": 2.636983079289708e-07, "logits/chosen": 13.348886489868164, "logits/rejected": 13.427177429199219, "logps/chosen": -4.3293352127075195, "logps/rejected": -4.506087303161621, "loss": 3.7917, "rewards/accuracies": 1.0, "rewards/chosen": -43.29335021972656, "rewards/margins": 1.7675209045410156, "rewards/rejected": -45.06087112426758, "step": 4771 }, { "epoch": 0.6497821350762527, "grad_norm": 42.1630721757177, "learning_rate": 2.635195624993927e-07, "logits/chosen": 12.511734962463379, "logits/rejected": 13.776480674743652, "logps/chosen": -4.120053768157959, "logps/rejected": -4.664947032928467, "loss": 4.2871, "rewards/accuracies": 0.75, "rewards/chosen": -41.20053482055664, "rewards/margins": 5.4489336013793945, "rewards/rejected": -46.64946746826172, "step": 4772 }, { "epoch": 0.6499183006535948, "grad_norm": 43.07019369460187, "learning_rate": 2.6334084790869766e-07, "logits/chosen": 13.98153018951416, "logits/rejected": 14.406198501586914, "logps/chosen": -3.9793171882629395, "logps/rejected": -4.6373066902160645, "loss": 3.4433, "rewards/accuracies": 1.0, "rewards/chosen": -39.79317092895508, "rewards/margins": 6.579896926879883, "rewards/rejected": -46.373069763183594, "step": 4773 }, { "epoch": 0.6500544662309368, "grad_norm": 37.8178313425338, "learning_rate": 2.631621641972678e-07, "logits/chosen": 13.2822904586792, "logits/rejected": 13.08568000793457, "logps/chosen": -4.079653263092041, "logps/rejected": -4.359870910644531, "loss": 3.8279, "rewards/accuracies": 0.75, "rewards/chosen": -40.796531677246094, "rewards/margins": 2.8021774291992188, "rewards/rejected": -43.59870910644531, "step": 4774 }, { "epoch": 0.6501906318082789, "grad_norm": 42.85818132214287, "learning_rate": 2.62983511405478e-07, "logits/chosen": 12.478403091430664, "logits/rejected": 12.431564331054688, "logps/chosen": -4.249885559082031, "logps/rejected": -4.388693809509277, "loss": 3.8071, "rewards/accuracies": 0.75, "rewards/chosen": -42.49885559082031, "rewards/margins": 1.3880815505981445, "rewards/rejected": -43.88693618774414, "step": 4775 }, { "epoch": 0.6503267973856209, "grad_norm": 43.24907336275471, "learning_rate": 2.628048895736963e-07, "logits/chosen": 13.876015663146973, "logits/rejected": 13.854059219360352, "logps/chosen": -4.318748950958252, "logps/rejected": -4.325465679168701, "loss": 4.3407, "rewards/accuracies": 0.5, "rewards/chosen": -43.1874885559082, "rewards/margins": 0.0671682357788086, "rewards/rejected": -43.25465774536133, "step": 4776 }, { "epoch": 0.6504629629629629, "grad_norm": 39.968808218136566, "learning_rate": 2.6262629874228386e-07, "logits/chosen": 13.091791152954102, "logits/rejected": 14.02468490600586, "logps/chosen": -4.021923542022705, "logps/rejected": -4.534682273864746, "loss": 3.1107, "rewards/accuracies": 1.0, "rewards/chosen": -40.21923828125, "rewards/margins": 5.1275835037231445, "rewards/rejected": -45.34681701660156, "step": 4777 }, { "epoch": 0.650599128540305, "grad_norm": 41.9565925109627, "learning_rate": 2.6244773895159495e-07, "logits/chosen": 13.517461776733398, "logits/rejected": 13.552875518798828, "logps/chosen": -4.044581413269043, "logps/rejected": -4.36551570892334, "loss": 3.893, "rewards/accuracies": 0.75, "rewards/chosen": -40.44581604003906, "rewards/margins": 3.209345817565918, "rewards/rejected": -43.65515899658203, "step": 4778 }, { "epoch": 0.6507352941176471, "grad_norm": 42.09235927868661, "learning_rate": 2.6226921024197627e-07, "logits/chosen": 13.51202392578125, "logits/rejected": 13.798722267150879, "logps/chosen": -4.542665004730225, "logps/rejected": -4.749600410461426, "loss": 4.151, "rewards/accuracies": 0.75, "rewards/chosen": -45.42665100097656, "rewards/margins": 2.0693531036376953, "rewards/rejected": -47.496002197265625, "step": 4779 }, { "epoch": 0.650871459694989, "grad_norm": 40.694105858790714, "learning_rate": 2.6209071265376806e-07, "logits/chosen": 12.335494995117188, "logits/rejected": 13.065662384033203, "logps/chosen": -4.01989221572876, "logps/rejected": -4.365959167480469, "loss": 4.2582, "rewards/accuracies": 0.5, "rewards/chosen": -40.19892120361328, "rewards/margins": 3.4606666564941406, "rewards/rejected": -43.65958786010742, "step": 4780 }, { "epoch": 0.6510076252723311, "grad_norm": 42.42528489407854, "learning_rate": 2.619122462273034e-07, "logits/chosen": 13.80752182006836, "logits/rejected": 13.872833251953125, "logps/chosen": -4.224213600158691, "logps/rejected": -4.453664779663086, "loss": 3.4911, "rewards/accuracies": 0.5, "rewards/chosen": -42.24213409423828, "rewards/margins": 2.294513702392578, "rewards/rejected": -44.53664779663086, "step": 4781 }, { "epoch": 0.6511437908496732, "grad_norm": 43.82940524353873, "learning_rate": 2.6173381100290803e-07, "logits/chosen": 13.14329719543457, "logits/rejected": 14.668980598449707, "logps/chosen": -4.341910362243652, "logps/rejected": -4.894413948059082, "loss": 3.3208, "rewards/accuracies": 1.0, "rewards/chosen": -43.41910171508789, "rewards/margins": 5.52503776550293, "rewards/rejected": -48.94413757324219, "step": 4782 }, { "epoch": 0.6512799564270153, "grad_norm": 36.84182020938845, "learning_rate": 2.6155540702090094e-07, "logits/chosen": 13.670907974243164, "logits/rejected": 14.165387153625488, "logps/chosen": -4.503003120422363, "logps/rejected": -4.565016746520996, "loss": 4.0056, "rewards/accuracies": 0.5, "rewards/chosen": -45.030033111572266, "rewards/margins": 0.6201391220092773, "rewards/rejected": -45.650169372558594, "step": 4783 }, { "epoch": 0.6514161220043573, "grad_norm": 42.95372394157904, "learning_rate": 2.6137703432159423e-07, "logits/chosen": 13.85477066040039, "logits/rejected": 13.583366394042969, "logps/chosen": -4.718145370483398, "logps/rejected": -4.265137195587158, "loss": 4.0097, "rewards/accuracies": 0.0, "rewards/chosen": -47.181453704833984, "rewards/margins": -4.530085563659668, "rewards/rejected": -42.651371002197266, "step": 4784 }, { "epoch": 0.6515522875816994, "grad_norm": 42.42989544105503, "learning_rate": 2.611986929452923e-07, "logits/chosen": 13.040770530700684, "logits/rejected": 13.725528717041016, "logps/chosen": -4.485618591308594, "logps/rejected": -4.858017921447754, "loss": 3.6022, "rewards/accuracies": 0.75, "rewards/chosen": -44.85618209838867, "rewards/margins": 3.7239952087402344, "rewards/rejected": -48.580177307128906, "step": 4785 }, { "epoch": 0.6516884531590414, "grad_norm": 44.55120506156882, "learning_rate": 2.6102038293229306e-07, "logits/chosen": 12.765271186828613, "logits/rejected": 12.826476097106934, "logps/chosen": -3.741741895675659, "logps/rejected": -3.8812241554260254, "loss": 3.7961, "rewards/accuracies": 0.5, "rewards/chosen": -37.41741943359375, "rewards/margins": 1.3948206901550293, "rewards/rejected": -38.81224060058594, "step": 4786 }, { "epoch": 0.6518246187363834, "grad_norm": 43.692078346167996, "learning_rate": 2.6084210432288727e-07, "logits/chosen": 14.20946979522705, "logits/rejected": 13.738553047180176, "logps/chosen": -4.805618762969971, "logps/rejected": -4.57327938079834, "loss": 4.2518, "rewards/accuracies": 0.5, "rewards/chosen": -48.056190490722656, "rewards/margins": -2.323394775390625, "rewards/rejected": -45.732791900634766, "step": 4787 }, { "epoch": 0.6519607843137255, "grad_norm": 45.39854564229851, "learning_rate": 2.6066385715735815e-07, "logits/chosen": 12.730612754821777, "logits/rejected": 12.987800598144531, "logps/chosen": -4.166662216186523, "logps/rejected": -4.1681437492370605, "loss": 4.0189, "rewards/accuracies": 0.75, "rewards/chosen": -41.66661834716797, "rewards/margins": 0.014818191528320312, "rewards/rejected": -41.681434631347656, "step": 4788 }, { "epoch": 0.6520969498910676, "grad_norm": 64.27833891502041, "learning_rate": 2.6048564147598227e-07, "logits/chosen": 13.488934516906738, "logits/rejected": 12.779394149780273, "logps/chosen": -4.3784027099609375, "logps/rejected": -4.220819473266602, "loss": 3.8545, "rewards/accuracies": 0.5, "rewards/chosen": -43.784027099609375, "rewards/margins": -1.5758323669433594, "rewards/rejected": -42.208194732666016, "step": 4789 }, { "epoch": 0.6522331154684096, "grad_norm": 44.35456287521957, "learning_rate": 2.6030745731902905e-07, "logits/chosen": 13.443710327148438, "logits/rejected": 14.115628242492676, "logps/chosen": -4.566941261291504, "logps/rejected": -4.6405229568481445, "loss": 4.142, "rewards/accuracies": 0.5, "rewards/chosen": -45.669410705566406, "rewards/margins": 0.7358160018920898, "rewards/rejected": -46.40522766113281, "step": 4790 }, { "epoch": 0.6523692810457516, "grad_norm": 40.89104400446635, "learning_rate": 2.6012930472676047e-07, "logits/chosen": 13.774370193481445, "logits/rejected": 14.786920547485352, "logps/chosen": -4.368419647216797, "logps/rejected": -4.5590996742248535, "loss": 3.6587, "rewards/accuracies": 0.75, "rewards/chosen": -43.684200286865234, "rewards/margins": 1.9067974090576172, "rewards/rejected": -45.59099578857422, "step": 4791 }, { "epoch": 0.6525054466230937, "grad_norm": 43.28200744216669, "learning_rate": 2.599511837394316e-07, "logits/chosen": 14.653675079345703, "logits/rejected": 15.042255401611328, "logps/chosen": -4.548330307006836, "logps/rejected": -4.594693660736084, "loss": 4.236, "rewards/accuracies": 0.5, "rewards/chosen": -45.48330307006836, "rewards/margins": 0.46363162994384766, "rewards/rejected": -45.946937561035156, "step": 4792 }, { "epoch": 0.6526416122004357, "grad_norm": 44.2459465748501, "learning_rate": 2.5977309439729064e-07, "logits/chosen": 13.70407485961914, "logits/rejected": 13.815661430358887, "logps/chosen": -4.1846466064453125, "logps/rejected": -4.186216354370117, "loss": 3.6986, "rewards/accuracies": 0.25, "rewards/chosen": -41.846466064453125, "rewards/margins": 0.015695571899414062, "rewards/rejected": -41.86216354370117, "step": 4793 }, { "epoch": 0.6527777777777778, "grad_norm": 44.14556927549596, "learning_rate": 2.5959503674057786e-07, "logits/chosen": 13.640771865844727, "logits/rejected": 13.642911911010742, "logps/chosen": -4.357944965362549, "logps/rejected": -4.494022846221924, "loss": 3.4355, "rewards/accuracies": 0.75, "rewards/chosen": -43.57944869995117, "rewards/margins": 1.3607807159423828, "rewards/rejected": -44.94023132324219, "step": 4794 }, { "epoch": 0.6529139433551199, "grad_norm": 46.08482940303412, "learning_rate": 2.594170108095272e-07, "logits/chosen": 13.777751922607422, "logits/rejected": 13.726024627685547, "logps/chosen": -4.527119159698486, "logps/rejected": -4.443488121032715, "loss": 4.4285, "rewards/accuracies": 0.5, "rewards/chosen": -45.27119064331055, "rewards/margins": -0.8363094329833984, "rewards/rejected": -44.434879302978516, "step": 4795 }, { "epoch": 0.6530501089324618, "grad_norm": 38.7479949733368, "learning_rate": 2.5923901664436524e-07, "logits/chosen": 13.685857772827148, "logits/rejected": 13.582772254943848, "logps/chosen": -4.094073295593262, "logps/rejected": -4.255725860595703, "loss": 3.6089, "rewards/accuracies": 0.75, "rewards/chosen": -40.94073486328125, "rewards/margins": 1.6165266036987305, "rewards/rejected": -42.55725860595703, "step": 4796 }, { "epoch": 0.6531862745098039, "grad_norm": 44.56957042805886, "learning_rate": 2.590610542853108e-07, "logits/chosen": 13.343812942504883, "logits/rejected": 13.286812782287598, "logps/chosen": -4.385466575622559, "logps/rejected": -4.448220252990723, "loss": 4.286, "rewards/accuracies": 0.5, "rewards/chosen": -43.85466003417969, "rewards/margins": 0.6275386810302734, "rewards/rejected": -44.482200622558594, "step": 4797 }, { "epoch": 0.653322440087146, "grad_norm": 41.03167380274044, "learning_rate": 2.5888312377257616e-07, "logits/chosen": 13.506175994873047, "logits/rejected": 14.200511932373047, "logps/chosen": -4.45750093460083, "logps/rejected": -4.812190055847168, "loss": 3.9292, "rewards/accuracies": 0.75, "rewards/chosen": -44.575008392333984, "rewards/margins": 3.546889305114746, "rewards/rejected": -48.12189865112305, "step": 4798 }, { "epoch": 0.653458605664488, "grad_norm": 45.78400452083908, "learning_rate": 2.587052251463663e-07, "logits/chosen": 13.182901382446289, "logits/rejected": 14.711524963378906, "logps/chosen": -3.92444109916687, "logps/rejected": -4.759163856506348, "loss": 4.0338, "rewards/accuracies": 1.0, "rewards/chosen": -39.244407653808594, "rewards/margins": 8.3472261428833, "rewards/rejected": -47.591636657714844, "step": 4799 }, { "epoch": 0.6535947712418301, "grad_norm": 39.62020167819782, "learning_rate": 2.5852735844687867e-07, "logits/chosen": 12.744604110717773, "logits/rejected": 13.682899475097656, "logps/chosen": -4.097288131713867, "logps/rejected": -4.718868255615234, "loss": 3.5467, "rewards/accuracies": 1.0, "rewards/chosen": -40.97288513183594, "rewards/margins": 6.215794563293457, "rewards/rejected": -47.188682556152344, "step": 4800 }, { "epoch": 0.6537309368191722, "grad_norm": 40.85681725132578, "learning_rate": 2.5834952371430383e-07, "logits/chosen": 13.402084350585938, "logits/rejected": 14.221044540405273, "logps/chosen": -4.232624053955078, "logps/rejected": -4.414488315582275, "loss": 3.5919, "rewards/accuracies": 0.75, "rewards/chosen": -42.32624053955078, "rewards/margins": 1.8186454772949219, "rewards/rejected": -44.14488220214844, "step": 4801 }, { "epoch": 0.6538671023965141, "grad_norm": 39.79365569655043, "learning_rate": 2.5817172098882513e-07, "logits/chosen": 13.467512130737305, "logits/rejected": 13.505762100219727, "logps/chosen": -4.376077651977539, "logps/rejected": -4.5669403076171875, "loss": 3.7481, "rewards/accuracies": 0.5, "rewards/chosen": -43.760780334472656, "rewards/margins": 1.908625602722168, "rewards/rejected": -45.66940689086914, "step": 4802 }, { "epoch": 0.6540032679738562, "grad_norm": 39.34047869393261, "learning_rate": 2.579939503106183e-07, "logits/chosen": 13.696451187133789, "logits/rejected": 13.898509979248047, "logps/chosen": -4.39787483215332, "logps/rejected": -4.707085609436035, "loss": 3.8119, "rewards/accuracies": 0.75, "rewards/chosen": -43.97874450683594, "rewards/margins": 3.0921096801757812, "rewards/rejected": -47.070858001708984, "step": 4803 }, { "epoch": 0.6541394335511983, "grad_norm": 40.934117704370536, "learning_rate": 2.5781621171985215e-07, "logits/chosen": 13.256973266601562, "logits/rejected": 13.617452621459961, "logps/chosen": -4.274383544921875, "logps/rejected": -4.303462028503418, "loss": 4.3443, "rewards/accuracies": 0.25, "rewards/chosen": -42.74383544921875, "rewards/margins": 0.29078102111816406, "rewards/rejected": -43.03461456298828, "step": 4804 }, { "epoch": 0.6542755991285403, "grad_norm": 40.29339922796425, "learning_rate": 2.5763850525668857e-07, "logits/chosen": 13.229618072509766, "logits/rejected": 14.762445449829102, "logps/chosen": -4.080269813537598, "logps/rejected": -4.704588890075684, "loss": 3.0355, "rewards/accuracies": 1.0, "rewards/chosen": -40.80270004272461, "rewards/margins": 6.243191719055176, "rewards/rejected": -47.04589080810547, "step": 4805 }, { "epoch": 0.6544117647058824, "grad_norm": 37.4958581553915, "learning_rate": 2.574608309612812e-07, "logits/chosen": 13.662322998046875, "logits/rejected": 13.590856552124023, "logps/chosen": -4.386597633361816, "logps/rejected": -4.574773788452148, "loss": 3.5164, "rewards/accuracies": 0.5, "rewards/chosen": -43.86597442626953, "rewards/margins": 1.881760597229004, "rewards/rejected": -45.74773406982422, "step": 4806 }, { "epoch": 0.6545479302832244, "grad_norm": 46.50775866929424, "learning_rate": 2.5728318887377744e-07, "logits/chosen": 13.886653900146484, "logits/rejected": 14.152176856994629, "logps/chosen": -4.184065341949463, "logps/rejected": -4.456096649169922, "loss": 3.7862, "rewards/accuracies": 0.75, "rewards/chosen": -41.84065246582031, "rewards/margins": 2.720309257507324, "rewards/rejected": -44.56096267700195, "step": 4807 }, { "epoch": 0.6546840958605664, "grad_norm": 46.39135043196925, "learning_rate": 2.571055790343169e-07, "logits/chosen": 12.236047744750977, "logits/rejected": 13.947607040405273, "logps/chosen": -3.9747016429901123, "logps/rejected": -4.54293155670166, "loss": 3.7763, "rewards/accuracies": 1.0, "rewards/chosen": -39.74701690673828, "rewards/margins": 5.68229866027832, "rewards/rejected": -45.42931365966797, "step": 4808 }, { "epoch": 0.6548202614379085, "grad_norm": 49.510101817842184, "learning_rate": 2.5692800148303193e-07, "logits/chosen": 13.394908905029297, "logits/rejected": 13.24221420288086, "logps/chosen": -3.920799970626831, "logps/rejected": -4.2840576171875, "loss": 3.9849, "rewards/accuracies": 0.75, "rewards/chosen": -39.20800018310547, "rewards/margins": 3.6325740814208984, "rewards/rejected": -42.840576171875, "step": 4809 }, { "epoch": 0.6549564270152506, "grad_norm": 41.79964434351646, "learning_rate": 2.5675045626004756e-07, "logits/chosen": 14.24901294708252, "logits/rejected": 15.232170104980469, "logps/chosen": -4.412275314331055, "logps/rejected": -4.953709602355957, "loss": 3.9294, "rewards/accuracies": 0.75, "rewards/chosen": -44.12275314331055, "rewards/margins": 5.414346694946289, "rewards/rejected": -49.5370979309082, "step": 4810 }, { "epoch": 0.6550925925925926, "grad_norm": 38.39106656252235, "learning_rate": 2.565729434054819e-07, "logits/chosen": 13.55940055847168, "logits/rejected": 13.21628189086914, "logps/chosen": -4.373117446899414, "logps/rejected": -4.418885231018066, "loss": 4.1797, "rewards/accuracies": 0.5, "rewards/chosen": -43.731170654296875, "rewards/margins": 0.4576759338378906, "rewards/rejected": -44.18885040283203, "step": 4811 }, { "epoch": 0.6552287581699346, "grad_norm": 43.125978491253505, "learning_rate": 2.563954629594451e-07, "logits/chosen": 13.674751281738281, "logits/rejected": 12.956669807434082, "logps/chosen": -4.3767499923706055, "logps/rejected": -4.2127227783203125, "loss": 4.1832, "rewards/accuracies": 0.25, "rewards/chosen": -43.76750183105469, "rewards/margins": -1.6402711868286133, "rewards/rejected": -42.127227783203125, "step": 4812 }, { "epoch": 0.6553649237472767, "grad_norm": 40.46559128068754, "learning_rate": 2.562180149620405e-07, "logits/chosen": 13.334183692932129, "logits/rejected": 14.267509460449219, "logps/chosen": -4.1642985343933105, "logps/rejected": -4.548434257507324, "loss": 3.7344, "rewards/accuracies": 0.75, "rewards/chosen": -41.642982482910156, "rewards/margins": 3.841360092163086, "rewards/rejected": -45.484344482421875, "step": 4813 }, { "epoch": 0.6555010893246187, "grad_norm": 42.08829524321939, "learning_rate": 2.56040599453364e-07, "logits/chosen": 11.972612380981445, "logits/rejected": 13.344547271728516, "logps/chosen": -4.026307106018066, "logps/rejected": -4.3779497146606445, "loss": 3.6551, "rewards/accuracies": 0.75, "rewards/chosen": -40.26306915283203, "rewards/margins": 3.516425132751465, "rewards/rejected": -43.77949523925781, "step": 4814 }, { "epoch": 0.6556372549019608, "grad_norm": 40.59412651308067, "learning_rate": 2.5586321647350405e-07, "logits/chosen": 12.657430648803711, "logits/rejected": 13.952216148376465, "logps/chosen": -3.8611857891082764, "logps/rejected": -4.312834739685059, "loss": 3.6407, "rewards/accuracies": 0.75, "rewards/chosen": -38.61185836791992, "rewards/margins": 4.516488075256348, "rewards/rejected": -43.12834930419922, "step": 4815 }, { "epoch": 0.6557734204793029, "grad_norm": 40.62756855679807, "learning_rate": 2.556858660625417e-07, "logits/chosen": 13.283796310424805, "logits/rejected": 12.969778060913086, "logps/chosen": -4.4265642166137695, "logps/rejected": -4.5610032081604, "loss": 3.9983, "rewards/accuracies": 0.75, "rewards/chosen": -44.26564407348633, "rewards/margins": 1.344386100769043, "rewards/rejected": -45.61003112792969, "step": 4816 }, { "epoch": 0.6559095860566448, "grad_norm": 45.36323329131895, "learning_rate": 2.5550854826055095e-07, "logits/chosen": 13.598344802856445, "logits/rejected": 14.458696365356445, "logps/chosen": -4.046489715576172, "logps/rejected": -4.709260940551758, "loss": 3.8442, "rewards/accuracies": 0.75, "rewards/chosen": -40.464900970458984, "rewards/margins": 6.627707481384277, "rewards/rejected": -47.09260559082031, "step": 4817 }, { "epoch": 0.6560457516339869, "grad_norm": 40.320118180695474, "learning_rate": 2.55331263107598e-07, "logits/chosen": 14.029510498046875, "logits/rejected": 13.355866432189941, "logps/chosen": -4.673018932342529, "logps/rejected": -4.412741661071777, "loss": 3.8861, "rewards/accuracies": 0.25, "rewards/chosen": -46.730186462402344, "rewards/margins": -2.602773666381836, "rewards/rejected": -44.12741470336914, "step": 4818 }, { "epoch": 0.656181917211329, "grad_norm": 44.3333719472831, "learning_rate": 2.5515401064374196e-07, "logits/chosen": 13.137012481689453, "logits/rejected": 13.677608489990234, "logps/chosen": -4.269285678863525, "logps/rejected": -4.398532867431641, "loss": 4.637, "rewards/accuracies": 0.75, "rewards/chosen": -42.69285583496094, "rewards/margins": 1.2924728393554688, "rewards/rejected": -43.985328674316406, "step": 4819 }, { "epoch": 0.656318082788671, "grad_norm": 45.186641387237714, "learning_rate": 2.549767909090346e-07, "logits/chosen": 13.208614349365234, "logits/rejected": 13.295076370239258, "logps/chosen": -4.238128662109375, "logps/rejected": -4.621414661407471, "loss": 4.1294, "rewards/accuracies": 0.75, "rewards/chosen": -42.38128662109375, "rewards/margins": 3.83286190032959, "rewards/rejected": -46.21414566040039, "step": 4820 }, { "epoch": 0.6564542483660131, "grad_norm": 43.58313349461245, "learning_rate": 2.5479960394352e-07, "logits/chosen": 12.497320175170898, "logits/rejected": 12.947196960449219, "logps/chosen": -3.804654359817505, "logps/rejected": -3.904860019683838, "loss": 3.9222, "rewards/accuracies": 0.5, "rewards/chosen": -38.04654312133789, "rewards/margins": 1.0020575523376465, "rewards/rejected": -39.04859924316406, "step": 4821 }, { "epoch": 0.6565904139433552, "grad_norm": 44.16707428364941, "learning_rate": 2.546224497872353e-07, "logits/chosen": 12.85472297668457, "logits/rejected": 13.775104522705078, "logps/chosen": -4.188927173614502, "logps/rejected": -4.57174015045166, "loss": 3.8974, "rewards/accuracies": 0.75, "rewards/chosen": -41.88927459716797, "rewards/margins": 3.8281288146972656, "rewards/rejected": -45.717403411865234, "step": 4822 }, { "epoch": 0.6567265795206971, "grad_norm": 44.71484664970943, "learning_rate": 2.544453284802097e-07, "logits/chosen": 13.288958549499512, "logits/rejected": 13.391562461853027, "logps/chosen": -4.294061660766602, "logps/rejected": -4.10745096206665, "loss": 4.2743, "rewards/accuracies": 0.25, "rewards/chosen": -42.94062042236328, "rewards/margins": -1.866109848022461, "rewards/rejected": -41.07450866699219, "step": 4823 }, { "epoch": 0.6568627450980392, "grad_norm": 48.31151143825339, "learning_rate": 2.5426824006246527e-07, "logits/chosen": 13.079172134399414, "logits/rejected": 12.792215347290039, "logps/chosen": -4.423689842224121, "logps/rejected": -4.252873420715332, "loss": 4.0697, "rewards/accuracies": 0.25, "rewards/chosen": -44.23690414428711, "rewards/margins": -1.7081642150878906, "rewards/rejected": -42.52873992919922, "step": 4824 }, { "epoch": 0.6569989106753813, "grad_norm": 40.23684315283951, "learning_rate": 2.540911845740167e-07, "logits/chosen": 13.556312561035156, "logits/rejected": 13.398798942565918, "logps/chosen": -4.3077073097229, "logps/rejected": -4.319065093994141, "loss": 3.9738, "rewards/accuracies": 0.5, "rewards/chosen": -43.07707214355469, "rewards/margins": 0.11357307434082031, "rewards/rejected": -43.19064712524414, "step": 4825 }, { "epoch": 0.6571350762527233, "grad_norm": 38.31252662088312, "learning_rate": 2.53914162054871e-07, "logits/chosen": 13.692604064941406, "logits/rejected": 13.157625198364258, "logps/chosen": -4.245841026306152, "logps/rejected": -4.317287445068359, "loss": 4.3015, "rewards/accuracies": 0.75, "rewards/chosen": -42.45840835571289, "rewards/margins": 0.7144670486450195, "rewards/rejected": -43.172874450683594, "step": 4826 }, { "epoch": 0.6572712418300654, "grad_norm": 40.58047509398669, "learning_rate": 2.537371725450279e-07, "logits/chosen": 13.587654113769531, "logits/rejected": 12.933324813842773, "logps/chosen": -4.449217319488525, "logps/rejected": -4.4615583419799805, "loss": 4.1224, "rewards/accuracies": 0.5, "rewards/chosen": -44.49217224121094, "rewards/margins": 0.1234130859375, "rewards/rejected": -44.61558532714844, "step": 4827 }, { "epoch": 0.6574074074074074, "grad_norm": 38.491357235233885, "learning_rate": 2.5356021608447967e-07, "logits/chosen": 12.580438613891602, "logits/rejected": 13.812845230102539, "logps/chosen": -4.432276725769043, "logps/rejected": -4.578025817871094, "loss": 4.1382, "rewards/accuracies": 0.5, "rewards/chosen": -44.32276916503906, "rewards/margins": 1.4574909210205078, "rewards/rejected": -45.78025817871094, "step": 4828 }, { "epoch": 0.6575435729847494, "grad_norm": 39.813705428277004, "learning_rate": 2.533832927132113e-07, "logits/chosen": 13.55910873413086, "logits/rejected": 13.302949905395508, "logps/chosen": -4.406524181365967, "logps/rejected": -4.161287784576416, "loss": 4.264, "rewards/accuracies": 0.0, "rewards/chosen": -44.065242767333984, "rewards/margins": -2.4523630142211914, "rewards/rejected": -41.61288070678711, "step": 4829 }, { "epoch": 0.6576797385620915, "grad_norm": 35.55713673887108, "learning_rate": 2.5320640247119966e-07, "logits/chosen": 13.243813514709473, "logits/rejected": 13.51505184173584, "logps/chosen": -3.870757818222046, "logps/rejected": -4.138223648071289, "loss": 3.8473, "rewards/accuracies": 0.5, "rewards/chosen": -38.70758056640625, "rewards/margins": 2.6746578216552734, "rewards/rejected": -41.382240295410156, "step": 4830 }, { "epoch": 0.6578159041394336, "grad_norm": 41.31680192120739, "learning_rate": 2.530295453984149e-07, "logits/chosen": 13.623821258544922, "logits/rejected": 12.622430801391602, "logps/chosen": -4.253951072692871, "logps/rejected": -4.087306499481201, "loss": 3.7834, "rewards/accuracies": 0.5, "rewards/chosen": -42.53950881958008, "rewards/margins": -1.6664419174194336, "rewards/rejected": -40.87306594848633, "step": 4831 }, { "epoch": 0.6579520697167756, "grad_norm": 37.570077739380366, "learning_rate": 2.5285272153481926e-07, "logits/chosen": 13.227090835571289, "logits/rejected": 12.16038703918457, "logps/chosen": -4.611159324645996, "logps/rejected": -4.243233680725098, "loss": 4.0455, "rewards/accuracies": 0.0, "rewards/chosen": -46.111595153808594, "rewards/margins": -3.679257392883301, "rewards/rejected": -42.43233871459961, "step": 4832 }, { "epoch": 0.6580882352941176, "grad_norm": 37.63614050934192, "learning_rate": 2.5267593092036754e-07, "logits/chosen": 12.792613983154297, "logits/rejected": 13.665138244628906, "logps/chosen": -4.034134864807129, "logps/rejected": -4.327916145324707, "loss": 3.7478, "rewards/accuracies": 0.75, "rewards/chosen": -40.341346740722656, "rewards/margins": 2.9378128051757812, "rewards/rejected": -43.2791633605957, "step": 4833 }, { "epoch": 0.6582244008714597, "grad_norm": 39.505914114469405, "learning_rate": 2.5249917359500685e-07, "logits/chosen": 13.043115615844727, "logits/rejected": 13.127699851989746, "logps/chosen": -4.366063594818115, "logps/rejected": -4.381473064422607, "loss": 3.8327, "rewards/accuracies": 0.5, "rewards/chosen": -43.66063690185547, "rewards/margins": 0.15408992767333984, "rewards/rejected": -43.814727783203125, "step": 4834 }, { "epoch": 0.6583605664488017, "grad_norm": 43.160219420604946, "learning_rate": 2.5232244959867734e-07, "logits/chosen": 12.984964370727539, "logits/rejected": 13.581911087036133, "logps/chosen": -4.283434867858887, "logps/rejected": -4.527683734893799, "loss": 3.9119, "rewards/accuracies": 0.75, "rewards/chosen": -42.834346771240234, "rewards/margins": 2.4424924850463867, "rewards/rejected": -45.27684020996094, "step": 4835 }, { "epoch": 0.6584967320261438, "grad_norm": 40.931268246042094, "learning_rate": 2.521457589713109e-07, "logits/chosen": 13.673454284667969, "logits/rejected": 12.953485488891602, "logps/chosen": -4.388434410095215, "logps/rejected": -4.242166519165039, "loss": 4.3848, "rewards/accuracies": 0.25, "rewards/chosen": -43.88434600830078, "rewards/margins": -1.4626798629760742, "rewards/rejected": -42.42166519165039, "step": 4836 }, { "epoch": 0.6586328976034859, "grad_norm": 43.25939101251337, "learning_rate": 2.519691017528324e-07, "logits/chosen": 13.336263656616211, "logits/rejected": 13.369277954101562, "logps/chosen": -4.546687126159668, "logps/rejected": -4.48940372467041, "loss": 3.9719, "rewards/accuracies": 0.5, "rewards/chosen": -45.46687316894531, "rewards/margins": -0.5728292465209961, "rewards/rejected": -44.89404296875, "step": 4837 }, { "epoch": 0.6587690631808278, "grad_norm": 40.01875140039348, "learning_rate": 2.517924779831592e-07, "logits/chosen": 12.98240852355957, "logits/rejected": 13.080118179321289, "logps/chosen": -4.040762901306152, "logps/rejected": -4.225780010223389, "loss": 4.2139, "rewards/accuracies": 0.75, "rewards/chosen": -40.407630920410156, "rewards/margins": 1.8501691818237305, "rewards/rejected": -42.25779724121094, "step": 4838 }, { "epoch": 0.6589052287581699, "grad_norm": 35.737246147826944, "learning_rate": 2.516158877022005e-07, "logits/chosen": 12.885457992553711, "logits/rejected": 12.989078521728516, "logps/chosen": -4.160884857177734, "logps/rejected": -4.419665336608887, "loss": 3.7438, "rewards/accuracies": 1.0, "rewards/chosen": -41.608848571777344, "rewards/margins": 2.5878067016601562, "rewards/rejected": -44.1966552734375, "step": 4839 }, { "epoch": 0.659041394335512, "grad_norm": 38.06788616929693, "learning_rate": 2.5143933094985855e-07, "logits/chosen": 13.622318267822266, "logits/rejected": 13.080211639404297, "logps/chosen": -4.159061431884766, "logps/rejected": -4.413568019866943, "loss": 4.0498, "rewards/accuracies": 0.75, "rewards/chosen": -41.590614318847656, "rewards/margins": 2.5450639724731445, "rewards/rejected": -44.135677337646484, "step": 4840 }, { "epoch": 0.659177559912854, "grad_norm": 45.14373488664754, "learning_rate": 2.512628077660279e-07, "logits/chosen": 12.97885513305664, "logits/rejected": 13.749739646911621, "logps/chosen": -4.068404197692871, "logps/rejected": -4.536762237548828, "loss": 3.9133, "rewards/accuracies": 1.0, "rewards/chosen": -40.68404006958008, "rewards/margins": 4.68358039855957, "rewards/rejected": -45.36762237548828, "step": 4841 }, { "epoch": 0.6593137254901961, "grad_norm": 37.94341767518585, "learning_rate": 2.510863181905952e-07, "logits/chosen": 12.56281566619873, "logits/rejected": 13.714435577392578, "logps/chosen": -4.28479528427124, "logps/rejected": -4.52863073348999, "loss": 4.3389, "rewards/accuracies": 1.0, "rewards/chosen": -42.84795379638672, "rewards/margins": 2.438352584838867, "rewards/rejected": -45.28630828857422, "step": 4842 }, { "epoch": 0.6594498910675382, "grad_norm": 37.207296907603585, "learning_rate": 2.509098622634398e-07, "logits/chosen": 13.398033142089844, "logits/rejected": 13.619298934936523, "logps/chosen": -4.3348493576049805, "logps/rejected": -4.39497184753418, "loss": 3.9624, "rewards/accuracies": 0.5, "rewards/chosen": -43.34849166870117, "rewards/margins": 0.6012258529663086, "rewards/rejected": -43.9497184753418, "step": 4843 }, { "epoch": 0.6595860566448801, "grad_norm": 43.003221775957314, "learning_rate": 2.507334400244336e-07, "logits/chosen": 12.896505355834961, "logits/rejected": 13.815654754638672, "logps/chosen": -4.204035758972168, "logps/rejected": -4.161462306976318, "loss": 4.7144, "rewards/accuracies": 0.5, "rewards/chosen": -42.04035949707031, "rewards/margins": -0.4257335662841797, "rewards/rejected": -41.6146240234375, "step": 4844 }, { "epoch": 0.6597222222222222, "grad_norm": 39.56574663321582, "learning_rate": 2.5055705151344033e-07, "logits/chosen": 13.127731323242188, "logits/rejected": 13.529741287231445, "logps/chosen": -4.211366176605225, "logps/rejected": -4.509433746337891, "loss": 4.1166, "rewards/accuracies": 1.0, "rewards/chosen": -42.11366271972656, "rewards/margins": 2.9806737899780273, "rewards/rejected": -45.094337463378906, "step": 4845 }, { "epoch": 0.6598583877995643, "grad_norm": 39.80072706516435, "learning_rate": 2.5038069677031657e-07, "logits/chosen": 13.131032943725586, "logits/rejected": 12.452977180480957, "logps/chosen": -4.314886569976807, "logps/rejected": -4.156721115112305, "loss": 3.9052, "rewards/accuracies": 0.5, "rewards/chosen": -43.14886474609375, "rewards/margins": -1.581650733947754, "rewards/rejected": -41.56721496582031, "step": 4846 }, { "epoch": 0.6599945533769063, "grad_norm": 41.79390030900328, "learning_rate": 2.5020437583491126e-07, "logits/chosen": 13.622657775878906, "logits/rejected": 13.833085060119629, "logps/chosen": -4.546714782714844, "logps/rejected": -4.478425025939941, "loss": 4.4163, "rewards/accuracies": 0.75, "rewards/chosen": -45.46714782714844, "rewards/margins": -0.6828937530517578, "rewards/rejected": -44.78425598144531, "step": 4847 }, { "epoch": 0.6601307189542484, "grad_norm": 40.75241536118566, "learning_rate": 2.5002808874706535e-07, "logits/chosen": 13.737089157104492, "logits/rejected": 13.344276428222656, "logps/chosen": -4.520402431488037, "logps/rejected": -4.56214714050293, "loss": 4.3853, "rewards/accuracies": 0.75, "rewards/chosen": -45.20402526855469, "rewards/margins": 0.41744232177734375, "rewards/rejected": -45.62146759033203, "step": 4848 }, { "epoch": 0.6602668845315904, "grad_norm": 44.67668539962766, "learning_rate": 2.498518355466124e-07, "logits/chosen": 12.107540130615234, "logits/rejected": 13.234146118164062, "logps/chosen": -4.100611686706543, "logps/rejected": -4.494577407836914, "loss": 3.6067, "rewards/accuracies": 0.75, "rewards/chosen": -41.00611114501953, "rewards/margins": 3.9396629333496094, "rewards/rejected": -44.94577407836914, "step": 4849 }, { "epoch": 0.6604030501089324, "grad_norm": 48.477458695054494, "learning_rate": 2.4967561627337854e-07, "logits/chosen": 13.234403610229492, "logits/rejected": 13.041242599487305, "logps/chosen": -4.3136067390441895, "logps/rejected": -4.328390121459961, "loss": 3.7742, "rewards/accuracies": 0.5, "rewards/chosen": -43.13606643676758, "rewards/margins": 0.14783382415771484, "rewards/rejected": -43.283897399902344, "step": 4850 }, { "epoch": 0.6605392156862745, "grad_norm": 39.08298561837466, "learning_rate": 2.494994309671816e-07, "logits/chosen": 13.364660263061523, "logits/rejected": 13.468482971191406, "logps/chosen": -4.234621047973633, "logps/rejected": -4.335199356079102, "loss": 4.25, "rewards/accuracies": 0.75, "rewards/chosen": -42.34621047973633, "rewards/margins": 1.0057830810546875, "rewards/rejected": -43.351993560791016, "step": 4851 }, { "epoch": 0.6606753812636166, "grad_norm": 39.576089725730085, "learning_rate": 2.493232796678323e-07, "logits/chosen": 13.185211181640625, "logits/rejected": 13.969440460205078, "logps/chosen": -4.177400588989258, "logps/rejected": -4.499831199645996, "loss": 3.9026, "rewards/accuracies": 0.75, "rewards/chosen": -41.77400207519531, "rewards/margins": 3.224308967590332, "rewards/rejected": -44.998313903808594, "step": 4852 }, { "epoch": 0.6608115468409586, "grad_norm": 47.1435455445944, "learning_rate": 2.4914716241513366e-07, "logits/chosen": 13.135504722595215, "logits/rejected": 13.486041069030762, "logps/chosen": -4.298486709594727, "logps/rejected": -4.844300270080566, "loss": 3.4776, "rewards/accuracies": 0.75, "rewards/chosen": -42.984867095947266, "rewards/margins": 5.458136558532715, "rewards/rejected": -48.4430046081543, "step": 4853 }, { "epoch": 0.6609477124183006, "grad_norm": 43.568980138871915, "learning_rate": 2.4897107924888044e-07, "logits/chosen": 13.553844451904297, "logits/rejected": 13.429682731628418, "logps/chosen": -4.603574752807617, "logps/rejected": -4.425307273864746, "loss": 4.1907, "rewards/accuracies": 0.25, "rewards/chosen": -46.03574752807617, "rewards/margins": -1.782679557800293, "rewards/rejected": -44.25307083129883, "step": 4854 }, { "epoch": 0.6610838779956427, "grad_norm": 38.58083116551113, "learning_rate": 2.4879503020886025e-07, "logits/chosen": 13.12731647491455, "logits/rejected": 13.03384017944336, "logps/chosen": -4.250631809234619, "logps/rejected": -4.246269702911377, "loss": 3.654, "rewards/accuracies": 0.5, "rewards/chosen": -42.506317138671875, "rewards/margins": -0.04362201690673828, "rewards/rejected": -42.46269989013672, "step": 4855 }, { "epoch": 0.6612200435729847, "grad_norm": 53.917352024895926, "learning_rate": 2.486190153348531e-07, "logits/chosen": 13.23365592956543, "logits/rejected": 13.242727279663086, "logps/chosen": -4.041452884674072, "logps/rejected": -4.1051344871521, "loss": 3.6419, "rewards/accuracies": 0.5, "rewards/chosen": -40.41453170776367, "rewards/margins": 0.636815071105957, "rewards/rejected": -41.05134582519531, "step": 4856 }, { "epoch": 0.6613562091503268, "grad_norm": 41.28529334699118, "learning_rate": 2.484430346666305e-07, "logits/chosen": 12.86372184753418, "logits/rejected": 13.511724472045898, "logps/chosen": -4.047082901000977, "logps/rejected": -4.3482184410095215, "loss": 4.3804, "rewards/accuracies": 0.75, "rewards/chosen": -40.47083282470703, "rewards/margins": 3.0113487243652344, "rewards/rejected": -43.482181549072266, "step": 4857 }, { "epoch": 0.6614923747276689, "grad_norm": 39.46099030195038, "learning_rate": 2.482670882439571e-07, "logits/chosen": 13.285076141357422, "logits/rejected": 13.010248184204102, "logps/chosen": -4.250953197479248, "logps/rejected": -4.252496719360352, "loss": 4.2538, "rewards/accuracies": 0.5, "rewards/chosen": -42.5095329284668, "rewards/margins": 0.015432357788085938, "rewards/rejected": -42.52496337890625, "step": 4858 }, { "epoch": 0.661628540305011, "grad_norm": 39.52338124952296, "learning_rate": 2.4809117610658943e-07, "logits/chosen": 12.407051086425781, "logits/rejected": 14.256521224975586, "logps/chosen": -4.111352920532227, "logps/rejected": -4.526444435119629, "loss": 4.2297, "rewards/accuracies": 1.0, "rewards/chosen": -41.11353302001953, "rewards/margins": 4.150912284851074, "rewards/rejected": -45.264442443847656, "step": 4859 }, { "epoch": 0.6617647058823529, "grad_norm": 46.70990456884799, "learning_rate": 2.479152982942761e-07, "logits/chosen": 12.490952491760254, "logits/rejected": 12.83728313446045, "logps/chosen": -4.03476619720459, "logps/rejected": -3.909162759780884, "loss": 3.8275, "rewards/accuracies": 0.25, "rewards/chosen": -40.34766387939453, "rewards/margins": -1.256037712097168, "rewards/rejected": -39.09162902832031, "step": 4860 }, { "epoch": 0.661900871459695, "grad_norm": 43.51399204167551, "learning_rate": 2.4773945484675824e-07, "logits/chosen": 12.340774536132812, "logits/rejected": 11.809991836547852, "logps/chosen": -4.242424011230469, "logps/rejected": -4.237431049346924, "loss": 4.4898, "rewards/accuracies": 0.5, "rewards/chosen": -42.42424392700195, "rewards/margins": -0.049933433532714844, "rewards/rejected": -42.37430953979492, "step": 4861 }, { "epoch": 0.6620370370370371, "grad_norm": 43.85555966255935, "learning_rate": 2.475636458037692e-07, "logits/chosen": 13.523294448852539, "logits/rejected": 12.896668434143066, "logps/chosen": -4.292069911956787, "logps/rejected": -4.1782026290893555, "loss": 4.1717, "rewards/accuracies": 0.25, "rewards/chosen": -42.92070007324219, "rewards/margins": -1.1386737823486328, "rewards/rejected": -41.78202438354492, "step": 4862 }, { "epoch": 0.6621732026143791, "grad_norm": 44.60173809177124, "learning_rate": 2.4738787120503454e-07, "logits/chosen": 13.34432601928711, "logits/rejected": 13.67398452758789, "logps/chosen": -4.105953216552734, "logps/rejected": -4.327817916870117, "loss": 3.9865, "rewards/accuracies": 0.75, "rewards/chosen": -41.059532165527344, "rewards/margins": 2.218644142150879, "rewards/rejected": -43.278175354003906, "step": 4863 }, { "epoch": 0.6623093681917211, "grad_norm": 59.18897498284194, "learning_rate": 2.4721213109027174e-07, "logits/chosen": 12.133050918579102, "logits/rejected": 12.901020050048828, "logps/chosen": -4.0787858963012695, "logps/rejected": -4.385400772094727, "loss": 4.4507, "rewards/accuracies": 0.75, "rewards/chosen": -40.787864685058594, "rewards/margins": 3.0661468505859375, "rewards/rejected": -43.854007720947266, "step": 4864 }, { "epoch": 0.6624455337690632, "grad_norm": 39.645851494095645, "learning_rate": 2.4703642549919095e-07, "logits/chosen": 12.954307556152344, "logits/rejected": 13.379350662231445, "logps/chosen": -4.240745544433594, "logps/rejected": -4.661426544189453, "loss": 4.0939, "rewards/accuracies": 1.0, "rewards/chosen": -42.40745162963867, "rewards/margins": 4.206813812255859, "rewards/rejected": -46.61426544189453, "step": 4865 }, { "epoch": 0.6625816993464052, "grad_norm": 39.26403689743284, "learning_rate": 2.468607544714943e-07, "logits/chosen": 12.884969711303711, "logits/rejected": 13.799625396728516, "logps/chosen": -4.21629524230957, "logps/rejected": -4.400213241577148, "loss": 3.8281, "rewards/accuracies": 0.75, "rewards/chosen": -42.1629524230957, "rewards/margins": 1.8391809463500977, "rewards/rejected": -44.00213623046875, "step": 4866 }, { "epoch": 0.6627178649237473, "grad_norm": 47.76230655870717, "learning_rate": 2.466851180468759e-07, "logits/chosen": 13.505232810974121, "logits/rejected": 13.934381484985352, "logps/chosen": -4.35368537902832, "logps/rejected": -4.748697280883789, "loss": 3.9954, "rewards/accuracies": 0.75, "rewards/chosen": -43.5368537902832, "rewards/margins": 3.9501142501831055, "rewards/rejected": -47.486968994140625, "step": 4867 }, { "epoch": 0.6628540305010894, "grad_norm": 42.423973993206026, "learning_rate": 2.4650951626502247e-07, "logits/chosen": 13.913315773010254, "logits/rejected": 13.401134490966797, "logps/chosen": -4.564023494720459, "logps/rejected": -4.264432907104492, "loss": 4.5249, "rewards/accuracies": 0.0, "rewards/chosen": -45.640235900878906, "rewards/margins": -2.995903968811035, "rewards/rejected": -42.64433288574219, "step": 4868 }, { "epoch": 0.6629901960784313, "grad_norm": 40.5124268666973, "learning_rate": 2.463339491656125e-07, "logits/chosen": 13.379480361938477, "logits/rejected": 13.222529411315918, "logps/chosen": -3.9618566036224365, "logps/rejected": -4.157569885253906, "loss": 3.4674, "rewards/accuracies": 0.5, "rewards/chosen": -39.618568420410156, "rewards/margins": 1.957137107849121, "rewards/rejected": -41.57570266723633, "step": 4869 }, { "epoch": 0.6631263616557734, "grad_norm": 44.4695033012291, "learning_rate": 2.4615841678831705e-07, "logits/chosen": 13.451008796691895, "logits/rejected": 13.011791229248047, "logps/chosen": -4.188054084777832, "logps/rejected": -4.2668023109436035, "loss": 3.8831, "rewards/accuracies": 0.5, "rewards/chosen": -41.88053894042969, "rewards/margins": 0.7874822616577148, "rewards/rejected": -42.668025970458984, "step": 4870 }, { "epoch": 0.6632625272331155, "grad_norm": 42.570215386988636, "learning_rate": 2.45982919172799e-07, "logits/chosen": 14.459634780883789, "logits/rejected": 13.498783111572266, "logps/chosen": -4.613310813903809, "logps/rejected": -4.579090118408203, "loss": 3.5758, "rewards/accuracies": 0.5, "rewards/chosen": -46.13311004638672, "rewards/margins": -0.3422088623046875, "rewards/rejected": -45.7909049987793, "step": 4871 }, { "epoch": 0.6633986928104575, "grad_norm": 42.6408516641532, "learning_rate": 2.4580745635871336e-07, "logits/chosen": 13.312277793884277, "logits/rejected": 13.778934478759766, "logps/chosen": -4.466326713562012, "logps/rejected": -4.444063663482666, "loss": 3.8236, "rewards/accuracies": 0.5, "rewards/chosen": -44.66326904296875, "rewards/margins": -0.22263622283935547, "rewards/rejected": -44.440635681152344, "step": 4872 }, { "epoch": 0.6635348583877996, "grad_norm": 39.10454494093548, "learning_rate": 2.4563202838570763e-07, "logits/chosen": 13.226156234741211, "logits/rejected": 13.861581802368164, "logps/chosen": -4.371176242828369, "logps/rejected": -4.651419639587402, "loss": 4.0634, "rewards/accuracies": 0.75, "rewards/chosen": -43.711761474609375, "rewards/margins": 2.802432060241699, "rewards/rejected": -46.514190673828125, "step": 4873 }, { "epoch": 0.6636710239651417, "grad_norm": 39.198590998844764, "learning_rate": 2.4545663529342116e-07, "logits/chosen": 13.937999725341797, "logits/rejected": 14.034158706665039, "logps/chosen": -4.30540132522583, "logps/rejected": -4.471070289611816, "loss": 4.2858, "rewards/accuracies": 0.75, "rewards/chosen": -43.054012298583984, "rewards/margins": 1.6566944122314453, "rewards/rejected": -44.71070861816406, "step": 4874 }, { "epoch": 0.6638071895424836, "grad_norm": 40.15651518464676, "learning_rate": 2.4528127712148523e-07, "logits/chosen": 13.794139862060547, "logits/rejected": 14.394777297973633, "logps/chosen": -4.351389408111572, "logps/rejected": -4.642669677734375, "loss": 3.9088, "rewards/accuracies": 0.5, "rewards/chosen": -43.513893127441406, "rewards/margins": 2.9128026962280273, "rewards/rejected": -46.42669677734375, "step": 4875 }, { "epoch": 0.6639433551198257, "grad_norm": 36.88363041252797, "learning_rate": 2.451059539095237e-07, "logits/chosen": 13.081636428833008, "logits/rejected": 12.685757637023926, "logps/chosen": -4.099449634552002, "logps/rejected": -4.470515251159668, "loss": 3.6914, "rewards/accuracies": 0.75, "rewards/chosen": -40.9944953918457, "rewards/margins": 3.71065616607666, "rewards/rejected": -44.70515441894531, "step": 4876 }, { "epoch": 0.6640795206971678, "grad_norm": 44.77111874508409, "learning_rate": 2.449306656971524e-07, "logits/chosen": 12.643003463745117, "logits/rejected": 13.698017120361328, "logps/chosen": -3.9021944999694824, "logps/rejected": -4.309817314147949, "loss": 4.2995, "rewards/accuracies": 0.75, "rewards/chosen": -39.021942138671875, "rewards/margins": 4.076229095458984, "rewards/rejected": -43.098175048828125, "step": 4877 }, { "epoch": 0.6642156862745098, "grad_norm": 47.69667764538896, "learning_rate": 2.447554125239789e-07, "logits/chosen": 13.32286262512207, "logits/rejected": 13.977788925170898, "logps/chosen": -4.223593711853027, "logps/rejected": -4.413881778717041, "loss": 4.5252, "rewards/accuracies": 0.75, "rewards/chosen": -42.235939025878906, "rewards/margins": 1.9028816223144531, "rewards/rejected": -44.138816833496094, "step": 4878 }, { "epoch": 0.6643518518518519, "grad_norm": 43.262607779639914, "learning_rate": 2.4458019442960315e-07, "logits/chosen": 13.746910095214844, "logits/rejected": 13.826690673828125, "logps/chosen": -4.471139430999756, "logps/rejected": -4.6729021072387695, "loss": 3.6861, "rewards/accuracies": 1.0, "rewards/chosen": -44.711395263671875, "rewards/margins": 2.0176219940185547, "rewards/rejected": -46.72901916503906, "step": 4879 }, { "epoch": 0.664488017429194, "grad_norm": 36.59259769678959, "learning_rate": 2.4440501145361734e-07, "logits/chosen": 12.616601943969727, "logits/rejected": 13.184237480163574, "logps/chosen": -3.940730094909668, "logps/rejected": -4.266572952270508, "loss": 3.9835, "rewards/accuracies": 1.0, "rewards/chosen": -39.40730285644531, "rewards/margins": 3.258429527282715, "rewards/rejected": -42.66572952270508, "step": 4880 }, { "epoch": 0.6646241830065359, "grad_norm": 41.50262894179587, "learning_rate": 2.442298636356052e-07, "logits/chosen": 13.347366333007812, "logits/rejected": 13.683242797851562, "logps/chosen": -4.1960601806640625, "logps/rejected": -4.508625030517578, "loss": 3.81, "rewards/accuracies": 0.75, "rewards/chosen": -41.960601806640625, "rewards/margins": 3.1256494522094727, "rewards/rejected": -45.08625030517578, "step": 4881 }, { "epoch": 0.664760348583878, "grad_norm": 39.580831653560615, "learning_rate": 2.44054751015143e-07, "logits/chosen": 12.510425567626953, "logits/rejected": 13.253423690795898, "logps/chosen": -3.8623828887939453, "logps/rejected": -4.063836097717285, "loss": 4.0628, "rewards/accuracies": 0.75, "rewards/chosen": -38.62383270263672, "rewards/margins": 2.014529228210449, "rewards/rejected": -40.63835906982422, "step": 4882 }, { "epoch": 0.6648965141612201, "grad_norm": 38.67739736061304, "learning_rate": 2.4387967363179903e-07, "logits/chosen": 13.72451400756836, "logits/rejected": 13.907447814941406, "logps/chosen": -4.208003997802734, "logps/rejected": -4.393013954162598, "loss": 3.8252, "rewards/accuracies": 0.75, "rewards/chosen": -42.080039978027344, "rewards/margins": 1.85009765625, "rewards/rejected": -43.93013381958008, "step": 4883 }, { "epoch": 0.6650326797385621, "grad_norm": 43.137924601740224, "learning_rate": 2.437046315251331e-07, "logits/chosen": 13.699214935302734, "logits/rejected": 13.523942947387695, "logps/chosen": -4.4225754737854, "logps/rejected": -4.274443626403809, "loss": 4.3233, "rewards/accuracies": 0.5, "rewards/chosen": -44.22575378417969, "rewards/margins": -1.4813194274902344, "rewards/rejected": -42.74443817138672, "step": 4884 }, { "epoch": 0.6651688453159041, "grad_norm": 40.69928011970485, "learning_rate": 2.4352962473469766e-07, "logits/chosen": 12.905139923095703, "logits/rejected": 14.316652297973633, "logps/chosen": -4.0424299240112305, "logps/rejected": -4.598393440246582, "loss": 4.0678, "rewards/accuracies": 0.75, "rewards/chosen": -40.42430114746094, "rewards/margins": 5.559628486633301, "rewards/rejected": -45.98393249511719, "step": 4885 }, { "epoch": 0.6653050108932462, "grad_norm": 41.11648632115058, "learning_rate": 2.433546533000371e-07, "logits/chosen": 14.17358112335205, "logits/rejected": 12.966264724731445, "logps/chosen": -4.440393447875977, "logps/rejected": -4.398529529571533, "loss": 3.5782, "rewards/accuracies": 0.25, "rewards/chosen": -44.40393829345703, "rewards/margins": -0.4186420440673828, "rewards/rejected": -43.98529052734375, "step": 4886 }, { "epoch": 0.6654411764705882, "grad_norm": 40.012251894875234, "learning_rate": 2.431797172606872e-07, "logits/chosen": 12.160344123840332, "logits/rejected": 13.513191223144531, "logps/chosen": -3.9704160690307617, "logps/rejected": -4.161637783050537, "loss": 4.0547, "rewards/accuracies": 0.5, "rewards/chosen": -39.704158782958984, "rewards/margins": 1.9122209548950195, "rewards/rejected": -41.61638259887695, "step": 4887 }, { "epoch": 0.6655773420479303, "grad_norm": 36.48752657559778, "learning_rate": 2.430048166561766e-07, "logits/chosen": 12.801403999328613, "logits/rejected": 12.150459289550781, "logps/chosen": -4.031839370727539, "logps/rejected": -4.023120880126953, "loss": 4.2519, "rewards/accuracies": 0.75, "rewards/chosen": -40.31839370727539, "rewards/margins": -0.08718681335449219, "rewards/rejected": -40.23120880126953, "step": 4888 }, { "epoch": 0.6657135076252724, "grad_norm": 43.1861308741953, "learning_rate": 2.428299515260255e-07, "logits/chosen": 13.781045913696289, "logits/rejected": 13.712265014648438, "logps/chosen": -4.5786638259887695, "logps/rejected": -4.430285453796387, "loss": 3.7738, "rewards/accuracies": 0.5, "rewards/chosen": -45.78663635253906, "rewards/margins": -1.4837846755981445, "rewards/rejected": -44.302852630615234, "step": 4889 }, { "epoch": 0.6658496732026143, "grad_norm": 45.11220602937502, "learning_rate": 2.426551219097459e-07, "logits/chosen": 13.375720977783203, "logits/rejected": 13.776820182800293, "logps/chosen": -4.148716926574707, "logps/rejected": -4.443337440490723, "loss": 4.2434, "rewards/accuracies": 1.0, "rewards/chosen": -41.48716735839844, "rewards/margins": 2.946208953857422, "rewards/rejected": -44.43337631225586, "step": 4890 }, { "epoch": 0.6659858387799564, "grad_norm": 41.726726858480376, "learning_rate": 2.4248032784684216e-07, "logits/chosen": 12.121271133422852, "logits/rejected": 13.424947738647461, "logps/chosen": -3.8015551567077637, "logps/rejected": -4.337018013000488, "loss": 4.049, "rewards/accuracies": 1.0, "rewards/chosen": -38.01555252075195, "rewards/margins": 5.3546295166015625, "rewards/rejected": -43.37017822265625, "step": 4891 }, { "epoch": 0.6661220043572985, "grad_norm": 39.83132408330608, "learning_rate": 2.423055693768105e-07, "logits/chosen": 13.17129898071289, "logits/rejected": 13.021276473999023, "logps/chosen": -3.9414291381835938, "logps/rejected": -4.1702423095703125, "loss": 3.8608, "rewards/accuracies": 1.0, "rewards/chosen": -39.41429138183594, "rewards/margins": 2.2881288528442383, "rewards/rejected": -41.70241928100586, "step": 4892 }, { "epoch": 0.6662581699346405, "grad_norm": 37.70960322526304, "learning_rate": 2.4213084653913886e-07, "logits/chosen": 13.872320175170898, "logits/rejected": 13.816057205200195, "logps/chosen": -4.327602863311768, "logps/rejected": -4.541011810302734, "loss": 3.9434, "rewards/accuracies": 0.5, "rewards/chosen": -43.276031494140625, "rewards/margins": 2.134087562561035, "rewards/rejected": -45.41011428833008, "step": 4893 }, { "epoch": 0.6663943355119826, "grad_norm": 37.51358191908684, "learning_rate": 2.419561593733074e-07, "logits/chosen": 13.376388549804688, "logits/rejected": 13.76553726196289, "logps/chosen": -4.176200866699219, "logps/rejected": -4.444461822509766, "loss": 4.2842, "rewards/accuracies": 0.75, "rewards/chosen": -41.76200866699219, "rewards/margins": 2.682607650756836, "rewards/rejected": -44.444618225097656, "step": 4894 }, { "epoch": 0.6665305010893247, "grad_norm": 37.02792522842502, "learning_rate": 2.417815079187883e-07, "logits/chosen": 12.932134628295898, "logits/rejected": 13.120559692382812, "logps/chosen": -4.021718978881836, "logps/rejected": -4.291897773742676, "loss": 3.4143, "rewards/accuracies": 0.5, "rewards/chosen": -40.21718978881836, "rewards/margins": 2.7017908096313477, "rewards/rejected": -42.91897964477539, "step": 4895 }, { "epoch": 0.6666666666666666, "grad_norm": 34.875627997748396, "learning_rate": 2.416068922150451e-07, "logits/chosen": 13.17052936553955, "logits/rejected": 12.73643684387207, "logps/chosen": -4.028583526611328, "logps/rejected": -3.8354718685150146, "loss": 4.0436, "rewards/accuracies": 0.5, "rewards/chosen": -40.28583526611328, "rewards/margins": -1.9311132431030273, "rewards/rejected": -38.35471725463867, "step": 4896 }, { "epoch": 0.6668028322440087, "grad_norm": 38.01883830004778, "learning_rate": 2.4143231230153397e-07, "logits/chosen": 12.746084213256836, "logits/rejected": 13.398025512695312, "logps/chosen": -4.042506217956543, "logps/rejected": -4.520597457885742, "loss": 3.6723, "rewards/accuracies": 1.0, "rewards/chosen": -40.42506408691406, "rewards/margins": 4.780911445617676, "rewards/rejected": -45.20597839355469, "step": 4897 }, { "epoch": 0.6669389978213508, "grad_norm": 43.954037174070564, "learning_rate": 2.4125776821770275e-07, "logits/chosen": 13.247049331665039, "logits/rejected": 13.810266494750977, "logps/chosen": -4.16709041595459, "logps/rejected": -4.324925899505615, "loss": 3.9962, "rewards/accuracies": 0.75, "rewards/chosen": -41.6709098815918, "rewards/margins": 1.578352928161621, "rewards/rejected": -43.24925994873047, "step": 4898 }, { "epoch": 0.6670751633986928, "grad_norm": 47.49700872471015, "learning_rate": 2.4108326000299077e-07, "logits/chosen": 14.514120101928711, "logits/rejected": 13.623897552490234, "logps/chosen": -4.599594593048096, "logps/rejected": -4.427738189697266, "loss": 4.2706, "rewards/accuracies": 0.25, "rewards/chosen": -45.99594497680664, "rewards/margins": -1.7185592651367188, "rewards/rejected": -44.27738571166992, "step": 4899 }, { "epoch": 0.6672113289760349, "grad_norm": 42.62096781786457, "learning_rate": 2.409087876968298e-07, "logits/chosen": 13.298452377319336, "logits/rejected": 14.061710357666016, "logps/chosen": -4.101459503173828, "logps/rejected": -4.461353778839111, "loss": 3.482, "rewards/accuracies": 0.75, "rewards/chosen": -41.01459503173828, "rewards/margins": 3.5989437103271484, "rewards/rejected": -44.61354064941406, "step": 4900 }, { "epoch": 0.6673474945533769, "grad_norm": 42.97503980141237, "learning_rate": 2.4073435133864353e-07, "logits/chosen": 12.747114181518555, "logits/rejected": 13.894346237182617, "logps/chosen": -4.21614408493042, "logps/rejected": -4.400963306427002, "loss": 4.0157, "rewards/accuracies": 0.75, "rewards/chosen": -42.16143798828125, "rewards/margins": 1.8481950759887695, "rewards/rejected": -44.00963592529297, "step": 4901 }, { "epoch": 0.6674836601307189, "grad_norm": 40.53029360584712, "learning_rate": 2.4055995096784696e-07, "logits/chosen": 13.274866104125977, "logits/rejected": 14.241742134094238, "logps/chosen": -4.1322126388549805, "logps/rejected": -4.504911422729492, "loss": 4.0287, "rewards/accuracies": 1.0, "rewards/chosen": -41.32212448120117, "rewards/margins": 3.726987838745117, "rewards/rejected": -45.04911422729492, "step": 4902 }, { "epoch": 0.667619825708061, "grad_norm": 42.52799007176362, "learning_rate": 2.4038558662384736e-07, "logits/chosen": 13.097719192504883, "logits/rejected": 13.30586051940918, "logps/chosen": -4.043352127075195, "logps/rejected": -4.332232475280762, "loss": 4.1227, "rewards/accuracies": 0.75, "rewards/chosen": -40.43352127075195, "rewards/margins": 2.888800621032715, "rewards/rejected": -43.322322845458984, "step": 4903 }, { "epoch": 0.6677559912854031, "grad_norm": 89.93532510295223, "learning_rate": 2.4021125834604394e-07, "logits/chosen": 12.249614715576172, "logits/rejected": 12.864141464233398, "logps/chosen": -4.143789291381836, "logps/rejected": -4.335935592651367, "loss": 3.6612, "rewards/accuracies": 0.5, "rewards/chosen": -41.437896728515625, "rewards/margins": 1.921462059020996, "rewards/rejected": -43.35935974121094, "step": 4904 }, { "epoch": 0.6678921568627451, "grad_norm": 43.379693686908894, "learning_rate": 2.400369661738275e-07, "logits/chosen": 13.503941535949707, "logits/rejected": 12.673093795776367, "logps/chosen": -4.142001152038574, "logps/rejected": -4.185891151428223, "loss": 4.4433, "rewards/accuracies": 0.5, "rewards/chosen": -41.42000961303711, "rewards/margins": 0.4389009475708008, "rewards/rejected": -41.858909606933594, "step": 4905 }, { "epoch": 0.6680283224400871, "grad_norm": 38.5486174365372, "learning_rate": 2.3986271014658076e-07, "logits/chosen": 12.098138809204102, "logits/rejected": 12.48018741607666, "logps/chosen": -4.030658721923828, "logps/rejected": -4.149990558624268, "loss": 4.2704, "rewards/accuracies": 0.75, "rewards/chosen": -40.30658721923828, "rewards/margins": 1.1933174133300781, "rewards/rejected": -41.499908447265625, "step": 4906 }, { "epoch": 0.6681644880174292, "grad_norm": 41.30235906776839, "learning_rate": 2.396884903036785e-07, "logits/chosen": 12.79210090637207, "logits/rejected": 13.341592788696289, "logps/chosen": -4.007331848144531, "logps/rejected": -4.201887130737305, "loss": 4.0808, "rewards/accuracies": 0.5, "rewards/chosen": -40.07331848144531, "rewards/margins": 1.9455556869506836, "rewards/rejected": -42.01887130737305, "step": 4907 }, { "epoch": 0.6683006535947712, "grad_norm": 36.45892728019104, "learning_rate": 2.3951430668448686e-07, "logits/chosen": 13.014259338378906, "logits/rejected": 13.471771240234375, "logps/chosen": -3.9295852184295654, "logps/rejected": -4.574204921722412, "loss": 3.7891, "rewards/accuracies": 0.75, "rewards/chosen": -39.29584884643555, "rewards/margins": 6.446197509765625, "rewards/rejected": -45.74204635620117, "step": 4908 }, { "epoch": 0.6684368191721133, "grad_norm": 36.856237965379016, "learning_rate": 2.3934015932836425e-07, "logits/chosen": 12.25615119934082, "logits/rejected": 13.370352745056152, "logps/chosen": -3.892073631286621, "logps/rejected": -4.271997451782227, "loss": 3.5009, "rewards/accuracies": 1.0, "rewards/chosen": -38.920738220214844, "rewards/margins": 3.799241065979004, "rewards/rejected": -42.71997833251953, "step": 4909 }, { "epoch": 0.6685729847494554, "grad_norm": 40.2778611634866, "learning_rate": 2.3916604827466076e-07, "logits/chosen": 14.080062866210938, "logits/rejected": 13.694249153137207, "logps/chosen": -4.2744622230529785, "logps/rejected": -4.5768280029296875, "loss": 4.4117, "rewards/accuracies": 0.75, "rewards/chosen": -42.74462127685547, "rewards/margins": 3.0236549377441406, "rewards/rejected": -45.768280029296875, "step": 4910 }, { "epoch": 0.6687091503267973, "grad_norm": 44.47164020565725, "learning_rate": 2.3899197356271804e-07, "logits/chosen": 13.371755599975586, "logits/rejected": 14.45034408569336, "logps/chosen": -4.434541702270508, "logps/rejected": -4.455202102661133, "loss": 3.3907, "rewards/accuracies": 0.75, "rewards/chosen": -44.34541702270508, "rewards/margins": 0.20660400390625, "rewards/rejected": -44.552024841308594, "step": 4911 }, { "epoch": 0.6688453159041394, "grad_norm": 39.95061052166079, "learning_rate": 2.3881793523186975e-07, "logits/chosen": 13.35377311706543, "logits/rejected": 13.29137897491455, "logps/chosen": -4.056880474090576, "logps/rejected": -4.438952445983887, "loss": 3.7962, "rewards/accuracies": 0.75, "rewards/chosen": -40.568809509277344, "rewards/margins": 3.8207168579101562, "rewards/rejected": -44.3895263671875, "step": 4912 }, { "epoch": 0.6689814814814815, "grad_norm": 40.62316331472955, "learning_rate": 2.3864393332144143e-07, "logits/chosen": 13.995685577392578, "logits/rejected": 13.334367752075195, "logps/chosen": -4.547989845275879, "logps/rejected": -4.310540199279785, "loss": 3.8489, "rewards/accuracies": 0.25, "rewards/chosen": -45.479896545410156, "rewards/margins": -2.3744916915893555, "rewards/rejected": -43.105403900146484, "step": 4913 }, { "epoch": 0.6691176470588235, "grad_norm": 41.57529984547814, "learning_rate": 2.384699678707502e-07, "logits/chosen": 12.994915008544922, "logits/rejected": 13.260377883911133, "logps/chosen": -4.242356300354004, "logps/rejected": -4.220087051391602, "loss": 4.3614, "rewards/accuracies": 0.25, "rewards/chosen": -42.423561096191406, "rewards/margins": -0.22269535064697266, "rewards/rejected": -42.20086669921875, "step": 4914 }, { "epoch": 0.6692538126361656, "grad_norm": 38.018293334908556, "learning_rate": 2.382960389191048e-07, "logits/chosen": 13.15289306640625, "logits/rejected": 14.156556129455566, "logps/chosen": -4.129151344299316, "logps/rejected": -4.685421466827393, "loss": 3.7138, "rewards/accuracies": 0.75, "rewards/chosen": -41.29151153564453, "rewards/margins": 5.562702178955078, "rewards/rejected": -46.85421371459961, "step": 4915 }, { "epoch": 0.6693899782135077, "grad_norm": 39.82947487865503, "learning_rate": 2.3812214650580622e-07, "logits/chosen": 13.597378730773926, "logits/rejected": 13.449670791625977, "logps/chosen": -4.159684181213379, "logps/rejected": -4.1844072341918945, "loss": 4.3597, "rewards/accuracies": 0.25, "rewards/chosen": -41.596839904785156, "rewards/margins": 0.24723148345947266, "rewards/rejected": -41.84407424926758, "step": 4916 }, { "epoch": 0.6695261437908496, "grad_norm": 40.470736052745046, "learning_rate": 2.3794829067014671e-07, "logits/chosen": 12.57502555847168, "logits/rejected": 12.747024536132812, "logps/chosen": -4.038156509399414, "logps/rejected": -4.2016143798828125, "loss": 3.5929, "rewards/accuracies": 0.5, "rewards/chosen": -40.381561279296875, "rewards/margins": 1.6345796585083008, "rewards/rejected": -42.016143798828125, "step": 4917 }, { "epoch": 0.6696623093681917, "grad_norm": 38.55194107306843, "learning_rate": 2.377744714514103e-07, "logits/chosen": 13.151594161987305, "logits/rejected": 13.415809631347656, "logps/chosen": -3.8494396209716797, "logps/rejected": -4.167049407958984, "loss": 3.8656, "rewards/accuracies": 0.5, "rewards/chosen": -38.49439239501953, "rewards/margins": 3.176100730895996, "rewards/rejected": -41.670494079589844, "step": 4918 }, { "epoch": 0.6697984749455338, "grad_norm": 38.965320652978015, "learning_rate": 2.3760068888887322e-07, "logits/chosen": 13.080465316772461, "logits/rejected": 13.461555480957031, "logps/chosen": -3.9396629333496094, "logps/rejected": -4.413290023803711, "loss": 4.0791, "rewards/accuracies": 0.75, "rewards/chosen": -39.396629333496094, "rewards/margins": 4.736272811889648, "rewards/rejected": -44.13290023803711, "step": 4919 }, { "epoch": 0.6699346405228758, "grad_norm": 36.04114634554208, "learning_rate": 2.3742694302180274e-07, "logits/chosen": 13.298447608947754, "logits/rejected": 13.021343231201172, "logps/chosen": -4.049065113067627, "logps/rejected": -4.175093650817871, "loss": 3.3718, "rewards/accuracies": 0.75, "rewards/chosen": -40.49065399169922, "rewards/margins": 1.2602853775024414, "rewards/rejected": -41.750938415527344, "step": 4920 }, { "epoch": 0.6700708061002179, "grad_norm": 39.111423842615366, "learning_rate": 2.3725323388945843e-07, "logits/chosen": 13.292187690734863, "logits/rejected": 13.169620513916016, "logps/chosen": -4.468936443328857, "logps/rejected": -4.32743501663208, "loss": 3.9664, "rewards/accuracies": 0.5, "rewards/chosen": -44.68936538696289, "rewards/margins": -1.4150152206420898, "rewards/rejected": -43.274349212646484, "step": 4921 }, { "epoch": 0.6702069716775599, "grad_norm": 41.00076742411749, "learning_rate": 2.3707956153109124e-07, "logits/chosen": 13.628862380981445, "logits/rejected": 14.164094924926758, "logps/chosen": -4.482698917388916, "logps/rejected": -4.817996978759766, "loss": 4.1193, "rewards/accuracies": 1.0, "rewards/chosen": -44.826988220214844, "rewards/margins": 3.3529815673828125, "rewards/rejected": -48.179969787597656, "step": 4922 }, { "epoch": 0.6703431372549019, "grad_norm": 39.258362704261884, "learning_rate": 2.369059259859437e-07, "logits/chosen": 13.17496109008789, "logits/rejected": 13.605205535888672, "logps/chosen": -3.8577470779418945, "logps/rejected": -4.265807628631592, "loss": 3.693, "rewards/accuracies": 0.75, "rewards/chosen": -38.57746887207031, "rewards/margins": 4.080605506896973, "rewards/rejected": -42.658077239990234, "step": 4923 }, { "epoch": 0.670479302832244, "grad_norm": 42.7157499256614, "learning_rate": 2.3673232729325043e-07, "logits/chosen": 13.505073547363281, "logits/rejected": 13.787788391113281, "logps/chosen": -4.376399993896484, "logps/rejected": -4.679464817047119, "loss": 3.6997, "rewards/accuracies": 0.75, "rewards/chosen": -43.76399612426758, "rewards/margins": 3.030649185180664, "rewards/rejected": -46.794647216796875, "step": 4924 }, { "epoch": 0.6706154684095861, "grad_norm": 40.495253893096724, "learning_rate": 2.365587654922374e-07, "logits/chosen": 13.44428539276123, "logits/rejected": 13.580131530761719, "logps/chosen": -4.192582130432129, "logps/rejected": -4.233584403991699, "loss": 3.9023, "rewards/accuracies": 0.5, "rewards/chosen": -41.92582321166992, "rewards/margins": 0.41002368927001953, "rewards/rejected": -42.335845947265625, "step": 4925 }, { "epoch": 0.670751633986928, "grad_norm": 44.80867400910084, "learning_rate": 2.3638524062212223e-07, "logits/chosen": 12.97348403930664, "logits/rejected": 12.899170875549316, "logps/chosen": -4.147602081298828, "logps/rejected": -4.220792770385742, "loss": 3.9743, "rewards/accuracies": 0.5, "rewards/chosen": -41.47602081298828, "rewards/margins": 0.731907844543457, "rewards/rejected": -42.20793151855469, "step": 4926 }, { "epoch": 0.6708877995642701, "grad_norm": 48.73673973697857, "learning_rate": 2.3621175272211443e-07, "logits/chosen": 12.798633575439453, "logits/rejected": 13.295235633850098, "logps/chosen": -3.9964404106140137, "logps/rejected": -4.275314807891846, "loss": 4.3055, "rewards/accuracies": 1.0, "rewards/chosen": -39.96440505981445, "rewards/margins": 2.7887449264526367, "rewards/rejected": -42.753150939941406, "step": 4927 }, { "epoch": 0.6710239651416122, "grad_norm": 100.85175635796823, "learning_rate": 2.3603830183141516e-07, "logits/chosen": 12.970130920410156, "logits/rejected": 13.362478256225586, "logps/chosen": -4.426886558532715, "logps/rejected": -4.614952564239502, "loss": 4.5354, "rewards/accuracies": 0.75, "rewards/chosen": -44.26886749267578, "rewards/margins": 1.880660057067871, "rewards/rejected": -46.1495246887207, "step": 4928 }, { "epoch": 0.6711601307189542, "grad_norm": 40.321654851353124, "learning_rate": 2.3586488798921665e-07, "logits/chosen": 13.35104751586914, "logits/rejected": 13.9188232421875, "logps/chosen": -4.28900671005249, "logps/rejected": -4.469543933868408, "loss": 3.966, "rewards/accuracies": 0.75, "rewards/chosen": -42.89006805419922, "rewards/margins": 1.8053693771362305, "rewards/rejected": -44.695438385009766, "step": 4929 }, { "epoch": 0.6712962962962963, "grad_norm": 45.25769718300974, "learning_rate": 2.3569151123470356e-07, "logits/chosen": 13.376810073852539, "logits/rejected": 13.717482566833496, "logps/chosen": -4.775904655456543, "logps/rejected": -4.895058631896973, "loss": 3.8911, "rewards/accuracies": 0.75, "rewards/chosen": -47.7590446472168, "rewards/margins": 1.1915359497070312, "rewards/rejected": -48.95058059692383, "step": 4930 }, { "epoch": 0.6714324618736384, "grad_norm": 41.76187647010142, "learning_rate": 2.3551817160705183e-07, "logits/chosen": 12.720245361328125, "logits/rejected": 13.535289764404297, "logps/chosen": -3.8391270637512207, "logps/rejected": -4.157207489013672, "loss": 3.855, "rewards/accuracies": 0.75, "rewards/chosen": -38.39126968383789, "rewards/margins": 3.1808090209960938, "rewards/rejected": -41.572078704833984, "step": 4931 }, { "epoch": 0.6715686274509803, "grad_norm": 39.64592121024222, "learning_rate": 2.3534486914542867e-07, "logits/chosen": 13.644289016723633, "logits/rejected": 13.938666343688965, "logps/chosen": -4.211638450622559, "logps/rejected": -4.418583869934082, "loss": 3.7109, "rewards/accuracies": 1.0, "rewards/chosen": -42.11638641357422, "rewards/margins": 2.0694522857666016, "rewards/rejected": -44.18583679199219, "step": 4932 }, { "epoch": 0.6717047930283224, "grad_norm": 38.42904950670155, "learning_rate": 2.3517160388899334e-07, "logits/chosen": 12.85580825805664, "logits/rejected": 14.228158950805664, "logps/chosen": -4.0236430168151855, "logps/rejected": -4.536759376525879, "loss": 3.7291, "rewards/accuracies": 0.75, "rewards/chosen": -40.236427307128906, "rewards/margins": 5.131163597106934, "rewards/rejected": -45.367591857910156, "step": 4933 }, { "epoch": 0.6718409586056645, "grad_norm": 92.00428912845325, "learning_rate": 2.3499837587689685e-07, "logits/chosen": 12.531478881835938, "logits/rejected": 14.084233283996582, "logps/chosen": -4.099097728729248, "logps/rejected": -4.623233795166016, "loss": 3.8915, "rewards/accuracies": 1.0, "rewards/chosen": -40.9909782409668, "rewards/margins": 5.24135684967041, "rewards/rejected": -46.232337951660156, "step": 4934 }, { "epoch": 0.6719771241830066, "grad_norm": 43.12862157357181, "learning_rate": 2.3482518514828103e-07, "logits/chosen": 14.070930480957031, "logits/rejected": 14.543476104736328, "logps/chosen": -3.930233955383301, "logps/rejected": -4.289680480957031, "loss": 4.3961, "rewards/accuracies": 1.0, "rewards/chosen": -39.302337646484375, "rewards/margins": 3.594470977783203, "rewards/rejected": -42.89680480957031, "step": 4935 }, { "epoch": 0.6721132897603486, "grad_norm": 36.232328973702856, "learning_rate": 2.3465203174228e-07, "logits/chosen": 13.897150039672852, "logits/rejected": 13.22000789642334, "logps/chosen": -4.690343856811523, "logps/rejected": -4.573622703552246, "loss": 3.7114, "rewards/accuracies": 0.5, "rewards/chosen": -46.9034423828125, "rewards/margins": -1.1672172546386719, "rewards/rejected": -45.73622131347656, "step": 4936 }, { "epoch": 0.6722494553376906, "grad_norm": 43.66787361785456, "learning_rate": 2.3447891569801929e-07, "logits/chosen": 13.293045997619629, "logits/rejected": 13.076496124267578, "logps/chosen": -4.289612770080566, "logps/rejected": -4.156806945800781, "loss": 3.9421, "rewards/accuracies": 0.5, "rewards/chosen": -42.89612579345703, "rewards/margins": -1.328054428100586, "rewards/rejected": -41.56806945800781, "step": 4937 }, { "epoch": 0.6723856209150327, "grad_norm": 35.46325773437777, "learning_rate": 2.3430583705461565e-07, "logits/chosen": 12.727237701416016, "logits/rejected": 14.06879711151123, "logps/chosen": -4.035923957824707, "logps/rejected": -4.548882961273193, "loss": 3.5507, "rewards/accuracies": 1.0, "rewards/chosen": -40.35923767089844, "rewards/margins": 5.129590034484863, "rewards/rejected": -45.48883056640625, "step": 4938 }, { "epoch": 0.6725217864923747, "grad_norm": 38.009065913715716, "learning_rate": 2.341327958511778e-07, "logits/chosen": 13.046637535095215, "logits/rejected": 13.302786827087402, "logps/chosen": -4.082365989685059, "logps/rejected": -4.372934818267822, "loss": 4.079, "rewards/accuracies": 0.5, "rewards/chosen": -40.82365417480469, "rewards/margins": 2.9056921005249023, "rewards/rejected": -43.72935104370117, "step": 4939 }, { "epoch": 0.6726579520697168, "grad_norm": 39.56604976838384, "learning_rate": 2.3395979212680594e-07, "logits/chosen": 12.641833305358887, "logits/rejected": 12.542794227600098, "logps/chosen": -4.081704139709473, "logps/rejected": -4.096685409545898, "loss": 4.335, "rewards/accuracies": 0.5, "rewards/chosen": -40.817039489746094, "rewards/margins": 0.14981555938720703, "rewards/rejected": -40.96685791015625, "step": 4940 }, { "epoch": 0.6727941176470589, "grad_norm": 42.076587940600234, "learning_rate": 2.3378682592059138e-07, "logits/chosen": 13.327604293823242, "logits/rejected": 13.457964897155762, "logps/chosen": -4.256242752075195, "logps/rejected": -4.326305389404297, "loss": 4.1769, "rewards/accuracies": 0.5, "rewards/chosen": -42.56243133544922, "rewards/margins": 0.70062255859375, "rewards/rejected": -43.26305389404297, "step": 4941 }, { "epoch": 0.6729302832244008, "grad_norm": 41.13558503053033, "learning_rate": 2.3361389727161743e-07, "logits/chosen": 13.131362915039062, "logits/rejected": 13.786177635192871, "logps/chosen": -4.080310821533203, "logps/rejected": -4.3658318519592285, "loss": 4.3093, "rewards/accuracies": 0.75, "rewards/chosen": -40.80310821533203, "rewards/margins": 2.8552093505859375, "rewards/rejected": -43.65831756591797, "step": 4942 }, { "epoch": 0.6730664488017429, "grad_norm": 38.91579393315796, "learning_rate": 2.3344100621895894e-07, "logits/chosen": 12.533937454223633, "logits/rejected": 12.534757614135742, "logps/chosen": -3.853740692138672, "logps/rejected": -3.9679627418518066, "loss": 3.806, "rewards/accuracies": 0.5, "rewards/chosen": -38.53740692138672, "rewards/margins": 1.1422200202941895, "rewards/rejected": -39.67962646484375, "step": 4943 }, { "epoch": 0.673202614379085, "grad_norm": 41.87879241294401, "learning_rate": 2.3326815280168168e-07, "logits/chosen": 12.866392135620117, "logits/rejected": 13.9177885055542, "logps/chosen": -3.8524394035339355, "logps/rejected": -4.4038190841674805, "loss": 3.7847, "rewards/accuracies": 1.0, "rewards/chosen": -38.52439498901367, "rewards/margins": 5.513792037963867, "rewards/rejected": -44.038185119628906, "step": 4944 }, { "epoch": 0.673338779956427, "grad_norm": 37.847241861103456, "learning_rate": 2.3309533705884355e-07, "logits/chosen": 13.740185737609863, "logits/rejected": 14.21971321105957, "logps/chosen": -4.34060001373291, "logps/rejected": -4.5720319747924805, "loss": 3.1112, "rewards/accuracies": 1.0, "rewards/chosen": -43.40599822998047, "rewards/margins": 2.3143205642700195, "rewards/rejected": -45.72032165527344, "step": 4945 }, { "epoch": 0.6734749455337691, "grad_norm": 42.81496758873803, "learning_rate": 2.3292255902949384e-07, "logits/chosen": 13.380810737609863, "logits/rejected": 13.59064769744873, "logps/chosen": -4.202611923217773, "logps/rejected": -4.326764106750488, "loss": 3.6024, "rewards/accuracies": 0.5, "rewards/chosen": -42.026123046875, "rewards/margins": 1.2415199279785156, "rewards/rejected": -43.26763916015625, "step": 4946 }, { "epoch": 0.6736111111111112, "grad_norm": 39.27160006295176, "learning_rate": 2.327498187526728e-07, "logits/chosen": 13.70859146118164, "logits/rejected": 13.661648750305176, "logps/chosen": -4.349617004394531, "logps/rejected": -4.346102237701416, "loss": 3.9279, "rewards/accuracies": 0.5, "rewards/chosen": -43.49617004394531, "rewards/margins": -0.03514671325683594, "rewards/rejected": -43.461021423339844, "step": 4947 }, { "epoch": 0.6737472766884531, "grad_norm": 42.498174584873446, "learning_rate": 2.3257711626741275e-07, "logits/chosen": 12.72642707824707, "logits/rejected": 12.883936882019043, "logps/chosen": -4.209265232086182, "logps/rejected": -4.295578479766846, "loss": 4.1128, "rewards/accuracies": 0.5, "rewards/chosen": -42.0926513671875, "rewards/margins": 0.8631343841552734, "rewards/rejected": -42.95578384399414, "step": 4948 }, { "epoch": 0.6738834422657952, "grad_norm": 38.328027170092604, "learning_rate": 2.3240445161273735e-07, "logits/chosen": 13.627479553222656, "logits/rejected": 13.13992977142334, "logps/chosen": -4.296594142913818, "logps/rejected": -4.262022018432617, "loss": 3.964, "rewards/accuracies": 0.25, "rewards/chosen": -42.9659423828125, "rewards/margins": -0.3457212448120117, "rewards/rejected": -42.62022399902344, "step": 4949 }, { "epoch": 0.6740196078431373, "grad_norm": 41.89044576993032, "learning_rate": 2.322318248276613e-07, "logits/chosen": 12.80429458618164, "logits/rejected": 13.791561126708984, "logps/chosen": -3.98539400100708, "logps/rejected": -4.478819847106934, "loss": 4.5867, "rewards/accuracies": 0.75, "rewards/chosen": -39.853939056396484, "rewards/margins": 4.934259414672852, "rewards/rejected": -44.78820037841797, "step": 4950 }, { "epoch": 0.6741557734204793, "grad_norm": 43.092444460949636, "learning_rate": 2.3205923595119122e-07, "logits/chosen": 13.104654312133789, "logits/rejected": 13.80563735961914, "logps/chosen": -3.956214189529419, "logps/rejected": -4.392328262329102, "loss": 4.4064, "rewards/accuracies": 1.0, "rewards/chosen": -39.56214141845703, "rewards/margins": 4.361142158508301, "rewards/rejected": -43.923282623291016, "step": 4951 }, { "epoch": 0.6742919389978214, "grad_norm": 41.083240678495066, "learning_rate": 2.3188668502232515e-07, "logits/chosen": 12.679129600524902, "logits/rejected": 13.747627258300781, "logps/chosen": -3.822826623916626, "logps/rejected": -4.348001480102539, "loss": 3.805, "rewards/accuracies": 1.0, "rewards/chosen": -38.22826385498047, "rewards/margins": 5.2517499923706055, "rewards/rejected": -43.480018615722656, "step": 4952 }, { "epoch": 0.6744281045751634, "grad_norm": 42.661029481584116, "learning_rate": 2.3171417208005207e-07, "logits/chosen": 12.910053253173828, "logits/rejected": 13.254698753356934, "logps/chosen": -3.870077610015869, "logps/rejected": -4.03593111038208, "loss": 4.0766, "rewards/accuracies": 0.75, "rewards/chosen": -38.700775146484375, "rewards/margins": 1.6585330963134766, "rewards/rejected": -40.359310150146484, "step": 4953 }, { "epoch": 0.6745642701525054, "grad_norm": 45.20548835122774, "learning_rate": 2.315416971633529e-07, "logits/chosen": 13.45748519897461, "logits/rejected": 13.320718765258789, "logps/chosen": -4.082826137542725, "logps/rejected": -4.359434127807617, "loss": 4.1116, "rewards/accuracies": 0.75, "rewards/chosen": -40.82826232910156, "rewards/margins": 2.766079902648926, "rewards/rejected": -43.594337463378906, "step": 4954 }, { "epoch": 0.6747004357298475, "grad_norm": 38.80160981449967, "learning_rate": 2.313692603111999e-07, "logits/chosen": 13.931467056274414, "logits/rejected": 12.721430778503418, "logps/chosen": -4.392159461975098, "logps/rejected": -4.186511039733887, "loss": 4.1121, "rewards/accuracies": 0.5, "rewards/chosen": -43.92159652709961, "rewards/margins": -2.0564870834350586, "rewards/rejected": -41.8651123046875, "step": 4955 }, { "epoch": 0.6748366013071896, "grad_norm": 46.17995740556627, "learning_rate": 2.3119686156255622e-07, "logits/chosen": 13.751055717468262, "logits/rejected": 13.790000915527344, "logps/chosen": -4.432290077209473, "logps/rejected": -4.469852447509766, "loss": 4.2748, "rewards/accuracies": 0.75, "rewards/chosen": -44.322898864746094, "rewards/margins": 0.3756284713745117, "rewards/rejected": -44.69852828979492, "step": 4956 }, { "epoch": 0.6749727668845316, "grad_norm": 39.174604706775135, "learning_rate": 2.3102450095637712e-07, "logits/chosen": 12.762162208557129, "logits/rejected": 13.875711441040039, "logps/chosen": -4.134888172149658, "logps/rejected": -4.302210807800293, "loss": 3.6689, "rewards/accuracies": 0.5, "rewards/chosen": -41.348880767822266, "rewards/margins": 1.6732301712036133, "rewards/rejected": -43.02210998535156, "step": 4957 }, { "epoch": 0.6751089324618736, "grad_norm": 41.482762155769194, "learning_rate": 2.3085217853160888e-07, "logits/chosen": 12.946890830993652, "logits/rejected": 13.522164344787598, "logps/chosen": -4.291780471801758, "logps/rejected": -4.2463178634643555, "loss": 3.6798, "rewards/accuracies": 0.5, "rewards/chosen": -42.917808532714844, "rewards/margins": -0.45462894439697266, "rewards/rejected": -42.46317672729492, "step": 4958 }, { "epoch": 0.6752450980392157, "grad_norm": 38.001183021685684, "learning_rate": 2.3067989432718896e-07, "logits/chosen": 12.612038612365723, "logits/rejected": 13.934855461120605, "logps/chosen": -4.20914363861084, "logps/rejected": -4.536675930023193, "loss": 4.0731, "rewards/accuracies": 0.75, "rewards/chosen": -42.09143829345703, "rewards/margins": 3.275320053100586, "rewards/rejected": -45.36676025390625, "step": 4959 }, { "epoch": 0.6753812636165577, "grad_norm": 41.43882464158987, "learning_rate": 2.3050764838204652e-07, "logits/chosen": 13.669378280639648, "logits/rejected": 13.343902587890625, "logps/chosen": -4.4706315994262695, "logps/rejected": -4.0665154457092285, "loss": 3.9239, "rewards/accuracies": 0.25, "rewards/chosen": -44.70631790161133, "rewards/margins": -4.041162490844727, "rewards/rejected": -40.66515350341797, "step": 4960 }, { "epoch": 0.6755174291938998, "grad_norm": 44.55759665442543, "learning_rate": 2.3033544073510213e-07, "logits/chosen": 14.341272354125977, "logits/rejected": 13.779598236083984, "logps/chosen": -4.41197395324707, "logps/rejected": -4.390603065490723, "loss": 3.5932, "rewards/accuracies": 0.5, "rewards/chosen": -44.1197395324707, "rewards/margins": -0.21370506286621094, "rewards/rejected": -43.906036376953125, "step": 4961 }, { "epoch": 0.6756535947712419, "grad_norm": 40.67034372181485, "learning_rate": 2.301632714252672e-07, "logits/chosen": 12.844625473022461, "logits/rejected": 13.831127166748047, "logps/chosen": -4.209582328796387, "logps/rejected": -4.672047138214111, "loss": 4.038, "rewards/accuracies": 1.0, "rewards/chosen": -42.095821380615234, "rewards/margins": 4.624650001525879, "rewards/rejected": -46.72047424316406, "step": 4962 }, { "epoch": 0.6757897603485838, "grad_norm": 41.88452852847074, "learning_rate": 2.29991140491445e-07, "logits/chosen": 13.386459350585938, "logits/rejected": 12.338380813598633, "logps/chosen": -4.310650825500488, "logps/rejected": -4.033294200897217, "loss": 4.3887, "rewards/accuracies": 0.25, "rewards/chosen": -43.10650634765625, "rewards/margins": -2.773564338684082, "rewards/rejected": -40.332942962646484, "step": 4963 }, { "epoch": 0.6759259259259259, "grad_norm": 46.768092076944335, "learning_rate": 2.2981904797253002e-07, "logits/chosen": 13.478754997253418, "logits/rejected": 13.89002513885498, "logps/chosen": -4.291598320007324, "logps/rejected": -4.55536413192749, "loss": 4.2222, "rewards/accuracies": 0.5, "rewards/chosen": -42.915985107421875, "rewards/margins": 2.6376523971557617, "rewards/rejected": -45.55363845825195, "step": 4964 }, { "epoch": 0.676062091503268, "grad_norm": 45.02443209504626, "learning_rate": 2.296469939074078e-07, "logits/chosen": 14.396387100219727, "logits/rejected": 13.926675796508789, "logps/chosen": -4.397966384887695, "logps/rejected": -4.411970138549805, "loss": 4.479, "rewards/accuracies": 0.25, "rewards/chosen": -43.97966766357422, "rewards/margins": 0.1400318145751953, "rewards/rejected": -44.11969757080078, "step": 4965 }, { "epoch": 0.67619825708061, "grad_norm": 37.989976683007235, "learning_rate": 2.294749783349554e-07, "logits/chosen": 13.713943481445312, "logits/rejected": 14.113121032714844, "logps/chosen": -4.362071514129639, "logps/rejected": -4.6934967041015625, "loss": 3.7902, "rewards/accuracies": 1.0, "rewards/chosen": -43.6207160949707, "rewards/margins": 3.314253807067871, "rewards/rejected": -46.934967041015625, "step": 4966 }, { "epoch": 0.6763344226579521, "grad_norm": 44.70361670289076, "learning_rate": 2.2930300129404138e-07, "logits/chosen": 13.652780532836914, "logits/rejected": 13.51252555847168, "logps/chosen": -4.545634746551514, "logps/rejected": -4.557656288146973, "loss": 3.8151, "rewards/accuracies": 0.5, "rewards/chosen": -45.45634841918945, "rewards/margins": 0.12021350860595703, "rewards/rejected": -45.576560974121094, "step": 4967 }, { "epoch": 0.6764705882352942, "grad_norm": 37.89972911408226, "learning_rate": 2.2913106282352506e-07, "logits/chosen": 13.08674430847168, "logits/rejected": 13.315250396728516, "logps/chosen": -4.192854881286621, "logps/rejected": -4.181083679199219, "loss": 3.8075, "rewards/accuracies": 0.5, "rewards/chosen": -41.92854309082031, "rewards/margins": -0.1177053451538086, "rewards/rejected": -41.81083679199219, "step": 4968 }, { "epoch": 0.6766067538126361, "grad_norm": 41.403141848070064, "learning_rate": 2.2895916296225755e-07, "logits/chosen": 14.582677841186523, "logits/rejected": 14.684574127197266, "logps/chosen": -4.521269798278809, "logps/rejected": -4.765774726867676, "loss": 4.3099, "rewards/accuracies": 1.0, "rewards/chosen": -45.21269989013672, "rewards/margins": 2.445052146911621, "rewards/rejected": -47.65774917602539, "step": 4969 }, { "epoch": 0.6767429193899782, "grad_norm": 42.281168677586464, "learning_rate": 2.2878730174908116e-07, "logits/chosen": 12.910408973693848, "logits/rejected": 12.158500671386719, "logps/chosen": -3.9646644592285156, "logps/rejected": -3.969877004623413, "loss": 4.3744, "rewards/accuracies": 0.5, "rewards/chosen": -39.646644592285156, "rewards/margins": 0.05212593078613281, "rewards/rejected": -39.698768615722656, "step": 4970 }, { "epoch": 0.6768790849673203, "grad_norm": 39.26650435607001, "learning_rate": 2.28615479222829e-07, "logits/chosen": 13.608556747436523, "logits/rejected": 14.258621215820312, "logps/chosen": -4.543118476867676, "logps/rejected": -4.8982391357421875, "loss": 4.219, "rewards/accuracies": 1.0, "rewards/chosen": -45.43118667602539, "rewards/margins": 3.551205635070801, "rewards/rejected": -48.982391357421875, "step": 4971 }, { "epoch": 0.6770152505446623, "grad_norm": 41.24124800048993, "learning_rate": 2.2844369542232598e-07, "logits/chosen": 13.350610733032227, "logits/rejected": 12.747207641601562, "logps/chosen": -4.320433616638184, "logps/rejected": -4.320938587188721, "loss": 3.6656, "rewards/accuracies": 0.75, "rewards/chosen": -43.20433807373047, "rewards/margins": 0.005047798156738281, "rewards/rejected": -43.209388732910156, "step": 4972 }, { "epoch": 0.6771514161220044, "grad_norm": 42.382871890395606, "learning_rate": 2.2827195038638826e-07, "logits/chosen": 13.505454063415527, "logits/rejected": 14.128864288330078, "logps/chosen": -4.32094144821167, "logps/rejected": -4.714899063110352, "loss": 3.8435, "rewards/accuracies": 0.75, "rewards/chosen": -43.20941162109375, "rewards/margins": 3.939577102661133, "rewards/rejected": -47.14898681640625, "step": 4973 }, { "epoch": 0.6772875816993464, "grad_norm": 45.82348534321375, "learning_rate": 2.2810024415382271e-07, "logits/chosen": 13.358112335205078, "logits/rejected": 14.433165550231934, "logps/chosen": -4.221440315246582, "logps/rejected": -4.678542137145996, "loss": 4.1052, "rewards/accuracies": 1.0, "rewards/chosen": -42.21440124511719, "rewards/margins": 4.571018218994141, "rewards/rejected": -46.785423278808594, "step": 4974 }, { "epoch": 0.6774237472766884, "grad_norm": 45.66271390123518, "learning_rate": 2.2792857676342794e-07, "logits/chosen": 12.850638389587402, "logits/rejected": 13.273168563842773, "logps/chosen": -3.7299914360046387, "logps/rejected": -4.212181091308594, "loss": 4.3006, "rewards/accuracies": 0.75, "rewards/chosen": -37.29991149902344, "rewards/margins": 4.821900844573975, "rewards/rejected": -42.12181091308594, "step": 4975 }, { "epoch": 0.6775599128540305, "grad_norm": 38.78391762433479, "learning_rate": 2.2775694825399375e-07, "logits/chosen": 13.030179977416992, "logits/rejected": 13.696657180786133, "logps/chosen": -3.979689121246338, "logps/rejected": -4.132749557495117, "loss": 3.9766, "rewards/accuracies": 0.75, "rewards/chosen": -39.79689025878906, "rewards/margins": 1.530604362487793, "rewards/rejected": -41.32749557495117, "step": 4976 }, { "epoch": 0.6776960784313726, "grad_norm": 42.12087295465012, "learning_rate": 2.2758535866430074e-07, "logits/chosen": 14.044414520263672, "logits/rejected": 13.809249877929688, "logps/chosen": -4.426638603210449, "logps/rejected": -4.31118631362915, "loss": 4.0496, "rewards/accuracies": 0.5, "rewards/chosen": -44.26638412475586, "rewards/margins": -1.1545209884643555, "rewards/rejected": -43.11186218261719, "step": 4977 }, { "epoch": 0.6778322440087146, "grad_norm": 50.60174090563682, "learning_rate": 2.2741380803312115e-07, "logits/chosen": 13.46957778930664, "logits/rejected": 13.956361770629883, "logps/chosen": -4.4231085777282715, "logps/rejected": -4.732943534851074, "loss": 4.2611, "rewards/accuracies": 0.75, "rewards/chosen": -44.23108673095703, "rewards/margins": 3.0983505249023438, "rewards/rejected": -47.329437255859375, "step": 4978 }, { "epoch": 0.6779684095860566, "grad_norm": 44.097181601124475, "learning_rate": 2.2724229639921836e-07, "logits/chosen": 13.48399543762207, "logits/rejected": 13.376222610473633, "logps/chosen": -4.391899108886719, "logps/rejected": -4.474495887756348, "loss": 4.1709, "rewards/accuracies": 0.5, "rewards/chosen": -43.91899108886719, "rewards/margins": 0.8259687423706055, "rewards/rejected": -44.744964599609375, "step": 4979 }, { "epoch": 0.6781045751633987, "grad_norm": 41.83672516227491, "learning_rate": 2.2707082380134656e-07, "logits/chosen": 13.619266510009766, "logits/rejected": 13.95325756072998, "logps/chosen": -4.191530227661133, "logps/rejected": -4.387983798980713, "loss": 4.4137, "rewards/accuracies": 0.75, "rewards/chosen": -41.91529846191406, "rewards/margins": 1.9645376205444336, "rewards/rejected": -43.87983703613281, "step": 4980 }, { "epoch": 0.6782407407407407, "grad_norm": 41.969051092193595, "learning_rate": 2.2689939027825163e-07, "logits/chosen": 13.563741683959961, "logits/rejected": 14.658437728881836, "logps/chosen": -4.00307035446167, "logps/rejected": -4.529201984405518, "loss": 3.5447, "rewards/accuracies": 0.75, "rewards/chosen": -40.030704498291016, "rewards/margins": 5.261316299438477, "rewards/rejected": -45.292022705078125, "step": 4981 }, { "epoch": 0.6783769063180828, "grad_norm": 38.48989874229108, "learning_rate": 2.2672799586867043e-07, "logits/chosen": 14.261775970458984, "logits/rejected": 13.794002532958984, "logps/chosen": -4.653382301330566, "logps/rejected": -4.5676655769348145, "loss": 4.2246, "rewards/accuracies": 0.5, "rewards/chosen": -46.53382110595703, "rewards/margins": -0.8571681976318359, "rewards/rejected": -45.67665481567383, "step": 4982 }, { "epoch": 0.6785130718954249, "grad_norm": 38.52150013652426, "learning_rate": 2.265566406113307e-07, "logits/chosen": 14.024913787841797, "logits/rejected": 14.267110824584961, "logps/chosen": -4.249746322631836, "logps/rejected": -4.430027961730957, "loss": 3.7959, "rewards/accuracies": 0.75, "rewards/chosen": -42.497459411621094, "rewards/margins": 1.8028192520141602, "rewards/rejected": -44.3002815246582, "step": 4983 }, { "epoch": 0.6786492374727668, "grad_norm": 39.32817680426428, "learning_rate": 2.2638532454495176e-07, "logits/chosen": 13.00296401977539, "logits/rejected": 13.175000190734863, "logps/chosen": -4.345083236694336, "logps/rejected": -4.424365043640137, "loss": 4.0556, "rewards/accuracies": 0.5, "rewards/chosen": -43.450828552246094, "rewards/margins": 0.7928199768066406, "rewards/rejected": -44.243648529052734, "step": 4984 }, { "epoch": 0.6787854030501089, "grad_norm": 41.83089247752888, "learning_rate": 2.2621404770824398e-07, "logits/chosen": 13.550725936889648, "logits/rejected": 14.312129974365234, "logps/chosen": -4.25447940826416, "logps/rejected": -4.734950065612793, "loss": 4.0867, "rewards/accuracies": 1.0, "rewards/chosen": -42.54479217529297, "rewards/margins": 4.804704666137695, "rewards/rejected": -47.3494987487793, "step": 4985 }, { "epoch": 0.678921568627451, "grad_norm": 51.494225611194224, "learning_rate": 2.2604281013990846e-07, "logits/chosen": 13.755313873291016, "logits/rejected": 13.794614791870117, "logps/chosen": -4.124956130981445, "logps/rejected": -4.582821369171143, "loss": 4.4312, "rewards/accuracies": 1.0, "rewards/chosen": -41.24956130981445, "rewards/margins": 4.578649520874023, "rewards/rejected": -45.82821273803711, "step": 4986 }, { "epoch": 0.679057734204793, "grad_norm": 42.994979201216694, "learning_rate": 2.25871611878638e-07, "logits/chosen": 13.470891952514648, "logits/rejected": 13.434338569641113, "logps/chosen": -4.223642826080322, "logps/rejected": -4.1192426681518555, "loss": 3.702, "rewards/accuracies": 0.25, "rewards/chosen": -42.236427307128906, "rewards/margins": -1.0440006256103516, "rewards/rejected": -41.19242858886719, "step": 4987 }, { "epoch": 0.6791938997821351, "grad_norm": 39.99989417916391, "learning_rate": 2.2570045296311613e-07, "logits/chosen": 13.182533264160156, "logits/rejected": 13.127737045288086, "logps/chosen": -4.256429672241211, "logps/rejected": -4.3897247314453125, "loss": 3.8905, "rewards/accuracies": 0.5, "rewards/chosen": -42.564300537109375, "rewards/margins": 1.33294677734375, "rewards/rejected": -43.897247314453125, "step": 4988 }, { "epoch": 0.6793300653594772, "grad_norm": 38.202588823863856, "learning_rate": 2.2552933343201796e-07, "logits/chosen": 12.70975399017334, "logits/rejected": 13.3775634765625, "logps/chosen": -3.9999818801879883, "logps/rejected": -4.332620620727539, "loss": 3.977, "rewards/accuracies": 0.75, "rewards/chosen": -39.99981689453125, "rewards/margins": 3.3263893127441406, "rewards/rejected": -43.32620620727539, "step": 4989 }, { "epoch": 0.6794662309368191, "grad_norm": 41.8497910613398, "learning_rate": 2.253582533240088e-07, "logits/chosen": 13.251211166381836, "logits/rejected": 13.467212677001953, "logps/chosen": -4.159136772155762, "logps/rejected": -4.489786148071289, "loss": 3.9311, "rewards/accuracies": 0.75, "rewards/chosen": -41.59136962890625, "rewards/margins": 3.306490898132324, "rewards/rejected": -44.89786148071289, "step": 4990 }, { "epoch": 0.6796023965141612, "grad_norm": 40.86967809645212, "learning_rate": 2.2518721267774597e-07, "logits/chosen": 13.439704895019531, "logits/rejected": 14.070022583007812, "logps/chosen": -4.168747425079346, "logps/rejected": -4.310511589050293, "loss": 4.3496, "rewards/accuracies": 0.75, "rewards/chosen": -41.68747329711914, "rewards/margins": 1.4176445007324219, "rewards/rejected": -43.10511779785156, "step": 4991 }, { "epoch": 0.6797385620915033, "grad_norm": 45.28817141219185, "learning_rate": 2.2501621153187762e-07, "logits/chosen": 13.131338119506836, "logits/rejected": 13.791618347167969, "logps/chosen": -4.238438606262207, "logps/rejected": -4.438187599182129, "loss": 4.1404, "rewards/accuracies": 1.0, "rewards/chosen": -42.38438415527344, "rewards/margins": 1.9974899291992188, "rewards/rejected": -44.381874084472656, "step": 4992 }, { "epoch": 0.6798747276688453, "grad_norm": 43.443735825615065, "learning_rate": 2.2484524992504251e-07, "logits/chosen": 13.716025352478027, "logits/rejected": 13.730600357055664, "logps/chosen": -4.321527481079102, "logps/rejected": -4.365349769592285, "loss": 4.2463, "rewards/accuracies": 0.5, "rewards/chosen": -43.215274810791016, "rewards/margins": 0.43822193145751953, "rewards/rejected": -43.65349578857422, "step": 4993 }, { "epoch": 0.6800108932461874, "grad_norm": 41.90921954515491, "learning_rate": 2.2467432789587103e-07, "logits/chosen": 13.739531517028809, "logits/rejected": 13.62101936340332, "logps/chosen": -4.245566368103027, "logps/rejected": -4.356466293334961, "loss": 3.7632, "rewards/accuracies": 0.5, "rewards/chosen": -42.45566177368164, "rewards/margins": 1.108999252319336, "rewards/rejected": -43.564659118652344, "step": 4994 }, { "epoch": 0.6801470588235294, "grad_norm": 45.1205453518669, "learning_rate": 2.2450344548298444e-07, "logits/chosen": 13.78978157043457, "logits/rejected": 13.06533432006836, "logps/chosen": -4.141720771789551, "logps/rejected": -4.268557548522949, "loss": 4.6626, "rewards/accuracies": 0.75, "rewards/chosen": -41.417205810546875, "rewards/margins": 1.2683725357055664, "rewards/rejected": -42.685577392578125, "step": 4995 }, { "epoch": 0.6802832244008714, "grad_norm": 42.60925001114597, "learning_rate": 2.2433260272499513e-07, "logits/chosen": 12.972877502441406, "logits/rejected": 13.221961975097656, "logps/chosen": -4.509316444396973, "logps/rejected": -4.431931018829346, "loss": 4.202, "rewards/accuracies": 0.25, "rewards/chosen": -45.093162536621094, "rewards/margins": -0.7738494873046875, "rewards/rejected": -44.319313049316406, "step": 4996 }, { "epoch": 0.6804193899782135, "grad_norm": 41.16656966272399, "learning_rate": 2.241617996605062e-07, "logits/chosen": 13.828176498413086, "logits/rejected": 13.688586235046387, "logps/chosen": -4.414057731628418, "logps/rejected": -4.750887870788574, "loss": 4.2912, "rewards/accuracies": 0.75, "rewards/chosen": -44.14057540893555, "rewards/margins": 3.368307113647461, "rewards/rejected": -47.508880615234375, "step": 4997 }, { "epoch": 0.6805555555555556, "grad_norm": 49.27457854599534, "learning_rate": 2.2399103632811206e-07, "logits/chosen": 13.280336380004883, "logits/rejected": 13.199195861816406, "logps/chosen": -3.9228296279907227, "logps/rejected": -4.009457588195801, "loss": 3.7353, "rewards/accuracies": 0.5, "rewards/chosen": -39.228294372558594, "rewards/margins": 0.8662843704223633, "rewards/rejected": -40.094581604003906, "step": 4998 }, { "epoch": 0.6806917211328976, "grad_norm": 37.98858488036612, "learning_rate": 2.2382031276639842e-07, "logits/chosen": 14.453033447265625, "logits/rejected": 14.305671691894531, "logps/chosen": -4.264438152313232, "logps/rejected": -4.555366516113281, "loss": 3.8535, "rewards/accuracies": 0.75, "rewards/chosen": -42.644378662109375, "rewards/margins": 2.9092884063720703, "rewards/rejected": -45.55366516113281, "step": 4999 }, { "epoch": 0.6808278867102396, "grad_norm": 41.94811246076298, "learning_rate": 2.2364962901394123e-07, "logits/chosen": 13.95418643951416, "logits/rejected": 14.303840637207031, "logps/chosen": -4.367353916168213, "logps/rejected": -4.350443363189697, "loss": 4.3544, "rewards/accuracies": 0.5, "rewards/chosen": -43.67354202270508, "rewards/margins": -0.1691122055053711, "rewards/rejected": -43.504432678222656, "step": 5000 }, { "epoch": 0.6809640522875817, "grad_norm": 40.78901527217579, "learning_rate": 2.234789851093081e-07, "logits/chosen": 13.165685653686523, "logits/rejected": 13.787588119506836, "logps/chosen": -4.396967887878418, "logps/rejected": -4.640708923339844, "loss": 4.1612, "rewards/accuracies": 1.0, "rewards/chosen": -43.96968078613281, "rewards/margins": 2.437413215637207, "rewards/rejected": -46.40708923339844, "step": 5001 }, { "epoch": 0.6811002178649237, "grad_norm": 51.69896004211716, "learning_rate": 2.2330838109105737e-07, "logits/chosen": 12.834299087524414, "logits/rejected": 13.363394737243652, "logps/chosen": -4.170774459838867, "logps/rejected": -4.564031600952148, "loss": 3.6398, "rewards/accuracies": 1.0, "rewards/chosen": -41.707740783691406, "rewards/margins": 3.9325733184814453, "rewards/rejected": -45.640316009521484, "step": 5002 }, { "epoch": 0.6812363834422658, "grad_norm": 39.31688607065951, "learning_rate": 2.231378169977387e-07, "logits/chosen": 13.472673416137695, "logits/rejected": 13.981507301330566, "logps/chosen": -4.350903511047363, "logps/rejected": -4.5102715492248535, "loss": 3.9274, "rewards/accuracies": 0.75, "rewards/chosen": -43.509037017822266, "rewards/margins": 1.5936784744262695, "rewards/rejected": -45.10271453857422, "step": 5003 }, { "epoch": 0.6813725490196079, "grad_norm": 43.3097706720235, "learning_rate": 2.2296729286789207e-07, "logits/chosen": 13.00706672668457, "logits/rejected": 13.026527404785156, "logps/chosen": -4.151429176330566, "logps/rejected": -4.192720890045166, "loss": 3.6572, "rewards/accuracies": 0.5, "rewards/chosen": -41.51428985595703, "rewards/margins": 0.4129199981689453, "rewards/rejected": -41.927207946777344, "step": 5004 }, { "epoch": 0.6815087145969498, "grad_norm": 40.46508647888254, "learning_rate": 2.2279680874004895e-07, "logits/chosen": 12.87918472290039, "logits/rejected": 12.738926887512207, "logps/chosen": -4.0249857902526855, "logps/rejected": -3.7901792526245117, "loss": 3.9561, "rewards/accuracies": 0.25, "rewards/chosen": -40.24985885620117, "rewards/margins": -2.3480682373046875, "rewards/rejected": -37.901790618896484, "step": 5005 }, { "epoch": 0.6816448801742919, "grad_norm": 42.657144962642704, "learning_rate": 2.2262636465273187e-07, "logits/chosen": 13.024589538574219, "logits/rejected": 13.917949676513672, "logps/chosen": -4.119035243988037, "logps/rejected": -4.41915225982666, "loss": 4.1094, "rewards/accuracies": 0.75, "rewards/chosen": -41.19035339355469, "rewards/margins": 3.001173973083496, "rewards/rejected": -44.191524505615234, "step": 5006 }, { "epoch": 0.681781045751634, "grad_norm": 39.65319402985123, "learning_rate": 2.224559606444537e-07, "logits/chosen": 13.730683326721191, "logits/rejected": 13.946415901184082, "logps/chosen": -4.100929260253906, "logps/rejected": -4.584770679473877, "loss": 4.1546, "rewards/accuracies": 1.0, "rewards/chosen": -41.00929260253906, "rewards/margins": 4.838413238525391, "rewards/rejected": -45.84770965576172, "step": 5007 }, { "epoch": 0.681917211328976, "grad_norm": 39.86175031420708, "learning_rate": 2.222855967537188e-07, "logits/chosen": 14.2041654586792, "logits/rejected": 13.898109436035156, "logps/chosen": -4.743954658508301, "logps/rejected": -4.75688362121582, "loss": 4.0666, "rewards/accuracies": 0.25, "rewards/chosen": -47.439552307128906, "rewards/margins": 0.1292867660522461, "rewards/rejected": -47.5688362121582, "step": 5008 }, { "epoch": 0.6820533769063181, "grad_norm": 38.9134953724926, "learning_rate": 2.2211527301902252e-07, "logits/chosen": 13.90983772277832, "logits/rejected": 14.315377235412598, "logps/chosen": -4.336110591888428, "logps/rejected": -4.475339889526367, "loss": 4.2528, "rewards/accuracies": 0.5, "rewards/chosen": -43.361106872558594, "rewards/margins": 1.3922948837280273, "rewards/rejected": -44.75340270996094, "step": 5009 }, { "epoch": 0.6821895424836601, "grad_norm": 43.98320805657753, "learning_rate": 2.2194498947885055e-07, "logits/chosen": 13.144036293029785, "logits/rejected": 13.657408714294434, "logps/chosen": -4.300394058227539, "logps/rejected": -4.447192192077637, "loss": 3.842, "rewards/accuracies": 0.75, "rewards/chosen": -43.00394058227539, "rewards/margins": 1.4679861068725586, "rewards/rejected": -44.471923828125, "step": 5010 }, { "epoch": 0.6823257080610022, "grad_norm": 41.69247926153457, "learning_rate": 2.2177474617168e-07, "logits/chosen": 13.624702453613281, "logits/rejected": 13.71026611328125, "logps/chosen": -4.155765533447266, "logps/rejected": -4.383963584899902, "loss": 4.2194, "rewards/accuracies": 0.5, "rewards/chosen": -41.557655334472656, "rewards/margins": 2.281982421875, "rewards/rejected": -43.83963394165039, "step": 5011 }, { "epoch": 0.6824618736383442, "grad_norm": 42.409154841352006, "learning_rate": 2.216045431359789e-07, "logits/chosen": 14.370423316955566, "logits/rejected": 13.751537322998047, "logps/chosen": -4.684075355529785, "logps/rejected": -4.48436164855957, "loss": 4.3122, "rewards/accuracies": 0.25, "rewards/chosen": -46.84075164794922, "rewards/margins": -1.9971342086791992, "rewards/rejected": -44.8436164855957, "step": 5012 }, { "epoch": 0.6825980392156863, "grad_norm": 43.8978278075162, "learning_rate": 2.214343804102058e-07, "logits/chosen": 13.822977066040039, "logits/rejected": 13.777944564819336, "logps/chosen": -4.605093955993652, "logps/rejected": -4.531925678253174, "loss": 4.0202, "rewards/accuracies": 0.25, "rewards/chosen": -46.050941467285156, "rewards/margins": -0.7316799163818359, "rewards/rejected": -45.31925964355469, "step": 5013 }, { "epoch": 0.6827342047930284, "grad_norm": 40.8573471581041, "learning_rate": 2.2126425803281048e-07, "logits/chosen": 13.439957618713379, "logits/rejected": 13.684158325195312, "logps/chosen": -4.2599263191223145, "logps/rejected": -4.236257076263428, "loss": 4.024, "rewards/accuracies": 0.5, "rewards/chosen": -42.59926223754883, "rewards/margins": -0.2366952896118164, "rewards/rejected": -42.36256790161133, "step": 5014 }, { "epoch": 0.6828703703703703, "grad_norm": 46.98463970774202, "learning_rate": 2.2109417604223366e-07, "logits/chosen": 13.41465950012207, "logits/rejected": 13.211991310119629, "logps/chosen": -4.368566513061523, "logps/rejected": -4.239809036254883, "loss": 4.346, "rewards/accuracies": 0.5, "rewards/chosen": -43.6856689453125, "rewards/margins": -1.2875728607177734, "rewards/rejected": -42.398094177246094, "step": 5015 }, { "epoch": 0.6830065359477124, "grad_norm": 42.716463885391974, "learning_rate": 2.2092413447690643e-07, "logits/chosen": 13.91878890991211, "logits/rejected": 14.42819881439209, "logps/chosen": -4.463340759277344, "logps/rejected": -4.573822975158691, "loss": 4.2024, "rewards/accuracies": 0.75, "rewards/chosen": -44.63340759277344, "rewards/margins": 1.1048259735107422, "rewards/rejected": -45.73823547363281, "step": 5016 }, { "epoch": 0.6831427015250545, "grad_norm": 40.573232414927446, "learning_rate": 2.2075413337525132e-07, "logits/chosen": 13.59170913696289, "logits/rejected": 14.119364738464355, "logps/chosen": -4.4066243171691895, "logps/rejected": -4.74156379699707, "loss": 4.0478, "rewards/accuracies": 0.75, "rewards/chosen": -44.06624221801758, "rewards/margins": 3.349398612976074, "rewards/rejected": -47.41564178466797, "step": 5017 }, { "epoch": 0.6832788671023965, "grad_norm": 42.42780324254925, "learning_rate": 2.2058417277568157e-07, "logits/chosen": 14.032445907592773, "logits/rejected": 14.119268417358398, "logps/chosen": -4.577078819274902, "logps/rejected": -4.8383965492248535, "loss": 4.4352, "rewards/accuracies": 1.0, "rewards/chosen": -45.770790100097656, "rewards/margins": 2.613173484802246, "rewards/rejected": -48.38396453857422, "step": 5018 }, { "epoch": 0.6834150326797386, "grad_norm": 45.09492785044118, "learning_rate": 2.2041425271660085e-07, "logits/chosen": 12.941164016723633, "logits/rejected": 13.158987045288086, "logps/chosen": -4.172703266143799, "logps/rejected": -4.470229148864746, "loss": 3.4536, "rewards/accuracies": 1.0, "rewards/chosen": -41.72703170776367, "rewards/margins": 2.9752607345581055, "rewards/rejected": -44.702293395996094, "step": 5019 }, { "epoch": 0.6835511982570807, "grad_norm": 48.838699181294565, "learning_rate": 2.2024437323640427e-07, "logits/chosen": 13.451563835144043, "logits/rejected": 13.46609115600586, "logps/chosen": -4.249394416809082, "logps/rejected": -4.611000061035156, "loss": 3.7599, "rewards/accuracies": 1.0, "rewards/chosen": -42.49394226074219, "rewards/margins": 3.616060256958008, "rewards/rejected": -46.11000061035156, "step": 5020 }, { "epoch": 0.6836873638344226, "grad_norm": 39.2276000519796, "learning_rate": 2.2007453437347757e-07, "logits/chosen": 13.963098526000977, "logits/rejected": 13.917725563049316, "logps/chosen": -4.216917514801025, "logps/rejected": -4.361137390136719, "loss": 4.0139, "rewards/accuracies": 0.75, "rewards/chosen": -42.16917419433594, "rewards/margins": 1.4422016143798828, "rewards/rejected": -43.61137771606445, "step": 5021 }, { "epoch": 0.6838235294117647, "grad_norm": 42.80077574170862, "learning_rate": 2.199047361661969e-07, "logits/chosen": 13.302776336669922, "logits/rejected": 13.442911148071289, "logps/chosen": -4.06459379196167, "logps/rejected": -4.444958209991455, "loss": 3.849, "rewards/accuracies": 0.75, "rewards/chosen": -40.645938873291016, "rewards/margins": 3.8036413192749023, "rewards/rejected": -44.44957733154297, "step": 5022 }, { "epoch": 0.6839596949891068, "grad_norm": 37.02378961276401, "learning_rate": 2.1973497865292984e-07, "logits/chosen": 13.425372123718262, "logits/rejected": 13.780284881591797, "logps/chosen": -4.098623275756836, "logps/rejected": -4.578459739685059, "loss": 3.8554, "rewards/accuracies": 1.0, "rewards/chosen": -40.986236572265625, "rewards/margins": 4.798361778259277, "rewards/rejected": -45.78459930419922, "step": 5023 }, { "epoch": 0.6840958605664488, "grad_norm": 41.32235014811329, "learning_rate": 2.1956526187203454e-07, "logits/chosen": 14.994256019592285, "logits/rejected": 14.74650764465332, "logps/chosen": -4.812558174133301, "logps/rejected": -4.9672346115112305, "loss": 4.1528, "rewards/accuracies": 0.5, "rewards/chosen": -48.125579833984375, "rewards/margins": 1.5467605590820312, "rewards/rejected": -49.672340393066406, "step": 5024 }, { "epoch": 0.6842320261437909, "grad_norm": 40.45730084503296, "learning_rate": 2.193955858618597e-07, "logits/chosen": 12.574077606201172, "logits/rejected": 12.616768836975098, "logps/chosen": -4.365527153015137, "logps/rejected": -4.112958908081055, "loss": 4.0049, "rewards/accuracies": 0.25, "rewards/chosen": -43.6552734375, "rewards/margins": -2.525684356689453, "rewards/rejected": -41.12958908081055, "step": 5025 }, { "epoch": 0.684368191721133, "grad_norm": 37.56528853277175, "learning_rate": 2.192259506607451e-07, "logits/chosen": 12.778242111206055, "logits/rejected": 13.83846664428711, "logps/chosen": -4.3068389892578125, "logps/rejected": -4.631358623504639, "loss": 4.1155, "rewards/accuracies": 0.75, "rewards/chosen": -43.068389892578125, "rewards/margins": 3.2451982498168945, "rewards/rejected": -46.31358337402344, "step": 5026 }, { "epoch": 0.6845043572984749, "grad_norm": 44.15132308434973, "learning_rate": 2.190563563070214e-07, "logits/chosen": 13.866601943969727, "logits/rejected": 13.919923782348633, "logps/chosen": -4.627959251403809, "logps/rejected": -4.472896575927734, "loss": 3.9813, "rewards/accuracies": 0.25, "rewards/chosen": -46.27959442138672, "rewards/margins": -1.5506315231323242, "rewards/rejected": -44.72896194458008, "step": 5027 }, { "epoch": 0.684640522875817, "grad_norm": 41.539292481757734, "learning_rate": 2.1888680283900952e-07, "logits/chosen": 13.302159309387207, "logits/rejected": 14.003076553344727, "logps/chosen": -4.345753192901611, "logps/rejected": -4.610883712768555, "loss": 3.7356, "rewards/accuracies": 1.0, "rewards/chosen": -43.45753479003906, "rewards/margins": 2.651301383972168, "rewards/rejected": -46.10883712768555, "step": 5028 }, { "epoch": 0.6847766884531591, "grad_norm": 41.89981168673526, "learning_rate": 2.1871729029502166e-07, "logits/chosen": 13.565690040588379, "logits/rejected": 13.907215118408203, "logps/chosen": -4.279422283172607, "logps/rejected": -4.388872146606445, "loss": 3.896, "rewards/accuracies": 0.75, "rewards/chosen": -42.794219970703125, "rewards/margins": 1.094496726989746, "rewards/rejected": -43.88871765136719, "step": 5029 }, { "epoch": 0.6849128540305011, "grad_norm": 41.89160843588649, "learning_rate": 2.185478187133607e-07, "logits/chosen": 13.600160598754883, "logits/rejected": 14.831604957580566, "logps/chosen": -4.394327640533447, "logps/rejected": -4.738616943359375, "loss": 4.1053, "rewards/accuracies": 0.75, "rewards/chosen": -43.943275451660156, "rewards/margins": 3.4428930282592773, "rewards/rejected": -47.38616943359375, "step": 5030 }, { "epoch": 0.6850490196078431, "grad_norm": 42.12499879276136, "learning_rate": 2.1837838813231984e-07, "logits/chosen": 12.900238990783691, "logits/rejected": 13.538312911987305, "logps/chosen": -4.07705545425415, "logps/rejected": -4.57674503326416, "loss": 3.8413, "rewards/accuracies": 1.0, "rewards/chosen": -40.77055358886719, "rewards/margins": 4.996895790100098, "rewards/rejected": -45.76744842529297, "step": 5031 }, { "epoch": 0.6851851851851852, "grad_norm": 43.66637392011086, "learning_rate": 2.182089985901835e-07, "logits/chosen": 13.407716751098633, "logits/rejected": 13.758161544799805, "logps/chosen": -4.2622551918029785, "logps/rejected": -4.309041500091553, "loss": 4.3707, "rewards/accuracies": 0.5, "rewards/chosen": -42.62255096435547, "rewards/margins": 0.4678630828857422, "rewards/rejected": -43.09041213989258, "step": 5032 }, { "epoch": 0.6853213507625272, "grad_norm": 42.743614642532116, "learning_rate": 2.180396501252268e-07, "logits/chosen": 13.536470413208008, "logits/rejected": 13.650853157043457, "logps/chosen": -4.2621235847473145, "logps/rejected": -4.592800140380859, "loss": 3.9033, "rewards/accuracies": 1.0, "rewards/chosen": -42.621238708496094, "rewards/margins": 3.3067684173583984, "rewards/rejected": -45.92800521850586, "step": 5033 }, { "epoch": 0.6854575163398693, "grad_norm": 40.96031392046798, "learning_rate": 2.17870342775715e-07, "logits/chosen": 12.512065887451172, "logits/rejected": 12.839315414428711, "logps/chosen": -4.231919288635254, "logps/rejected": -4.357069969177246, "loss": 3.4081, "rewards/accuracies": 0.75, "rewards/chosen": -42.31919860839844, "rewards/margins": 1.2515039443969727, "rewards/rejected": -43.570701599121094, "step": 5034 }, { "epoch": 0.6855936819172114, "grad_norm": 37.54030753446383, "learning_rate": 2.1770107657990486e-07, "logits/chosen": 14.254210472106934, "logits/rejected": 14.354803085327148, "logps/chosen": -4.6660003662109375, "logps/rejected": -4.8659563064575195, "loss": 3.6438, "rewards/accuracies": 0.75, "rewards/chosen": -46.660003662109375, "rewards/margins": 1.9995546340942383, "rewards/rejected": -48.65956115722656, "step": 5035 }, { "epoch": 0.6857298474945533, "grad_norm": 39.160259857954436, "learning_rate": 2.175318515760435e-07, "logits/chosen": 12.856311798095703, "logits/rejected": 13.967535018920898, "logps/chosen": -3.756572961807251, "logps/rejected": -4.370994567871094, "loss": 3.5725, "rewards/accuracies": 1.0, "rewards/chosen": -37.565731048583984, "rewards/margins": 6.144213676452637, "rewards/rejected": -43.70994567871094, "step": 5036 }, { "epoch": 0.6858660130718954, "grad_norm": 42.117112195601536, "learning_rate": 2.173626678023684e-07, "logits/chosen": 13.458192825317383, "logits/rejected": 13.263712882995605, "logps/chosen": -4.429783821105957, "logps/rejected": -4.482925891876221, "loss": 3.6839, "rewards/accuracies": 0.5, "rewards/chosen": -44.2978401184082, "rewards/margins": 0.5314197540283203, "rewards/rejected": -44.829261779785156, "step": 5037 }, { "epoch": 0.6860021786492375, "grad_norm": 43.327777436605245, "learning_rate": 2.1719352529710817e-07, "logits/chosen": 13.878326416015625, "logits/rejected": 14.192203521728516, "logps/chosen": -4.496834754943848, "logps/rejected": -4.61407470703125, "loss": 3.7033, "rewards/accuracies": 0.5, "rewards/chosen": -44.968345642089844, "rewards/margins": 1.172403335571289, "rewards/rejected": -46.1407470703125, "step": 5038 }, { "epoch": 0.6861383442265795, "grad_norm": 43.25432132950634, "learning_rate": 2.1702442409848217e-07, "logits/chosen": 12.927835464477539, "logits/rejected": 13.479970932006836, "logps/chosen": -3.983790636062622, "logps/rejected": -4.559229850769043, "loss": 3.4097, "rewards/accuracies": 1.0, "rewards/chosen": -39.83790588378906, "rewards/margins": 5.75439453125, "rewards/rejected": -45.59230041503906, "step": 5039 }, { "epoch": 0.6862745098039216, "grad_norm": 45.58601567539898, "learning_rate": 2.1685536424469992e-07, "logits/chosen": 14.449775695800781, "logits/rejected": 14.723917007446289, "logps/chosen": -5.061694622039795, "logps/rejected": -4.923243522644043, "loss": 3.9849, "rewards/accuracies": 0.25, "rewards/chosen": -50.616947174072266, "rewards/margins": -1.3845109939575195, "rewards/rejected": -49.23243713378906, "step": 5040 }, { "epoch": 0.6864106753812637, "grad_norm": 47.84255782106799, "learning_rate": 2.1668634577396198e-07, "logits/chosen": 13.297242164611816, "logits/rejected": 13.420615196228027, "logps/chosen": -4.337591171264648, "logps/rejected": -4.467556953430176, "loss": 3.499, "rewards/accuracies": 0.75, "rewards/chosen": -43.375911712646484, "rewards/margins": 1.2996559143066406, "rewards/rejected": -44.675567626953125, "step": 5041 }, { "epoch": 0.6865468409586056, "grad_norm": 45.24925979567889, "learning_rate": 2.1651736872445965e-07, "logits/chosen": 13.463484764099121, "logits/rejected": 15.125666618347168, "logps/chosen": -4.140844345092773, "logps/rejected": -5.007050514221191, "loss": 4.2897, "rewards/accuracies": 1.0, "rewards/chosen": -41.408447265625, "rewards/margins": 8.662060737609863, "rewards/rejected": -50.07050323486328, "step": 5042 }, { "epoch": 0.6866830065359477, "grad_norm": 42.242960725915495, "learning_rate": 2.1634843313437437e-07, "logits/chosen": 13.199676513671875, "logits/rejected": 14.019630432128906, "logps/chosen": -4.230906009674072, "logps/rejected": -4.716821670532227, "loss": 3.6316, "rewards/accuracies": 0.75, "rewards/chosen": -42.309059143066406, "rewards/margins": 4.859156608581543, "rewards/rejected": -47.168216705322266, "step": 5043 }, { "epoch": 0.6868191721132898, "grad_norm": 42.69790606339553, "learning_rate": 2.1617953904187875e-07, "logits/chosen": 13.95077133178711, "logits/rejected": 13.871588706970215, "logps/chosen": -4.366288185119629, "logps/rejected": -4.410467624664307, "loss": 4.4175, "rewards/accuracies": 0.5, "rewards/chosen": -43.66288757324219, "rewards/margins": 0.4417877197265625, "rewards/rejected": -44.10467529296875, "step": 5044 }, { "epoch": 0.6869553376906318, "grad_norm": 43.27180210118367, "learning_rate": 2.1601068648513588e-07, "logits/chosen": 13.272359848022461, "logits/rejected": 14.151491165161133, "logps/chosen": -4.274847030639648, "logps/rejected": -4.6629228591918945, "loss": 3.9044, "rewards/accuracies": 0.75, "rewards/chosen": -42.748470306396484, "rewards/margins": 3.880758285522461, "rewards/rejected": -46.62923049926758, "step": 5045 }, { "epoch": 0.6870915032679739, "grad_norm": 44.19788855710561, "learning_rate": 2.158418755022991e-07, "logits/chosen": 14.19868278503418, "logits/rejected": 14.266611099243164, "logps/chosen": -4.763640880584717, "logps/rejected": -4.5059685707092285, "loss": 4.2445, "rewards/accuracies": 0.5, "rewards/chosen": -47.63640594482422, "rewards/margins": -2.5767202377319336, "rewards/rejected": -45.05968475341797, "step": 5046 }, { "epoch": 0.6872276688453159, "grad_norm": 37.53079276850813, "learning_rate": 2.1567310613151287e-07, "logits/chosen": 12.452136039733887, "logits/rejected": 14.220528602600098, "logps/chosen": -3.946361780166626, "logps/rejected": -4.663974285125732, "loss": 3.321, "rewards/accuracies": 1.0, "rewards/chosen": -39.46361541748047, "rewards/margins": 7.176124572753906, "rewards/rejected": -46.63974380493164, "step": 5047 }, { "epoch": 0.6873638344226579, "grad_norm": 53.9992274510413, "learning_rate": 2.1550437841091206e-07, "logits/chosen": 12.970587730407715, "logits/rejected": 14.488672256469727, "logps/chosen": -4.0139265060424805, "logps/rejected": -4.487324237823486, "loss": 3.7372, "rewards/accuracies": 0.75, "rewards/chosen": -40.1392707824707, "rewards/margins": 4.733972549438477, "rewards/rejected": -44.87324142456055, "step": 5048 }, { "epoch": 0.6875, "grad_norm": 40.809631001887, "learning_rate": 2.1533569237862186e-07, "logits/chosen": 12.428160667419434, "logits/rejected": 13.518182754516602, "logps/chosen": -4.012902736663818, "logps/rejected": -4.466946601867676, "loss": 3.8278, "rewards/accuracies": 1.0, "rewards/chosen": -40.1290283203125, "rewards/margins": 4.54044246673584, "rewards/rejected": -44.66946792602539, "step": 5049 }, { "epoch": 0.6876361655773421, "grad_norm": 44.594424879328805, "learning_rate": 2.151670480727585e-07, "logits/chosen": 14.73651123046875, "logits/rejected": 14.67249870300293, "logps/chosen": -4.5192461013793945, "logps/rejected": -4.53315544128418, "loss": 3.9018, "rewards/accuracies": 0.5, "rewards/chosen": -45.19246292114258, "rewards/margins": 0.13908767700195312, "rewards/rejected": -45.33155059814453, "step": 5050 }, { "epoch": 0.6877723311546841, "grad_norm": 41.25089244063081, "learning_rate": 2.1499844553142855e-07, "logits/chosen": 13.691286087036133, "logits/rejected": 14.143214225769043, "logps/chosen": -4.392005443572998, "logps/rejected": -4.494807243347168, "loss": 4.1444, "rewards/accuracies": 0.75, "rewards/chosen": -43.9200553894043, "rewards/margins": 1.02801513671875, "rewards/rejected": -44.94807052612305, "step": 5051 }, { "epoch": 0.6879084967320261, "grad_norm": 38.487525976002686, "learning_rate": 2.1482988479272893e-07, "logits/chosen": 13.783309936523438, "logits/rejected": 13.695435523986816, "logps/chosen": -4.372703552246094, "logps/rejected": -4.54793643951416, "loss": 3.7719, "rewards/accuracies": 0.75, "rewards/chosen": -43.72703552246094, "rewards/margins": 1.7523307800292969, "rewards/rejected": -45.479366302490234, "step": 5052 }, { "epoch": 0.6880446623093682, "grad_norm": 43.05359271217701, "learning_rate": 2.1466136589474747e-07, "logits/chosen": 13.812551498413086, "logits/rejected": 12.83680534362793, "logps/chosen": -4.266291618347168, "logps/rejected": -4.062941074371338, "loss": 3.9956, "rewards/accuracies": 0.25, "rewards/chosen": -42.66291046142578, "rewards/margins": -2.033503532409668, "rewards/rejected": -40.62940979003906, "step": 5053 }, { "epoch": 0.6881808278867102, "grad_norm": 47.708984990193926, "learning_rate": 2.1449288887556256e-07, "logits/chosen": 14.179397583007812, "logits/rejected": 13.96268367767334, "logps/chosen": -4.67371940612793, "logps/rejected": -4.560037612915039, "loss": 3.5771, "rewards/accuracies": 0.5, "rewards/chosen": -46.73719024658203, "rewards/margins": -1.1368141174316406, "rewards/rejected": -45.600379943847656, "step": 5054 }, { "epoch": 0.6883169934640523, "grad_norm": 39.88617020922525, "learning_rate": 2.1432445377324268e-07, "logits/chosen": 13.438325881958008, "logits/rejected": 13.679304122924805, "logps/chosen": -4.237137794494629, "logps/rejected": -4.793191909790039, "loss": 4.1952, "rewards/accuracies": 0.75, "rewards/chosen": -42.37137985229492, "rewards/margins": 5.560540199279785, "rewards/rejected": -47.93191909790039, "step": 5055 }, { "epoch": 0.6884531590413944, "grad_norm": 41.376116080029945, "learning_rate": 2.1415606062584727e-07, "logits/chosen": 12.516794204711914, "logits/rejected": 13.6596097946167, "logps/chosen": -4.022603988647461, "logps/rejected": -4.287923812866211, "loss": 3.6999, "rewards/accuracies": 0.75, "rewards/chosen": -40.22603988647461, "rewards/margins": 2.6531972885131836, "rewards/rejected": -42.879234313964844, "step": 5056 }, { "epoch": 0.6885893246187363, "grad_norm": 48.230666412626206, "learning_rate": 2.1398770947142632e-07, "logits/chosen": 13.696203231811523, "logits/rejected": 13.676042556762695, "logps/chosen": -3.9933180809020996, "logps/rejected": -4.128590106964111, "loss": 4.4716, "rewards/accuracies": 0.75, "rewards/chosen": -39.93318176269531, "rewards/margins": 1.3527183532714844, "rewards/rejected": -41.2859001159668, "step": 5057 }, { "epoch": 0.6887254901960784, "grad_norm": 52.08813073709919, "learning_rate": 2.1381940034801986e-07, "logits/chosen": 12.883903503417969, "logits/rejected": 13.003467559814453, "logps/chosen": -4.093922138214111, "logps/rejected": -4.1950883865356445, "loss": 4.562, "rewards/accuracies": 0.75, "rewards/chosen": -40.9392204284668, "rewards/margins": 1.0116605758666992, "rewards/rejected": -41.95088195800781, "step": 5058 }, { "epoch": 0.6888616557734205, "grad_norm": 42.393408137209185, "learning_rate": 2.136511332936589e-07, "logits/chosen": 12.004070281982422, "logits/rejected": 13.98047924041748, "logps/chosen": -4.022098541259766, "logps/rejected": -4.52042293548584, "loss": 3.8706, "rewards/accuracies": 0.75, "rewards/chosen": -40.220985412597656, "rewards/margins": 4.983241081237793, "rewards/rejected": -45.204227447509766, "step": 5059 }, { "epoch": 0.6889978213507625, "grad_norm": 40.35287874051388, "learning_rate": 2.1348290834636492e-07, "logits/chosen": 14.215036392211914, "logits/rejected": 14.008841514587402, "logps/chosen": -4.314678192138672, "logps/rejected": -4.662087917327881, "loss": 4.2108, "rewards/accuracies": 0.75, "rewards/chosen": -43.14678192138672, "rewards/margins": 3.4740991592407227, "rewards/rejected": -46.620880126953125, "step": 5060 }, { "epoch": 0.6891339869281046, "grad_norm": 43.27689018761523, "learning_rate": 2.1331472554414933e-07, "logits/chosen": 13.383419036865234, "logits/rejected": 13.46914291381836, "logps/chosen": -4.656763076782227, "logps/rejected": -4.732245445251465, "loss": 4.1581, "rewards/accuracies": 0.5, "rewards/chosen": -46.56763458251953, "rewards/margins": 0.7548236846923828, "rewards/rejected": -47.32245635986328, "step": 5061 }, { "epoch": 0.6892701525054467, "grad_norm": 61.27612620321013, "learning_rate": 2.131465849250147e-07, "logits/chosen": 13.791242599487305, "logits/rejected": 13.974440574645996, "logps/chosen": -4.313846111297607, "logps/rejected": -4.462347030639648, "loss": 3.9636, "rewards/accuracies": 0.75, "rewards/chosen": -43.138458251953125, "rewards/margins": 1.4850101470947266, "rewards/rejected": -44.62346649169922, "step": 5062 }, { "epoch": 0.6894063180827886, "grad_norm": 41.97032007143911, "learning_rate": 2.1297848652695395e-07, "logits/chosen": 13.333269119262695, "logits/rejected": 13.827139854431152, "logps/chosen": -4.500308990478516, "logps/rejected": -4.670294284820557, "loss": 3.7705, "rewards/accuracies": 0.75, "rewards/chosen": -45.003089904785156, "rewards/margins": 1.6998577117919922, "rewards/rejected": -46.702945709228516, "step": 5063 }, { "epoch": 0.6895424836601307, "grad_norm": 47.98500503081346, "learning_rate": 2.128104303879499e-07, "logits/chosen": 13.70368766784668, "logits/rejected": 13.828487396240234, "logps/chosen": -4.718319892883301, "logps/rejected": -4.714456558227539, "loss": 4.314, "rewards/accuracies": 0.25, "rewards/chosen": -47.183204650878906, "rewards/margins": -0.03863811492919922, "rewards/rejected": -47.144561767578125, "step": 5064 }, { "epoch": 0.6896786492374728, "grad_norm": 40.721417047784584, "learning_rate": 2.126424165459764e-07, "logits/chosen": 12.899665832519531, "logits/rejected": 13.803180694580078, "logps/chosen": -4.068606376647949, "logps/rejected": -4.495388507843018, "loss": 4.0035, "rewards/accuracies": 1.0, "rewards/chosen": -40.686065673828125, "rewards/margins": 4.267820358276367, "rewards/rejected": -44.95388412475586, "step": 5065 }, { "epoch": 0.6898148148148148, "grad_norm": 42.82262710381326, "learning_rate": 2.124744450389978e-07, "logits/chosen": 13.108783721923828, "logits/rejected": 14.115621566772461, "logps/chosen": -4.5188493728637695, "logps/rejected": -4.867621421813965, "loss": 4.0253, "rewards/accuracies": 1.0, "rewards/chosen": -45.18849563598633, "rewards/margins": 3.487720489501953, "rewards/rejected": -48.67621612548828, "step": 5066 }, { "epoch": 0.6899509803921569, "grad_norm": 39.818915372306115, "learning_rate": 2.1230651590496826e-07, "logits/chosen": 13.391151428222656, "logits/rejected": 13.234872817993164, "logps/chosen": -4.413756370544434, "logps/rejected": -4.441619873046875, "loss": 4.0771, "rewards/accuracies": 0.75, "rewards/chosen": -44.13756561279297, "rewards/margins": 0.27863216400146484, "rewards/rejected": -44.41619873046875, "step": 5067 }, { "epoch": 0.6900871459694989, "grad_norm": 39.3740042464666, "learning_rate": 2.1213862918183296e-07, "logits/chosen": 12.61839485168457, "logits/rejected": 13.271062850952148, "logps/chosen": -4.126917839050293, "logps/rejected": -4.385824203491211, "loss": 3.7294, "rewards/accuracies": 0.75, "rewards/chosen": -41.26918029785156, "rewards/margins": 2.5890655517578125, "rewards/rejected": -43.85824203491211, "step": 5068 }, { "epoch": 0.6902233115468409, "grad_norm": 41.63325798159692, "learning_rate": 2.119707849075274e-07, "logits/chosen": 13.743759155273438, "logits/rejected": 14.049776077270508, "logps/chosen": -4.566286087036133, "logps/rejected": -4.583836555480957, "loss": 4.2231, "rewards/accuracies": 0.25, "rewards/chosen": -45.66285705566406, "rewards/margins": 0.17550277709960938, "rewards/rejected": -45.83836364746094, "step": 5069 }, { "epoch": 0.690359477124183, "grad_norm": 47.84423855785404, "learning_rate": 2.1180298311997716e-07, "logits/chosen": 13.705587387084961, "logits/rejected": 13.399707794189453, "logps/chosen": -4.536466121673584, "logps/rejected": -4.456170082092285, "loss": 3.9485, "rewards/accuracies": 0.5, "rewards/chosen": -45.364662170410156, "rewards/margins": -0.8029565811157227, "rewards/rejected": -44.561702728271484, "step": 5070 }, { "epoch": 0.6904956427015251, "grad_norm": 44.30002894689206, "learning_rate": 2.1163522385709852e-07, "logits/chosen": 12.84009075164795, "logits/rejected": 12.999507904052734, "logps/chosen": -4.23394775390625, "logps/rejected": -4.579129219055176, "loss": 3.9364, "rewards/accuracies": 1.0, "rewards/chosen": -42.3394775390625, "rewards/margins": 3.4518165588378906, "rewards/rejected": -45.791290283203125, "step": 5071 }, { "epoch": 0.690631808278867, "grad_norm": 41.9331489431367, "learning_rate": 2.1146750715679822e-07, "logits/chosen": 13.660833358764648, "logits/rejected": 13.050054550170898, "logps/chosen": -4.042286396026611, "logps/rejected": -4.057928562164307, "loss": 4.0239, "rewards/accuracies": 0.5, "rewards/chosen": -40.42286682128906, "rewards/margins": 0.1564188003540039, "rewards/rejected": -40.57928466796875, "step": 5072 }, { "epoch": 0.6907679738562091, "grad_norm": 38.5057302890517, "learning_rate": 2.1129983305697294e-07, "logits/chosen": 13.363285064697266, "logits/rejected": 13.740050315856934, "logps/chosen": -4.343874931335449, "logps/rejected": -4.542196273803711, "loss": 3.4224, "rewards/accuracies": 0.75, "rewards/chosen": -43.438743591308594, "rewards/margins": 1.9832210540771484, "rewards/rejected": -45.421966552734375, "step": 5073 }, { "epoch": 0.6909041394335512, "grad_norm": 39.68664462251153, "learning_rate": 2.1113220159551025e-07, "logits/chosen": 12.992027282714844, "logits/rejected": 13.298467636108398, "logps/chosen": -4.263635635375977, "logps/rejected": -4.441850662231445, "loss": 3.9791, "rewards/accuracies": 0.75, "rewards/chosen": -42.6363525390625, "rewards/margins": 1.7821540832519531, "rewards/rejected": -44.41851043701172, "step": 5074 }, { "epoch": 0.6910403050108932, "grad_norm": 44.25353541089607, "learning_rate": 2.109646128102879e-07, "logits/chosen": 14.142789840698242, "logits/rejected": 13.936809539794922, "logps/chosen": -4.07859468460083, "logps/rejected": -4.577127456665039, "loss": 4.2883, "rewards/accuracies": 0.75, "rewards/chosen": -40.785945892333984, "rewards/margins": 4.9853315353393555, "rewards/rejected": -45.771278381347656, "step": 5075 }, { "epoch": 0.6911764705882353, "grad_norm": 41.257018067871684, "learning_rate": 2.1079706673917374e-07, "logits/chosen": 13.170450210571289, "logits/rejected": 13.68282699584961, "logps/chosen": -4.195496559143066, "logps/rejected": -4.410134315490723, "loss": 4.3763, "rewards/accuracies": 0.75, "rewards/chosen": -41.95496368408203, "rewards/margins": 2.1463794708251953, "rewards/rejected": -44.101341247558594, "step": 5076 }, { "epoch": 0.6913126361655774, "grad_norm": 40.49063465066177, "learning_rate": 2.106295634200263e-07, "logits/chosen": 13.158891677856445, "logits/rejected": 13.354315757751465, "logps/chosen": -4.375474452972412, "logps/rejected": -4.513238430023193, "loss": 4.2859, "rewards/accuracies": 0.75, "rewards/chosen": -43.75474548339844, "rewards/margins": 1.3776378631591797, "rewards/rejected": -45.13238525390625, "step": 5077 }, { "epoch": 0.6914488017429193, "grad_norm": 41.33825560505502, "learning_rate": 2.104621028906945e-07, "logits/chosen": 13.362775802612305, "logits/rejected": 14.156366348266602, "logps/chosen": -4.336331367492676, "logps/rejected": -4.483623504638672, "loss": 3.9592, "rewards/accuracies": 0.75, "rewards/chosen": -43.363311767578125, "rewards/margins": 1.4729204177856445, "rewards/rejected": -44.83623504638672, "step": 5078 }, { "epoch": 0.6915849673202614, "grad_norm": 40.373386548488384, "learning_rate": 2.102946851890172e-07, "logits/chosen": 13.304162979125977, "logits/rejected": 14.278861999511719, "logps/chosen": -4.629725456237793, "logps/rejected": -4.911787033081055, "loss": 3.8663, "rewards/accuracies": 0.75, "rewards/chosen": -46.2972526550293, "rewards/margins": 2.8206214904785156, "rewards/rejected": -49.11787414550781, "step": 5079 }, { "epoch": 0.6917211328976035, "grad_norm": 41.13423341286676, "learning_rate": 2.1012731035282382e-07, "logits/chosen": 12.762391090393066, "logits/rejected": 13.558043479919434, "logps/chosen": -4.110299110412598, "logps/rejected": -4.359222412109375, "loss": 3.3072, "rewards/accuracies": 1.0, "rewards/chosen": -41.102996826171875, "rewards/margins": 2.489227294921875, "rewards/rejected": -43.59222412109375, "step": 5080 }, { "epoch": 0.6918572984749455, "grad_norm": 41.31584275122111, "learning_rate": 2.0995997841993435e-07, "logits/chosen": 13.327335357666016, "logits/rejected": 14.191000938415527, "logps/chosen": -4.202797889709473, "logps/rejected": -4.330366134643555, "loss": 3.7874, "rewards/accuracies": 0.75, "rewards/chosen": -42.027976989746094, "rewards/margins": 1.2756824493408203, "rewards/rejected": -43.30365753173828, "step": 5081 }, { "epoch": 0.6919934640522876, "grad_norm": 40.9266667549336, "learning_rate": 2.097926894281585e-07, "logits/chosen": 13.562143325805664, "logits/rejected": 12.990341186523438, "logps/chosen": -4.079766273498535, "logps/rejected": -4.306893348693848, "loss": 4.0316, "rewards/accuracies": 0.75, "rewards/chosen": -40.79766082763672, "rewards/margins": 2.271273612976074, "rewards/rejected": -43.06893539428711, "step": 5082 }, { "epoch": 0.6921296296296297, "grad_norm": 40.22071514171541, "learning_rate": 2.0962544341529678e-07, "logits/chosen": 12.782796859741211, "logits/rejected": 14.042491912841797, "logps/chosen": -4.547945022583008, "logps/rejected": -4.818963527679443, "loss": 3.8082, "rewards/accuracies": 0.75, "rewards/chosen": -45.47944641113281, "rewards/margins": 2.7101898193359375, "rewards/rejected": -48.189640045166016, "step": 5083 }, { "epoch": 0.6922657952069716, "grad_norm": 41.22192978736261, "learning_rate": 2.0945824041913985e-07, "logits/chosen": 13.265483856201172, "logits/rejected": 13.058601379394531, "logps/chosen": -4.2526021003723145, "logps/rejected": -4.237612247467041, "loss": 3.955, "rewards/accuracies": 0.25, "rewards/chosen": -42.526023864746094, "rewards/margins": -0.14989852905273438, "rewards/rejected": -42.376121520996094, "step": 5084 }, { "epoch": 0.6924019607843137, "grad_norm": 37.0035667988636, "learning_rate": 2.0929108047746839e-07, "logits/chosen": 14.387584686279297, "logits/rejected": 13.778642654418945, "logps/chosen": -4.529292106628418, "logps/rejected": -4.487575531005859, "loss": 3.7497, "rewards/accuracies": 0.5, "rewards/chosen": -45.29291534423828, "rewards/margins": -0.4171600341796875, "rewards/rejected": -44.875755310058594, "step": 5085 }, { "epoch": 0.6925381263616558, "grad_norm": 41.293509919144604, "learning_rate": 2.0912396362805377e-07, "logits/chosen": 14.503765106201172, "logits/rejected": 13.650392532348633, "logps/chosen": -4.497783660888672, "logps/rejected": -4.6581549644470215, "loss": 3.5107, "rewards/accuracies": 0.5, "rewards/chosen": -44.97783660888672, "rewards/margins": 1.6037101745605469, "rewards/rejected": -46.581546783447266, "step": 5086 }, { "epoch": 0.6926742919389978, "grad_norm": 40.74615752373075, "learning_rate": 2.0895688990865735e-07, "logits/chosen": 13.380255699157715, "logits/rejected": 13.970422744750977, "logps/chosen": -4.389219760894775, "logps/rejected": -4.535592555999756, "loss": 3.4015, "rewards/accuracies": 0.75, "rewards/chosen": -43.89219665527344, "rewards/margins": 1.4637298583984375, "rewards/rejected": -45.355926513671875, "step": 5087 }, { "epoch": 0.6928104575163399, "grad_norm": 46.71519995943659, "learning_rate": 2.0878985935703092e-07, "logits/chosen": 13.770597457885742, "logits/rejected": 13.654386520385742, "logps/chosen": -4.721039772033691, "logps/rejected": -4.512668609619141, "loss": 3.7812, "rewards/accuracies": 0.25, "rewards/chosen": -47.21039581298828, "rewards/margins": -2.083711624145508, "rewards/rejected": -45.12668228149414, "step": 5088 }, { "epoch": 0.6929466230936819, "grad_norm": 38.73860667687815, "learning_rate": 2.0862287201091626e-07, "logits/chosen": 12.86759090423584, "logits/rejected": 13.177826881408691, "logps/chosen": -4.306591033935547, "logps/rejected": -4.423086166381836, "loss": 4.2213, "rewards/accuracies": 0.5, "rewards/chosen": -43.06591033935547, "rewards/margins": 1.1649551391601562, "rewards/rejected": -44.230865478515625, "step": 5089 }, { "epoch": 0.693082788671024, "grad_norm": 39.08342380598343, "learning_rate": 2.084559279080456e-07, "logits/chosen": 13.15573787689209, "logits/rejected": 12.900150299072266, "logps/chosen": -4.057318210601807, "logps/rejected": -4.116828441619873, "loss": 4.1118, "rewards/accuracies": 0.5, "rewards/chosen": -40.57318115234375, "rewards/margins": 0.5951032638549805, "rewards/rejected": -41.16828155517578, "step": 5090 }, { "epoch": 0.693218954248366, "grad_norm": 42.07082985814409, "learning_rate": 2.0828902708614144e-07, "logits/chosen": 12.76496696472168, "logits/rejected": 13.702133178710938, "logps/chosen": -4.219656467437744, "logps/rejected": -4.386173725128174, "loss": 4.1548, "rewards/accuracies": 0.75, "rewards/chosen": -42.196563720703125, "rewards/margins": 1.665170669555664, "rewards/rejected": -43.861732482910156, "step": 5091 }, { "epoch": 0.6933551198257081, "grad_norm": 41.143852318605184, "learning_rate": 2.081221695829162e-07, "logits/chosen": 13.572877883911133, "logits/rejected": 13.668222427368164, "logps/chosen": -4.517581939697266, "logps/rejected": -4.037419319152832, "loss": 3.7583, "rewards/accuracies": 0.25, "rewards/chosen": -45.175819396972656, "rewards/margins": -4.801627159118652, "rewards/rejected": -40.37419128417969, "step": 5092 }, { "epoch": 0.6934912854030502, "grad_norm": 39.545483953070274, "learning_rate": 2.079553554360728e-07, "logits/chosen": 13.699578285217285, "logits/rejected": 13.422443389892578, "logps/chosen": -4.321961402893066, "logps/rejected": -4.344786643981934, "loss": 3.7942, "rewards/accuracies": 0.5, "rewards/chosen": -43.21961212158203, "rewards/margins": 0.22825145721435547, "rewards/rejected": -43.44786834716797, "step": 5093 }, { "epoch": 0.6936274509803921, "grad_norm": 40.555816510176264, "learning_rate": 2.077885846833043e-07, "logits/chosen": 13.54434585571289, "logits/rejected": 13.621641159057617, "logps/chosen": -4.245604991912842, "logps/rejected": -4.224218368530273, "loss": 3.8526, "rewards/accuracies": 0.5, "rewards/chosen": -42.45604705810547, "rewards/margins": -0.21386432647705078, "rewards/rejected": -42.242183685302734, "step": 5094 }, { "epoch": 0.6937636165577342, "grad_norm": 41.003900308692906, "learning_rate": 2.0762185736229409e-07, "logits/chosen": 13.715967178344727, "logits/rejected": 13.430397033691406, "logps/chosen": -4.340754985809326, "logps/rejected": -4.362268924713135, "loss": 4.0052, "rewards/accuracies": 0.5, "rewards/chosen": -43.40755081176758, "rewards/margins": 0.21514129638671875, "rewards/rejected": -43.62268829345703, "step": 5095 }, { "epoch": 0.6938997821350763, "grad_norm": 37.912283367604935, "learning_rate": 2.0745517351071528e-07, "logits/chosen": 13.47255802154541, "logits/rejected": 14.681856155395508, "logps/chosen": -4.358994007110596, "logps/rejected": -4.764190673828125, "loss": 3.5601, "rewards/accuracies": 0.5, "rewards/chosen": -43.589942932128906, "rewards/margins": 4.051967620849609, "rewards/rejected": -47.64190673828125, "step": 5096 }, { "epoch": 0.6940359477124183, "grad_norm": 44.84337025340423, "learning_rate": 2.0728853316623162e-07, "logits/chosen": 14.087149620056152, "logits/rejected": 13.511069297790527, "logps/chosen": -4.22297477722168, "logps/rejected": -4.288403034210205, "loss": 4.7286, "rewards/accuracies": 0.5, "rewards/chosen": -42.2297477722168, "rewards/margins": 0.6542844772338867, "rewards/rejected": -42.884033203125, "step": 5097 }, { "epoch": 0.6941721132897604, "grad_norm": 42.223845059236666, "learning_rate": 2.0712193636649697e-07, "logits/chosen": 14.433207511901855, "logits/rejected": 13.800060272216797, "logps/chosen": -4.388302803039551, "logps/rejected": -4.388149261474609, "loss": 4.2549, "rewards/accuracies": 0.5, "rewards/chosen": -43.883026123046875, "rewards/margins": -0.0015316009521484375, "rewards/rejected": -43.881492614746094, "step": 5098 }, { "epoch": 0.6943082788671024, "grad_norm": 41.430996697022735, "learning_rate": 2.0695538314915501e-07, "logits/chosen": 13.943016052246094, "logits/rejected": 13.939346313476562, "logps/chosen": -4.47157096862793, "logps/rejected": -4.657182693481445, "loss": 4.0219, "rewards/accuracies": 0.75, "rewards/chosen": -44.71571350097656, "rewards/margins": 1.8561172485351562, "rewards/rejected": -46.57183074951172, "step": 5099 }, { "epoch": 0.6944444444444444, "grad_norm": 41.445199432831906, "learning_rate": 2.0678887355183998e-07, "logits/chosen": 13.876853942871094, "logits/rejected": 13.878110885620117, "logps/chosen": -4.3015570640563965, "logps/rejected": -4.5112714767456055, "loss": 4.193, "rewards/accuracies": 0.5, "rewards/chosen": -43.01557159423828, "rewards/margins": 2.0971450805664062, "rewards/rejected": -45.11271667480469, "step": 5100 }, { "epoch": 0.6945806100217865, "grad_norm": 42.26929405383839, "learning_rate": 2.0662240761217605e-07, "logits/chosen": 12.877202987670898, "logits/rejected": 13.223124504089355, "logps/chosen": -4.313320159912109, "logps/rejected": -4.405317306518555, "loss": 4.2324, "rewards/accuracies": 0.5, "rewards/chosen": -43.133201599121094, "rewards/margins": 0.9199695587158203, "rewards/rejected": -44.05317687988281, "step": 5101 }, { "epoch": 0.6947167755991286, "grad_norm": 38.165112122904105, "learning_rate": 2.0645598536777774e-07, "logits/chosen": 12.656169891357422, "logits/rejected": 14.062711715698242, "logps/chosen": -4.094114303588867, "logps/rejected": -4.528383255004883, "loss": 3.8682, "rewards/accuracies": 1.0, "rewards/chosen": -40.94114685058594, "rewards/margins": 4.34268856048584, "rewards/rejected": -45.283836364746094, "step": 5102 }, { "epoch": 0.6948529411764706, "grad_norm": 39.406596138157695, "learning_rate": 2.062896068562492e-07, "logits/chosen": 13.650413513183594, "logits/rejected": 14.314994812011719, "logps/chosen": -4.1140947341918945, "logps/rejected": -4.622213363647461, "loss": 3.2642, "rewards/accuracies": 0.75, "rewards/chosen": -41.14094924926758, "rewards/margins": 5.0811872482299805, "rewards/rejected": -46.222137451171875, "step": 5103 }, { "epoch": 0.6949891067538126, "grad_norm": 40.77723040888827, "learning_rate": 2.0612327211518524e-07, "logits/chosen": 12.430075645446777, "logits/rejected": 13.131691932678223, "logps/chosen": -4.144225120544434, "logps/rejected": -4.530037879943848, "loss": 4.1828, "rewards/accuracies": 0.75, "rewards/chosen": -41.44225311279297, "rewards/margins": 3.8581314086914062, "rewards/rejected": -45.30038070678711, "step": 5104 }, { "epoch": 0.6951252723311547, "grad_norm": 46.80019921372937, "learning_rate": 2.0595698118217072e-07, "logits/chosen": 14.082620620727539, "logits/rejected": 13.000936508178711, "logps/chosen": -4.458235740661621, "logps/rejected": -4.081647872924805, "loss": 4.6682, "rewards/accuracies": 0.0, "rewards/chosen": -44.582359313964844, "rewards/margins": -3.7658796310424805, "rewards/rejected": -40.81647872924805, "step": 5105 }, { "epoch": 0.6952614379084967, "grad_norm": 45.110272770797216, "learning_rate": 2.057907340947801e-07, "logits/chosen": 13.501541137695312, "logits/rejected": 13.6808443069458, "logps/chosen": -4.404853820800781, "logps/rejected": -4.518847465515137, "loss": 3.7116, "rewards/accuracies": 0.5, "rewards/chosen": -44.04853820800781, "rewards/margins": 1.1399364471435547, "rewards/rejected": -45.1884765625, "step": 5106 }, { "epoch": 0.6953976034858388, "grad_norm": 38.11959399280645, "learning_rate": 2.056245308905785e-07, "logits/chosen": 13.974937438964844, "logits/rejected": 14.20182991027832, "logps/chosen": -4.184617042541504, "logps/rejected": -4.657515525817871, "loss": 4.2692, "rewards/accuracies": 1.0, "rewards/chosen": -41.84617614746094, "rewards/margins": 4.728979110717773, "rewards/rejected": -46.57515335083008, "step": 5107 }, { "epoch": 0.6955337690631809, "grad_norm": 40.76505795516192, "learning_rate": 2.0545837160712098e-07, "logits/chosen": 13.269615173339844, "logits/rejected": 13.242137908935547, "logps/chosen": -3.8712728023529053, "logps/rejected": -4.101598262786865, "loss": 4.0612, "rewards/accuracies": 1.0, "rewards/chosen": -38.712730407714844, "rewards/margins": 2.303248405456543, "rewards/rejected": -41.0159797668457, "step": 5108 }, { "epoch": 0.6956699346405228, "grad_norm": 39.236258130530466, "learning_rate": 2.0529225628195235e-07, "logits/chosen": 12.690223693847656, "logits/rejected": 13.241146087646484, "logps/chosen": -4.313358306884766, "logps/rejected": -4.266595840454102, "loss": 4.0185, "rewards/accuracies": 0.25, "rewards/chosen": -43.13358688354492, "rewards/margins": -0.46762657165527344, "rewards/rejected": -42.66596221923828, "step": 5109 }, { "epoch": 0.6958061002178649, "grad_norm": 40.567749671409075, "learning_rate": 2.051261849526079e-07, "logits/chosen": 13.244162559509277, "logits/rejected": 13.799945831298828, "logps/chosen": -4.121114730834961, "logps/rejected": -4.517578601837158, "loss": 3.9283, "rewards/accuracies": 1.0, "rewards/chosen": -41.21114730834961, "rewards/margins": 3.9646425247192383, "rewards/rejected": -45.17578887939453, "step": 5110 }, { "epoch": 0.695942265795207, "grad_norm": 42.83485786304523, "learning_rate": 2.0496015765661294e-07, "logits/chosen": 12.87813949584961, "logits/rejected": 13.851580619812012, "logps/chosen": -4.028752326965332, "logps/rejected": -4.337527275085449, "loss": 3.965, "rewards/accuracies": 0.75, "rewards/chosen": -40.28752517700195, "rewards/margins": 3.0877466201782227, "rewards/rejected": -43.375274658203125, "step": 5111 }, { "epoch": 0.696078431372549, "grad_norm": 44.541215444602756, "learning_rate": 2.0479417443148247e-07, "logits/chosen": 13.100870132446289, "logits/rejected": 13.67669677734375, "logps/chosen": -4.23206090927124, "logps/rejected": -4.388735771179199, "loss": 3.8617, "rewards/accuracies": 0.5, "rewards/chosen": -42.32061004638672, "rewards/margins": 1.566746711730957, "rewards/rejected": -43.887359619140625, "step": 5112 }, { "epoch": 0.6962145969498911, "grad_norm": 42.36543471149327, "learning_rate": 2.0462823531472177e-07, "logits/chosen": 13.953012466430664, "logits/rejected": 14.228744506835938, "logps/chosen": -4.628755569458008, "logps/rejected": -4.67325496673584, "loss": 3.8583, "rewards/accuracies": 0.75, "rewards/chosen": -46.28755569458008, "rewards/margins": 0.44499683380126953, "rewards/rejected": -46.73255157470703, "step": 5113 }, { "epoch": 0.6963507625272332, "grad_norm": 42.986342923314766, "learning_rate": 2.044623403438265e-07, "logits/chosen": 12.845898628234863, "logits/rejected": 13.966684341430664, "logps/chosen": -4.202999114990234, "logps/rejected": -4.631914138793945, "loss": 3.5898, "rewards/accuracies": 1.0, "rewards/chosen": -42.029991149902344, "rewards/margins": 4.289148330688477, "rewards/rejected": -46.31914138793945, "step": 5114 }, { "epoch": 0.6964869281045751, "grad_norm": 46.96692645024869, "learning_rate": 2.0429648955628157e-07, "logits/chosen": 13.630184173583984, "logits/rejected": 13.976577758789062, "logps/chosen": -4.313559532165527, "logps/rejected": -4.519721984863281, "loss": 4.0974, "rewards/accuracies": 0.5, "rewards/chosen": -43.135597229003906, "rewards/margins": 2.061624526977539, "rewards/rejected": -45.19722366333008, "step": 5115 }, { "epoch": 0.6966230936819172, "grad_norm": 42.420267751238704, "learning_rate": 2.0413068298956255e-07, "logits/chosen": 13.378213882446289, "logits/rejected": 13.453021049499512, "logps/chosen": -3.77614164352417, "logps/rejected": -4.237223148345947, "loss": 3.704, "rewards/accuracies": 0.75, "rewards/chosen": -37.76141357421875, "rewards/margins": 4.61081600189209, "rewards/rejected": -42.37223434448242, "step": 5116 }, { "epoch": 0.6967592592592593, "grad_norm": 43.23259454368196, "learning_rate": 2.0396492068113492e-07, "logits/chosen": 13.486570358276367, "logits/rejected": 13.653175354003906, "logps/chosen": -4.089382648468018, "logps/rejected": -4.315431594848633, "loss": 3.7542, "rewards/accuracies": 0.75, "rewards/chosen": -40.893829345703125, "rewards/margins": 2.2604856491088867, "rewards/rejected": -43.15431213378906, "step": 5117 }, { "epoch": 0.6968954248366013, "grad_norm": 40.672196930116826, "learning_rate": 2.0379920266845376e-07, "logits/chosen": 13.215167999267578, "logits/rejected": 13.351835250854492, "logps/chosen": -4.416976451873779, "logps/rejected": -4.558827877044678, "loss": 3.805, "rewards/accuracies": 0.75, "rewards/chosen": -44.16976547241211, "rewards/margins": 1.4185152053833008, "rewards/rejected": -45.588279724121094, "step": 5118 }, { "epoch": 0.6970315904139434, "grad_norm": 42.62194797495019, "learning_rate": 2.0363352898896458e-07, "logits/chosen": 13.821325302124023, "logits/rejected": 13.842549324035645, "logps/chosen": -4.507485866546631, "logps/rejected": -4.467202186584473, "loss": 4.2452, "rewards/accuracies": 0.25, "rewards/chosen": -45.074859619140625, "rewards/margins": -0.40283775329589844, "rewards/rejected": -44.672019958496094, "step": 5119 }, { "epoch": 0.6971677559912854, "grad_norm": 44.06340366794141, "learning_rate": 2.0346789968010283e-07, "logits/chosen": 13.450079917907715, "logits/rejected": 13.647623062133789, "logps/chosen": -4.184709072113037, "logps/rejected": -4.393232345581055, "loss": 3.7119, "rewards/accuracies": 0.5, "rewards/chosen": -41.84709167480469, "rewards/margins": 2.085230827331543, "rewards/rejected": -43.93232727050781, "step": 5120 }, { "epoch": 0.6973039215686274, "grad_norm": 44.028303771368556, "learning_rate": 2.0330231477929356e-07, "logits/chosen": 13.070433616638184, "logits/rejected": 12.927669525146484, "logps/chosen": -3.945128917694092, "logps/rejected": -3.997920036315918, "loss": 4.1372, "rewards/accuracies": 0.5, "rewards/chosen": -39.45128631591797, "rewards/margins": 0.5279130935668945, "rewards/rejected": -39.97920227050781, "step": 5121 }, { "epoch": 0.6974400871459695, "grad_norm": 42.785015099956006, "learning_rate": 2.0313677432395217e-07, "logits/chosen": 12.701772689819336, "logits/rejected": 13.246987342834473, "logps/chosen": -4.282366752624512, "logps/rejected": -4.352101802825928, "loss": 3.8154, "rewards/accuracies": 0.5, "rewards/chosen": -42.823665618896484, "rewards/margins": 0.6973505020141602, "rewards/rejected": -43.521018981933594, "step": 5122 }, { "epoch": 0.6975762527233116, "grad_norm": 41.75215881251502, "learning_rate": 2.0297127835148408e-07, "logits/chosen": 13.013542175292969, "logits/rejected": 13.838224411010742, "logps/chosen": -4.257994651794434, "logps/rejected": -4.493561744689941, "loss": 4.1522, "rewards/accuracies": 1.0, "rewards/chosen": -42.57994842529297, "rewards/margins": 2.355670928955078, "rewards/rejected": -44.93561553955078, "step": 5123 }, { "epoch": 0.6977124183006536, "grad_norm": 47.12357108153056, "learning_rate": 2.0280582689928419e-07, "logits/chosen": 13.802391052246094, "logits/rejected": 13.862401962280273, "logps/chosen": -4.205333232879639, "logps/rejected": -4.583533763885498, "loss": 3.5868, "rewards/accuracies": 1.0, "rewards/chosen": -42.0533332824707, "rewards/margins": 3.78200626373291, "rewards/rejected": -45.8353385925293, "step": 5124 }, { "epoch": 0.6978485838779956, "grad_norm": 40.65057692923315, "learning_rate": 2.0264042000473768e-07, "logits/chosen": 13.130156517028809, "logits/rejected": 13.395992279052734, "logps/chosen": -4.40715217590332, "logps/rejected": -4.728185653686523, "loss": 3.7053, "rewards/accuracies": 1.0, "rewards/chosen": -44.07152557373047, "rewards/margins": 3.210331916809082, "rewards/rejected": -47.281856536865234, "step": 5125 }, { "epoch": 0.6979847494553377, "grad_norm": 42.141948929386814, "learning_rate": 2.024750577052198e-07, "logits/chosen": 12.291927337646484, "logits/rejected": 12.547689437866211, "logps/chosen": -4.115011692047119, "logps/rejected": -4.373581886291504, "loss": 3.997, "rewards/accuracies": 0.75, "rewards/chosen": -41.150115966796875, "rewards/margins": 2.5857019424438477, "rewards/rejected": -43.735816955566406, "step": 5126 }, { "epoch": 0.6981209150326797, "grad_norm": 38.05849161727105, "learning_rate": 2.0230974003809528e-07, "logits/chosen": 12.678089141845703, "logits/rejected": 13.402814865112305, "logps/chosen": -4.200478553771973, "logps/rejected": -4.060709476470947, "loss": 3.8686, "rewards/accuracies": 0.25, "rewards/chosen": -42.004791259765625, "rewards/margins": -1.3976945877075195, "rewards/rejected": -40.607093811035156, "step": 5127 }, { "epoch": 0.6982570806100218, "grad_norm": 40.473008883715444, "learning_rate": 2.0214446704071908e-07, "logits/chosen": 13.15432357788086, "logits/rejected": 12.843770980834961, "logps/chosen": -4.371124267578125, "logps/rejected": -4.067192077636719, "loss": 4.0307, "rewards/accuracies": 0.0, "rewards/chosen": -43.71124267578125, "rewards/margins": -3.0393190383911133, "rewards/rejected": -40.67192077636719, "step": 5128 }, { "epoch": 0.6983932461873639, "grad_norm": 41.264030559729896, "learning_rate": 2.0197923875043625e-07, "logits/chosen": 13.31997299194336, "logits/rejected": 13.584463119506836, "logps/chosen": -4.059009075164795, "logps/rejected": -4.230879306793213, "loss": 4.1087, "rewards/accuracies": 0.75, "rewards/chosen": -40.590091705322266, "rewards/margins": 1.7187013626098633, "rewards/rejected": -42.30879211425781, "step": 5129 }, { "epoch": 0.6985294117647058, "grad_norm": 44.31742707697218, "learning_rate": 2.0181405520458106e-07, "logits/chosen": 14.150163650512695, "logits/rejected": 13.446487426757812, "logps/chosen": -4.553497791290283, "logps/rejected": -4.404016494750977, "loss": 3.7668, "rewards/accuracies": 0.25, "rewards/chosen": -45.53498077392578, "rewards/margins": -1.4948101043701172, "rewards/rejected": -44.04016876220703, "step": 5130 }, { "epoch": 0.6986655773420479, "grad_norm": 43.578075874100506, "learning_rate": 2.0164891644047838e-07, "logits/chosen": 13.811586380004883, "logits/rejected": 12.873224258422852, "logps/chosen": -4.150848388671875, "logps/rejected": -4.030863285064697, "loss": 3.9394, "rewards/accuracies": 0.25, "rewards/chosen": -41.50848388671875, "rewards/margins": -1.1998510360717773, "rewards/rejected": -40.30863571166992, "step": 5131 }, { "epoch": 0.69880174291939, "grad_norm": 38.94165922712331, "learning_rate": 2.0148382249544275e-07, "logits/chosen": 13.51932144165039, "logits/rejected": 13.896886825561523, "logps/chosen": -4.339369297027588, "logps/rejected": -4.6904096603393555, "loss": 3.6783, "rewards/accuracies": 0.75, "rewards/chosen": -43.39369583129883, "rewards/margins": 3.510401725769043, "rewards/rejected": -46.90409851074219, "step": 5132 }, { "epoch": 0.698937908496732, "grad_norm": 43.294579097623355, "learning_rate": 2.0131877340677818e-07, "logits/chosen": 13.713279724121094, "logits/rejected": 14.548666000366211, "logps/chosen": -4.608349800109863, "logps/rejected": -4.599088191986084, "loss": 3.8334, "rewards/accuracies": 0.5, "rewards/chosen": -46.083499908447266, "rewards/margins": -0.09261703491210938, "rewards/rejected": -45.990882873535156, "step": 5133 }, { "epoch": 0.6990740740740741, "grad_norm": 42.82426825579712, "learning_rate": 2.0115376921177916e-07, "logits/chosen": 14.223770141601562, "logits/rejected": 14.005464553833008, "logps/chosen": -4.367291450500488, "logps/rejected": -4.670817852020264, "loss": 3.7528, "rewards/accuracies": 0.75, "rewards/chosen": -43.672916412353516, "rewards/margins": 3.0352611541748047, "rewards/rejected": -46.70817947387695, "step": 5134 }, { "epoch": 0.6992102396514162, "grad_norm": 60.28745293937405, "learning_rate": 2.0098880994772976e-07, "logits/chosen": 13.117071151733398, "logits/rejected": 13.256426811218262, "logps/chosen": -4.383004188537598, "logps/rejected": -4.393882751464844, "loss": 3.9864, "rewards/accuracies": 0.5, "rewards/chosen": -43.830039978027344, "rewards/margins": 0.10878372192382812, "rewards/rejected": -43.93882751464844, "step": 5135 }, { "epoch": 0.6993464052287581, "grad_norm": 40.88696893317685, "learning_rate": 2.0082389565190368e-07, "logits/chosen": 13.521434783935547, "logits/rejected": 13.590792655944824, "logps/chosen": -4.306130409240723, "logps/rejected": -4.414851188659668, "loss": 3.9467, "rewards/accuracies": 0.5, "rewards/chosen": -43.061309814453125, "rewards/margins": 1.0872087478637695, "rewards/rejected": -44.14851379394531, "step": 5136 }, { "epoch": 0.6994825708061002, "grad_norm": 38.692393577844804, "learning_rate": 2.0065902636156476e-07, "logits/chosen": 12.948348999023438, "logits/rejected": 13.03712272644043, "logps/chosen": -3.948113441467285, "logps/rejected": -4.11521053314209, "loss": 3.8335, "rewards/accuracies": 0.5, "rewards/chosen": -39.481136322021484, "rewards/margins": 1.6709699630737305, "rewards/rejected": -41.152103424072266, "step": 5137 }, { "epoch": 0.6996187363834423, "grad_norm": 41.81734689269193, "learning_rate": 2.0049420211396676e-07, "logits/chosen": 13.679122924804688, "logits/rejected": 14.134281158447266, "logps/chosen": -4.196521759033203, "logps/rejected": -4.496137619018555, "loss": 3.5919, "rewards/accuracies": 0.75, "rewards/chosen": -41.96521759033203, "rewards/margins": 2.996161460876465, "rewards/rejected": -44.96138000488281, "step": 5138 }, { "epoch": 0.6997549019607843, "grad_norm": 45.69553160274961, "learning_rate": 2.0032942294635276e-07, "logits/chosen": 13.140216827392578, "logits/rejected": 13.323986053466797, "logps/chosen": -4.412029266357422, "logps/rejected": -4.54908561706543, "loss": 3.781, "rewards/accuracies": 0.75, "rewards/chosen": -44.12029266357422, "rewards/margins": 1.3705577850341797, "rewards/rejected": -45.49085235595703, "step": 5139 }, { "epoch": 0.6998910675381264, "grad_norm": 47.10370394915393, "learning_rate": 2.0016468889595611e-07, "logits/chosen": 13.366504669189453, "logits/rejected": 12.181703567504883, "logps/chosen": -4.266447067260742, "logps/rejected": -4.083125591278076, "loss": 4.2668, "rewards/accuracies": 0.5, "rewards/chosen": -42.664466857910156, "rewards/margins": -1.833211898803711, "rewards/rejected": -40.83125686645508, "step": 5140 }, { "epoch": 0.7000272331154684, "grad_norm": 46.94495376865438, "learning_rate": 2.0000000000000007e-07, "logits/chosen": 13.028897285461426, "logits/rejected": 13.531664848327637, "logps/chosen": -4.195003509521484, "logps/rejected": -4.583918571472168, "loss": 4.0857, "rewards/accuracies": 0.75, "rewards/chosen": -41.950035095214844, "rewards/margins": 3.8891525268554688, "rewards/rejected": -45.83918762207031, "step": 5141 }, { "epoch": 0.7001633986928104, "grad_norm": 44.7207012915434, "learning_rate": 1.9983535629569707e-07, "logits/chosen": 12.83725357055664, "logits/rejected": 13.131043434143066, "logps/chosen": -4.126620292663574, "logps/rejected": -4.472866535186768, "loss": 3.8841, "rewards/accuracies": 0.75, "rewards/chosen": -41.266204833984375, "rewards/margins": 3.4624624252319336, "rewards/rejected": -44.728668212890625, "step": 5142 }, { "epoch": 0.7002995642701525, "grad_norm": 42.33560206225021, "learning_rate": 1.9967075782024988e-07, "logits/chosen": 13.176040649414062, "logits/rejected": 12.887588500976562, "logps/chosen": -4.33934211730957, "logps/rejected": -4.34018087387085, "loss": 3.848, "rewards/accuracies": 0.5, "rewards/chosen": -43.39341735839844, "rewards/margins": 0.008391380310058594, "rewards/rejected": -43.40180969238281, "step": 5143 }, { "epoch": 0.7004357298474946, "grad_norm": 41.115651201869404, "learning_rate": 1.995062046108511e-07, "logits/chosen": 13.482433319091797, "logits/rejected": 14.008451461791992, "logps/chosen": -4.214125156402588, "logps/rejected": -4.574815273284912, "loss": 3.3154, "rewards/accuracies": 0.75, "rewards/chosen": -42.14125061035156, "rewards/margins": 3.6068992614746094, "rewards/rejected": -45.74815368652344, "step": 5144 }, { "epoch": 0.7005718954248366, "grad_norm": 45.62122048736546, "learning_rate": 1.9934169670468252e-07, "logits/chosen": 13.81904411315918, "logits/rejected": 13.550070762634277, "logps/chosen": -4.4785614013671875, "logps/rejected": -4.2217116355896, "loss": 4.0656, "rewards/accuracies": 0.0, "rewards/chosen": -44.78561019897461, "rewards/margins": -2.5684947967529297, "rewards/rejected": -42.21711730957031, "step": 5145 }, { "epoch": 0.7007080610021786, "grad_norm": 40.661250532690374, "learning_rate": 1.991772341389162e-07, "logits/chosen": 13.634906768798828, "logits/rejected": 13.64134407043457, "logps/chosen": -4.261451244354248, "logps/rejected": -4.396420955657959, "loss": 3.5749, "rewards/accuracies": 0.5, "rewards/chosen": -42.61450958251953, "rewards/margins": 1.3496980667114258, "rewards/rejected": -43.964210510253906, "step": 5146 }, { "epoch": 0.7008442265795207, "grad_norm": 40.70952414029627, "learning_rate": 1.9901281695071397e-07, "logits/chosen": 13.662918090820312, "logits/rejected": 13.784160614013672, "logps/chosen": -4.0681352615356445, "logps/rejected": -4.487529754638672, "loss": 3.6767, "rewards/accuracies": 1.0, "rewards/chosen": -40.681358337402344, "rewards/margins": 4.193946838378906, "rewards/rejected": -44.875301361083984, "step": 5147 }, { "epoch": 0.7009803921568627, "grad_norm": 42.907902845429305, "learning_rate": 1.9884844517722704e-07, "logits/chosen": 13.53775691986084, "logits/rejected": 13.765995025634766, "logps/chosen": -4.086750507354736, "logps/rejected": -4.090607643127441, "loss": 4.1851, "rewards/accuracies": 0.5, "rewards/chosen": -40.86750411987305, "rewards/margins": 0.03857231140136719, "rewards/rejected": -40.90607452392578, "step": 5148 }, { "epoch": 0.7011165577342048, "grad_norm": 48.8220054475929, "learning_rate": 1.986841188555966e-07, "logits/chosen": 14.13778305053711, "logits/rejected": 12.393817901611328, "logps/chosen": -4.427501678466797, "logps/rejected": -3.951476812362671, "loss": 4.2447, "rewards/accuracies": 0.0, "rewards/chosen": -44.27501678466797, "rewards/margins": -4.760245323181152, "rewards/rejected": -39.5147705078125, "step": 5149 }, { "epoch": 0.7012527233115469, "grad_norm": 44.168764856949544, "learning_rate": 1.985198380229538e-07, "logits/chosen": 14.14738655090332, "logits/rejected": 13.982873916625977, "logps/chosen": -4.765775680541992, "logps/rejected": -4.414572715759277, "loss": 3.9789, "rewards/accuracies": 0.0, "rewards/chosen": -47.65775680541992, "rewards/margins": -3.512028694152832, "rewards/rejected": -44.145729064941406, "step": 5150 }, { "epoch": 0.7013888888888888, "grad_norm": 38.69992108043724, "learning_rate": 1.9835560271641887e-07, "logits/chosen": 12.565925598144531, "logits/rejected": 13.720312118530273, "logps/chosen": -4.020280838012695, "logps/rejected": -4.298962593078613, "loss": 3.4973, "rewards/accuracies": 0.75, "rewards/chosen": -40.20280456542969, "rewards/margins": 2.786823272705078, "rewards/rejected": -42.98963165283203, "step": 5151 }, { "epoch": 0.7015250544662309, "grad_norm": 43.904274846914106, "learning_rate": 1.9819141297310233e-07, "logits/chosen": 12.980688095092773, "logits/rejected": 13.557988166809082, "logps/chosen": -4.137541770935059, "logps/rejected": -4.432927131652832, "loss": 4.5287, "rewards/accuracies": 0.75, "rewards/chosen": -41.37541961669922, "rewards/margins": 2.953852653503418, "rewards/rejected": -44.32927322387695, "step": 5152 }, { "epoch": 0.701661220043573, "grad_norm": 41.881130854989294, "learning_rate": 1.9802726883010435e-07, "logits/chosen": 13.609830856323242, "logits/rejected": 13.876567840576172, "logps/chosen": -3.9976487159729004, "logps/rejected": -4.262933254241943, "loss": 3.5834, "rewards/accuracies": 0.75, "rewards/chosen": -39.97648620605469, "rewards/margins": 2.6528453826904297, "rewards/rejected": -42.62933349609375, "step": 5153 }, { "epoch": 0.701797385620915, "grad_norm": 46.376841792962054, "learning_rate": 1.9786317032451435e-07, "logits/chosen": 12.881016731262207, "logits/rejected": 12.38532829284668, "logps/chosen": -4.0645904541015625, "logps/rejected": -4.115062713623047, "loss": 3.6633, "rewards/accuracies": 0.5, "rewards/chosen": -40.645904541015625, "rewards/margins": 0.5047197341918945, "rewards/rejected": -41.15062713623047, "step": 5154 }, { "epoch": 0.7019335511982571, "grad_norm": 43.782722815747476, "learning_rate": 1.9769911749341186e-07, "logits/chosen": 13.363875389099121, "logits/rejected": 13.631282806396484, "logps/chosen": -4.361894130706787, "logps/rejected": -4.457499980926514, "loss": 3.854, "rewards/accuracies": 0.5, "rewards/chosen": -43.61894226074219, "rewards/margins": 0.9560575485229492, "rewards/rejected": -44.57499694824219, "step": 5155 }, { "epoch": 0.7020697167755992, "grad_norm": 43.646510543596136, "learning_rate": 1.9753511037386619e-07, "logits/chosen": 13.008121490478516, "logits/rejected": 13.807672500610352, "logps/chosen": -3.998999834060669, "logps/rejected": -4.026605606079102, "loss": 4.3513, "rewards/accuracies": 0.5, "rewards/chosen": -39.98999786376953, "rewards/margins": 0.2760591506958008, "rewards/rejected": -40.26605987548828, "step": 5156 }, { "epoch": 0.7022058823529411, "grad_norm": 40.4820715063934, "learning_rate": 1.9737114900293578e-07, "logits/chosen": 12.705286026000977, "logits/rejected": 12.991537094116211, "logps/chosen": -3.8836472034454346, "logps/rejected": -4.206111907958984, "loss": 3.9396, "rewards/accuracies": 1.0, "rewards/chosen": -38.83647155761719, "rewards/margins": 3.2246475219726562, "rewards/rejected": -42.061119079589844, "step": 5157 }, { "epoch": 0.7023420479302832, "grad_norm": 43.97895246390895, "learning_rate": 1.972072334176692e-07, "logits/chosen": 13.111054420471191, "logits/rejected": 13.986037254333496, "logps/chosen": -4.364346504211426, "logps/rejected": -4.479188919067383, "loss": 3.7091, "rewards/accuracies": 0.75, "rewards/chosen": -43.643470764160156, "rewards/margins": 1.148416519165039, "rewards/rejected": -44.79188537597656, "step": 5158 }, { "epoch": 0.7024782135076253, "grad_norm": 43.225202663978685, "learning_rate": 1.9704336365510464e-07, "logits/chosen": 13.698001861572266, "logits/rejected": 13.796252250671387, "logps/chosen": -4.1569905281066895, "logps/rejected": -4.529109477996826, "loss": 3.7937, "rewards/accuracies": 0.75, "rewards/chosen": -41.56990432739258, "rewards/margins": 3.721189498901367, "rewards/rejected": -45.29109191894531, "step": 5159 }, { "epoch": 0.7026143790849673, "grad_norm": 45.00824423621246, "learning_rate": 1.968795397522696e-07, "logits/chosen": 14.13144302368164, "logits/rejected": 13.977537155151367, "logps/chosen": -4.2321672439575195, "logps/rejected": -4.234480857849121, "loss": 4.0772, "rewards/accuracies": 0.5, "rewards/chosen": -42.321678161621094, "rewards/margins": 0.023130416870117188, "rewards/rejected": -42.34480667114258, "step": 5160 }, { "epoch": 0.7027505446623094, "grad_norm": 39.226899501650756, "learning_rate": 1.9671576174618156e-07, "logits/chosen": 13.35714340209961, "logits/rejected": 13.88271713256836, "logps/chosen": -4.306342124938965, "logps/rejected": -4.432406425476074, "loss": 3.7184, "rewards/accuracies": 0.5, "rewards/chosen": -43.06342315673828, "rewards/margins": 1.2606439590454102, "rewards/rejected": -44.324066162109375, "step": 5161 }, { "epoch": 0.7028867102396514, "grad_norm": 43.12552623323507, "learning_rate": 1.9655202967384766e-07, "logits/chosen": 12.869343757629395, "logits/rejected": 13.585455894470215, "logps/chosen": -4.339933395385742, "logps/rejected": -4.551843643188477, "loss": 3.8316, "rewards/accuracies": 0.75, "rewards/chosen": -43.399330139160156, "rewards/margins": 2.119108200073242, "rewards/rejected": -45.518436431884766, "step": 5162 }, { "epoch": 0.7030228758169934, "grad_norm": 37.65379566954797, "learning_rate": 1.9638834357226425e-07, "logits/chosen": 13.097785949707031, "logits/rejected": 13.162337303161621, "logps/chosen": -3.9964160919189453, "logps/rejected": -4.096813201904297, "loss": 3.4539, "rewards/accuracies": 0.5, "rewards/chosen": -39.96415710449219, "rewards/margins": 1.0039749145507812, "rewards/rejected": -40.96813201904297, "step": 5163 }, { "epoch": 0.7031590413943355, "grad_norm": 43.59427124907987, "learning_rate": 1.9622470347841764e-07, "logits/chosen": 13.046014785766602, "logits/rejected": 13.26268196105957, "logps/chosen": -4.2663750648498535, "logps/rejected": -4.303436279296875, "loss": 4.3495, "rewards/accuracies": 0.75, "rewards/chosen": -42.66374969482422, "rewards/margins": 0.37061214447021484, "rewards/rejected": -43.03436279296875, "step": 5164 }, { "epoch": 0.7032952069716776, "grad_norm": 39.05648239274157, "learning_rate": 1.960611094292839e-07, "logits/chosen": 14.527814865112305, "logits/rejected": 14.501303672790527, "logps/chosen": -4.484958648681641, "logps/rejected": -5.078371524810791, "loss": 3.6543, "rewards/accuracies": 1.0, "rewards/chosen": -44.849586486816406, "rewards/margins": 5.9341278076171875, "rewards/rejected": -50.783714294433594, "step": 5165 }, { "epoch": 0.7034313725490197, "grad_norm": 43.15838277689122, "learning_rate": 1.9589756146182809e-07, "logits/chosen": 13.46672534942627, "logits/rejected": 14.13743782043457, "logps/chosen": -4.0846028327941895, "logps/rejected": -4.220608711242676, "loss": 3.7876, "rewards/accuracies": 0.75, "rewards/chosen": -40.84602737426758, "rewards/margins": 1.3600635528564453, "rewards/rejected": -42.206092834472656, "step": 5166 }, { "epoch": 0.7035675381263616, "grad_norm": 38.62486462973846, "learning_rate": 1.957340596130054e-07, "logits/chosen": 13.6766357421875, "logits/rejected": 13.358510971069336, "logps/chosen": -4.290409088134766, "logps/rejected": -4.102843761444092, "loss": 4.1375, "rewards/accuracies": 0.25, "rewards/chosen": -42.904090881347656, "rewards/margins": -1.875655174255371, "rewards/rejected": -41.028438568115234, "step": 5167 }, { "epoch": 0.7037037037037037, "grad_norm": 45.72494627561545, "learning_rate": 1.9557060391976053e-07, "logits/chosen": 12.864178657531738, "logits/rejected": 13.281697273254395, "logps/chosen": -4.085831642150879, "logps/rejected": -4.240403175354004, "loss": 3.9019, "rewards/accuracies": 0.75, "rewards/chosen": -40.858314514160156, "rewards/margins": 1.5457134246826172, "rewards/rejected": -42.404029846191406, "step": 5168 }, { "epoch": 0.7038398692810458, "grad_norm": 36.326643001044594, "learning_rate": 1.9540719441902742e-07, "logits/chosen": 14.339338302612305, "logits/rejected": 13.840803146362305, "logps/chosen": -4.444511413574219, "logps/rejected": -4.364354133605957, "loss": 3.7202, "rewards/accuracies": 0.5, "rewards/chosen": -44.44511032104492, "rewards/margins": -0.8015651702880859, "rewards/rejected": -43.64354705810547, "step": 5169 }, { "epoch": 0.7039760348583878, "grad_norm": 63.30335722985156, "learning_rate": 1.9524383114772992e-07, "logits/chosen": 13.386041641235352, "logits/rejected": 13.483926773071289, "logps/chosen": -4.352120399475098, "logps/rejected": -4.624840259552002, "loss": 4.3945, "rewards/accuracies": 0.75, "rewards/chosen": -43.521202087402344, "rewards/margins": 2.727200508117676, "rewards/rejected": -46.24840545654297, "step": 5170 }, { "epoch": 0.7041122004357299, "grad_norm": 43.370174783580644, "learning_rate": 1.9508051414278147e-07, "logits/chosen": 13.103769302368164, "logits/rejected": 13.513044357299805, "logps/chosen": -4.163671493530273, "logps/rejected": -4.453301906585693, "loss": 3.3425, "rewards/accuracies": 0.5, "rewards/chosen": -41.636714935302734, "rewards/margins": 2.896306037902832, "rewards/rejected": -44.53302001953125, "step": 5171 }, { "epoch": 0.704248366013072, "grad_norm": 42.18204636812059, "learning_rate": 1.9491724344108452e-07, "logits/chosen": 13.218171119689941, "logits/rejected": 13.352376937866211, "logps/chosen": -4.128216743469238, "logps/rejected": -4.198334217071533, "loss": 3.5375, "rewards/accuracies": 0.5, "rewards/chosen": -41.28216552734375, "rewards/margins": 0.7011737823486328, "rewards/rejected": -41.983341217041016, "step": 5172 }, { "epoch": 0.7043845315904139, "grad_norm": 38.856102001855916, "learning_rate": 1.947540190795317e-07, "logits/chosen": 13.886436462402344, "logits/rejected": 13.988212585449219, "logps/chosen": -4.495659828186035, "logps/rejected": -4.6651716232299805, "loss": 3.6758, "rewards/accuracies": 0.75, "rewards/chosen": -44.956600189208984, "rewards/margins": 1.6951208114624023, "rewards/rejected": -46.6517219543457, "step": 5173 }, { "epoch": 0.704520697167756, "grad_norm": 43.107261053147305, "learning_rate": 1.9459084109500497e-07, "logits/chosen": 13.2537841796875, "logits/rejected": 12.712905883789062, "logps/chosen": -4.335926055908203, "logps/rejected": -4.13970947265625, "loss": 4.2949, "rewards/accuracies": 0.5, "rewards/chosen": -43.35926055908203, "rewards/margins": -1.9621658325195312, "rewards/rejected": -41.3970947265625, "step": 5174 }, { "epoch": 0.7046568627450981, "grad_norm": 43.039465819828585, "learning_rate": 1.9442770952437547e-07, "logits/chosen": 13.798532485961914, "logits/rejected": 14.275324821472168, "logps/chosen": -4.64646577835083, "logps/rejected": -4.76863956451416, "loss": 3.6699, "rewards/accuracies": 0.5, "rewards/chosen": -46.46465301513672, "rewards/margins": 1.2217426300048828, "rewards/rejected": -47.6864013671875, "step": 5175 }, { "epoch": 0.7047930283224401, "grad_norm": 45.30328148256404, "learning_rate": 1.942646244045043e-07, "logits/chosen": 13.47463321685791, "logits/rejected": 13.492254257202148, "logps/chosen": -4.481554985046387, "logps/rejected": -4.599623680114746, "loss": 3.8974, "rewards/accuracies": 0.5, "rewards/chosen": -44.815547943115234, "rewards/margins": 1.1806879043579102, "rewards/rejected": -45.99623489379883, "step": 5176 }, { "epoch": 0.7049291938997821, "grad_norm": 41.30197628540169, "learning_rate": 1.9410158577224203e-07, "logits/chosen": 13.823383331298828, "logits/rejected": 14.192505836486816, "logps/chosen": -4.3265204429626465, "logps/rejected": -4.359183311462402, "loss": 4.0996, "rewards/accuracies": 0.5, "rewards/chosen": -43.26520538330078, "rewards/margins": 0.3266267776489258, "rewards/rejected": -43.59183120727539, "step": 5177 }, { "epoch": 0.7050653594771242, "grad_norm": 42.142289868388964, "learning_rate": 1.9393859366442827e-07, "logits/chosen": 13.350468635559082, "logits/rejected": 13.241811752319336, "logps/chosen": -4.201394081115723, "logps/rejected": -4.44530725479126, "loss": 4.1758, "rewards/accuracies": 0.75, "rewards/chosen": -42.01394271850586, "rewards/margins": 2.4391307830810547, "rewards/rejected": -44.45307540893555, "step": 5178 }, { "epoch": 0.7052015250544662, "grad_norm": 37.59647700159345, "learning_rate": 1.9377564811789258e-07, "logits/chosen": 14.13862419128418, "logits/rejected": 14.395928382873535, "logps/chosen": -4.244136333465576, "logps/rejected": -4.669121742248535, "loss": 3.8145, "rewards/accuracies": 1.0, "rewards/chosen": -42.44136047363281, "rewards/margins": 4.249853134155273, "rewards/rejected": -46.69121551513672, "step": 5179 }, { "epoch": 0.7053376906318083, "grad_norm": 95.56260827484512, "learning_rate": 1.9361274916945401e-07, "logits/chosen": 13.402263641357422, "logits/rejected": 12.728599548339844, "logps/chosen": -4.354326248168945, "logps/rejected": -4.025872230529785, "loss": 4.4248, "rewards/accuracies": 0.25, "rewards/chosen": -43.54326629638672, "rewards/margins": -3.2845420837402344, "rewards/rejected": -40.258724212646484, "step": 5180 }, { "epoch": 0.7054738562091504, "grad_norm": 44.266678667462685, "learning_rate": 1.9344989685592065e-07, "logits/chosen": 12.87228012084961, "logits/rejected": 13.372376441955566, "logps/chosen": -4.0923357009887695, "logps/rejected": -4.336688995361328, "loss": 3.7247, "rewards/accuracies": 0.75, "rewards/chosen": -40.92335891723633, "rewards/margins": 2.4435348510742188, "rewards/rejected": -43.36688995361328, "step": 5181 }, { "epoch": 0.7056100217864923, "grad_norm": 40.3023074130774, "learning_rate": 1.9328709121409042e-07, "logits/chosen": 13.125814437866211, "logits/rejected": 13.528183937072754, "logps/chosen": -4.246404647827148, "logps/rejected": -4.504514217376709, "loss": 4.1447, "rewards/accuracies": 0.75, "rewards/chosen": -42.46405029296875, "rewards/margins": 2.5810928344726562, "rewards/rejected": -45.045143127441406, "step": 5182 }, { "epoch": 0.7057461873638344, "grad_norm": 44.55997158103814, "learning_rate": 1.9312433228075083e-07, "logits/chosen": 13.72146224975586, "logits/rejected": 13.983524322509766, "logps/chosen": -4.094071388244629, "logps/rejected": -4.361820697784424, "loss": 4.1051, "rewards/accuracies": 0.75, "rewards/chosen": -40.94071578979492, "rewards/margins": 2.677490234375, "rewards/rejected": -43.61820602416992, "step": 5183 }, { "epoch": 0.7058823529411765, "grad_norm": 41.46561387893873, "learning_rate": 1.9296162009267824e-07, "logits/chosen": 13.736823081970215, "logits/rejected": 13.462882041931152, "logps/chosen": -4.457399845123291, "logps/rejected": -4.387206077575684, "loss": 3.7207, "rewards/accuracies": 0.25, "rewards/chosen": -44.573997497558594, "rewards/margins": -0.7019338607788086, "rewards/rejected": -43.87206268310547, "step": 5184 }, { "epoch": 0.7060185185185185, "grad_norm": 41.17130658988053, "learning_rate": 1.92798954686639e-07, "logits/chosen": 12.344404220581055, "logits/rejected": 12.046137809753418, "logps/chosen": -3.935318946838379, "logps/rejected": -3.799199104309082, "loss": 3.8505, "rewards/accuracies": 0.5, "rewards/chosen": -39.353187561035156, "rewards/margins": -1.3611984252929688, "rewards/rejected": -37.99198913574219, "step": 5185 }, { "epoch": 0.7061546840958606, "grad_norm": 45.863961837241334, "learning_rate": 1.926363360993887e-07, "logits/chosen": 13.965023040771484, "logits/rejected": 14.178829193115234, "logps/chosen": -4.193129062652588, "logps/rejected": -4.476328372955322, "loss": 3.7612, "rewards/accuracies": 0.75, "rewards/chosen": -41.93128967285156, "rewards/margins": 2.8319950103759766, "rewards/rejected": -44.763282775878906, "step": 5186 }, { "epoch": 0.7062908496732027, "grad_norm": 44.5029638257824, "learning_rate": 1.9247376436767246e-07, "logits/chosen": 13.63190746307373, "logits/rejected": 13.975732803344727, "logps/chosen": -3.9727158546447754, "logps/rejected": -4.608236312866211, "loss": 4.1865, "rewards/accuracies": 1.0, "rewards/chosen": -39.72715759277344, "rewards/margins": 6.355207443237305, "rewards/rejected": -46.082366943359375, "step": 5187 }, { "epoch": 0.7064270152505446, "grad_norm": 39.37430721260823, "learning_rate": 1.9231123952822444e-07, "logits/chosen": 14.452347755432129, "logits/rejected": 14.230806350708008, "logps/chosen": -4.346803188323975, "logps/rejected": -4.3286943435668945, "loss": 3.8602, "rewards/accuracies": 0.5, "rewards/chosen": -43.46803283691406, "rewards/margins": -0.18108892440795898, "rewards/rejected": -43.28694152832031, "step": 5188 }, { "epoch": 0.7065631808278867, "grad_norm": 38.518656454367296, "learning_rate": 1.9214876161776865e-07, "logits/chosen": 13.074542045593262, "logits/rejected": 14.09396743774414, "logps/chosen": -4.109341621398926, "logps/rejected": -4.809821605682373, "loss": 3.2352, "rewards/accuracies": 1.0, "rewards/chosen": -41.093414306640625, "rewards/margins": 7.0048017501831055, "rewards/rejected": -48.09821319580078, "step": 5189 }, { "epoch": 0.7066993464052288, "grad_norm": 39.172333306357615, "learning_rate": 1.919863306730184e-07, "logits/chosen": 13.185213088989258, "logits/rejected": 13.831697463989258, "logps/chosen": -4.00417423248291, "logps/rejected": -4.3724164962768555, "loss": 3.6938, "rewards/accuracies": 0.75, "rewards/chosen": -40.0417366027832, "rewards/margins": 3.682431221008301, "rewards/rejected": -43.72416687011719, "step": 5190 }, { "epoch": 0.7068355119825708, "grad_norm": 40.18892973933238, "learning_rate": 1.918239467306761e-07, "logits/chosen": 12.48154067993164, "logits/rejected": 13.312064170837402, "logps/chosen": -3.9121651649475098, "logps/rejected": -4.2992658615112305, "loss": 4.142, "rewards/accuracies": 0.75, "rewards/chosen": -39.12165069580078, "rewards/margins": 3.871006965637207, "rewards/rejected": -42.99265670776367, "step": 5191 }, { "epoch": 0.7069716775599129, "grad_norm": 42.54893341800595, "learning_rate": 1.9166160982743382e-07, "logits/chosen": 13.42888355255127, "logits/rejected": 14.033821105957031, "logps/chosen": -4.173810958862305, "logps/rejected": -4.628559589385986, "loss": 3.7443, "rewards/accuracies": 1.0, "rewards/chosen": -41.73810958862305, "rewards/margins": 4.547489166259766, "rewards/rejected": -46.28559875488281, "step": 5192 }, { "epoch": 0.7071078431372549, "grad_norm": 42.748465683094906, "learning_rate": 1.914993199999729e-07, "logits/chosen": 14.531684875488281, "logits/rejected": 14.45634651184082, "logps/chosen": -4.677365303039551, "logps/rejected": -4.857008457183838, "loss": 4.0951, "rewards/accuracies": 0.5, "rewards/chosen": -46.773651123046875, "rewards/margins": 1.7964324951171875, "rewards/rejected": -48.57008361816406, "step": 5193 }, { "epoch": 0.7072440087145969, "grad_norm": 42.31531741900248, "learning_rate": 1.9133707728496428e-07, "logits/chosen": 14.091171264648438, "logits/rejected": 13.987665176391602, "logps/chosen": -4.870955467224121, "logps/rejected": -4.6323137283325195, "loss": 4.2067, "rewards/accuracies": 0.25, "rewards/chosen": -48.709556579589844, "rewards/margins": -2.386415481567383, "rewards/rejected": -46.32313919067383, "step": 5194 }, { "epoch": 0.707380174291939, "grad_norm": 50.63123457129773, "learning_rate": 1.9117488171906774e-07, "logits/chosen": 12.873165130615234, "logits/rejected": 13.87209701538086, "logps/chosen": -4.474983215332031, "logps/rejected": -4.695067882537842, "loss": 4.3456, "rewards/accuracies": 0.5, "rewards/chosen": -44.74983215332031, "rewards/margins": 2.2008466720581055, "rewards/rejected": -46.950679779052734, "step": 5195 }, { "epoch": 0.7075163398692811, "grad_norm": 43.216060381233305, "learning_rate": 1.9101273333893285e-07, "logits/chosen": 13.941976547241211, "logits/rejected": 13.936647415161133, "logps/chosen": -4.48119592666626, "logps/rejected": -4.373660087585449, "loss": 4.3487, "rewards/accuracies": 0.5, "rewards/chosen": -44.81195831298828, "rewards/margins": -1.0753583908081055, "rewards/rejected": -43.73659896850586, "step": 5196 }, { "epoch": 0.7076525054466231, "grad_norm": 42.97380433108709, "learning_rate": 1.9085063218119851e-07, "logits/chosen": 13.315546989440918, "logits/rejected": 14.720296859741211, "logps/chosen": -4.239866256713867, "logps/rejected": -4.638000965118408, "loss": 4.1055, "rewards/accuracies": 1.0, "rewards/chosen": -42.39866638183594, "rewards/margins": 3.9813461303710938, "rewards/rejected": -46.38001251220703, "step": 5197 }, { "epoch": 0.7077886710239651, "grad_norm": 40.20913503521909, "learning_rate": 1.9068857828249253e-07, "logits/chosen": 13.00398063659668, "logits/rejected": 13.732182502746582, "logps/chosen": -4.234971523284912, "logps/rejected": -4.418067932128906, "loss": 4.032, "rewards/accuracies": 1.0, "rewards/chosen": -42.34971618652344, "rewards/margins": 1.8309640884399414, "rewards/rejected": -44.18067932128906, "step": 5198 }, { "epoch": 0.7079248366013072, "grad_norm": 40.67234669090635, "learning_rate": 1.9052657167943242e-07, "logits/chosen": 13.718608856201172, "logits/rejected": 13.920936584472656, "logps/chosen": -4.180695056915283, "logps/rejected": -4.428947925567627, "loss": 3.8141, "rewards/accuracies": 0.75, "rewards/chosen": -41.806949615478516, "rewards/margins": 2.482529640197754, "rewards/rejected": -44.28948211669922, "step": 5199 }, { "epoch": 0.7080610021786492, "grad_norm": 43.27660677647476, "learning_rate": 1.9036461240862502e-07, "logits/chosen": 13.679054260253906, "logits/rejected": 15.358495712280273, "logps/chosen": -4.109796524047852, "logps/rejected": -4.6680474281311035, "loss": 3.5431, "rewards/accuracies": 0.75, "rewards/chosen": -41.09796905517578, "rewards/margins": 5.582507133483887, "rewards/rejected": -46.68047332763672, "step": 5200 }, { "epoch": 0.7081971677559913, "grad_norm": 39.69058561467603, "learning_rate": 1.902027005066664e-07, "logits/chosen": 14.14743423461914, "logits/rejected": 14.583112716674805, "logps/chosen": -4.491461753845215, "logps/rejected": -4.976922035217285, "loss": 3.7986, "rewards/accuracies": 1.0, "rewards/chosen": -44.914615631103516, "rewards/margins": 4.854602813720703, "rewards/rejected": -49.76921844482422, "step": 5201 }, { "epoch": 0.7083333333333334, "grad_norm": 45.90554036619822, "learning_rate": 1.9004083601014173e-07, "logits/chosen": 13.460456848144531, "logits/rejected": 14.275218963623047, "logps/chosen": -4.43563175201416, "logps/rejected": -4.492310047149658, "loss": 3.7426, "rewards/accuracies": 0.75, "rewards/chosen": -44.35631561279297, "rewards/margins": 0.5667819976806641, "rewards/rejected": -44.923099517822266, "step": 5202 }, { "epoch": 0.7084694989106753, "grad_norm": 39.25072059252871, "learning_rate": 1.8987901895562568e-07, "logits/chosen": 14.102334976196289, "logits/rejected": 14.178750991821289, "logps/chosen": -4.09364652633667, "logps/rejected": -4.419984817504883, "loss": 3.7872, "rewards/accuracies": 1.0, "rewards/chosen": -40.93646240234375, "rewards/margins": 3.2633838653564453, "rewards/rejected": -44.199851989746094, "step": 5203 }, { "epoch": 0.7086056644880174, "grad_norm": 41.23862522382576, "learning_rate": 1.8971724937968231e-07, "logits/chosen": 13.559350967407227, "logits/rejected": 14.473811149597168, "logps/chosen": -4.401334762573242, "logps/rejected": -4.721283912658691, "loss": 4.2689, "rewards/accuracies": 0.75, "rewards/chosen": -44.013343811035156, "rewards/margins": 3.1994895935058594, "rewards/rejected": -47.212833404541016, "step": 5204 }, { "epoch": 0.7087418300653595, "grad_norm": 45.98391993679931, "learning_rate": 1.8955552731886453e-07, "logits/chosen": 12.520511627197266, "logits/rejected": 12.465607643127441, "logps/chosen": -4.381992340087891, "logps/rejected": -4.078753471374512, "loss": 3.9448, "rewards/accuracies": 0.0, "rewards/chosen": -43.81991958618164, "rewards/margins": -3.0323848724365234, "rewards/rejected": -40.78753662109375, "step": 5205 }, { "epoch": 0.7088779956427015, "grad_norm": 38.57749334079023, "learning_rate": 1.8939385280971485e-07, "logits/chosen": 13.830537796020508, "logits/rejected": 13.680055618286133, "logps/chosen": -4.368220329284668, "logps/rejected": -4.4508280754089355, "loss": 3.7678, "rewards/accuracies": 0.75, "rewards/chosen": -43.68220520019531, "rewards/margins": 0.8260784149169922, "rewards/rejected": -44.50828552246094, "step": 5206 }, { "epoch": 0.7090141612200436, "grad_norm": 44.61512957495157, "learning_rate": 1.892322258887652e-07, "logits/chosen": 13.733024597167969, "logits/rejected": 13.548181533813477, "logps/chosen": -4.107558250427246, "logps/rejected": -4.127320766448975, "loss": 4.6706, "rewards/accuracies": 0.5, "rewards/chosen": -41.07558059692383, "rewards/margins": 0.19762420654296875, "rewards/rejected": -41.27320861816406, "step": 5207 }, { "epoch": 0.7091503267973857, "grad_norm": 40.835456777553325, "learning_rate": 1.890706465925362e-07, "logits/chosen": 13.014001846313477, "logits/rejected": 13.321874618530273, "logps/chosen": -4.118344306945801, "logps/rejected": -4.294760704040527, "loss": 3.2875, "rewards/accuracies": 0.75, "rewards/chosen": -41.183448791503906, "rewards/margins": 1.7641572952270508, "rewards/rejected": -42.94760513305664, "step": 5208 }, { "epoch": 0.7092864923747276, "grad_norm": 45.68853365995374, "learning_rate": 1.8890911495753814e-07, "logits/chosen": 13.863533020019531, "logits/rejected": 13.564062118530273, "logps/chosen": -4.0390448570251465, "logps/rejected": -4.604536056518555, "loss": 4.7196, "rewards/accuracies": 0.75, "rewards/chosen": -40.39044952392578, "rewards/margins": 5.654911994934082, "rewards/rejected": -46.04535675048828, "step": 5209 }, { "epoch": 0.7094226579520697, "grad_norm": 42.59577536593494, "learning_rate": 1.887476310202706e-07, "logits/chosen": 13.207059860229492, "logits/rejected": 13.540250778198242, "logps/chosen": -4.101773262023926, "logps/rejected": -4.462867736816406, "loss": 4.2361, "rewards/accuracies": 0.75, "rewards/chosen": -41.017730712890625, "rewards/margins": 3.6109466552734375, "rewards/rejected": -44.62867736816406, "step": 5210 }, { "epoch": 0.7095588235294118, "grad_norm": 44.07142168298345, "learning_rate": 1.8858619481722195e-07, "logits/chosen": 13.85702133178711, "logits/rejected": 13.736087799072266, "logps/chosen": -4.025105953216553, "logps/rejected": -4.401989459991455, "loss": 4.197, "rewards/accuracies": 1.0, "rewards/chosen": -40.251060485839844, "rewards/margins": 3.7688350677490234, "rewards/rejected": -44.019893646240234, "step": 5211 }, { "epoch": 0.7096949891067538, "grad_norm": 40.73789509815583, "learning_rate": 1.8842480638487007e-07, "logits/chosen": 13.393402099609375, "logits/rejected": 13.217253684997559, "logps/chosen": -4.448318958282471, "logps/rejected": -4.565449237823486, "loss": 3.8157, "rewards/accuracies": 0.25, "rewards/chosen": -44.48318862915039, "rewards/margins": 1.1713027954101562, "rewards/rejected": -45.65449142456055, "step": 5212 }, { "epoch": 0.7098311546840959, "grad_norm": 39.02217097101121, "learning_rate": 1.882634657596823e-07, "logits/chosen": 13.538019180297852, "logits/rejected": 14.151466369628906, "logps/chosen": -4.100942134857178, "logps/rejected": -4.408446788787842, "loss": 3.3111, "rewards/accuracies": 1.0, "rewards/chosen": -41.00941848754883, "rewards/margins": 3.0750484466552734, "rewards/rejected": -44.08446502685547, "step": 5213 }, { "epoch": 0.7099673202614379, "grad_norm": 40.208108356323024, "learning_rate": 1.881021729781145e-07, "logits/chosen": 13.192781448364258, "logits/rejected": 13.671239852905273, "logps/chosen": -4.3284807205200195, "logps/rejected": -4.68547248840332, "loss": 3.3892, "rewards/accuracies": 0.75, "rewards/chosen": -43.28480529785156, "rewards/margins": 3.5699167251586914, "rewards/rejected": -46.8547248840332, "step": 5214 }, { "epoch": 0.7101034858387799, "grad_norm": 42.19661398674028, "learning_rate": 1.879409280766123e-07, "logits/chosen": 13.823448181152344, "logits/rejected": 13.504533767700195, "logps/chosen": -4.33211612701416, "logps/rejected": -4.517520904541016, "loss": 3.3669, "rewards/accuracies": 0.75, "rewards/chosen": -43.32115936279297, "rewards/margins": 1.8540496826171875, "rewards/rejected": -45.17520523071289, "step": 5215 }, { "epoch": 0.710239651416122, "grad_norm": 38.64708407665575, "learning_rate": 1.8777973109161046e-07, "logits/chosen": 14.06934928894043, "logits/rejected": 14.458362579345703, "logps/chosen": -4.417301654815674, "logps/rejected": -4.771516799926758, "loss": 3.9629, "rewards/accuracies": 1.0, "rewards/chosen": -44.17301559448242, "rewards/margins": 3.542154312133789, "rewards/rejected": -47.71516799926758, "step": 5216 }, { "epoch": 0.7103758169934641, "grad_norm": 38.8113629325148, "learning_rate": 1.8761858205953241e-07, "logits/chosen": 14.480245590209961, "logits/rejected": 14.209793090820312, "logps/chosen": -4.446780681610107, "logps/rejected": -4.332367897033691, "loss": 4.0261, "rewards/accuracies": 0.25, "rewards/chosen": -44.467803955078125, "rewards/margins": -1.1441287994384766, "rewards/rejected": -43.32367706298828, "step": 5217 }, { "epoch": 0.710511982570806, "grad_norm": 35.394716803957486, "learning_rate": 1.874574810167913e-07, "logits/chosen": 13.400984764099121, "logits/rejected": 14.010169982910156, "logps/chosen": -4.458774566650391, "logps/rejected": -4.578591346740723, "loss": 3.5501, "rewards/accuracies": 0.5, "rewards/chosen": -44.587745666503906, "rewards/margins": 1.1981658935546875, "rewards/rejected": -45.785911560058594, "step": 5218 }, { "epoch": 0.7106481481481481, "grad_norm": 40.405185557147696, "learning_rate": 1.8729642799978946e-07, "logits/chosen": 12.949063301086426, "logits/rejected": 13.971677780151367, "logps/chosen": -4.000112056732178, "logps/rejected": -4.332635402679443, "loss": 3.4929, "rewards/accuracies": 0.75, "rewards/chosen": -40.001121520996094, "rewards/margins": 3.3252334594726562, "rewards/rejected": -43.32635498046875, "step": 5219 }, { "epoch": 0.7107843137254902, "grad_norm": 82.9156032843232, "learning_rate": 1.8713542304491777e-07, "logits/chosen": 14.118358612060547, "logits/rejected": 13.583066940307617, "logps/chosen": -4.429953575134277, "logps/rejected": -4.26430082321167, "loss": 4.2115, "rewards/accuracies": 0.5, "rewards/chosen": -44.299537658691406, "rewards/margins": -1.656529426574707, "rewards/rejected": -42.64300537109375, "step": 5220 }, { "epoch": 0.7109204793028322, "grad_norm": 41.87787169442215, "learning_rate": 1.869744661885568e-07, "logits/chosen": 13.332374572753906, "logits/rejected": 13.419322967529297, "logps/chosen": -4.08321475982666, "logps/rejected": -4.362923622131348, "loss": 4.2573, "rewards/accuracies": 1.0, "rewards/chosen": -40.832149505615234, "rewards/margins": 2.797086715698242, "rewards/rejected": -43.629234313964844, "step": 5221 }, { "epoch": 0.7110566448801743, "grad_norm": 44.5863383704526, "learning_rate": 1.868135574670762e-07, "logits/chosen": 13.734107971191406, "logits/rejected": 13.851099014282227, "logps/chosen": -4.477564811706543, "logps/rejected": -4.433114528656006, "loss": 4.4199, "rewards/accuracies": 0.5, "rewards/chosen": -44.7756462097168, "rewards/margins": -0.4445018768310547, "rewards/rejected": -44.33114242553711, "step": 5222 }, { "epoch": 0.7111928104575164, "grad_norm": 39.76812524017522, "learning_rate": 1.8665269691683437e-07, "logits/chosen": 12.807101249694824, "logits/rejected": 13.87124252319336, "logps/chosen": -3.853013753890991, "logps/rejected": -4.321375846862793, "loss": 3.6515, "rewards/accuracies": 0.75, "rewards/chosen": -38.5301399230957, "rewards/margins": 4.683619499206543, "rewards/rejected": -43.21376037597656, "step": 5223 }, { "epoch": 0.7113289760348583, "grad_norm": 41.295948499923874, "learning_rate": 1.8649188457417923e-07, "logits/chosen": 14.184236526489258, "logits/rejected": 15.008621215820312, "logps/chosen": -4.477975845336914, "logps/rejected": -4.562062740325928, "loss": 4.2573, "rewards/accuracies": 0.5, "rewards/chosen": -44.77975845336914, "rewards/margins": 0.8408699035644531, "rewards/rejected": -45.620628356933594, "step": 5224 }, { "epoch": 0.7114651416122004, "grad_norm": 46.2882784401345, "learning_rate": 1.8633112047544776e-07, "logits/chosen": 13.455244064331055, "logits/rejected": 13.509764671325684, "logps/chosen": -4.493990898132324, "logps/rejected": -4.381580829620361, "loss": 4.1559, "rewards/accuracies": 0.5, "rewards/chosen": -44.939910888671875, "rewards/margins": -1.124100685119629, "rewards/rejected": -43.81581115722656, "step": 5225 }, { "epoch": 0.7116013071895425, "grad_norm": 38.946540744274785, "learning_rate": 1.8617040465696573e-07, "logits/chosen": 13.692152976989746, "logits/rejected": 13.28453540802002, "logps/chosen": -4.229203224182129, "logps/rejected": -4.347476005554199, "loss": 3.6271, "rewards/accuracies": 0.5, "rewards/chosen": -42.292030334472656, "rewards/margins": 1.1827306747436523, "rewards/rejected": -43.474761962890625, "step": 5226 }, { "epoch": 0.7117374727668845, "grad_norm": 38.1476040213523, "learning_rate": 1.8600973715504828e-07, "logits/chosen": 13.525708198547363, "logits/rejected": 13.349381446838379, "logps/chosen": -4.309608459472656, "logps/rejected": -4.603906631469727, "loss": 3.7646, "rewards/accuracies": 0.75, "rewards/chosen": -43.09608840942383, "rewards/margins": 2.9429798126220703, "rewards/rejected": -46.039066314697266, "step": 5227 }, { "epoch": 0.7118736383442266, "grad_norm": 40.722552977700886, "learning_rate": 1.8584911800599974e-07, "logits/chosen": 13.478126525878906, "logits/rejected": 13.755483627319336, "logps/chosen": -4.219771385192871, "logps/rejected": -4.525272369384766, "loss": 4.1574, "rewards/accuracies": 0.75, "rewards/chosen": -42.197715759277344, "rewards/margins": 3.055008888244629, "rewards/rejected": -45.252723693847656, "step": 5228 }, { "epoch": 0.7120098039215687, "grad_norm": 50.940755823290544, "learning_rate": 1.8568854724611298e-07, "logits/chosen": 13.471578598022461, "logits/rejected": 13.59752368927002, "logps/chosen": -4.16356086730957, "logps/rejected": -4.127001762390137, "loss": 3.6108, "rewards/accuracies": 0.25, "rewards/chosen": -41.6356086730957, "rewards/margins": -0.3655853271484375, "rewards/rejected": -41.27001953125, "step": 5229 }, { "epoch": 0.7121459694989106, "grad_norm": 42.2665653047963, "learning_rate": 1.8552802491167053e-07, "logits/chosen": 12.176398277282715, "logits/rejected": 12.656822204589844, "logps/chosen": -3.574134349822998, "logps/rejected": -3.779359817504883, "loss": 4.1782, "rewards/accuracies": 0.5, "rewards/chosen": -35.7413444519043, "rewards/margins": 2.052255630493164, "rewards/rejected": -37.793601989746094, "step": 5230 }, { "epoch": 0.7122821350762527, "grad_norm": 41.68547960547775, "learning_rate": 1.853675510389438e-07, "logits/chosen": 12.937158584594727, "logits/rejected": 12.626331329345703, "logps/chosen": -4.311771392822266, "logps/rejected": -4.398390769958496, "loss": 4.3517, "rewards/accuracies": 0.5, "rewards/chosen": -43.117713928222656, "rewards/margins": 0.8661909103393555, "rewards/rejected": -43.98390579223633, "step": 5231 }, { "epoch": 0.7124183006535948, "grad_norm": 39.444172436499144, "learning_rate": 1.85207125664193e-07, "logits/chosen": 13.921066284179688, "logits/rejected": 14.254398345947266, "logps/chosen": -4.325573921203613, "logps/rejected": -4.626905918121338, "loss": 4.1805, "rewards/accuracies": 0.75, "rewards/chosen": -43.2557373046875, "rewards/margins": 3.013317108154297, "rewards/rejected": -46.26905822753906, "step": 5232 }, { "epoch": 0.7125544662309368, "grad_norm": 43.47120555260076, "learning_rate": 1.8504674882366758e-07, "logits/chosen": 13.508023262023926, "logits/rejected": 13.778787612915039, "logps/chosen": -4.412399768829346, "logps/rejected": -4.686089992523193, "loss": 4.2016, "rewards/accuracies": 0.75, "rewards/chosen": -44.124000549316406, "rewards/margins": 2.7369003295898438, "rewards/rejected": -46.86090087890625, "step": 5233 }, { "epoch": 0.7126906318082789, "grad_norm": 40.883317914237956, "learning_rate": 1.848864205536063e-07, "logits/chosen": 13.312984466552734, "logits/rejected": 14.245140075683594, "logps/chosen": -4.064992427825928, "logps/rejected": -4.384574890136719, "loss": 3.9828, "rewards/accuracies": 0.75, "rewards/chosen": -40.649925231933594, "rewards/margins": 3.195826530456543, "rewards/rejected": -43.84574890136719, "step": 5234 }, { "epoch": 0.7128267973856209, "grad_norm": 40.151981446670305, "learning_rate": 1.8472614089023625e-07, "logits/chosen": 13.73742389678955, "logits/rejected": 13.608314514160156, "logps/chosen": -4.245212554931641, "logps/rejected": -4.18256139755249, "loss": 3.8282, "rewards/accuracies": 0.75, "rewards/chosen": -42.45212936401367, "rewards/margins": -0.6265172958374023, "rewards/rejected": -41.82561492919922, "step": 5235 }, { "epoch": 0.7129629629629629, "grad_norm": 37.362078313588974, "learning_rate": 1.845659098697742e-07, "logits/chosen": 13.064871788024902, "logits/rejected": 13.486572265625, "logps/chosen": -4.367506980895996, "logps/rejected": -4.4636406898498535, "loss": 4.1694, "rewards/accuracies": 0.5, "rewards/chosen": -43.67506790161133, "rewards/margins": 0.9613409042358398, "rewards/rejected": -44.63640594482422, "step": 5236 }, { "epoch": 0.713099128540305, "grad_norm": 44.49871684378892, "learning_rate": 1.844057275284257e-07, "logits/chosen": 13.743049621582031, "logits/rejected": 14.377878189086914, "logps/chosen": -4.330362319946289, "logps/rejected": -4.692364692687988, "loss": 4.2453, "rewards/accuracies": 1.0, "rewards/chosen": -43.303619384765625, "rewards/margins": 3.620020866394043, "rewards/rejected": -46.92364501953125, "step": 5237 }, { "epoch": 0.7132352941176471, "grad_norm": 41.55077795615367, "learning_rate": 1.8424559390238504e-07, "logits/chosen": 13.681110382080078, "logits/rejected": 14.470584869384766, "logps/chosen": -4.193024635314941, "logps/rejected": -4.197604179382324, "loss": 4.421, "rewards/accuracies": 0.75, "rewards/chosen": -41.93024444580078, "rewards/margins": 0.045798301696777344, "rewards/rejected": -41.976043701171875, "step": 5238 }, { "epoch": 0.713371459694989, "grad_norm": 38.81730659117115, "learning_rate": 1.8408550902783588e-07, "logits/chosen": 13.614437103271484, "logits/rejected": 14.014528274536133, "logps/chosen": -4.277912616729736, "logps/rejected": -4.848016738891602, "loss": 3.5362, "rewards/accuracies": 0.75, "rewards/chosen": -42.77912521362305, "rewards/margins": 5.701043128967285, "rewards/rejected": -48.480167388916016, "step": 5239 }, { "epoch": 0.7135076252723311, "grad_norm": 47.142037371096386, "learning_rate": 1.8392547294095092e-07, "logits/chosen": 13.571754455566406, "logits/rejected": 14.20675277709961, "logps/chosen": -4.531246185302734, "logps/rejected": -4.564929008483887, "loss": 4.4849, "rewards/accuracies": 0.5, "rewards/chosen": -45.312461853027344, "rewards/margins": 0.3368263244628906, "rewards/rejected": -45.649288177490234, "step": 5240 }, { "epoch": 0.7136437908496732, "grad_norm": 40.58859600029027, "learning_rate": 1.8376548567789123e-07, "logits/chosen": 13.623238563537598, "logits/rejected": 13.58039665222168, "logps/chosen": -4.330004692077637, "logps/rejected": -4.599020957946777, "loss": 3.7908, "rewards/accuracies": 0.75, "rewards/chosen": -43.300048828125, "rewards/margins": 2.6901626586914062, "rewards/rejected": -45.990211486816406, "step": 5241 }, { "epoch": 0.7137799564270153, "grad_norm": 39.58496894993976, "learning_rate": 1.8360554727480749e-07, "logits/chosen": 13.656579971313477, "logits/rejected": 14.358687400817871, "logps/chosen": -4.2644429206848145, "logps/rejected": -4.615915775299072, "loss": 3.7367, "rewards/accuracies": 1.0, "rewards/chosen": -42.644432067871094, "rewards/margins": 3.514725685119629, "rewards/rejected": -46.159156799316406, "step": 5242 }, { "epoch": 0.7139161220043573, "grad_norm": 70.71426987361623, "learning_rate": 1.834456577678392e-07, "logits/chosen": 13.498531341552734, "logits/rejected": 14.072604179382324, "logps/chosen": -4.100085258483887, "logps/rejected": -4.606512546539307, "loss": 3.8891, "rewards/accuracies": 1.0, "rewards/chosen": -41.0008544921875, "rewards/margins": 5.064272880554199, "rewards/rejected": -46.06512451171875, "step": 5243 }, { "epoch": 0.7140522875816994, "grad_norm": 36.49676696416437, "learning_rate": 1.832858171931145e-07, "logits/chosen": 13.343828201293945, "logits/rejected": 14.218308448791504, "logps/chosen": -4.137424468994141, "logps/rejected": -4.570987224578857, "loss": 3.7553, "rewards/accuracies": 0.75, "rewards/chosen": -41.374244689941406, "rewards/margins": 4.335630416870117, "rewards/rejected": -45.70987319946289, "step": 5244 }, { "epoch": 0.7141884531590414, "grad_norm": 37.98853018523473, "learning_rate": 1.8312602558675074e-07, "logits/chosen": 13.812400817871094, "logits/rejected": 13.621963500976562, "logps/chosen": -4.3514580726623535, "logps/rejected": -4.166697025299072, "loss": 3.8804, "rewards/accuracies": 0.25, "rewards/chosen": -43.51457977294922, "rewards/margins": -1.847609519958496, "rewards/rejected": -41.66697311401367, "step": 5245 }, { "epoch": 0.7143246187363834, "grad_norm": 42.930955139170614, "learning_rate": 1.8296628298485436e-07, "logits/chosen": 14.317060470581055, "logits/rejected": 13.916618347167969, "logps/chosen": -4.417217254638672, "logps/rejected": -4.24761438369751, "loss": 3.7271, "rewards/accuracies": 0.25, "rewards/chosen": -44.17217254638672, "rewards/margins": -1.6960296630859375, "rewards/rejected": -42.47614288330078, "step": 5246 }, { "epoch": 0.7144607843137255, "grad_norm": 39.79461234360932, "learning_rate": 1.8280658942352017e-07, "logits/chosen": 13.515560150146484, "logits/rejected": 13.312347412109375, "logps/chosen": -3.9926223754882812, "logps/rejected": -3.9967188835144043, "loss": 4.0432, "rewards/accuracies": 0.25, "rewards/chosen": -39.92622375488281, "rewards/margins": 0.04096508026123047, "rewards/rejected": -39.96718978881836, "step": 5247 }, { "epoch": 0.7145969498910676, "grad_norm": 38.6574431937874, "learning_rate": 1.8264694493883251e-07, "logits/chosen": 13.266298294067383, "logits/rejected": 13.474472999572754, "logps/chosen": -4.028264999389648, "logps/rejected": -4.1710405349731445, "loss": 3.9054, "rewards/accuracies": 0.5, "rewards/chosen": -40.282649993896484, "rewards/margins": 1.4277496337890625, "rewards/rejected": -41.71039962768555, "step": 5248 }, { "epoch": 0.7147331154684096, "grad_norm": 39.70285020902079, "learning_rate": 1.824873495668644e-07, "logits/chosen": 13.408098220825195, "logits/rejected": 13.88101577758789, "logps/chosen": -4.273736000061035, "logps/rejected": -4.580848217010498, "loss": 4.0916, "rewards/accuracies": 1.0, "rewards/chosen": -42.73736572265625, "rewards/margins": 3.071117401123047, "rewards/rejected": -45.80847930908203, "step": 5249 }, { "epoch": 0.7148692810457516, "grad_norm": 40.09621963609866, "learning_rate": 1.8232780334367752e-07, "logits/chosen": 13.808862686157227, "logits/rejected": 14.170005798339844, "logps/chosen": -4.501155853271484, "logps/rejected": -4.5319623947143555, "loss": 3.981, "rewards/accuracies": 0.5, "rewards/chosen": -45.01156234741211, "rewards/margins": 0.30806636810302734, "rewards/rejected": -45.31962585449219, "step": 5250 }, { "epoch": 0.7150054466230937, "grad_norm": 39.39383801433163, "learning_rate": 1.8216830630532276e-07, "logits/chosen": 13.671063423156738, "logits/rejected": 14.249942779541016, "logps/chosen": -4.023536682128906, "logps/rejected": -4.481890678405762, "loss": 3.8763, "rewards/accuracies": 1.0, "rewards/chosen": -40.235374450683594, "rewards/margins": 4.583532333374023, "rewards/rejected": -44.818904876708984, "step": 5251 }, { "epoch": 0.7151416122004357, "grad_norm": 42.033903957509395, "learning_rate": 1.820088584878399e-07, "logits/chosen": 12.880812644958496, "logits/rejected": 13.360334396362305, "logps/chosen": -4.074260711669922, "logps/rejected": -4.085079193115234, "loss": 4.3877, "rewards/accuracies": 0.5, "rewards/chosen": -40.74260711669922, "rewards/margins": 0.10818004608154297, "rewards/rejected": -40.85078811645508, "step": 5252 }, { "epoch": 0.7152777777777778, "grad_norm": 40.619426422667246, "learning_rate": 1.8184945992725732e-07, "logits/chosen": 13.983561515808105, "logits/rejected": 13.623706817626953, "logps/chosen": -4.859367370605469, "logps/rejected": -4.68877649307251, "loss": 4.255, "rewards/accuracies": 0.25, "rewards/chosen": -48.59367370605469, "rewards/margins": -1.7059087753295898, "rewards/rejected": -46.88776397705078, "step": 5253 }, { "epoch": 0.7154139433551199, "grad_norm": 36.073605786331306, "learning_rate": 1.816901106595925e-07, "logits/chosen": 14.174457550048828, "logits/rejected": 13.462299346923828, "logps/chosen": -4.693617343902588, "logps/rejected": -4.562121391296387, "loss": 3.6986, "rewards/accuracies": 0.25, "rewards/chosen": -46.93617248535156, "rewards/margins": -1.3149595260620117, "rewards/rejected": -45.6212158203125, "step": 5254 }, { "epoch": 0.7155501089324618, "grad_norm": 38.36180046687954, "learning_rate": 1.815308107208519e-07, "logits/chosen": 13.165380477905273, "logits/rejected": 13.978334426879883, "logps/chosen": -4.064701557159424, "logps/rejected": -4.311805248260498, "loss": 3.6125, "rewards/accuracies": 0.5, "rewards/chosen": -40.64701461791992, "rewards/margins": 2.471038818359375, "rewards/rejected": -43.11804962158203, "step": 5255 }, { "epoch": 0.7156862745098039, "grad_norm": 42.03702654633132, "learning_rate": 1.8137156014703034e-07, "logits/chosen": 12.665449142456055, "logits/rejected": 13.280961990356445, "logps/chosen": -4.034445762634277, "logps/rejected": -4.173486709594727, "loss": 4.1169, "rewards/accuracies": 0.5, "rewards/chosen": -40.344451904296875, "rewards/margins": 1.3904132843017578, "rewards/rejected": -41.73487091064453, "step": 5256 }, { "epoch": 0.715822440087146, "grad_norm": 39.372271423061704, "learning_rate": 1.8121235897411195e-07, "logits/chosen": 12.446274757385254, "logits/rejected": 13.018949508666992, "logps/chosen": -4.050912857055664, "logps/rejected": -4.303005218505859, "loss": 3.9079, "rewards/accuracies": 0.75, "rewards/chosen": -40.509124755859375, "rewards/margins": 2.5209264755249023, "rewards/rejected": -43.030052185058594, "step": 5257 }, { "epoch": 0.715958605664488, "grad_norm": 37.74199199457962, "learning_rate": 1.810532072380697e-07, "logits/chosen": 13.431509971618652, "logits/rejected": 13.491697311401367, "logps/chosen": -4.569997787475586, "logps/rejected": -4.509734153747559, "loss": 3.8595, "rewards/accuracies": 0.25, "rewards/chosen": -45.69997787475586, "rewards/margins": -0.6026391983032227, "rewards/rejected": -45.09733963012695, "step": 5258 }, { "epoch": 0.7160947712418301, "grad_norm": 40.053064208245694, "learning_rate": 1.8089410497486503e-07, "logits/chosen": 12.924703598022461, "logits/rejected": 12.996248245239258, "logps/chosen": -4.202860355377197, "logps/rejected": -4.4963812828063965, "loss": 3.8342, "rewards/accuracies": 0.75, "rewards/chosen": -42.02859878540039, "rewards/margins": 2.935211181640625, "rewards/rejected": -44.96381378173828, "step": 5259 }, { "epoch": 0.7162309368191722, "grad_norm": 44.85618238315113, "learning_rate": 1.8073505222044844e-07, "logits/chosen": 13.164891242980957, "logits/rejected": 14.19556713104248, "logps/chosen": -3.927485704421997, "logps/rejected": -4.191860675811768, "loss": 4.2087, "rewards/accuracies": 0.5, "rewards/chosen": -39.27485656738281, "rewards/margins": 2.6437454223632812, "rewards/rejected": -41.918601989746094, "step": 5260 }, { "epoch": 0.7163671023965141, "grad_norm": 40.01025465993723, "learning_rate": 1.8057604901075942e-07, "logits/chosen": 13.75517463684082, "logits/rejected": 13.083642959594727, "logps/chosen": -4.398122787475586, "logps/rejected": -4.305856704711914, "loss": 4.2474, "rewards/accuracies": 0.5, "rewards/chosen": -43.981224060058594, "rewards/margins": -0.9226608276367188, "rewards/rejected": -43.05856704711914, "step": 5261 }, { "epoch": 0.7165032679738562, "grad_norm": 38.61629094442718, "learning_rate": 1.8041709538172577e-07, "logits/chosen": 13.394304275512695, "logits/rejected": 14.224003791809082, "logps/chosen": -4.110197067260742, "logps/rejected": -4.620372295379639, "loss": 3.4309, "rewards/accuracies": 0.75, "rewards/chosen": -41.10197067260742, "rewards/margins": 5.101755142211914, "rewards/rejected": -46.20372772216797, "step": 5262 }, { "epoch": 0.7166394335511983, "grad_norm": 38.12296482048082, "learning_rate": 1.802581913692645e-07, "logits/chosen": 13.505918502807617, "logits/rejected": 13.188817977905273, "logps/chosen": -4.123427867889404, "logps/rejected": -4.20407772064209, "loss": 3.385, "rewards/accuracies": 0.75, "rewards/chosen": -41.234275817871094, "rewards/margins": 0.8065004348754883, "rewards/rejected": -42.04077911376953, "step": 5263 }, { "epoch": 0.7167755991285403, "grad_norm": 38.82076296451599, "learning_rate": 1.8009933700928142e-07, "logits/chosen": 13.023507118225098, "logits/rejected": 13.126792907714844, "logps/chosen": -4.2733564376831055, "logps/rejected": -4.3942108154296875, "loss": 3.9341, "rewards/accuracies": 0.75, "rewards/chosen": -42.73356628417969, "rewards/margins": 1.2085437774658203, "rewards/rejected": -43.942108154296875, "step": 5264 }, { "epoch": 0.7169117647058824, "grad_norm": 37.364785800852935, "learning_rate": 1.7994053233767072e-07, "logits/chosen": 13.43962287902832, "logits/rejected": 13.579100608825684, "logps/chosen": -4.502019882202148, "logps/rejected": -4.604353427886963, "loss": 3.8924, "rewards/accuracies": 0.75, "rewards/chosen": -45.020198822021484, "rewards/margins": 1.0233354568481445, "rewards/rejected": -46.04353332519531, "step": 5265 }, { "epoch": 0.7170479302832244, "grad_norm": 39.639432826578044, "learning_rate": 1.7978177739031577e-07, "logits/chosen": 13.754083633422852, "logits/rejected": 13.300241470336914, "logps/chosen": -4.147886276245117, "logps/rejected": -4.411247730255127, "loss": 3.3417, "rewards/accuracies": 0.75, "rewards/chosen": -41.47886657714844, "rewards/margins": 2.633612632751465, "rewards/rejected": -44.11247634887695, "step": 5266 }, { "epoch": 0.7171840958605664, "grad_norm": 40.20478214714394, "learning_rate": 1.7962307220308874e-07, "logits/chosen": 13.213953971862793, "logits/rejected": 13.29541301727295, "logps/chosen": -4.20864725112915, "logps/rejected": -4.159008979797363, "loss": 3.7132, "rewards/accuracies": 0.5, "rewards/chosen": -42.08647155761719, "rewards/margins": -0.4963827133178711, "rewards/rejected": -41.590087890625, "step": 5267 }, { "epoch": 0.7173202614379085, "grad_norm": 42.6194681642925, "learning_rate": 1.7946441681185003e-07, "logits/chosen": 13.229637145996094, "logits/rejected": 13.547220230102539, "logps/chosen": -4.305068016052246, "logps/rejected": -4.35806131362915, "loss": 4.5034, "rewards/accuracies": 0.5, "rewards/chosen": -43.050682067871094, "rewards/margins": 0.5299320220947266, "rewards/rejected": -43.58061218261719, "step": 5268 }, { "epoch": 0.7174564270152506, "grad_norm": 37.40597400837326, "learning_rate": 1.793058112524493e-07, "logits/chosen": 13.240486145019531, "logits/rejected": 14.359964370727539, "logps/chosen": -4.212830543518066, "logps/rejected": -4.78416633605957, "loss": 3.9812, "rewards/accuracies": 1.0, "rewards/chosen": -42.12830352783203, "rewards/margins": 5.713360786437988, "rewards/rejected": -47.8416633605957, "step": 5269 }, { "epoch": 0.7175925925925926, "grad_norm": 40.5474263941232, "learning_rate": 1.7914725556072491e-07, "logits/chosen": 13.355947494506836, "logits/rejected": 13.643779754638672, "logps/chosen": -4.276450157165527, "logps/rejected": -4.422099590301514, "loss": 4.0923, "rewards/accuracies": 0.75, "rewards/chosen": -42.76449966430664, "rewards/margins": 1.456496238708496, "rewards/rejected": -44.22099685668945, "step": 5270 }, { "epoch": 0.7177287581699346, "grad_norm": 46.939393316716554, "learning_rate": 1.7898874977250363e-07, "logits/chosen": 14.265237808227539, "logits/rejected": 14.001163482666016, "logps/chosen": -4.223361015319824, "logps/rejected": -4.417430877685547, "loss": 4.0477, "rewards/accuracies": 0.75, "rewards/chosen": -42.233604431152344, "rewards/margins": 1.9407033920288086, "rewards/rejected": -44.17430877685547, "step": 5271 }, { "epoch": 0.7178649237472767, "grad_norm": 42.02353893268543, "learning_rate": 1.7883029392360123e-07, "logits/chosen": 13.942441940307617, "logits/rejected": 14.110116004943848, "logps/chosen": -4.133846282958984, "logps/rejected": -4.447174072265625, "loss": 3.9568, "rewards/accuracies": 0.75, "rewards/chosen": -41.338462829589844, "rewards/margins": 3.133275032043457, "rewards/rejected": -44.47174072265625, "step": 5272 }, { "epoch": 0.7180010893246187, "grad_norm": 40.59772500823311, "learning_rate": 1.7867188804982223e-07, "logits/chosen": 14.019868850708008, "logits/rejected": 15.017905235290527, "logps/chosen": -4.361129283905029, "logps/rejected": -4.7715044021606445, "loss": 3.9651, "rewards/accuracies": 0.75, "rewards/chosen": -43.61129379272461, "rewards/margins": 4.1037492752075195, "rewards/rejected": -47.71504211425781, "step": 5273 }, { "epoch": 0.7181372549019608, "grad_norm": 49.211449112845266, "learning_rate": 1.7851353218695952e-07, "logits/chosen": 13.869478225708008, "logits/rejected": 14.312955856323242, "logps/chosen": -4.480283737182617, "logps/rejected": -5.033023357391357, "loss": 4.1347, "rewards/accuracies": 1.0, "rewards/chosen": -44.802833557128906, "rewards/margins": 5.527396202087402, "rewards/rejected": -50.330230712890625, "step": 5274 }, { "epoch": 0.7182734204793029, "grad_norm": 38.9429295253177, "learning_rate": 1.7835522637079504e-07, "logits/chosen": 13.670001983642578, "logits/rejected": 13.807419776916504, "logps/chosen": -4.433756351470947, "logps/rejected": -4.541982650756836, "loss": 4.3011, "rewards/accuracies": 0.75, "rewards/chosen": -44.337562561035156, "rewards/margins": 1.0822620391845703, "rewards/rejected": -45.41982650756836, "step": 5275 }, { "epoch": 0.7184095860566448, "grad_norm": 51.98954064244077, "learning_rate": 1.7819697063709942e-07, "logits/chosen": 12.88571548461914, "logits/rejected": 13.649757385253906, "logps/chosen": -4.086601734161377, "logps/rejected": -4.217309951782227, "loss": 4.4865, "rewards/accuracies": 0.5, "rewards/chosen": -40.86601638793945, "rewards/margins": 1.3070793151855469, "rewards/rejected": -42.173095703125, "step": 5276 }, { "epoch": 0.7185457516339869, "grad_norm": 45.309154882317976, "learning_rate": 1.780387650216316e-07, "logits/chosen": 14.023114204406738, "logits/rejected": 14.10267448425293, "logps/chosen": -4.110753059387207, "logps/rejected": -4.456368923187256, "loss": 4.6561, "rewards/accuracies": 0.75, "rewards/chosen": -41.10752868652344, "rewards/margins": 3.4561614990234375, "rewards/rejected": -44.563690185546875, "step": 5277 }, { "epoch": 0.718681917211329, "grad_norm": 54.74023337587237, "learning_rate": 1.778806095601396e-07, "logits/chosen": 13.168610572814941, "logits/rejected": 13.924898147583008, "logps/chosen": -3.8879473209381104, "logps/rejected": -4.404070854187012, "loss": 3.0902, "rewards/accuracies": 1.0, "rewards/chosen": -38.87947463989258, "rewards/margins": 5.161233901977539, "rewards/rejected": -44.040706634521484, "step": 5278 }, { "epoch": 0.718818082788671, "grad_norm": 39.38133948898772, "learning_rate": 1.7772250428836002e-07, "logits/chosen": 13.704957962036133, "logits/rejected": 14.548795700073242, "logps/chosen": -4.148191452026367, "logps/rejected": -4.759110450744629, "loss": 3.4027, "rewards/accuracies": 1.0, "rewards/chosen": -41.48191452026367, "rewards/margins": 6.109186172485352, "rewards/rejected": -47.591102600097656, "step": 5279 }, { "epoch": 0.7189542483660131, "grad_norm": 47.5163197951515, "learning_rate": 1.7756444924201786e-07, "logits/chosen": 13.575371742248535, "logits/rejected": 14.129085540771484, "logps/chosen": -4.0803704261779785, "logps/rejected": -4.351914405822754, "loss": 3.928, "rewards/accuracies": 0.75, "rewards/chosen": -40.803707122802734, "rewards/margins": 2.7154417037963867, "rewards/rejected": -43.51914978027344, "step": 5280 }, { "epoch": 0.7190904139433552, "grad_norm": 40.76540721803641, "learning_rate": 1.7740644445682701e-07, "logits/chosen": 13.711374282836914, "logits/rejected": 14.536216735839844, "logps/chosen": -4.170173168182373, "logps/rejected": -4.668457984924316, "loss": 3.9815, "rewards/accuracies": 1.0, "rewards/chosen": -41.70173263549805, "rewards/margins": 4.982850074768066, "rewards/rejected": -46.6845817565918, "step": 5281 }, { "epoch": 0.7192265795206971, "grad_norm": 39.741835538628806, "learning_rate": 1.772484899684902e-07, "logits/chosen": 14.470558166503906, "logits/rejected": 14.496360778808594, "logps/chosen": -4.351552963256836, "logps/rejected": -4.45851993560791, "loss": 3.5149, "rewards/accuracies": 0.75, "rewards/chosen": -43.51552963256836, "rewards/margins": 1.0696678161621094, "rewards/rejected": -44.58519744873047, "step": 5282 }, { "epoch": 0.7193627450980392, "grad_norm": 40.32539244420521, "learning_rate": 1.770905858126982e-07, "logits/chosen": 14.166243553161621, "logits/rejected": 14.697314262390137, "logps/chosen": -4.466863632202148, "logps/rejected": -4.801723957061768, "loss": 4.0596, "rewards/accuracies": 0.75, "rewards/chosen": -44.668636322021484, "rewards/margins": 3.348605155944824, "rewards/rejected": -48.017242431640625, "step": 5283 }, { "epoch": 0.7194989106753813, "grad_norm": 41.04748452103214, "learning_rate": 1.7693273202513096e-07, "logits/chosen": 13.607314109802246, "logits/rejected": 13.944695472717285, "logps/chosen": -4.3098554611206055, "logps/rejected": -4.63536262512207, "loss": 3.4953, "rewards/accuracies": 0.75, "rewards/chosen": -43.09855270385742, "rewards/margins": 3.2550697326660156, "rewards/rejected": -46.35362243652344, "step": 5284 }, { "epoch": 0.7196350762527233, "grad_norm": 42.85907118942793, "learning_rate": 1.7677492864145678e-07, "logits/chosen": 14.287464141845703, "logits/rejected": 13.937631607055664, "logps/chosen": -4.129934310913086, "logps/rejected": -4.245062828063965, "loss": 4.1367, "rewards/accuracies": 0.75, "rewards/chosen": -41.299346923828125, "rewards/margins": 1.151285171508789, "rewards/rejected": -42.45063018798828, "step": 5285 }, { "epoch": 0.7197712418300654, "grad_norm": 40.313955124362984, "learning_rate": 1.7661717569733284e-07, "logits/chosen": 14.394041061401367, "logits/rejected": 13.823302268981934, "logps/chosen": -4.578636646270752, "logps/rejected": -4.601856231689453, "loss": 3.9091, "rewards/accuracies": 0.25, "rewards/chosen": -45.78636932373047, "rewards/margins": 0.2321929931640625, "rewards/rejected": -46.01856231689453, "step": 5286 }, { "epoch": 0.7199074074074074, "grad_norm": 40.32732743457064, "learning_rate": 1.7645947322840437e-07, "logits/chosen": 13.970748901367188, "logits/rejected": 14.259592056274414, "logps/chosen": -4.380767822265625, "logps/rejected": -4.619205474853516, "loss": 3.5181, "rewards/accuracies": 0.75, "rewards/chosen": -43.80767822265625, "rewards/margins": 2.384370803833008, "rewards/rejected": -46.19205093383789, "step": 5287 }, { "epoch": 0.7200435729847494, "grad_norm": 47.657685391819854, "learning_rate": 1.7630182127030576e-07, "logits/chosen": 13.787201881408691, "logits/rejected": 14.039556503295898, "logps/chosen": -4.3781633377075195, "logps/rejected": -4.339783668518066, "loss": 4.5255, "rewards/accuracies": 0.25, "rewards/chosen": -43.78163528442383, "rewards/margins": -0.3838005065917969, "rewards/rejected": -43.39783477783203, "step": 5288 }, { "epoch": 0.7201797385620915, "grad_norm": 42.280523405763965, "learning_rate": 1.7614421985865984e-07, "logits/chosen": 14.3713960647583, "logits/rejected": 13.762206077575684, "logps/chosen": -4.501652717590332, "logps/rejected": -4.454667091369629, "loss": 3.7983, "rewards/accuracies": 0.5, "rewards/chosen": -45.01652908325195, "rewards/margins": -0.4698610305786133, "rewards/rejected": -44.546669006347656, "step": 5289 }, { "epoch": 0.7203159041394336, "grad_norm": 41.49873294663742, "learning_rate": 1.7598666902907776e-07, "logits/chosen": 13.864543914794922, "logits/rejected": 14.493827819824219, "logps/chosen": -4.352217674255371, "logps/rejected": -4.889129161834717, "loss": 4.0774, "rewards/accuracies": 1.0, "rewards/chosen": -43.52217483520508, "rewards/margins": 5.369117736816406, "rewards/rejected": -48.891292572021484, "step": 5290 }, { "epoch": 0.7204520697167756, "grad_norm": 41.88993301982128, "learning_rate": 1.758291688171595e-07, "logits/chosen": 13.316864013671875, "logits/rejected": 13.309621810913086, "logps/chosen": -4.078346252441406, "logps/rejected": -4.146320343017578, "loss": 4.198, "rewards/accuracies": 0.5, "rewards/chosen": -40.7834587097168, "rewards/margins": 0.6797428131103516, "rewards/rejected": -41.46320343017578, "step": 5291 }, { "epoch": 0.7205882352941176, "grad_norm": 44.014156001737746, "learning_rate": 1.7567171925849354e-07, "logits/chosen": 13.382268905639648, "logits/rejected": 13.848758697509766, "logps/chosen": -4.344661712646484, "logps/rejected": -4.440443515777588, "loss": 3.9061, "rewards/accuracies": 0.25, "rewards/chosen": -43.446617126464844, "rewards/margins": 0.9578189849853516, "rewards/rejected": -44.40443420410156, "step": 5292 }, { "epoch": 0.7207244008714597, "grad_norm": 39.81964429108218, "learning_rate": 1.7551432038865714e-07, "logits/chosen": 13.473219871520996, "logits/rejected": 14.492400169372559, "logps/chosen": -4.233223915100098, "logps/rejected": -4.751204013824463, "loss": 3.4067, "rewards/accuracies": 0.75, "rewards/chosen": -42.332237243652344, "rewards/margins": 5.179804801940918, "rewards/rejected": -47.51203918457031, "step": 5293 }, { "epoch": 0.7208605664488017, "grad_norm": 48.839746967176175, "learning_rate": 1.7535697224321546e-07, "logits/chosen": 13.353652954101562, "logits/rejected": 13.209785461425781, "logps/chosen": -4.398528099060059, "logps/rejected": -4.333682537078857, "loss": 4.046, "rewards/accuracies": 0.25, "rewards/chosen": -43.98527526855469, "rewards/margins": -0.6484527587890625, "rewards/rejected": -43.336822509765625, "step": 5294 }, { "epoch": 0.7209967320261438, "grad_norm": 50.815179506186084, "learning_rate": 1.7519967485772286e-07, "logits/chosen": 13.875381469726562, "logits/rejected": 14.210689544677734, "logps/chosen": -4.578690528869629, "logps/rejected": -4.892499923706055, "loss": 3.2944, "rewards/accuracies": 1.0, "rewards/chosen": -45.786903381347656, "rewards/margins": 3.13809871673584, "rewards/rejected": -48.92500305175781, "step": 5295 }, { "epoch": 0.7211328976034859, "grad_norm": 39.108166697624796, "learning_rate": 1.7504242826772208e-07, "logits/chosen": 13.516740798950195, "logits/rejected": 14.165283203125, "logps/chosen": -4.418638229370117, "logps/rejected": -4.8140459060668945, "loss": 3.4899, "rewards/accuracies": 1.0, "rewards/chosen": -44.18638610839844, "rewards/margins": 3.954075813293457, "rewards/rejected": -48.14046096801758, "step": 5296 }, { "epoch": 0.7212690631808278, "grad_norm": 43.3886493541181, "learning_rate": 1.74885232508744e-07, "logits/chosen": 14.116753578186035, "logits/rejected": 14.643823623657227, "logps/chosen": -4.208134174346924, "logps/rejected": -4.464313983917236, "loss": 3.5248, "rewards/accuracies": 1.0, "rewards/chosen": -42.08134078979492, "rewards/margins": 2.5617990493774414, "rewards/rejected": -44.64313888549805, "step": 5297 }, { "epoch": 0.7214052287581699, "grad_norm": 46.85563515712901, "learning_rate": 1.7472808761630845e-07, "logits/chosen": 14.31031608581543, "logits/rejected": 14.970474243164062, "logps/chosen": -4.639158248901367, "logps/rejected": -4.729525089263916, "loss": 4.2486, "rewards/accuracies": 0.75, "rewards/chosen": -46.39158248901367, "rewards/margins": 0.9036664962768555, "rewards/rejected": -47.295249938964844, "step": 5298 }, { "epoch": 0.721541394335512, "grad_norm": 45.459393800039535, "learning_rate": 1.745709936259236e-07, "logits/chosen": 14.71323013305664, "logits/rejected": 15.206731796264648, "logps/chosen": -4.49012565612793, "logps/rejected": -4.594394683837891, "loss": 3.9356, "rewards/accuracies": 0.75, "rewards/chosen": -44.9012565612793, "rewards/margins": 1.0426921844482422, "rewards/rejected": -45.943946838378906, "step": 5299 }, { "epoch": 0.721677559912854, "grad_norm": 44.14303427523109, "learning_rate": 1.7441395057308634e-07, "logits/chosen": 14.478212356567383, "logits/rejected": 14.569247245788574, "logps/chosen": -4.816022872924805, "logps/rejected": -4.952582359313965, "loss": 3.6156, "rewards/accuracies": 0.75, "rewards/chosen": -48.16022491455078, "rewards/margins": 1.3655929565429688, "rewards/rejected": -49.525821685791016, "step": 5300 }, { "epoch": 0.7218137254901961, "grad_norm": 41.49816728258632, "learning_rate": 1.742569584932815e-07, "logits/chosen": 14.40926742553711, "logits/rejected": 15.298450469970703, "logps/chosen": -4.270304203033447, "logps/rejected": -4.565212726593018, "loss": 3.9445, "rewards/accuracies": 0.75, "rewards/chosen": -42.703041076660156, "rewards/margins": 2.9490833282470703, "rewards/rejected": -45.65212631225586, "step": 5301 }, { "epoch": 0.7219498910675382, "grad_norm": 43.31593275143848, "learning_rate": 1.7410001742198288e-07, "logits/chosen": 13.616958618164062, "logits/rejected": 13.913797378540039, "logps/chosen": -4.288795471191406, "logps/rejected": -4.347745418548584, "loss": 3.991, "rewards/accuracies": 0.5, "rewards/chosen": -42.88795471191406, "rewards/margins": 0.5894994735717773, "rewards/rejected": -43.477455139160156, "step": 5302 }, { "epoch": 0.7220860566448801, "grad_norm": 47.1633775810273, "learning_rate": 1.7394312739465282e-07, "logits/chosen": 14.423961639404297, "logits/rejected": 13.721309661865234, "logps/chosen": -4.7382659912109375, "logps/rejected": -4.505459785461426, "loss": 4.0256, "rewards/accuracies": 0.25, "rewards/chosen": -47.382659912109375, "rewards/margins": -2.328062057495117, "rewards/rejected": -45.054595947265625, "step": 5303 }, { "epoch": 0.7222222222222222, "grad_norm": 47.17747985021074, "learning_rate": 1.7378628844674154e-07, "logits/chosen": 13.765593528747559, "logits/rejected": 14.015020370483398, "logps/chosen": -4.540074825286865, "logps/rejected": -4.61287784576416, "loss": 3.6201, "rewards/accuracies": 0.75, "rewards/chosen": -45.40074920654297, "rewards/margins": 0.7280330657958984, "rewards/rejected": -46.128780364990234, "step": 5304 }, { "epoch": 0.7223583877995643, "grad_norm": 48.16494249211623, "learning_rate": 1.736295006136883e-07, "logits/chosen": 13.889095306396484, "logits/rejected": 13.71537971496582, "logps/chosen": -4.487019062042236, "logps/rejected": -4.48323392868042, "loss": 3.8406, "rewards/accuracies": 0.5, "rewards/chosen": -44.87018966674805, "rewards/margins": -0.037853240966796875, "rewards/rejected": -44.83233642578125, "step": 5305 }, { "epoch": 0.7224945533769063, "grad_norm": 49.22143709406512, "learning_rate": 1.7347276393092076e-07, "logits/chosen": 14.464719772338867, "logits/rejected": 14.3330078125, "logps/chosen": -4.376938819885254, "logps/rejected": -4.636961936950684, "loss": 4.4311, "rewards/accuracies": 0.75, "rewards/chosen": -43.76939010620117, "rewards/margins": 2.6002283096313477, "rewards/rejected": -46.3696174621582, "step": 5306 }, { "epoch": 0.7226307189542484, "grad_norm": 39.540646893892685, "learning_rate": 1.7331607843385454e-07, "logits/chosen": 13.591596603393555, "logits/rejected": 13.828428268432617, "logps/chosen": -4.297165393829346, "logps/rejected": -4.517151832580566, "loss": 3.7099, "rewards/accuracies": 1.0, "rewards/chosen": -42.97165298461914, "rewards/margins": 2.19986629486084, "rewards/rejected": -45.1715202331543, "step": 5307 }, { "epoch": 0.7227668845315904, "grad_norm": 42.833611480082084, "learning_rate": 1.731594441578942e-07, "logits/chosen": 14.308446884155273, "logits/rejected": 14.721994400024414, "logps/chosen": -4.708198070526123, "logps/rejected": -4.622364521026611, "loss": 3.9361, "rewards/accuracies": 0.25, "rewards/chosen": -47.08197784423828, "rewards/margins": -0.8583354949951172, "rewards/rejected": -46.22364807128906, "step": 5308 }, { "epoch": 0.7229030501089324, "grad_norm": 42.21485095757023, "learning_rate": 1.7300286113843266e-07, "logits/chosen": 12.643213272094727, "logits/rejected": 13.297567367553711, "logps/chosen": -3.9783716201782227, "logps/rejected": -4.416194915771484, "loss": 3.8799, "rewards/accuracies": 0.75, "rewards/chosen": -39.783714294433594, "rewards/margins": 4.378232955932617, "rewards/rejected": -44.16194534301758, "step": 5309 }, { "epoch": 0.7230392156862745, "grad_norm": 43.24594738470446, "learning_rate": 1.728463294108509e-07, "logits/chosen": 14.00330638885498, "logits/rejected": 13.778190612792969, "logps/chosen": -4.595667362213135, "logps/rejected": -4.704740524291992, "loss": 3.8236, "rewards/accuracies": 0.5, "rewards/chosen": -45.95667266845703, "rewards/margins": 1.0907316207885742, "rewards/rejected": -47.04740524291992, "step": 5310 }, { "epoch": 0.7231753812636166, "grad_norm": 43.874470126653996, "learning_rate": 1.726898490105187e-07, "logits/chosen": 13.748725891113281, "logits/rejected": 14.084705352783203, "logps/chosen": -4.606954574584961, "logps/rejected": -4.677448272705078, "loss": 3.8737, "rewards/accuracies": 0.75, "rewards/chosen": -46.06954574584961, "rewards/margins": 0.7049341201782227, "rewards/rejected": -46.774478912353516, "step": 5311 }, { "epoch": 0.7233115468409586, "grad_norm": 39.72866689620585, "learning_rate": 1.725334199727942e-07, "logits/chosen": 13.011102676391602, "logits/rejected": 14.387396812438965, "logps/chosen": -4.236118316650391, "logps/rejected": -4.5723090171813965, "loss": 3.7561, "rewards/accuracies": 0.75, "rewards/chosen": -42.361183166503906, "rewards/margins": 3.3619089126586914, "rewards/rejected": -45.72309112548828, "step": 5312 }, { "epoch": 0.7234477124183006, "grad_norm": 43.04575986685416, "learning_rate": 1.7237704233302353e-07, "logits/chosen": 14.0779390335083, "logits/rejected": 14.099526405334473, "logps/chosen": -4.36965799331665, "logps/rejected": -4.390444755554199, "loss": 3.2256, "rewards/accuracies": 0.5, "rewards/chosen": -43.69657897949219, "rewards/margins": 0.2078714370727539, "rewards/rejected": -43.904449462890625, "step": 5313 }, { "epoch": 0.7235838779956427, "grad_norm": 41.523076048733365, "learning_rate": 1.7222071612654174e-07, "logits/chosen": 14.123856544494629, "logits/rejected": 14.896846771240234, "logps/chosen": -4.4499287605285645, "logps/rejected": -4.493516445159912, "loss": 4.1622, "rewards/accuracies": 0.5, "rewards/chosen": -44.49928283691406, "rewards/margins": 0.4358816146850586, "rewards/rejected": -44.93516540527344, "step": 5314 }, { "epoch": 0.7237200435729847, "grad_norm": 44.216220373402514, "learning_rate": 1.720644413886721e-07, "logits/chosen": 13.916975021362305, "logits/rejected": 14.628746032714844, "logps/chosen": -4.526090145111084, "logps/rejected": -4.897173881530762, "loss": 4.4579, "rewards/accuracies": 0.75, "rewards/chosen": -45.260902404785156, "rewards/margins": 3.7108373641967773, "rewards/rejected": -48.97174072265625, "step": 5315 }, { "epoch": 0.7238562091503268, "grad_norm": 42.96210509646332, "learning_rate": 1.7190821815472595e-07, "logits/chosen": 13.605329513549805, "logits/rejected": 13.663084030151367, "logps/chosen": -4.413234710693359, "logps/rejected": -4.546624660491943, "loss": 3.6333, "rewards/accuracies": 0.5, "rewards/chosen": -44.132347106933594, "rewards/margins": 1.3339014053344727, "rewards/rejected": -45.46624755859375, "step": 5316 }, { "epoch": 0.7239923747276689, "grad_norm": 46.29005674117974, "learning_rate": 1.717520464600033e-07, "logits/chosen": 14.167924880981445, "logits/rejected": 13.677986145019531, "logps/chosen": -4.479180335998535, "logps/rejected": -4.63275671005249, "loss": 4.2984, "rewards/accuracies": 0.75, "rewards/chosen": -44.79180145263672, "rewards/margins": 1.5357666015625, "rewards/rejected": -46.327571868896484, "step": 5317 }, { "epoch": 0.724128540305011, "grad_norm": 39.926590880680926, "learning_rate": 1.7159592633979263e-07, "logits/chosen": 13.303789138793945, "logits/rejected": 13.922738075256348, "logps/chosen": -4.194716453552246, "logps/rejected": -4.21044397354126, "loss": 4.0514, "rewards/accuracies": 0.5, "rewards/chosen": -41.94715881347656, "rewards/margins": 0.15728092193603516, "rewards/rejected": -42.10444259643555, "step": 5318 }, { "epoch": 0.7242647058823529, "grad_norm": 40.322884936959305, "learning_rate": 1.7143985782937026e-07, "logits/chosen": 13.837532997131348, "logits/rejected": 14.872583389282227, "logps/chosen": -4.178309440612793, "logps/rejected": -4.865655422210693, "loss": 3.4086, "rewards/accuracies": 1.0, "rewards/chosen": -41.78309631347656, "rewards/margins": 6.8734588623046875, "rewards/rejected": -48.65655517578125, "step": 5319 }, { "epoch": 0.724400871459695, "grad_norm": 42.64856741622218, "learning_rate": 1.7128384096400136e-07, "logits/chosen": 13.771547317504883, "logits/rejected": 14.404321670532227, "logps/chosen": -4.186805725097656, "logps/rejected": -4.4207868576049805, "loss": 4.381, "rewards/accuracies": 0.75, "rewards/chosen": -41.86805725097656, "rewards/margins": 2.3398170471191406, "rewards/rejected": -44.20787048339844, "step": 5320 }, { "epoch": 0.7245370370370371, "grad_norm": 40.87167650069443, "learning_rate": 1.711278757789393e-07, "logits/chosen": 13.034911155700684, "logits/rejected": 13.423517227172852, "logps/chosen": -4.255364418029785, "logps/rejected": -4.564948081970215, "loss": 3.6736, "rewards/accuracies": 0.75, "rewards/chosen": -42.55364227294922, "rewards/margins": 3.0958404541015625, "rewards/rejected": -45.64948272705078, "step": 5321 }, { "epoch": 0.7246732026143791, "grad_norm": 40.95701212173249, "learning_rate": 1.7097196230942542e-07, "logits/chosen": 14.067911148071289, "logits/rejected": 14.348494529724121, "logps/chosen": -4.349518775939941, "logps/rejected": -4.433755874633789, "loss": 3.3377, "rewards/accuracies": 0.5, "rewards/chosen": -43.49518585205078, "rewards/margins": 0.8423728942871094, "rewards/rejected": -44.337562561035156, "step": 5322 }, { "epoch": 0.7248093681917211, "grad_norm": 42.37826809406078, "learning_rate": 1.708161005906898e-07, "logits/chosen": 13.690824508666992, "logits/rejected": 13.29957389831543, "logps/chosen": -4.123040199279785, "logps/rejected": -4.4958271980285645, "loss": 3.5491, "rewards/accuracies": 0.75, "rewards/chosen": -41.230403900146484, "rewards/margins": 3.727869987487793, "rewards/rejected": -44.958274841308594, "step": 5323 }, { "epoch": 0.7249455337690632, "grad_norm": 40.00020065438423, "learning_rate": 1.7066029065795088e-07, "logits/chosen": 14.217169761657715, "logits/rejected": 14.449246406555176, "logps/chosen": -4.673315525054932, "logps/rejected": -4.6622514724731445, "loss": 4.1495, "rewards/accuracies": 0.5, "rewards/chosen": -46.733158111572266, "rewards/margins": -0.1106405258178711, "rewards/rejected": -46.62251663208008, "step": 5324 }, { "epoch": 0.7250816993464052, "grad_norm": 43.475805996780906, "learning_rate": 1.705045325464149e-07, "logits/chosen": 13.90989875793457, "logits/rejected": 14.011391639709473, "logps/chosen": -4.287370681762695, "logps/rejected": -4.480611801147461, "loss": 3.7438, "rewards/accuracies": 0.5, "rewards/chosen": -42.87371063232422, "rewards/margins": 1.9324073791503906, "rewards/rejected": -44.806114196777344, "step": 5325 }, { "epoch": 0.7252178649237473, "grad_norm": 46.349481373662606, "learning_rate": 1.703488262912768e-07, "logits/chosen": 13.745584487915039, "logits/rejected": 14.682061195373535, "logps/chosen": -4.381411552429199, "logps/rejected": -4.459504127502441, "loss": 3.1732, "rewards/accuracies": 0.75, "rewards/chosen": -43.814117431640625, "rewards/margins": 0.7809257507324219, "rewards/rejected": -44.59504318237305, "step": 5326 }, { "epoch": 0.7253540305010894, "grad_norm": 45.59249157055057, "learning_rate": 1.7019317192771988e-07, "logits/chosen": 13.989357948303223, "logits/rejected": 14.529438018798828, "logps/chosen": -4.258185386657715, "logps/rejected": -4.696250915527344, "loss": 4.5877, "rewards/accuracies": 0.75, "rewards/chosen": -42.58185577392578, "rewards/margins": 4.3806562423706055, "rewards/rejected": -46.96250915527344, "step": 5327 }, { "epoch": 0.7254901960784313, "grad_norm": 44.96552337967237, "learning_rate": 1.7003756949091518e-07, "logits/chosen": 13.194368362426758, "logits/rejected": 13.49785041809082, "logps/chosen": -4.034738063812256, "logps/rejected": -4.46315860748291, "loss": 4.1345, "rewards/accuracies": 1.0, "rewards/chosen": -40.34737777709961, "rewards/margins": 4.284205436706543, "rewards/rejected": -44.63158416748047, "step": 5328 }, { "epoch": 0.7256263616557734, "grad_norm": 54.96142203730251, "learning_rate": 1.6988201901602258e-07, "logits/chosen": 13.73179817199707, "logits/rejected": 14.184566497802734, "logps/chosen": -4.307783126831055, "logps/rejected": -4.58388614654541, "loss": 4.5261, "rewards/accuracies": 0.75, "rewards/chosen": -43.07783508300781, "rewards/margins": 2.7610273361206055, "rewards/rejected": -45.83885955810547, "step": 5329 }, { "epoch": 0.7257625272331155, "grad_norm": 41.01160871414413, "learning_rate": 1.6972652053819004e-07, "logits/chosen": 15.257381439208984, "logits/rejected": 14.5994873046875, "logps/chosen": -4.626118183135986, "logps/rejected": -4.941766738891602, "loss": 3.5748, "rewards/accuracies": 0.75, "rewards/chosen": -46.26118087768555, "rewards/margins": 3.1564865112304688, "rewards/rejected": -49.417667388916016, "step": 5330 }, { "epoch": 0.7258986928104575, "grad_norm": 45.5899246923672, "learning_rate": 1.6957107409255355e-07, "logits/chosen": 13.646171569824219, "logits/rejected": 12.827637672424316, "logps/chosen": -4.621671199798584, "logps/rejected": -4.465423583984375, "loss": 4.153, "rewards/accuracies": 0.25, "rewards/chosen": -46.216712951660156, "rewards/margins": -1.5624799728393555, "rewards/rejected": -44.654232025146484, "step": 5331 }, { "epoch": 0.7260348583877996, "grad_norm": 41.74483987807624, "learning_rate": 1.694156797142376e-07, "logits/chosen": 14.080741882324219, "logits/rejected": 14.4273681640625, "logps/chosen": -4.814694404602051, "logps/rejected": -4.529231071472168, "loss": 3.5175, "rewards/accuracies": 0.25, "rewards/chosen": -48.146942138671875, "rewards/margins": -2.8546323776245117, "rewards/rejected": -45.29231262207031, "step": 5332 }, { "epoch": 0.7261710239651417, "grad_norm": 44.265571120338805, "learning_rate": 1.6926033743835503e-07, "logits/chosen": 14.01894760131836, "logits/rejected": 13.913108825683594, "logps/chosen": -4.374636650085449, "logps/rejected": -4.434388160705566, "loss": 3.3639, "rewards/accuracies": 0.75, "rewards/chosen": -43.746368408203125, "rewards/margins": 0.5975093841552734, "rewards/rejected": -44.34387969970703, "step": 5333 }, { "epoch": 0.7263071895424836, "grad_norm": 42.683856565753786, "learning_rate": 1.6910504730000635e-07, "logits/chosen": 14.22774887084961, "logits/rejected": 14.18600845336914, "logps/chosen": -4.391031265258789, "logps/rejected": -4.408731937408447, "loss": 3.8079, "rewards/accuracies": 0.75, "rewards/chosen": -43.910316467285156, "rewards/margins": 0.17700576782226562, "rewards/rejected": -44.087318420410156, "step": 5334 }, { "epoch": 0.7264433551198257, "grad_norm": 58.03329651725504, "learning_rate": 1.6894980933428085e-07, "logits/chosen": 14.08273696899414, "logits/rejected": 14.41860580444336, "logps/chosen": -4.253661155700684, "logps/rejected": -4.764466762542725, "loss": 3.6913, "rewards/accuracies": 0.75, "rewards/chosen": -42.53661346435547, "rewards/margins": 5.108055591583252, "rewards/rejected": -47.64466857910156, "step": 5335 }, { "epoch": 0.7265795206971678, "grad_norm": 43.30573480690903, "learning_rate": 1.6879462357625592e-07, "logits/chosen": 13.749443054199219, "logits/rejected": 14.810382843017578, "logps/chosen": -4.253431797027588, "logps/rejected": -4.734308242797852, "loss": 3.7099, "rewards/accuracies": 0.75, "rewards/chosen": -42.53431701660156, "rewards/margins": 4.80876350402832, "rewards/rejected": -47.34307861328125, "step": 5336 }, { "epoch": 0.7267156862745098, "grad_norm": 44.44787678332708, "learning_rate": 1.6863949006099684e-07, "logits/chosen": 14.541762351989746, "logits/rejected": 14.659132957458496, "logps/chosen": -4.763396263122559, "logps/rejected": -4.730592727661133, "loss": 3.6349, "rewards/accuracies": 0.25, "rewards/chosen": -47.63396453857422, "rewards/margins": -0.3280363082885742, "rewards/rejected": -47.30592727661133, "step": 5337 }, { "epoch": 0.7268518518518519, "grad_norm": 46.55628028741865, "learning_rate": 1.6848440882355744e-07, "logits/chosen": 13.23411750793457, "logits/rejected": 13.717939376831055, "logps/chosen": -3.8782176971435547, "logps/rejected": -4.3112101554870605, "loss": 3.9394, "rewards/accuracies": 1.0, "rewards/chosen": -38.78217697143555, "rewards/margins": 4.329923629760742, "rewards/rejected": -43.112098693847656, "step": 5338 }, { "epoch": 0.726988017429194, "grad_norm": 43.85861587133862, "learning_rate": 1.6832937989897967e-07, "logits/chosen": 14.939203262329102, "logits/rejected": 14.654911994934082, "logps/chosen": -4.772723197937012, "logps/rejected": -4.710069179534912, "loss": 3.8442, "rewards/accuracies": 0.25, "rewards/chosen": -47.727230072021484, "rewards/margins": -0.6265373229980469, "rewards/rejected": -47.10069274902344, "step": 5339 }, { "epoch": 0.7271241830065359, "grad_norm": 43.14305437106007, "learning_rate": 1.6817440332229346e-07, "logits/chosen": 14.184905052185059, "logits/rejected": 13.33143424987793, "logps/chosen": -4.375591278076172, "logps/rejected": -3.9720754623413086, "loss": 3.9174, "rewards/accuracies": 0.0, "rewards/chosen": -43.755916595458984, "rewards/margins": -4.035161018371582, "rewards/rejected": -39.72075653076172, "step": 5340 }, { "epoch": 0.727260348583878, "grad_norm": 41.630055039914126, "learning_rate": 1.6801947912851703e-07, "logits/chosen": 14.541360855102539, "logits/rejected": 14.133930206298828, "logps/chosen": -4.447466850280762, "logps/rejected": -4.579226016998291, "loss": 3.7026, "rewards/accuracies": 0.75, "rewards/chosen": -44.47467041015625, "rewards/margins": 1.3175907135009766, "rewards/rejected": -45.792259216308594, "step": 5341 }, { "epoch": 0.7273965141612201, "grad_norm": 40.08304135462943, "learning_rate": 1.6786460735265706e-07, "logits/chosen": 14.014005661010742, "logits/rejected": 14.758018493652344, "logps/chosen": -4.52820348739624, "logps/rejected": -5.0413289070129395, "loss": 4.0334, "rewards/accuracies": 1.0, "rewards/chosen": -45.28203582763672, "rewards/margins": 5.131251335144043, "rewards/rejected": -50.41328811645508, "step": 5342 }, { "epoch": 0.7275326797385621, "grad_norm": 42.149327266678085, "learning_rate": 1.6770978802970776e-07, "logits/chosen": 12.821052551269531, "logits/rejected": 14.203329086303711, "logps/chosen": -3.9955105781555176, "logps/rejected": -4.352138519287109, "loss": 3.6424, "rewards/accuracies": 0.75, "rewards/chosen": -39.955108642578125, "rewards/margins": 3.5662755966186523, "rewards/rejected": -43.521385192871094, "step": 5343 }, { "epoch": 0.7276688453159041, "grad_norm": 43.40562995606044, "learning_rate": 1.6755502119465197e-07, "logits/chosen": 14.516939163208008, "logits/rejected": 15.148481369018555, "logps/chosen": -4.731081962585449, "logps/rejected": -5.056042194366455, "loss": 4.2178, "rewards/accuracies": 1.0, "rewards/chosen": -47.310821533203125, "rewards/margins": 3.249600410461426, "rewards/rejected": -50.5604248046875, "step": 5344 }, { "epoch": 0.7278050108932462, "grad_norm": 51.50993885079415, "learning_rate": 1.674003068824607e-07, "logits/chosen": 13.436712265014648, "logits/rejected": 13.820159912109375, "logps/chosen": -4.31476354598999, "logps/rejected": -4.530271530151367, "loss": 4.3981, "rewards/accuracies": 0.5, "rewards/chosen": -43.14763641357422, "rewards/margins": 2.155083656311035, "rewards/rejected": -45.30271911621094, "step": 5345 }, { "epoch": 0.7279411764705882, "grad_norm": 42.971665397318844, "learning_rate": 1.6724564512809266e-07, "logits/chosen": 13.953267097473145, "logits/rejected": 15.398926734924316, "logps/chosen": -4.443672180175781, "logps/rejected": -5.166438102722168, "loss": 3.4312, "rewards/accuracies": 0.75, "rewards/chosen": -44.43671798706055, "rewards/margins": 7.227666854858398, "rewards/rejected": -51.66438293457031, "step": 5346 }, { "epoch": 0.7280773420479303, "grad_norm": 45.78924637001789, "learning_rate": 1.6709103596649502e-07, "logits/chosen": 12.633962631225586, "logits/rejected": 13.783217430114746, "logps/chosen": -4.124434947967529, "logps/rejected": -4.572056293487549, "loss": 4.1416, "rewards/accuracies": 0.75, "rewards/chosen": -41.24435043334961, "rewards/margins": 4.4762115478515625, "rewards/rejected": -45.720558166503906, "step": 5347 }, { "epoch": 0.7282135076252724, "grad_norm": 43.12608671366383, "learning_rate": 1.6693647943260323e-07, "logits/chosen": 13.055583953857422, "logits/rejected": 14.167829513549805, "logps/chosen": -4.177371501922607, "logps/rejected": -4.704863548278809, "loss": 3.5463, "rewards/accuracies": 1.0, "rewards/chosen": -41.773712158203125, "rewards/margins": 5.274920463562012, "rewards/rejected": -47.04863739013672, "step": 5348 }, { "epoch": 0.7283496732026143, "grad_norm": 43.68005721975116, "learning_rate": 1.667819755613403e-07, "logits/chosen": 14.251897811889648, "logits/rejected": 14.181818008422852, "logps/chosen": -4.55173397064209, "logps/rejected": -4.7264790534973145, "loss": 3.8787, "rewards/accuracies": 0.5, "rewards/chosen": -45.517337799072266, "rewards/margins": 1.7474536895751953, "rewards/rejected": -47.264793395996094, "step": 5349 }, { "epoch": 0.7284858387799564, "grad_norm": 41.51941876161432, "learning_rate": 1.6662752438761776e-07, "logits/chosen": 14.24756145477295, "logits/rejected": 14.405402183532715, "logps/chosen": -4.845302581787109, "logps/rejected": -4.690787315368652, "loss": 3.8263, "rewards/accuracies": 0.25, "rewards/chosen": -48.45302200317383, "rewards/margins": -1.5451507568359375, "rewards/rejected": -46.90787124633789, "step": 5350 }, { "epoch": 0.7286220043572985, "grad_norm": 43.05949184510235, "learning_rate": 1.6647312594633532e-07, "logits/chosen": 14.510424613952637, "logits/rejected": 14.887173652648926, "logps/chosen": -4.564831256866455, "logps/rejected": -4.793883323669434, "loss": 4.0776, "rewards/accuracies": 0.5, "rewards/chosen": -45.648311614990234, "rewards/margins": 2.290522575378418, "rewards/rejected": -47.93883514404297, "step": 5351 }, { "epoch": 0.7287581699346405, "grad_norm": 40.21943790267196, "learning_rate": 1.6631878027238027e-07, "logits/chosen": 14.208173751831055, "logits/rejected": 14.12386703491211, "logps/chosen": -4.45077657699585, "logps/rejected": -4.514582633972168, "loss": 3.7295, "rewards/accuracies": 0.5, "rewards/chosen": -44.50776672363281, "rewards/margins": 0.6380548477172852, "rewards/rejected": -45.14582443237305, "step": 5352 }, { "epoch": 0.7288943355119826, "grad_norm": 49.56744152065581, "learning_rate": 1.6616448740062845e-07, "logits/chosen": 13.735234260559082, "logits/rejected": 13.807613372802734, "logps/chosen": -4.56027889251709, "logps/rejected": -4.4011640548706055, "loss": 4.495, "rewards/accuracies": 0.25, "rewards/chosen": -45.602783203125, "rewards/margins": -1.5911455154418945, "rewards/rejected": -44.01164245605469, "step": 5353 }, { "epoch": 0.7290305010893247, "grad_norm": 46.97924874789019, "learning_rate": 1.6601024736594376e-07, "logits/chosen": 14.220200538635254, "logits/rejected": 14.156526565551758, "logps/chosen": -4.8409013748168945, "logps/rejected": -4.627614498138428, "loss": 4.0967, "rewards/accuracies": 0.25, "rewards/chosen": -48.40901565551758, "rewards/margins": -2.132870674133301, "rewards/rejected": -46.27614212036133, "step": 5354 }, { "epoch": 0.7291666666666666, "grad_norm": 41.58836409648836, "learning_rate": 1.6585606020317772e-07, "logits/chosen": 14.221790313720703, "logits/rejected": 15.378671646118164, "logps/chosen": -4.4672064781188965, "logps/rejected": -5.147215843200684, "loss": 3.6115, "rewards/accuracies": 1.0, "rewards/chosen": -44.67206573486328, "rewards/margins": 6.800092697143555, "rewards/rejected": -51.47216033935547, "step": 5355 }, { "epoch": 0.7293028322440087, "grad_norm": 39.69510604944999, "learning_rate": 1.6570192594717032e-07, "logits/chosen": 13.52559757232666, "logits/rejected": 13.91398811340332, "logps/chosen": -4.229820251464844, "logps/rejected": -4.50543212890625, "loss": 4.0089, "rewards/accuracies": 0.75, "rewards/chosen": -42.29820251464844, "rewards/margins": 2.756114959716797, "rewards/rejected": -45.0543212890625, "step": 5356 }, { "epoch": 0.7294389978213508, "grad_norm": 41.82109194584261, "learning_rate": 1.655478446327496e-07, "logits/chosen": 14.089801788330078, "logits/rejected": 14.116788864135742, "logps/chosen": -4.070101737976074, "logps/rejected": -4.14650297164917, "loss": 4.2781, "rewards/accuracies": 0.5, "rewards/chosen": -40.70102310180664, "rewards/margins": 0.7640066146850586, "rewards/rejected": -41.46502685546875, "step": 5357 }, { "epoch": 0.7295751633986928, "grad_norm": 41.27421435724965, "learning_rate": 1.653938162947313e-07, "logits/chosen": 14.26763916015625, "logits/rejected": 14.214910507202148, "logps/chosen": -4.638478755950928, "logps/rejected": -4.564753532409668, "loss": 4.0051, "rewards/accuracies": 0.5, "rewards/chosen": -46.384788513183594, "rewards/margins": -0.7372522354125977, "rewards/rejected": -45.64753723144531, "step": 5358 }, { "epoch": 0.7297113289760349, "grad_norm": 46.004559926740754, "learning_rate": 1.6523984096791944e-07, "logits/chosen": 14.344520568847656, "logits/rejected": 14.625673294067383, "logps/chosen": -4.50339412689209, "logps/rejected": -4.809350967407227, "loss": 4.1314, "rewards/accuracies": 0.75, "rewards/chosen": -45.033939361572266, "rewards/margins": 3.059572219848633, "rewards/rejected": -48.09351348876953, "step": 5359 }, { "epoch": 0.7298474945533769, "grad_norm": 41.41377653746607, "learning_rate": 1.650859186871062e-07, "logits/chosen": 13.641873359680176, "logits/rejected": 14.597074508666992, "logps/chosen": -4.413386344909668, "logps/rejected": -4.699707508087158, "loss": 4.0852, "rewards/accuracies": 0.75, "rewards/chosen": -44.13386535644531, "rewards/margins": 2.8632125854492188, "rewards/rejected": -46.99707794189453, "step": 5360 }, { "epoch": 0.7299836601307189, "grad_norm": 55.51341990558047, "learning_rate": 1.6493204948707132e-07, "logits/chosen": 14.181375503540039, "logits/rejected": 13.843734741210938, "logps/chosen": -4.2526044845581055, "logps/rejected": -4.213237285614014, "loss": 3.7691, "rewards/accuracies": 0.5, "rewards/chosen": -42.52604293823242, "rewards/margins": -0.39367103576660156, "rewards/rejected": -42.13237380981445, "step": 5361 }, { "epoch": 0.730119825708061, "grad_norm": 43.31321618717673, "learning_rate": 1.6477823340258295e-07, "logits/chosen": 13.668488502502441, "logits/rejected": 13.831387519836426, "logps/chosen": -4.0361738204956055, "logps/rejected": -4.312006950378418, "loss": 4.138, "rewards/accuracies": 0.75, "rewards/chosen": -40.36173629760742, "rewards/margins": 2.758334159851074, "rewards/rejected": -43.12007141113281, "step": 5362 }, { "epoch": 0.7302559912854031, "grad_norm": 45.452427600051394, "learning_rate": 1.6462447046839727e-07, "logits/chosen": 14.507272720336914, "logits/rejected": 14.570671081542969, "logps/chosen": -4.614851951599121, "logps/rejected": -4.418300628662109, "loss": 4.0032, "rewards/accuracies": 0.0, "rewards/chosen": -46.14851379394531, "rewards/margins": -1.9655065536499023, "rewards/rejected": -44.183006286621094, "step": 5363 }, { "epoch": 0.7303921568627451, "grad_norm": 41.84978811238785, "learning_rate": 1.6447076071925792e-07, "logits/chosen": 14.558370590209961, "logits/rejected": 13.522550582885742, "logps/chosen": -4.6118927001953125, "logps/rejected": -4.2691450119018555, "loss": 3.9914, "rewards/accuracies": 0.0, "rewards/chosen": -46.118927001953125, "rewards/margins": -3.4274768829345703, "rewards/rejected": -42.69144821166992, "step": 5364 }, { "epoch": 0.7305283224400871, "grad_norm": 42.64720896795318, "learning_rate": 1.6431710418989715e-07, "logits/chosen": 15.264996528625488, "logits/rejected": 15.474002838134766, "logps/chosen": -4.785537242889404, "logps/rejected": -4.972231864929199, "loss": 3.958, "rewards/accuracies": 0.75, "rewards/chosen": -47.855369567871094, "rewards/margins": 1.8669452667236328, "rewards/rejected": -49.72231674194336, "step": 5365 }, { "epoch": 0.7306644880174292, "grad_norm": 41.265963674440265, "learning_rate": 1.6416350091503498e-07, "logits/chosen": 14.448539733886719, "logits/rejected": 15.27087688446045, "logps/chosen": -4.446028709411621, "logps/rejected": -4.763977527618408, "loss": 4.1152, "rewards/accuracies": 0.75, "rewards/chosen": -44.460289001464844, "rewards/margins": 3.1794872283935547, "rewards/rejected": -47.63977813720703, "step": 5366 }, { "epoch": 0.7308006535947712, "grad_norm": 40.693019669727484, "learning_rate": 1.6400995092937908e-07, "logits/chosen": 13.929259300231934, "logits/rejected": 14.115528106689453, "logps/chosen": -4.566892147064209, "logps/rejected": -4.481049537658691, "loss": 4.3194, "rewards/accuracies": 0.5, "rewards/chosen": -45.668922424316406, "rewards/margins": -0.8584251403808594, "rewards/rejected": -44.81049346923828, "step": 5367 }, { "epoch": 0.7309368191721133, "grad_norm": 47.141511028563436, "learning_rate": 1.6385645426762547e-07, "logits/chosen": 14.826906204223633, "logits/rejected": 14.560245513916016, "logps/chosen": -4.5342936515808105, "logps/rejected": -4.522229194641113, "loss": 4.0237, "rewards/accuracies": 0.25, "rewards/chosen": -45.342933654785156, "rewards/margins": -0.12064552307128906, "rewards/rejected": -45.2222900390625, "step": 5368 }, { "epoch": 0.7310729847494554, "grad_norm": 42.6511770868018, "learning_rate": 1.6370301096445816e-07, "logits/chosen": 13.177411079406738, "logits/rejected": 14.201732635498047, "logps/chosen": -4.406002998352051, "logps/rejected": -4.545248031616211, "loss": 3.8089, "rewards/accuracies": 0.75, "rewards/chosen": -44.060035705566406, "rewards/margins": 1.3924503326416016, "rewards/rejected": -45.452484130859375, "step": 5369 }, { "epoch": 0.7312091503267973, "grad_norm": 42.793614097564344, "learning_rate": 1.635496210545486e-07, "logits/chosen": 14.632362365722656, "logits/rejected": 14.862627029418945, "logps/chosen": -4.647820472717285, "logps/rejected": -4.825932502746582, "loss": 3.3648, "rewards/accuracies": 0.5, "rewards/chosen": -46.47820281982422, "rewards/margins": 1.7811212539672852, "rewards/rejected": -48.25932312011719, "step": 5370 }, { "epoch": 0.7313453159041394, "grad_norm": 52.78249079100306, "learning_rate": 1.6339628457255673e-07, "logits/chosen": 14.986942291259766, "logits/rejected": 14.462059020996094, "logps/chosen": -5.043713569641113, "logps/rejected": -4.392594337463379, "loss": 4.3644, "rewards/accuracies": 0.5, "rewards/chosen": -50.437137603759766, "rewards/margins": -6.51119327545166, "rewards/rejected": -43.92594528198242, "step": 5371 }, { "epoch": 0.7314814814814815, "grad_norm": 68.27727957769925, "learning_rate": 1.6324300155313025e-07, "logits/chosen": 13.843080520629883, "logits/rejected": 14.12027359008789, "logps/chosen": -4.497690200805664, "logps/rejected": -4.503329277038574, "loss": 4.0879, "rewards/accuracies": 0.5, "rewards/chosen": -44.97690200805664, "rewards/margins": 0.05639076232910156, "rewards/rejected": -45.033294677734375, "step": 5372 }, { "epoch": 0.7316176470588235, "grad_norm": 43.16401406744747, "learning_rate": 1.6308977203090453e-07, "logits/chosen": 13.692159652709961, "logits/rejected": 13.937484741210938, "logps/chosen": -4.61353063583374, "logps/rejected": -4.56165885925293, "loss": 4.1509, "rewards/accuracies": 0.5, "rewards/chosen": -46.13530731201172, "rewards/margins": -0.5187225341796875, "rewards/rejected": -45.61658477783203, "step": 5373 }, { "epoch": 0.7317538126361656, "grad_norm": 48.23889304150226, "learning_rate": 1.629365960405031e-07, "logits/chosen": 13.677178382873535, "logits/rejected": 14.203226089477539, "logps/chosen": -4.152364730834961, "logps/rejected": -4.646674156188965, "loss": 4.1019, "rewards/accuracies": 0.75, "rewards/chosen": -41.52364730834961, "rewards/margins": 4.94309139251709, "rewards/rejected": -46.46673583984375, "step": 5374 }, { "epoch": 0.7318899782135077, "grad_norm": 40.90377600537425, "learning_rate": 1.6278347361653753e-07, "logits/chosen": 14.237361907958984, "logits/rejected": 14.688493728637695, "logps/chosen": -4.585893630981445, "logps/rejected": -4.749578475952148, "loss": 3.7967, "rewards/accuracies": 0.75, "rewards/chosen": -45.85893630981445, "rewards/margins": 1.6368474960327148, "rewards/rejected": -47.49578094482422, "step": 5375 }, { "epoch": 0.7320261437908496, "grad_norm": 43.54842609909796, "learning_rate": 1.6263040479360682e-07, "logits/chosen": 13.747936248779297, "logits/rejected": 14.240219116210938, "logps/chosen": -4.198674201965332, "logps/rejected": -4.456415176391602, "loss": 4.2409, "rewards/accuracies": 0.75, "rewards/chosen": -41.98674011230469, "rewards/margins": 2.5774097442626953, "rewards/rejected": -44.56414794921875, "step": 5376 }, { "epoch": 0.7321623093681917, "grad_norm": 42.9802897379674, "learning_rate": 1.6247738960629823e-07, "logits/chosen": 13.907670974731445, "logits/rejected": 13.77934455871582, "logps/chosen": -4.301027297973633, "logps/rejected": -4.460107803344727, "loss": 3.7733, "rewards/accuracies": 0.5, "rewards/chosen": -43.010276794433594, "rewards/margins": 1.5908069610595703, "rewards/rejected": -44.60108184814453, "step": 5377 }, { "epoch": 0.7322984749455338, "grad_norm": 40.81175934886451, "learning_rate": 1.6232442808918702e-07, "logits/chosen": 14.564737319946289, "logits/rejected": 14.554533004760742, "logps/chosen": -4.409273624420166, "logps/rejected": -4.409609794616699, "loss": 3.7822, "rewards/accuracies": 0.5, "rewards/chosen": -44.092735290527344, "rewards/margins": 0.0033617019653320312, "rewards/rejected": -44.096099853515625, "step": 5378 }, { "epoch": 0.7324346405228758, "grad_norm": 41.49567897025854, "learning_rate": 1.6217152027683576e-07, "logits/chosen": 13.819265365600586, "logits/rejected": 14.271308898925781, "logps/chosen": -4.33896541595459, "logps/rejected": -4.2844390869140625, "loss": 4.2978, "rewards/accuracies": 0.25, "rewards/chosen": -43.38965606689453, "rewards/margins": -0.5452651977539062, "rewards/rejected": -42.844390869140625, "step": 5379 }, { "epoch": 0.7325708061002179, "grad_norm": 45.45214275060196, "learning_rate": 1.620186662037954e-07, "logits/chosen": 14.024946212768555, "logits/rejected": 14.88606071472168, "logps/chosen": -4.5561323165893555, "logps/rejected": -4.963335037231445, "loss": 3.7107, "rewards/accuracies": 0.75, "rewards/chosen": -45.56132507324219, "rewards/margins": 4.072021484375, "rewards/rejected": -49.63334655761719, "step": 5380 }, { "epoch": 0.7327069716775599, "grad_norm": 45.26143333739788, "learning_rate": 1.6186586590460473e-07, "logits/chosen": 14.821338653564453, "logits/rejected": 14.22905158996582, "logps/chosen": -4.767187118530273, "logps/rejected": -4.7388787269592285, "loss": 4.3995, "rewards/accuracies": 0.25, "rewards/chosen": -47.671871185302734, "rewards/margins": -0.2830839157104492, "rewards/rejected": -47.38878631591797, "step": 5381 }, { "epoch": 0.7328431372549019, "grad_norm": 41.75663737479618, "learning_rate": 1.6171311941379e-07, "logits/chosen": 14.511131286621094, "logits/rejected": 15.140300750732422, "logps/chosen": -4.3317084312438965, "logps/rejected": -4.812013149261475, "loss": 3.8723, "rewards/accuracies": 1.0, "rewards/chosen": -43.31708526611328, "rewards/margins": 4.803045272827148, "rewards/rejected": -48.12013244628906, "step": 5382 }, { "epoch": 0.732979302832244, "grad_norm": 43.074151785276605, "learning_rate": 1.615604267658656e-07, "logits/chosen": 13.332941055297852, "logits/rejected": 14.000741958618164, "logps/chosen": -4.012871742248535, "logps/rejected": -4.533642768859863, "loss": 3.5627, "rewards/accuracies": 0.75, "rewards/chosen": -40.12871551513672, "rewards/margins": 5.207714080810547, "rewards/rejected": -45.33643341064453, "step": 5383 }, { "epoch": 0.7331154684095861, "grad_norm": 50.798865932833614, "learning_rate": 1.6140778799533373e-07, "logits/chosen": 13.655649185180664, "logits/rejected": 13.452159881591797, "logps/chosen": -4.144996166229248, "logps/rejected": -4.2028093338012695, "loss": 4.596, "rewards/accuracies": 0.5, "rewards/chosen": -41.44995880126953, "rewards/margins": 0.5781316757202148, "rewards/rejected": -42.02809143066406, "step": 5384 }, { "epoch": 0.733251633986928, "grad_norm": 45.20330200966978, "learning_rate": 1.6125520313668456e-07, "logits/chosen": 13.963855743408203, "logits/rejected": 13.912869453430176, "logps/chosen": -4.555041790008545, "logps/rejected": -4.689803123474121, "loss": 4.3195, "rewards/accuracies": 0.5, "rewards/chosen": -45.550418853759766, "rewards/margins": 1.3476142883300781, "rewards/rejected": -46.898033142089844, "step": 5385 }, { "epoch": 0.7333877995642701, "grad_norm": 47.68573114952263, "learning_rate": 1.611026722243955e-07, "logits/chosen": 14.457870483398438, "logits/rejected": 14.614019393920898, "logps/chosen": -4.5654754638671875, "logps/rejected": -4.418299674987793, "loss": 4.6313, "rewards/accuracies": 0.25, "rewards/chosen": -45.654754638671875, "rewards/margins": -1.4717521667480469, "rewards/rejected": -44.18300247192383, "step": 5386 }, { "epoch": 0.7335239651416122, "grad_norm": 107.7246249830472, "learning_rate": 1.609501952929325e-07, "logits/chosen": 14.082923889160156, "logits/rejected": 14.062650680541992, "logps/chosen": -4.598697662353516, "logps/rejected": -4.535879611968994, "loss": 4.0875, "rewards/accuracies": 0.5, "rewards/chosen": -45.986976623535156, "rewards/margins": -0.6281824111938477, "rewards/rejected": -45.358795166015625, "step": 5387 }, { "epoch": 0.7336601307189542, "grad_norm": 47.304710684159936, "learning_rate": 1.6079777237674895e-07, "logits/chosen": 15.006139755249023, "logits/rejected": 14.831480026245117, "logps/chosen": -4.692952632904053, "logps/rejected": -4.785041332244873, "loss": 4.3095, "rewards/accuracies": 0.75, "rewards/chosen": -46.929527282714844, "rewards/margins": 0.9208860397338867, "rewards/rejected": -47.85041427612305, "step": 5388 }, { "epoch": 0.7337962962962963, "grad_norm": 42.427557270499236, "learning_rate": 1.606454035102859e-07, "logits/chosen": 14.077345848083496, "logits/rejected": 14.275671005249023, "logps/chosen": -4.33568000793457, "logps/rejected": -4.411751747131348, "loss": 4.1618, "rewards/accuracies": 0.5, "rewards/chosen": -43.3568000793457, "rewards/margins": 0.7607154846191406, "rewards/rejected": -44.117515563964844, "step": 5389 }, { "epoch": 0.7339324618736384, "grad_norm": 38.46408122393428, "learning_rate": 1.6049308872797242e-07, "logits/chosen": 13.978561401367188, "logits/rejected": 15.07009506225586, "logps/chosen": -4.347104549407959, "logps/rejected": -4.874898910522461, "loss": 3.7294, "rewards/accuracies": 1.0, "rewards/chosen": -43.471046447753906, "rewards/margins": 5.277945518493652, "rewards/rejected": -48.748992919921875, "step": 5390 }, { "epoch": 0.7340686274509803, "grad_norm": 41.728842356033155, "learning_rate": 1.6034082806422532e-07, "logits/chosen": 14.485228538513184, "logits/rejected": 14.584651947021484, "logps/chosen": -4.354855537414551, "logps/rejected": -4.249222755432129, "loss": 3.5644, "rewards/accuracies": 0.25, "rewards/chosen": -43.548553466796875, "rewards/margins": -1.0563278198242188, "rewards/rejected": -42.492225646972656, "step": 5391 }, { "epoch": 0.7342047930283224, "grad_norm": 47.001891722229985, "learning_rate": 1.6018862155344932e-07, "logits/chosen": 13.819329261779785, "logits/rejected": 14.852493286132812, "logps/chosen": -4.36052131652832, "logps/rejected": -4.682969570159912, "loss": 3.714, "rewards/accuracies": 0.75, "rewards/chosen": -43.6052131652832, "rewards/margins": 3.224482536315918, "rewards/rejected": -46.82969665527344, "step": 5392 }, { "epoch": 0.7343409586056645, "grad_norm": 39.10023468296453, "learning_rate": 1.6003646923003644e-07, "logits/chosen": 13.649979591369629, "logits/rejected": 14.254743576049805, "logps/chosen": -4.345530986785889, "logps/rejected": -4.607789516448975, "loss": 3.8258, "rewards/accuracies": 0.5, "rewards/chosen": -43.45530700683594, "rewards/margins": 2.622586250305176, "rewards/rejected": -46.07789611816406, "step": 5393 }, { "epoch": 0.7344771241830066, "grad_norm": 47.98765620130527, "learning_rate": 1.5988437112836692e-07, "logits/chosen": 14.428953170776367, "logits/rejected": 14.481800079345703, "logps/chosen": -4.381925582885742, "logps/rejected": -4.5562591552734375, "loss": 3.5908, "rewards/accuracies": 0.5, "rewards/chosen": -43.81925582885742, "rewards/margins": 1.7433366775512695, "rewards/rejected": -45.56259536743164, "step": 5394 }, { "epoch": 0.7346132897603486, "grad_norm": 41.85756498763493, "learning_rate": 1.5973232728280864e-07, "logits/chosen": 13.289958000183105, "logits/rejected": 14.069049835205078, "logps/chosen": -4.239373207092285, "logps/rejected": -4.653918266296387, "loss": 3.8646, "rewards/accuracies": 1.0, "rewards/chosen": -42.393733978271484, "rewards/margins": 4.145450592041016, "rewards/rejected": -46.539180755615234, "step": 5395 }, { "epoch": 0.7347494553376906, "grad_norm": 46.921110287822266, "learning_rate": 1.5958033772771698e-07, "logits/chosen": 13.487171173095703, "logits/rejected": 13.83712387084961, "logps/chosen": -4.315615653991699, "logps/rejected": -4.283838748931885, "loss": 4.0464, "rewards/accuracies": 0.5, "rewards/chosen": -43.156158447265625, "rewards/margins": -0.31777191162109375, "rewards/rejected": -42.83838653564453, "step": 5396 }, { "epoch": 0.7348856209150327, "grad_norm": 43.06023100145666, "learning_rate": 1.5942840249743536e-07, "logits/chosen": 14.170358657836914, "logits/rejected": 14.483245849609375, "logps/chosen": -4.531786918640137, "logps/rejected": -4.72633171081543, "loss": 4.1956, "rewards/accuracies": 0.5, "rewards/chosen": -45.31787109375, "rewards/margins": 1.9454460144042969, "rewards/rejected": -47.26332092285156, "step": 5397 }, { "epoch": 0.7350217864923747, "grad_norm": 44.026945831619074, "learning_rate": 1.5927652162629475e-07, "logits/chosen": 14.757359504699707, "logits/rejected": 14.635444641113281, "logps/chosen": -4.275364875793457, "logps/rejected": -4.4218430519104, "loss": 3.8293, "rewards/accuracies": 0.75, "rewards/chosen": -42.7536506652832, "rewards/margins": 1.4647750854492188, "rewards/rejected": -44.21842575073242, "step": 5398 }, { "epoch": 0.7351579520697168, "grad_norm": 39.98776580736612, "learning_rate": 1.591246951486141e-07, "logits/chosen": 13.637848854064941, "logits/rejected": 14.26667308807373, "logps/chosen": -4.277209281921387, "logps/rejected": -4.6667160987854, "loss": 3.7381, "rewards/accuracies": 1.0, "rewards/chosen": -42.772090911865234, "rewards/margins": 3.895069122314453, "rewards/rejected": -46.66716003417969, "step": 5399 }, { "epoch": 0.7352941176470589, "grad_norm": 40.76334106517327, "learning_rate": 1.589729230986995e-07, "logits/chosen": 13.931272506713867, "logits/rejected": 14.041139602661133, "logps/chosen": -4.265024185180664, "logps/rejected": -4.473114013671875, "loss": 3.8759, "rewards/accuracies": 0.5, "rewards/chosen": -42.650238037109375, "rewards/margins": 2.0808982849121094, "rewards/rejected": -44.73114013671875, "step": 5400 }, { "epoch": 0.7354302832244008, "grad_norm": 41.78924711944786, "learning_rate": 1.5882120551084527e-07, "logits/chosen": 13.734642028808594, "logits/rejected": 13.998008728027344, "logps/chosen": -4.430659294128418, "logps/rejected": -4.6593475341796875, "loss": 3.7751, "rewards/accuracies": 0.75, "rewards/chosen": -44.30659484863281, "rewards/margins": 2.2868785858154297, "rewards/rejected": -46.593475341796875, "step": 5401 }, { "epoch": 0.7355664488017429, "grad_norm": 46.58101530360389, "learning_rate": 1.5866954241933344e-07, "logits/chosen": 14.398614883422852, "logits/rejected": 14.453542709350586, "logps/chosen": -4.603699684143066, "logps/rejected": -4.591870307922363, "loss": 4.1251, "rewards/accuracies": 0.5, "rewards/chosen": -46.03699493408203, "rewards/margins": -0.11829948425292969, "rewards/rejected": -45.918701171875, "step": 5402 }, { "epoch": 0.735702614379085, "grad_norm": 42.117300958858344, "learning_rate": 1.5851793385843318e-07, "logits/chosen": 14.000370025634766, "logits/rejected": 14.194491386413574, "logps/chosen": -4.102439880371094, "logps/rejected": -4.3197784423828125, "loss": 3.7921, "rewards/accuracies": 0.75, "rewards/chosen": -41.02439880371094, "rewards/margins": 2.1733884811401367, "rewards/rejected": -43.197784423828125, "step": 5403 }, { "epoch": 0.735838779956427, "grad_norm": 41.96744265548187, "learning_rate": 1.5836637986240189e-07, "logits/chosen": 14.196622848510742, "logits/rejected": 14.77701187133789, "logps/chosen": -4.145113468170166, "logps/rejected": -4.442923545837402, "loss": 3.7596, "rewards/accuracies": 0.75, "rewards/chosen": -41.451133728027344, "rewards/margins": 2.978099822998047, "rewards/rejected": -44.429237365722656, "step": 5404 }, { "epoch": 0.7359749455337691, "grad_norm": 44.764770799684, "learning_rate": 1.5821488046548455e-07, "logits/chosen": 14.430517196655273, "logits/rejected": 13.94941520690918, "logps/chosen": -4.329547882080078, "logps/rejected": -4.268083572387695, "loss": 4.09, "rewards/accuracies": 0.25, "rewards/chosen": -43.29547882080078, "rewards/margins": -0.6146383285522461, "rewards/rejected": -42.68083953857422, "step": 5405 }, { "epoch": 0.7361111111111112, "grad_norm": 48.250026714551595, "learning_rate": 1.5806343570191346e-07, "logits/chosen": 14.184457778930664, "logits/rejected": 14.322973251342773, "logps/chosen": -4.3404693603515625, "logps/rejected": -4.585130214691162, "loss": 4.4205, "rewards/accuracies": 0.75, "rewards/chosen": -43.404693603515625, "rewards/margins": 2.4466047286987305, "rewards/rejected": -45.85130310058594, "step": 5406 }, { "epoch": 0.7362472766884531, "grad_norm": 45.548556432682396, "learning_rate": 1.5791204560590897e-07, "logits/chosen": 13.090197563171387, "logits/rejected": 14.303041458129883, "logps/chosen": -4.279348850250244, "logps/rejected": -4.605652332305908, "loss": 4.1966, "rewards/accuracies": 0.75, "rewards/chosen": -42.793495178222656, "rewards/margins": 3.2630300521850586, "rewards/rejected": -46.056522369384766, "step": 5407 }, { "epoch": 0.7363834422657952, "grad_norm": 46.4990444106675, "learning_rate": 1.57760710211679e-07, "logits/chosen": 14.036439895629883, "logits/rejected": 14.315313339233398, "logps/chosen": -4.370880126953125, "logps/rejected": -4.411109447479248, "loss": 4.2485, "rewards/accuracies": 0.5, "rewards/chosen": -43.70880126953125, "rewards/margins": 0.4022941589355469, "rewards/rejected": -44.11109161376953, "step": 5408 }, { "epoch": 0.7365196078431373, "grad_norm": 41.38037891960832, "learning_rate": 1.5760942955341876e-07, "logits/chosen": 13.787874221801758, "logits/rejected": 14.140886306762695, "logps/chosen": -4.539867401123047, "logps/rejected": -4.597917556762695, "loss": 3.7255, "rewards/accuracies": 0.75, "rewards/chosen": -45.39867401123047, "rewards/margins": 0.5805025100708008, "rewards/rejected": -45.97917556762695, "step": 5409 }, { "epoch": 0.7366557734204793, "grad_norm": 42.07105703388624, "learning_rate": 1.5745820366531159e-07, "logits/chosen": 14.133758544921875, "logits/rejected": 14.023660659790039, "logps/chosen": -4.413108825683594, "logps/rejected": -4.445870399475098, "loss": 3.9751, "rewards/accuracies": 0.25, "rewards/chosen": -44.13108825683594, "rewards/margins": 0.3276195526123047, "rewards/rejected": -44.45870590209961, "step": 5410 }, { "epoch": 0.7367919389978214, "grad_norm": 44.36409703257962, "learning_rate": 1.573070325815283e-07, "logits/chosen": 14.783010482788086, "logits/rejected": 14.84288215637207, "logps/chosen": -4.392108917236328, "logps/rejected": -4.736142158508301, "loss": 3.9633, "rewards/accuracies": 0.75, "rewards/chosen": -43.92108917236328, "rewards/margins": 3.440328598022461, "rewards/rejected": -47.361419677734375, "step": 5411 }, { "epoch": 0.7369281045751634, "grad_norm": 42.77404597792231, "learning_rate": 1.5715591633622697e-07, "logits/chosen": 14.617782592773438, "logits/rejected": 14.70148754119873, "logps/chosen": -4.595239639282227, "logps/rejected": -5.010453224182129, "loss": 3.6076, "rewards/accuracies": 0.75, "rewards/chosen": -45.95240020751953, "rewards/margins": 4.152130126953125, "rewards/rejected": -50.104530334472656, "step": 5412 }, { "epoch": 0.7370642701525054, "grad_norm": 47.670344185188036, "learning_rate": 1.5700485496355368e-07, "logits/chosen": 14.645076751708984, "logits/rejected": 14.707778930664062, "logps/chosen": -4.282772064208984, "logps/rejected": -4.121750831604004, "loss": 4.925, "rewards/accuracies": 0.25, "rewards/chosen": -42.827720642089844, "rewards/margins": -1.6102123260498047, "rewards/rejected": -41.21751022338867, "step": 5413 }, { "epoch": 0.7372004357298475, "grad_norm": 40.318561068122094, "learning_rate": 1.5685384849764222e-07, "logits/chosen": 14.402944564819336, "logits/rejected": 14.027027130126953, "logps/chosen": -4.576754093170166, "logps/rejected": -4.316403388977051, "loss": 3.9816, "rewards/accuracies": 0.25, "rewards/chosen": -45.76754379272461, "rewards/margins": -2.603512763977051, "rewards/rejected": -43.16402816772461, "step": 5414 }, { "epoch": 0.7373366013071896, "grad_norm": 42.72226825528045, "learning_rate": 1.567028969726134e-07, "logits/chosen": 14.227468490600586, "logits/rejected": 14.002546310424805, "logps/chosen": -4.577827453613281, "logps/rejected": -4.597468376159668, "loss": 3.6424, "rewards/accuracies": 0.5, "rewards/chosen": -45.77827453613281, "rewards/margins": 0.1964120864868164, "rewards/rejected": -45.97468566894531, "step": 5415 }, { "epoch": 0.7374727668845316, "grad_norm": 42.025479077788056, "learning_rate": 1.5655200042257612e-07, "logits/chosen": 13.683372497558594, "logits/rejected": 14.340553283691406, "logps/chosen": -4.3858232498168945, "logps/rejected": -4.782446384429932, "loss": 3.8295, "rewards/accuracies": 1.0, "rewards/chosen": -43.85823059082031, "rewards/margins": 3.966231346130371, "rewards/rejected": -47.824462890625, "step": 5416 }, { "epoch": 0.7376089324618736, "grad_norm": 44.615764644123736, "learning_rate": 1.5640115888162687e-07, "logits/chosen": 13.835800170898438, "logits/rejected": 14.155207633972168, "logps/chosen": -4.193817138671875, "logps/rejected": -4.6089606285095215, "loss": 3.7677, "rewards/accuracies": 0.75, "rewards/chosen": -41.93817138671875, "rewards/margins": 4.151433944702148, "rewards/rejected": -46.089603424072266, "step": 5417 }, { "epoch": 0.7377450980392157, "grad_norm": 38.534388074161974, "learning_rate": 1.5625037238384922e-07, "logits/chosen": 14.019521713256836, "logits/rejected": 14.400171279907227, "logps/chosen": -4.370392799377441, "logps/rejected": -4.499345779418945, "loss": 3.7752, "rewards/accuracies": 0.5, "rewards/chosen": -43.70392608642578, "rewards/margins": 1.2895288467407227, "rewards/rejected": -44.99345397949219, "step": 5418 }, { "epoch": 0.7378812636165577, "grad_norm": 37.38482672365028, "learning_rate": 1.5609964096331481e-07, "logits/chosen": 14.403813362121582, "logits/rejected": 13.868124008178711, "logps/chosen": -4.251900672912598, "logps/rejected": -4.393197059631348, "loss": 4.1735, "rewards/accuracies": 1.0, "rewards/chosen": -42.519004821777344, "rewards/margins": 1.4129638671875, "rewards/rejected": -43.931968688964844, "step": 5419 }, { "epoch": 0.7380174291938998, "grad_norm": 42.39568408925358, "learning_rate": 1.5594896465408272e-07, "logits/chosen": 13.689788818359375, "logits/rejected": 14.420819282531738, "logps/chosen": -3.9728379249572754, "logps/rejected": -4.348128318786621, "loss": 4.2352, "rewards/accuracies": 1.0, "rewards/chosen": -39.72837829589844, "rewards/margins": 3.7529067993164062, "rewards/rejected": -43.481285095214844, "step": 5420 }, { "epoch": 0.7381535947712419, "grad_norm": 43.49211137166415, "learning_rate": 1.557983434901993e-07, "logits/chosen": 13.819595336914062, "logits/rejected": 14.739348411560059, "logps/chosen": -4.352775573730469, "logps/rejected": -4.817131996154785, "loss": 3.9905, "rewards/accuracies": 0.75, "rewards/chosen": -43.52775955200195, "rewards/margins": 4.643558502197266, "rewards/rejected": -48.17131805419922, "step": 5421 }, { "epoch": 0.7382897603485838, "grad_norm": 40.17894951691985, "learning_rate": 1.5564777750569876e-07, "logits/chosen": 15.038663864135742, "logits/rejected": 14.707237243652344, "logps/chosen": -4.492982864379883, "logps/rejected": -4.2509846687316895, "loss": 3.776, "rewards/accuracies": 0.25, "rewards/chosen": -44.92982482910156, "rewards/margins": -2.419976234436035, "rewards/rejected": -42.509849548339844, "step": 5422 }, { "epoch": 0.7384259259259259, "grad_norm": 42.71128580404261, "learning_rate": 1.5549726673460284e-07, "logits/chosen": 14.102386474609375, "logits/rejected": 14.392204284667969, "logps/chosen": -4.70963191986084, "logps/rejected": -4.774716377258301, "loss": 3.7265, "rewards/accuracies": 0.75, "rewards/chosen": -47.096317291259766, "rewards/margins": 0.6508445739746094, "rewards/rejected": -47.747161865234375, "step": 5423 }, { "epoch": 0.738562091503268, "grad_norm": 44.627106656825596, "learning_rate": 1.5534681121092047e-07, "logits/chosen": 13.901041030883789, "logits/rejected": 13.859843254089355, "logps/chosen": -4.555853366851807, "logps/rejected": -4.514616966247559, "loss": 4.0186, "rewards/accuracies": 0.5, "rewards/chosen": -45.55853271484375, "rewards/margins": -0.41236305236816406, "rewards/rejected": -45.14617156982422, "step": 5424 }, { "epoch": 0.73869825708061, "grad_norm": 44.735279302856455, "learning_rate": 1.5519641096864842e-07, "logits/chosen": 14.179864883422852, "logits/rejected": 14.12420654296875, "logps/chosen": -4.580041885375977, "logps/rejected": -4.6879143714904785, "loss": 4.3149, "rewards/accuracies": 0.75, "rewards/chosen": -45.800418853759766, "rewards/margins": 1.078725814819336, "rewards/rejected": -46.87914276123047, "step": 5425 }, { "epoch": 0.7388344226579521, "grad_norm": 38.794264961037854, "learning_rate": 1.5504606604177103e-07, "logits/chosen": 13.751747131347656, "logits/rejected": 13.420263290405273, "logps/chosen": -4.139040470123291, "logps/rejected": -4.227713584899902, "loss": 3.7571, "rewards/accuracies": 0.5, "rewards/chosen": -41.390403747558594, "rewards/margins": 0.8867292404174805, "rewards/rejected": -42.27713394165039, "step": 5426 }, { "epoch": 0.7389705882352942, "grad_norm": 41.06971697512317, "learning_rate": 1.5489577646425968e-07, "logits/chosen": 14.500029563903809, "logits/rejected": 14.319341659545898, "logps/chosen": -4.7964677810668945, "logps/rejected": -4.6228837966918945, "loss": 3.8816, "rewards/accuracies": 0.5, "rewards/chosen": -47.96467971801758, "rewards/margins": -1.73583984375, "rewards/rejected": -46.22883987426758, "step": 5427 }, { "epoch": 0.7391067538126361, "grad_norm": 42.70797689875717, "learning_rate": 1.5474554227007368e-07, "logits/chosen": 14.329980850219727, "logits/rejected": 14.489444732666016, "logps/chosen": -4.811483383178711, "logps/rejected": -4.72852087020874, "loss": 3.7836, "rewards/accuracies": 0.5, "rewards/chosen": -48.11483383178711, "rewards/margins": -0.8296270370483398, "rewards/rejected": -47.28520965576172, "step": 5428 }, { "epoch": 0.7392429193899782, "grad_norm": 59.09958711392956, "learning_rate": 1.5459536349315988e-07, "logits/chosen": 13.451820373535156, "logits/rejected": 13.655986785888672, "logps/chosen": -4.387300968170166, "logps/rejected": -4.278979301452637, "loss": 3.8696, "rewards/accuracies": 0.25, "rewards/chosen": -43.873008728027344, "rewards/margins": -1.0832176208496094, "rewards/rejected": -42.789791107177734, "step": 5429 }, { "epoch": 0.7393790849673203, "grad_norm": 44.62470196455368, "learning_rate": 1.5444524016745204e-07, "logits/chosen": 14.89819049835205, "logits/rejected": 14.057182312011719, "logps/chosen": -4.831101894378662, "logps/rejected": -4.683600902557373, "loss": 3.5759, "rewards/accuracies": 0.5, "rewards/chosen": -48.31101989746094, "rewards/margins": -1.475010871887207, "rewards/rejected": -46.83600616455078, "step": 5430 }, { "epoch": 0.7395152505446623, "grad_norm": 44.268092203131864, "learning_rate": 1.5429517232687198e-07, "logits/chosen": 14.774702072143555, "logits/rejected": 14.27185344696045, "logps/chosen": -4.6845197677612305, "logps/rejected": -4.568290710449219, "loss": 4.5996, "rewards/accuracies": 0.25, "rewards/chosen": -46.84519958496094, "rewards/margins": -1.1622896194458008, "rewards/rejected": -45.68290710449219, "step": 5431 }, { "epoch": 0.7396514161220044, "grad_norm": 39.490874202607664, "learning_rate": 1.541451600053289e-07, "logits/chosen": 13.577152252197266, "logits/rejected": 14.802499771118164, "logps/chosen": -4.4805707931518555, "logps/rejected": -5.0002923011779785, "loss": 3.8537, "rewards/accuracies": 1.0, "rewards/chosen": -44.80570983886719, "rewards/margins": 5.197212219238281, "rewards/rejected": -50.00292205810547, "step": 5432 }, { "epoch": 0.7397875816993464, "grad_norm": 41.32718762971425, "learning_rate": 1.5399520323671902e-07, "logits/chosen": 14.425422668457031, "logits/rejected": 14.554627418518066, "logps/chosen": -4.317817687988281, "logps/rejected": -4.594667434692383, "loss": 3.5606, "rewards/accuracies": 0.75, "rewards/chosen": -43.17817687988281, "rewards/margins": 2.768495559692383, "rewards/rejected": -45.94667434692383, "step": 5433 }, { "epoch": 0.7399237472766884, "grad_norm": 42.472667600311695, "learning_rate": 1.5384530205492648e-07, "logits/chosen": 13.80140495300293, "logits/rejected": 13.962024688720703, "logps/chosen": -4.296474933624268, "logps/rejected": -4.534829616546631, "loss": 3.724, "rewards/accuracies": 0.75, "rewards/chosen": -42.964752197265625, "rewards/margins": 2.383546829223633, "rewards/rejected": -45.348297119140625, "step": 5434 }, { "epoch": 0.7400599128540305, "grad_norm": 47.47810135331611, "learning_rate": 1.5369545649382282e-07, "logits/chosen": 14.441390037536621, "logits/rejected": 14.798319816589355, "logps/chosen": -4.389739990234375, "logps/rejected": -4.947010040283203, "loss": 3.6149, "rewards/accuracies": 1.0, "rewards/chosen": -43.89739990234375, "rewards/margins": 5.572705268859863, "rewards/rejected": -49.47010040283203, "step": 5435 }, { "epoch": 0.7401960784313726, "grad_norm": 42.52901712329852, "learning_rate": 1.5354566658726657e-07, "logits/chosen": 13.405195236206055, "logits/rejected": 13.689643859863281, "logps/chosen": -4.2282280921936035, "logps/rejected": -4.4643049240112305, "loss": 3.7883, "rewards/accuracies": 0.75, "rewards/chosen": -42.28227996826172, "rewards/margins": 2.3607702255249023, "rewards/rejected": -44.64305114746094, "step": 5436 }, { "epoch": 0.7403322440087146, "grad_norm": 39.683045641100044, "learning_rate": 1.5339593236910419e-07, "logits/chosen": 14.37653923034668, "logits/rejected": 14.200448989868164, "logps/chosen": -4.471620082855225, "logps/rejected": -4.406613826751709, "loss": 3.962, "rewards/accuracies": 0.5, "rewards/chosen": -44.7161979675293, "rewards/margins": -0.6500606536865234, "rewards/rejected": -44.066139221191406, "step": 5437 }, { "epoch": 0.7404684095860566, "grad_norm": 43.31918568526976, "learning_rate": 1.5324625387316948e-07, "logits/chosen": 14.183884620666504, "logits/rejected": 14.128992080688477, "logps/chosen": -4.721408843994141, "logps/rejected": -4.714097023010254, "loss": 4.0038, "rewards/accuracies": 0.5, "rewards/chosen": -47.214088439941406, "rewards/margins": -0.07311725616455078, "rewards/rejected": -47.14097213745117, "step": 5438 }, { "epoch": 0.7406045751633987, "grad_norm": 40.136627434201266, "learning_rate": 1.5309663113328325e-07, "logits/chosen": 14.091449737548828, "logits/rejected": 14.076484680175781, "logps/chosen": -4.496587753295898, "logps/rejected": -4.64665412902832, "loss": 3.8314, "rewards/accuracies": 0.75, "rewards/chosen": -44.96587371826172, "rewards/margins": 1.5006685256958008, "rewards/rejected": -46.46654510498047, "step": 5439 }, { "epoch": 0.7407407407407407, "grad_norm": 39.919197885419045, "learning_rate": 1.5294706418325412e-07, "logits/chosen": 13.53271484375, "logits/rejected": 14.255624771118164, "logps/chosen": -4.26705265045166, "logps/rejected": -4.63532829284668, "loss": 3.7905, "rewards/accuracies": 1.0, "rewards/chosen": -42.67052459716797, "rewards/margins": 3.6827545166015625, "rewards/rejected": -46.35327911376953, "step": 5440 }, { "epoch": 0.7408769063180828, "grad_norm": 42.439089452721134, "learning_rate": 1.527975530568782e-07, "logits/chosen": 13.864828109741211, "logits/rejected": 14.865559577941895, "logps/chosen": -4.260132789611816, "logps/rejected": -4.345977783203125, "loss": 3.5548, "rewards/accuracies": 0.5, "rewards/chosen": -42.60133361816406, "rewards/margins": 0.8584470748901367, "rewards/rejected": -43.45977783203125, "step": 5441 }, { "epoch": 0.7410130718954249, "grad_norm": 42.235695724233175, "learning_rate": 1.5264809778793836e-07, "logits/chosen": 13.914705276489258, "logits/rejected": 14.335433959960938, "logps/chosen": -4.713471412658691, "logps/rejected": -4.6789727210998535, "loss": 4.369, "rewards/accuracies": 0.25, "rewards/chosen": -47.13471221923828, "rewards/margins": -0.3449831008911133, "rewards/rejected": -46.78972625732422, "step": 5442 }, { "epoch": 0.7411492374727668, "grad_norm": 43.89155293990667, "learning_rate": 1.5249869841020547e-07, "logits/chosen": 14.50577163696289, "logits/rejected": 14.967227935791016, "logps/chosen": -4.518385887145996, "logps/rejected": -4.790492057800293, "loss": 4.272, "rewards/accuracies": 0.75, "rewards/chosen": -45.18386459350586, "rewards/margins": 2.721057891845703, "rewards/rejected": -47.90492248535156, "step": 5443 }, { "epoch": 0.7412854030501089, "grad_norm": 54.929848536385215, "learning_rate": 1.5234935495743768e-07, "logits/chosen": 15.04031753540039, "logits/rejected": 14.707724571228027, "logps/chosen": -4.924221038818359, "logps/rejected": -4.883448600769043, "loss": 4.8245, "rewards/accuracies": 0.75, "rewards/chosen": -49.24220657348633, "rewards/margins": -0.4077186584472656, "rewards/rejected": -48.83448791503906, "step": 5444 }, { "epoch": 0.741421568627451, "grad_norm": 44.287518250287604, "learning_rate": 1.522000674633801e-07, "logits/chosen": 14.143306732177734, "logits/rejected": 14.659833908081055, "logps/chosen": -4.174224853515625, "logps/rejected": -4.484931468963623, "loss": 3.6658, "rewards/accuracies": 1.0, "rewards/chosen": -41.74224853515625, "rewards/margins": 3.1070661544799805, "rewards/rejected": -44.84931182861328, "step": 5445 }, { "epoch": 0.741557734204793, "grad_norm": 41.643247031718715, "learning_rate": 1.5205083596176565e-07, "logits/chosen": 13.257936477661133, "logits/rejected": 13.599464416503906, "logps/chosen": -4.1290283203125, "logps/rejected": -4.497766494750977, "loss": 4.0858, "rewards/accuracies": 0.75, "rewards/chosen": -41.290279388427734, "rewards/margins": 3.6873855590820312, "rewards/rejected": -44.977664947509766, "step": 5446 }, { "epoch": 0.7416938997821351, "grad_norm": 41.230254689348385, "learning_rate": 1.5190166048631445e-07, "logits/chosen": 14.405771255493164, "logits/rejected": 14.028753280639648, "logps/chosen": -4.755336761474609, "logps/rejected": -4.6707048416137695, "loss": 4.6922, "rewards/accuracies": 0.5, "rewards/chosen": -47.553367614746094, "rewards/margins": -0.8463211059570312, "rewards/rejected": -46.70704650878906, "step": 5447 }, { "epoch": 0.7418300653594772, "grad_norm": 45.63720308443713, "learning_rate": 1.517525410707338e-07, "logits/chosen": 14.346076965332031, "logits/rejected": 14.458751678466797, "logps/chosen": -4.533449172973633, "logps/rejected": -4.799484729766846, "loss": 4.4768, "rewards/accuracies": 0.5, "rewards/chosen": -45.33449172973633, "rewards/margins": 2.6603574752807617, "rewards/rejected": -47.994850158691406, "step": 5448 }, { "epoch": 0.7419662309368191, "grad_norm": 43.3272061773992, "learning_rate": 1.5160347774871846e-07, "logits/chosen": 13.79320240020752, "logits/rejected": 14.508504867553711, "logps/chosen": -4.45078706741333, "logps/rejected": -4.840309143066406, "loss": 3.6677, "rewards/accuracies": 0.75, "rewards/chosen": -44.507869720458984, "rewards/margins": 3.895218849182129, "rewards/rejected": -48.40309143066406, "step": 5449 }, { "epoch": 0.7421023965141612, "grad_norm": 41.09971408450804, "learning_rate": 1.5145447055395074e-07, "logits/chosen": 13.929110527038574, "logits/rejected": 14.723341941833496, "logps/chosen": -4.322943687438965, "logps/rejected": -5.0189595222473145, "loss": 3.9099, "rewards/accuracies": 1.0, "rewards/chosen": -43.22943878173828, "rewards/margins": 6.9601545333862305, "rewards/rejected": -50.189598083496094, "step": 5450 }, { "epoch": 0.7422385620915033, "grad_norm": 43.02580180496934, "learning_rate": 1.513055195200998e-07, "logits/chosen": 13.962217330932617, "logits/rejected": 14.08229923248291, "logps/chosen": -4.327293872833252, "logps/rejected": -4.577426433563232, "loss": 3.5316, "rewards/accuracies": 0.75, "rewards/chosen": -43.27294158935547, "rewards/margins": 2.5013256072998047, "rewards/rejected": -45.774261474609375, "step": 5451 }, { "epoch": 0.7423747276688453, "grad_norm": 44.64100236409239, "learning_rate": 1.5115662468082247e-07, "logits/chosen": 12.579521179199219, "logits/rejected": 13.833194732666016, "logps/chosen": -4.247134685516357, "logps/rejected": -4.586188793182373, "loss": 3.5857, "rewards/accuracies": 0.75, "rewards/chosen": -42.471351623535156, "rewards/margins": 3.3905391693115234, "rewards/rejected": -45.86188888549805, "step": 5452 }, { "epoch": 0.7425108932461874, "grad_norm": 44.4105045084016, "learning_rate": 1.5100778606976287e-07, "logits/chosen": 13.980716705322266, "logits/rejected": 14.284022331237793, "logps/chosen": -4.606436729431152, "logps/rejected": -4.684549331665039, "loss": 3.6976, "rewards/accuracies": 0.75, "rewards/chosen": -46.06436538696289, "rewards/margins": 0.7811288833618164, "rewards/rejected": -46.84549331665039, "step": 5453 }, { "epoch": 0.7426470588235294, "grad_norm": 38.71311596733635, "learning_rate": 1.5085900372055203e-07, "logits/chosen": 13.261310577392578, "logits/rejected": 14.530014038085938, "logps/chosen": -4.128615379333496, "logps/rejected": -4.790870666503906, "loss": 3.3464, "rewards/accuracies": 1.0, "rewards/chosen": -41.28614807128906, "rewards/margins": 6.622555732727051, "rewards/rejected": -47.90870666503906, "step": 5454 }, { "epoch": 0.7427832244008714, "grad_norm": 41.83631312658746, "learning_rate": 1.5071027766680872e-07, "logits/chosen": 14.512406349182129, "logits/rejected": 13.96700382232666, "logps/chosen": -4.365511417388916, "logps/rejected": -4.465458393096924, "loss": 4.0127, "rewards/accuracies": 0.5, "rewards/chosen": -43.655113220214844, "rewards/margins": 0.9994688034057617, "rewards/rejected": -44.65458297729492, "step": 5455 }, { "epoch": 0.7429193899782135, "grad_norm": 44.99931517472169, "learning_rate": 1.5056160794213897e-07, "logits/chosen": 13.719121932983398, "logits/rejected": 14.89742660522461, "logps/chosen": -4.208337783813477, "logps/rejected": -4.526669025421143, "loss": 4.3111, "rewards/accuracies": 0.75, "rewards/chosen": -42.08338165283203, "rewards/margins": 3.183309555053711, "rewards/rejected": -45.266693115234375, "step": 5456 }, { "epoch": 0.7430555555555556, "grad_norm": 43.714962511633615, "learning_rate": 1.5041299458013566e-07, "logits/chosen": 14.38945198059082, "logits/rejected": 14.940633773803711, "logps/chosen": -4.751345634460449, "logps/rejected": -4.727937698364258, "loss": 4.1529, "rewards/accuracies": 0.25, "rewards/chosen": -47.513458251953125, "rewards/margins": -0.2340831756591797, "rewards/rejected": -47.27937316894531, "step": 5457 }, { "epoch": 0.7431917211328976, "grad_norm": 41.56083964020006, "learning_rate": 1.502644376143793e-07, "logits/chosen": 15.33795166015625, "logits/rejected": 14.884867668151855, "logps/chosen": -4.478609085083008, "logps/rejected": -4.542115211486816, "loss": 3.9512, "rewards/accuracies": 0.25, "rewards/chosen": -44.78609085083008, "rewards/margins": 0.6350584030151367, "rewards/rejected": -45.42115020751953, "step": 5458 }, { "epoch": 0.7433278867102396, "grad_norm": 44.85360352270225, "learning_rate": 1.5011593707843777e-07, "logits/chosen": 13.805295944213867, "logits/rejected": 14.596891403198242, "logps/chosen": -4.503508567810059, "logps/rejected": -4.760305404663086, "loss": 3.7412, "rewards/accuracies": 0.75, "rewards/chosen": -45.03508758544922, "rewards/margins": 2.56796932220459, "rewards/rejected": -47.60305404663086, "step": 5459 }, { "epoch": 0.7434640522875817, "grad_norm": 44.33564052291152, "learning_rate": 1.4996749300586567e-07, "logits/chosen": 14.26333999633789, "logits/rejected": 14.428604125976562, "logps/chosen": -4.330901622772217, "logps/rejected": -4.495295524597168, "loss": 3.9106, "rewards/accuracies": 0.5, "rewards/chosen": -43.309017181396484, "rewards/margins": 1.6439380645751953, "rewards/rejected": -44.95295715332031, "step": 5460 }, { "epoch": 0.7436002178649237, "grad_norm": 41.946444196996296, "learning_rate": 1.4981910543020532e-07, "logits/chosen": 14.926104545593262, "logits/rejected": 14.590536117553711, "logps/chosen": -4.7682647705078125, "logps/rejected": -4.624405860900879, "loss": 3.9038, "rewards/accuracies": 0.25, "rewards/chosen": -47.682647705078125, "rewards/margins": -1.4385900497436523, "rewards/rejected": -46.244056701660156, "step": 5461 }, { "epoch": 0.7437363834422658, "grad_norm": 39.956304400757006, "learning_rate": 1.4967077438498623e-07, "logits/chosen": 13.952007293701172, "logits/rejected": 14.505254745483398, "logps/chosen": -4.531441688537598, "logps/rejected": -4.911023139953613, "loss": 3.9761, "rewards/accuracies": 0.75, "rewards/chosen": -45.314414978027344, "rewards/margins": 3.7958173751831055, "rewards/rejected": -49.110233306884766, "step": 5462 }, { "epoch": 0.7438725490196079, "grad_norm": 45.74800140261451, "learning_rate": 1.4952249990372477e-07, "logits/chosen": 14.31818962097168, "logits/rejected": 14.535871505737305, "logps/chosen": -4.489619255065918, "logps/rejected": -4.495584011077881, "loss": 3.7611, "rewards/accuracies": 0.5, "rewards/chosen": -44.89619445800781, "rewards/margins": 0.059645652770996094, "rewards/rejected": -44.955841064453125, "step": 5463 }, { "epoch": 0.7440087145969498, "grad_norm": 47.96775066419478, "learning_rate": 1.4937428201992496e-07, "logits/chosen": 14.746230125427246, "logits/rejected": 14.813009262084961, "logps/chosen": -4.386482238769531, "logps/rejected": -4.507190704345703, "loss": 4.0453, "rewards/accuracies": 0.5, "rewards/chosen": -43.86482620239258, "rewards/margins": 1.2070808410644531, "rewards/rejected": -45.07190704345703, "step": 5464 }, { "epoch": 0.7441448801742919, "grad_norm": 42.98267120785298, "learning_rate": 1.4922612076707796e-07, "logits/chosen": 14.427715301513672, "logits/rejected": 14.154739379882812, "logps/chosen": -4.268169403076172, "logps/rejected": -4.570536136627197, "loss": 3.8516, "rewards/accuracies": 0.75, "rewards/chosen": -42.68170166015625, "rewards/margins": 3.023660659790039, "rewards/rejected": -45.705360412597656, "step": 5465 }, { "epoch": 0.744281045751634, "grad_norm": 42.29705099222088, "learning_rate": 1.4907801617866173e-07, "logits/chosen": 13.279064178466797, "logits/rejected": 15.187150955200195, "logps/chosen": -4.211440563201904, "logps/rejected": -4.844564914703369, "loss": 3.9672, "rewards/accuracies": 0.75, "rewards/chosen": -42.11440658569336, "rewards/margins": 6.331241607666016, "rewards/rejected": -48.445648193359375, "step": 5466 }, { "epoch": 0.744417211328976, "grad_norm": 45.514554913243465, "learning_rate": 1.4892996828814188e-07, "logits/chosen": 13.865959167480469, "logits/rejected": 14.363895416259766, "logps/chosen": -4.338778972625732, "logps/rejected": -4.666019916534424, "loss": 4.2748, "rewards/accuracies": 0.75, "rewards/chosen": -43.387786865234375, "rewards/margins": 3.2724132537841797, "rewards/rejected": -46.66020202636719, "step": 5467 }, { "epoch": 0.7445533769063181, "grad_norm": 49.263506513659344, "learning_rate": 1.487819771289712e-07, "logits/chosen": 14.19382095336914, "logits/rejected": 14.112513542175293, "logps/chosen": -4.658526420593262, "logps/rejected": -4.675398826599121, "loss": 4.3751, "rewards/accuracies": 0.25, "rewards/chosen": -46.58526611328125, "rewards/margins": 0.16872787475585938, "rewards/rejected": -46.75399398803711, "step": 5468 }, { "epoch": 0.7446895424836601, "grad_norm": 45.14326890393155, "learning_rate": 1.4863404273458927e-07, "logits/chosen": 13.960752487182617, "logits/rejected": 14.173135757446289, "logps/chosen": -4.430381774902344, "logps/rejected": -4.762746334075928, "loss": 3.7998, "rewards/accuracies": 1.0, "rewards/chosen": -44.3038215637207, "rewards/margins": 3.323638916015625, "rewards/rejected": -47.62746047973633, "step": 5469 }, { "epoch": 0.7448257080610022, "grad_norm": 46.68570728830705, "learning_rate": 1.4848616513842317e-07, "logits/chosen": 13.982772827148438, "logits/rejected": 14.322397232055664, "logps/chosen": -4.830490589141846, "logps/rejected": -4.973640441894531, "loss": 3.8034, "rewards/accuracies": 0.5, "rewards/chosen": -48.304908752441406, "rewards/margins": 1.4315013885498047, "rewards/rejected": -49.73640823364258, "step": 5470 }, { "epoch": 0.7449618736383442, "grad_norm": 42.616519194249506, "learning_rate": 1.4833834437388722e-07, "logits/chosen": 14.366891860961914, "logits/rejected": 14.336244583129883, "logps/chosen": -4.854938983917236, "logps/rejected": -4.666205406188965, "loss": 4.4668, "rewards/accuracies": 0.25, "rewards/chosen": -48.54938888549805, "rewards/margins": -1.8873329162597656, "rewards/rejected": -46.66205596923828, "step": 5471 }, { "epoch": 0.7450980392156863, "grad_norm": 45.40498362838827, "learning_rate": 1.4819058047438251e-07, "logits/chosen": 14.50629997253418, "logits/rejected": 14.270963668823242, "logps/chosen": -4.70151948928833, "logps/rejected": -4.839215278625488, "loss": 4.6701, "rewards/accuracies": 0.75, "rewards/chosen": -47.015193939208984, "rewards/margins": 1.376962661743164, "rewards/rejected": -48.392154693603516, "step": 5472 }, { "epoch": 0.7452342047930284, "grad_norm": 44.35638835060536, "learning_rate": 1.480428734732976e-07, "logits/chosen": 13.815757751464844, "logits/rejected": 14.22480583190918, "logps/chosen": -4.387690544128418, "logps/rejected": -4.576696395874023, "loss": 3.5076, "rewards/accuracies": 0.75, "rewards/chosen": -43.87690734863281, "rewards/margins": 1.8900575637817383, "rewards/rejected": -45.7669677734375, "step": 5473 }, { "epoch": 0.7453703703703703, "grad_norm": 45.5491758428868, "learning_rate": 1.4789522340400825e-07, "logits/chosen": 13.177091598510742, "logits/rejected": 13.976634979248047, "logps/chosen": -4.344108581542969, "logps/rejected": -4.6371564865112305, "loss": 3.4891, "rewards/accuracies": 0.75, "rewards/chosen": -43.44108963012695, "rewards/margins": 2.930476188659668, "rewards/rejected": -46.37156295776367, "step": 5474 }, { "epoch": 0.7455065359477124, "grad_norm": 46.64732572258008, "learning_rate": 1.4774763029987697e-07, "logits/chosen": 14.410686492919922, "logits/rejected": 14.991451263427734, "logps/chosen": -4.619297027587891, "logps/rejected": -4.9707818031311035, "loss": 3.9158, "rewards/accuracies": 0.75, "rewards/chosen": -46.192970275878906, "rewards/margins": 3.514845848083496, "rewards/rejected": -49.70781707763672, "step": 5475 }, { "epoch": 0.7456427015250545, "grad_norm": 46.013891397721395, "learning_rate": 1.4760009419425377e-07, "logits/chosen": 13.872416496276855, "logits/rejected": 14.29672622680664, "logps/chosen": -4.66727876663208, "logps/rejected": -4.8265485763549805, "loss": 4.0388, "rewards/accuracies": 0.5, "rewards/chosen": -46.67279052734375, "rewards/margins": 1.592696189880371, "rewards/rejected": -48.26548385620117, "step": 5476 }, { "epoch": 0.7457788671023965, "grad_norm": 42.58040986930179, "learning_rate": 1.474526151204758e-07, "logits/chosen": 13.815773010253906, "logits/rejected": 13.847529411315918, "logps/chosen": -4.224152088165283, "logps/rejected": -4.395170211791992, "loss": 3.6342, "rewards/accuracies": 0.75, "rewards/chosen": -42.241519927978516, "rewards/margins": 1.7101802825927734, "rewards/rejected": -43.951698303222656, "step": 5477 }, { "epoch": 0.7459150326797386, "grad_norm": 42.53810735238684, "learning_rate": 1.4730519311186681e-07, "logits/chosen": 13.615621566772461, "logits/rejected": 14.14344596862793, "logps/chosen": -4.210310935974121, "logps/rejected": -4.528855800628662, "loss": 3.6663, "rewards/accuracies": 0.5, "rewards/chosen": -42.103111267089844, "rewards/margins": 3.1854476928710938, "rewards/rejected": -45.28855895996094, "step": 5478 }, { "epoch": 0.7460511982570807, "grad_norm": 44.5052762488898, "learning_rate": 1.4715782820173832e-07, "logits/chosen": 13.084190368652344, "logits/rejected": 13.329450607299805, "logps/chosen": -4.218250274658203, "logps/rejected": -4.3057942390441895, "loss": 4.3388, "rewards/accuracies": 0.5, "rewards/chosen": -42.18250274658203, "rewards/margins": 0.8754377365112305, "rewards/rejected": -43.05794143676758, "step": 5479 }, { "epoch": 0.7461873638344226, "grad_norm": 40.894732268963914, "learning_rate": 1.4701052042338865e-07, "logits/chosen": 13.748632431030273, "logits/rejected": 14.355236053466797, "logps/chosen": -4.098917007446289, "logps/rejected": -4.406817436218262, "loss": 3.8442, "rewards/accuracies": 0.75, "rewards/chosen": -40.98917007446289, "rewards/margins": 3.07900333404541, "rewards/rejected": -44.06817626953125, "step": 5480 }, { "epoch": 0.7463235294117647, "grad_norm": 43.22339667826893, "learning_rate": 1.4686326981010303e-07, "logits/chosen": 14.382302284240723, "logits/rejected": 14.588274002075195, "logps/chosen": -4.534640312194824, "logps/rejected": -4.666416168212891, "loss": 3.7612, "rewards/accuracies": 0.75, "rewards/chosen": -45.34640884399414, "rewards/margins": 1.3177528381347656, "rewards/rejected": -46.664161682128906, "step": 5481 }, { "epoch": 0.7464596949891068, "grad_norm": 44.30750578657253, "learning_rate": 1.4671607639515399e-07, "logits/chosen": 13.669805526733398, "logits/rejected": 13.82363510131836, "logps/chosen": -4.555163383483887, "logps/rejected": -4.879586696624756, "loss": 3.9307, "rewards/accuracies": 1.0, "rewards/chosen": -45.5516357421875, "rewards/margins": 3.244232177734375, "rewards/rejected": -48.795867919921875, "step": 5482 }, { "epoch": 0.7465958605664488, "grad_norm": 42.757619827018985, "learning_rate": 1.4656894021180116e-07, "logits/chosen": 13.506519317626953, "logits/rejected": 13.817108154296875, "logps/chosen": -4.519565582275391, "logps/rejected": -4.573781967163086, "loss": 3.7388, "rewards/accuracies": 0.25, "rewards/chosen": -45.195655822753906, "rewards/margins": 0.5421600341796875, "rewards/rejected": -45.737815856933594, "step": 5483 }, { "epoch": 0.7467320261437909, "grad_norm": 47.73388705225704, "learning_rate": 1.4642186129329134e-07, "logits/chosen": 14.030057907104492, "logits/rejected": 15.376203536987305, "logps/chosen": -4.398565292358398, "logps/rejected": -5.015161037445068, "loss": 3.9969, "rewards/accuracies": 0.75, "rewards/chosen": -43.985652923583984, "rewards/margins": 6.16595458984375, "rewards/rejected": -50.151611328125, "step": 5484 }, { "epoch": 0.746868191721133, "grad_norm": 41.502377583618056, "learning_rate": 1.462748396728579e-07, "logits/chosen": 13.663347244262695, "logits/rejected": 13.391023635864258, "logps/chosen": -4.501564979553223, "logps/rejected": -4.384079456329346, "loss": 3.9897, "rewards/accuracies": 0.25, "rewards/chosen": -45.015647888183594, "rewards/margins": -1.1748552322387695, "rewards/rejected": -43.84079360961914, "step": 5485 }, { "epoch": 0.7470043572984749, "grad_norm": 45.88723626259312, "learning_rate": 1.4612787538372175e-07, "logits/chosen": 14.13058090209961, "logits/rejected": 14.42576789855957, "logps/chosen": -4.36597204208374, "logps/rejected": -4.85162353515625, "loss": 4.1951, "rewards/accuracies": 0.75, "rewards/chosen": -43.65972137451172, "rewards/margins": 4.856514930725098, "rewards/rejected": -48.5162353515625, "step": 5486 }, { "epoch": 0.747140522875817, "grad_norm": 45.180046171588884, "learning_rate": 1.4598096845909086e-07, "logits/chosen": 13.389056205749512, "logits/rejected": 14.90806770324707, "logps/chosen": -4.364879131317139, "logps/rejected": -4.6877031326293945, "loss": 3.8904, "rewards/accuracies": 0.75, "rewards/chosen": -43.6487922668457, "rewards/margins": 3.228239059448242, "rewards/rejected": -46.87702941894531, "step": 5487 }, { "epoch": 0.7472766884531591, "grad_norm": 46.25632210298671, "learning_rate": 1.458341189321597e-07, "logits/chosen": 14.283693313598633, "logits/rejected": 13.698801040649414, "logps/chosen": -4.517834663391113, "logps/rejected": -4.52208948135376, "loss": 4.3542, "rewards/accuracies": 0.5, "rewards/chosen": -45.1783447265625, "rewards/margins": 0.042548179626464844, "rewards/rejected": -45.22089385986328, "step": 5488 }, { "epoch": 0.7474128540305011, "grad_norm": 38.81503559124096, "learning_rate": 1.4568732683611034e-07, "logits/chosen": 13.236467361450195, "logits/rejected": 13.736907005310059, "logps/chosen": -3.8841962814331055, "logps/rejected": -4.216126441955566, "loss": 3.8644, "rewards/accuracies": 0.75, "rewards/chosen": -38.84196472167969, "rewards/margins": 3.319302558898926, "rewards/rejected": -42.16127014160156, "step": 5489 }, { "epoch": 0.7475490196078431, "grad_norm": 43.94792598013837, "learning_rate": 1.4554059220411167e-07, "logits/chosen": 13.53754711151123, "logits/rejected": 13.798517227172852, "logps/chosen": -4.275317192077637, "logps/rejected": -4.45587158203125, "loss": 3.7, "rewards/accuracies": 0.5, "rewards/chosen": -42.75316619873047, "rewards/margins": 1.8055496215820312, "rewards/rejected": -44.5587158203125, "step": 5490 }, { "epoch": 0.7476851851851852, "grad_norm": 45.2922406823269, "learning_rate": 1.4539391506931971e-07, "logits/chosen": 14.763008117675781, "logits/rejected": 14.02536392211914, "logps/chosen": -4.6402997970581055, "logps/rejected": -4.626402854919434, "loss": 4.1095, "rewards/accuracies": 0.25, "rewards/chosen": -46.40299606323242, "rewards/margins": -0.1389636993408203, "rewards/rejected": -46.26403045654297, "step": 5491 }, { "epoch": 0.7478213507625272, "grad_norm": 42.50310697264887, "learning_rate": 1.4524729546487708e-07, "logits/chosen": 15.07483196258545, "logits/rejected": 14.894195556640625, "logps/chosen": -4.390935897827148, "logps/rejected": -4.593371391296387, "loss": 4.3977, "rewards/accuracies": 0.75, "rewards/chosen": -43.90935516357422, "rewards/margins": 2.024354934692383, "rewards/rejected": -45.933712005615234, "step": 5492 }, { "epoch": 0.7479575163398693, "grad_norm": 43.92023130203368, "learning_rate": 1.4510073342391387e-07, "logits/chosen": 14.85277271270752, "logits/rejected": 14.124088287353516, "logps/chosen": -4.402336597442627, "logps/rejected": -4.292013168334961, "loss": 3.6427, "rewards/accuracies": 0.5, "rewards/chosen": -44.02336502075195, "rewards/margins": -1.103236198425293, "rewards/rejected": -42.920127868652344, "step": 5493 }, { "epoch": 0.7480936819172114, "grad_norm": 40.27218624522346, "learning_rate": 1.4495422897954707e-07, "logits/chosen": 13.696622848510742, "logits/rejected": 15.005800247192383, "logps/chosen": -4.288699150085449, "logps/rejected": -4.91312313079834, "loss": 4.0237, "rewards/accuracies": 1.0, "rewards/chosen": -42.88698959350586, "rewards/margins": 6.244246482849121, "rewards/rejected": -49.1312370300293, "step": 5494 }, { "epoch": 0.7482298474945533, "grad_norm": 46.627526459143795, "learning_rate": 1.4480778216488032e-07, "logits/chosen": 15.035636901855469, "logits/rejected": 15.09243392944336, "logps/chosen": -4.244182586669922, "logps/rejected": -4.611093044281006, "loss": 3.9197, "rewards/accuracies": 0.75, "rewards/chosen": -42.44182586669922, "rewards/margins": 3.6691036224365234, "rewards/rejected": -46.11092758178711, "step": 5495 }, { "epoch": 0.7483660130718954, "grad_norm": 40.65743370342081, "learning_rate": 1.4466139301300468e-07, "logits/chosen": 14.646705627441406, "logits/rejected": 13.969463348388672, "logps/chosen": -4.454790115356445, "logps/rejected": -4.325623512268066, "loss": 3.8609, "rewards/accuracies": 0.25, "rewards/chosen": -44.54790496826172, "rewards/margins": -1.2916688919067383, "rewards/rejected": -43.25623321533203, "step": 5496 }, { "epoch": 0.7485021786492375, "grad_norm": 40.01237919403067, "learning_rate": 1.4451506155699788e-07, "logits/chosen": 14.479059219360352, "logits/rejected": 14.214422225952148, "logps/chosen": -4.438210487365723, "logps/rejected": -4.3575639724731445, "loss": 3.5114, "rewards/accuracies": 0.25, "rewards/chosen": -44.382102966308594, "rewards/margins": -0.8064584732055664, "rewards/rejected": -43.575645446777344, "step": 5497 }, { "epoch": 0.7486383442265795, "grad_norm": 45.11750849874741, "learning_rate": 1.4436878782992496e-07, "logits/chosen": 14.227529525756836, "logits/rejected": 14.968231201171875, "logps/chosen": -4.468814849853516, "logps/rejected": -4.864315986633301, "loss": 3.6641, "rewards/accuracies": 0.75, "rewards/chosen": -44.68815231323242, "rewards/margins": 3.9550085067749023, "rewards/rejected": -48.64316177368164, "step": 5498 }, { "epoch": 0.7487745098039216, "grad_norm": 46.794020228905104, "learning_rate": 1.4422257186483733e-07, "logits/chosen": 14.81210994720459, "logits/rejected": 14.513858795166016, "logps/chosen": -4.485302925109863, "logps/rejected": -4.648400783538818, "loss": 3.6348, "rewards/accuracies": 0.25, "rewards/chosen": -44.853031158447266, "rewards/margins": 1.6309776306152344, "rewards/rejected": -46.4840087890625, "step": 5499 }, { "epoch": 0.7489106753812637, "grad_norm": 44.09684013051102, "learning_rate": 1.440764136947739e-07, "logits/chosen": 14.328533172607422, "logits/rejected": 14.796465873718262, "logps/chosen": -4.024467945098877, "logps/rejected": -4.539923667907715, "loss": 3.566, "rewards/accuracies": 1.0, "rewards/chosen": -40.24467849731445, "rewards/margins": 5.154561996459961, "rewards/rejected": -45.39923858642578, "step": 5500 }, { "epoch": 0.7490468409586056, "grad_norm": 44.98479104163784, "learning_rate": 1.4393031335276039e-07, "logits/chosen": 13.115739822387695, "logits/rejected": 13.813814163208008, "logps/chosen": -4.137472629547119, "logps/rejected": -4.588232040405273, "loss": 4.4661, "rewards/accuracies": 1.0, "rewards/chosen": -41.374725341796875, "rewards/margins": 4.50759220123291, "rewards/rejected": -45.88231658935547, "step": 5501 }, { "epoch": 0.7491830065359477, "grad_norm": 35.78502425936155, "learning_rate": 1.4378427087180915e-07, "logits/chosen": 12.999406814575195, "logits/rejected": 13.339261054992676, "logps/chosen": -4.240560531616211, "logps/rejected": -4.319223880767822, "loss": 3.3974, "rewards/accuracies": 0.5, "rewards/chosen": -42.405609130859375, "rewards/margins": 0.7866296768188477, "rewards/rejected": -43.192237854003906, "step": 5502 }, { "epoch": 0.7493191721132898, "grad_norm": 42.70739374953344, "learning_rate": 1.4363828628491982e-07, "logits/chosen": 14.109925270080566, "logits/rejected": 14.05429458618164, "logps/chosen": -4.812209606170654, "logps/rejected": -4.516165733337402, "loss": 3.9287, "rewards/accuracies": 0.25, "rewards/chosen": -48.122093200683594, "rewards/margins": -2.9604387283325195, "rewards/rejected": -45.161659240722656, "step": 5503 }, { "epoch": 0.7494553376906318, "grad_norm": 40.81129250350984, "learning_rate": 1.434923596250789e-07, "logits/chosen": 13.601696014404297, "logits/rejected": 13.724414825439453, "logps/chosen": -4.256522178649902, "logps/rejected": -4.194916248321533, "loss": 4.1003, "rewards/accuracies": 0.75, "rewards/chosen": -42.565223693847656, "rewards/margins": -0.6160564422607422, "rewards/rejected": -41.94916534423828, "step": 5504 }, { "epoch": 0.7495915032679739, "grad_norm": 41.95669765197228, "learning_rate": 1.4334649092525956e-07, "logits/chosen": 14.516956329345703, "logits/rejected": 14.768980979919434, "logps/chosen": -4.477294921875, "logps/rejected": -4.713834285736084, "loss": 4.0403, "rewards/accuracies": 0.75, "rewards/chosen": -44.77294921875, "rewards/margins": 2.36539363861084, "rewards/rejected": -47.138343811035156, "step": 5505 }, { "epoch": 0.7497276688453159, "grad_norm": 42.8510106989742, "learning_rate": 1.4320068021842207e-07, "logits/chosen": 14.508901596069336, "logits/rejected": 14.676305770874023, "logps/chosen": -4.353009223937988, "logps/rejected": -4.6614766120910645, "loss": 3.6483, "rewards/accuracies": 0.5, "rewards/chosen": -43.53009033203125, "rewards/margins": 3.0846776962280273, "rewards/rejected": -46.614768981933594, "step": 5506 }, { "epoch": 0.7498638344226579, "grad_norm": 41.09634833500747, "learning_rate": 1.4305492753751377e-07, "logits/chosen": 14.123457908630371, "logits/rejected": 14.360352516174316, "logps/chosen": -4.539734840393066, "logps/rejected": -4.867062568664551, "loss": 3.8989, "rewards/accuracies": 0.75, "rewards/chosen": -45.39734649658203, "rewards/margins": 3.2732810974121094, "rewards/rejected": -48.670631408691406, "step": 5507 }, { "epoch": 0.75, "grad_norm": 39.8322868020892, "learning_rate": 1.4290923291546836e-07, "logits/chosen": 13.07156753540039, "logits/rejected": 13.26789379119873, "logps/chosen": -4.035396099090576, "logps/rejected": -4.3080010414123535, "loss": 3.7901, "rewards/accuracies": 1.0, "rewards/chosen": -40.35395812988281, "rewards/margins": 2.726048469543457, "rewards/rejected": -43.08000946044922, "step": 5508 }, { "epoch": 0.7501361655773421, "grad_norm": 44.663413827209475, "learning_rate": 1.4276359638520693e-07, "logits/chosen": 14.1781005859375, "logits/rejected": 13.458771705627441, "logps/chosen": -4.373190879821777, "logps/rejected": -4.296321868896484, "loss": 3.9829, "rewards/accuracies": 0.5, "rewards/chosen": -43.731910705566406, "rewards/margins": -0.7686891555786133, "rewards/rejected": -42.963218688964844, "step": 5509 }, { "epoch": 0.7502723311546841, "grad_norm": 47.20325011116652, "learning_rate": 1.4261801797963725e-07, "logits/chosen": 14.357370376586914, "logits/rejected": 13.752832412719727, "logps/chosen": -4.5412821769714355, "logps/rejected": -4.493595123291016, "loss": 4.5166, "rewards/accuracies": 0.25, "rewards/chosen": -45.41282272338867, "rewards/margins": -0.4768686294555664, "rewards/rejected": -44.93595504760742, "step": 5510 }, { "epoch": 0.7504084967320261, "grad_norm": 38.14745793616609, "learning_rate": 1.4247249773165405e-07, "logits/chosen": 14.107450485229492, "logits/rejected": 14.713665962219238, "logps/chosen": -4.445197582244873, "logps/rejected": -4.651039123535156, "loss": 3.41, "rewards/accuracies": 0.75, "rewards/chosen": -44.45197296142578, "rewards/margins": 2.058415412902832, "rewards/rejected": -46.51039123535156, "step": 5511 }, { "epoch": 0.7505446623093682, "grad_norm": 50.64952711971185, "learning_rate": 1.4232703567413862e-07, "logits/chosen": 13.222392082214355, "logits/rejected": 14.082759857177734, "logps/chosen": -4.2840986251831055, "logps/rejected": -4.725857734680176, "loss": 4.1194, "rewards/accuracies": 1.0, "rewards/chosen": -42.84098815917969, "rewards/margins": 4.417593955993652, "rewards/rejected": -47.25857925415039, "step": 5512 }, { "epoch": 0.7506808278867102, "grad_norm": 41.82876490364115, "learning_rate": 1.4218163183995938e-07, "logits/chosen": 14.425609588623047, "logits/rejected": 15.518945693969727, "logps/chosen": -4.765176773071289, "logps/rejected": -5.0708699226379395, "loss": 3.8468, "rewards/accuracies": 1.0, "rewards/chosen": -47.65176773071289, "rewards/margins": 3.056934356689453, "rewards/rejected": -50.708702087402344, "step": 5513 }, { "epoch": 0.7508169934640523, "grad_norm": 46.30853018794779, "learning_rate": 1.4203628626197177e-07, "logits/chosen": 14.053057670593262, "logits/rejected": 14.42262077331543, "logps/chosen": -4.306370735168457, "logps/rejected": -4.588834285736084, "loss": 4.0329, "rewards/accuracies": 0.75, "rewards/chosen": -43.06370544433594, "rewards/margins": 2.8246326446533203, "rewards/rejected": -45.888343811035156, "step": 5514 }, { "epoch": 0.7509531590413944, "grad_norm": 46.0261595400236, "learning_rate": 1.4189099897301743e-07, "logits/chosen": 15.255910873413086, "logits/rejected": 14.50876235961914, "logps/chosen": -5.033120155334473, "logps/rejected": -4.95482063293457, "loss": 3.9533, "rewards/accuracies": 0.5, "rewards/chosen": -50.331199645996094, "rewards/margins": -0.7829933166503906, "rewards/rejected": -49.5482063293457, "step": 5515 }, { "epoch": 0.7510893246187363, "grad_norm": 45.251436013405154, "learning_rate": 1.4174577000592546e-07, "logits/chosen": 13.981222152709961, "logits/rejected": 13.35019302368164, "logps/chosen": -4.170258522033691, "logps/rejected": -4.16209602355957, "loss": 4.2683, "rewards/accuracies": 0.5, "rewards/chosen": -41.70259094238281, "rewards/margins": -0.08162784576416016, "rewards/rejected": -41.6209602355957, "step": 5516 }, { "epoch": 0.7512254901960784, "grad_norm": 41.43899760824405, "learning_rate": 1.4160059939351165e-07, "logits/chosen": 13.669443130493164, "logits/rejected": 13.840868949890137, "logps/chosen": -4.043665885925293, "logps/rejected": -4.454957008361816, "loss": 3.6395, "rewards/accuracies": 0.75, "rewards/chosen": -40.43666076660156, "rewards/margins": 4.112913131713867, "rewards/rejected": -44.5495719909668, "step": 5517 }, { "epoch": 0.7513616557734205, "grad_norm": 40.01458731282063, "learning_rate": 1.4145548716857826e-07, "logits/chosen": 13.419600486755371, "logits/rejected": 14.367902755737305, "logps/chosen": -4.243551731109619, "logps/rejected": -4.611195087432861, "loss": 4.1406, "rewards/accuracies": 1.0, "rewards/chosen": -42.435516357421875, "rewards/margins": 3.6764326095581055, "rewards/rejected": -46.11195373535156, "step": 5518 }, { "epoch": 0.7514978213507625, "grad_norm": 40.34209871278283, "learning_rate": 1.4131043336391462e-07, "logits/chosen": 14.339675903320312, "logits/rejected": 14.100015640258789, "logps/chosen": -4.749282360076904, "logps/rejected": -4.705453395843506, "loss": 4.19, "rewards/accuracies": 0.5, "rewards/chosen": -47.49282455444336, "rewards/margins": -0.4382896423339844, "rewards/rejected": -47.054534912109375, "step": 5519 }, { "epoch": 0.7516339869281046, "grad_norm": 48.13520921311684, "learning_rate": 1.4116543801229707e-07, "logits/chosen": 13.41596794128418, "logits/rejected": 13.200089454650879, "logps/chosen": -4.030849933624268, "logps/rejected": -4.090539932250977, "loss": 4.3128, "rewards/accuracies": 0.75, "rewards/chosen": -40.308502197265625, "rewards/margins": 0.5968990325927734, "rewards/rejected": -40.905399322509766, "step": 5520 }, { "epoch": 0.7517701525054467, "grad_norm": 44.82152971770262, "learning_rate": 1.4102050114648823e-07, "logits/chosen": 14.522710800170898, "logits/rejected": 14.401717185974121, "logps/chosen": -4.394153118133545, "logps/rejected": -4.250373840332031, "loss": 4.268, "rewards/accuracies": 0.25, "rewards/chosen": -43.9415283203125, "rewards/margins": -1.4377899169921875, "rewards/rejected": -42.50373840332031, "step": 5521 }, { "epoch": 0.7519063180827886, "grad_norm": 49.27011452364519, "learning_rate": 1.408756227992379e-07, "logits/chosen": 13.030550003051758, "logits/rejected": 14.270729064941406, "logps/chosen": -4.067095756530762, "logps/rejected": -4.714885234832764, "loss": 3.7218, "rewards/accuracies": 1.0, "rewards/chosen": -40.670955657958984, "rewards/margins": 6.477893829345703, "rewards/rejected": -47.14884948730469, "step": 5522 }, { "epoch": 0.7520424836601307, "grad_norm": 40.51125002095429, "learning_rate": 1.407308030032827e-07, "logits/chosen": 14.763809204101562, "logits/rejected": 14.779123306274414, "logps/chosen": -4.56633996963501, "logps/rejected": -5.046351909637451, "loss": 4.299, "rewards/accuracies": 0.75, "rewards/chosen": -45.66340255737305, "rewards/margins": 4.800117492675781, "rewards/rejected": -50.46352005004883, "step": 5523 }, { "epoch": 0.7521786492374728, "grad_norm": 44.014852999417876, "learning_rate": 1.405860417913455e-07, "logits/chosen": 13.829591751098633, "logits/rejected": 14.402521133422852, "logps/chosen": -4.158078193664551, "logps/rejected": -4.460324287414551, "loss": 4.1493, "rewards/accuracies": 0.75, "rewards/chosen": -41.58077621459961, "rewards/margins": 3.022462844848633, "rewards/rejected": -44.603240966796875, "step": 5524 }, { "epoch": 0.7523148148148148, "grad_norm": 40.548891145834354, "learning_rate": 1.4044133919613653e-07, "logits/chosen": 13.478446960449219, "logits/rejected": 14.146510124206543, "logps/chosen": -4.298864364624023, "logps/rejected": -4.559712886810303, "loss": 4.3909, "rewards/accuracies": 0.75, "rewards/chosen": -42.98863983154297, "rewards/margins": 2.608487129211426, "rewards/rejected": -45.59712600708008, "step": 5525 }, { "epoch": 0.7524509803921569, "grad_norm": 41.969178149372866, "learning_rate": 1.4029669525035264e-07, "logits/chosen": 13.324382781982422, "logits/rejected": 13.478702545166016, "logps/chosen": -4.488621711730957, "logps/rejected": -4.458793640136719, "loss": 4.199, "rewards/accuracies": 0.5, "rewards/chosen": -44.8862190246582, "rewards/margins": -0.29827880859375, "rewards/rejected": -44.58794021606445, "step": 5526 }, { "epoch": 0.7525871459694989, "grad_norm": 41.21269242429313, "learning_rate": 1.4015210998667707e-07, "logits/chosen": 13.805713653564453, "logits/rejected": 14.648866653442383, "logps/chosen": -4.310519695281982, "logps/rejected": -4.782685279846191, "loss": 3.7809, "rewards/accuracies": 1.0, "rewards/chosen": -43.10519790649414, "rewards/margins": 4.721654891967773, "rewards/rejected": -47.82685089111328, "step": 5527 }, { "epoch": 0.7527233115468409, "grad_norm": 42.06798647118134, "learning_rate": 1.4000758343778015e-07, "logits/chosen": 14.527145385742188, "logits/rejected": 14.533132553100586, "logps/chosen": -4.503209114074707, "logps/rejected": -4.693978786468506, "loss": 4.0443, "rewards/accuracies": 0.75, "rewards/chosen": -45.03208923339844, "rewards/margins": 1.9076976776123047, "rewards/rejected": -46.939788818359375, "step": 5528 }, { "epoch": 0.752859477124183, "grad_norm": 39.914526423843704, "learning_rate": 1.3986311563631903e-07, "logits/chosen": 14.599308013916016, "logits/rejected": 14.459157943725586, "logps/chosen": -4.130771636962891, "logps/rejected": -4.511298179626465, "loss": 4.1648, "rewards/accuracies": 1.0, "rewards/chosen": -41.307716369628906, "rewards/margins": 3.8052635192871094, "rewards/rejected": -45.112979888916016, "step": 5529 }, { "epoch": 0.7529956427015251, "grad_norm": 39.1861271029016, "learning_rate": 1.397187066149371e-07, "logits/chosen": 14.900701522827148, "logits/rejected": 14.668539047241211, "logps/chosen": -4.706977844238281, "logps/rejected": -4.71621561050415, "loss": 4.155, "rewards/accuracies": 0.25, "rewards/chosen": -47.06977844238281, "rewards/margins": 0.09238338470458984, "rewards/rejected": -47.16215896606445, "step": 5530 }, { "epoch": 0.753131808278867, "grad_norm": 44.61646818003401, "learning_rate": 1.395743564062649e-07, "logits/chosen": 13.567609786987305, "logits/rejected": 14.532634735107422, "logps/chosen": -4.201444625854492, "logps/rejected": -4.8788933753967285, "loss": 4.0792, "rewards/accuracies": 0.75, "rewards/chosen": -42.01444625854492, "rewards/margins": 6.774489402770996, "rewards/rejected": -48.78893280029297, "step": 5531 }, { "epoch": 0.7532679738562091, "grad_norm": 44.48630528214738, "learning_rate": 1.3943006504291968e-07, "logits/chosen": 14.59353256225586, "logits/rejected": 14.8155517578125, "logps/chosen": -4.594498634338379, "logps/rejected": -4.648322105407715, "loss": 4.6042, "rewards/accuracies": 0.75, "rewards/chosen": -45.94498825073242, "rewards/margins": 0.5382299423217773, "rewards/rejected": -46.48321533203125, "step": 5532 }, { "epoch": 0.7534041394335512, "grad_norm": 41.90002792653474, "learning_rate": 1.392858325575051e-07, "logits/chosen": 13.075841903686523, "logits/rejected": 14.074271202087402, "logps/chosen": -4.140815258026123, "logps/rejected": -4.489591121673584, "loss": 3.9356, "rewards/accuracies": 1.0, "rewards/chosen": -41.40814971923828, "rewards/margins": 3.4877614974975586, "rewards/rejected": -44.895912170410156, "step": 5533 }, { "epoch": 0.7535403050108932, "grad_norm": 45.67392007314181, "learning_rate": 1.3914165898261168e-07, "logits/chosen": 13.994925498962402, "logits/rejected": 14.565788269042969, "logps/chosen": -4.580573081970215, "logps/rejected": -4.455043792724609, "loss": 3.6597, "rewards/accuracies": 0.25, "rewards/chosen": -45.80573272705078, "rewards/margins": -1.2552976608276367, "rewards/rejected": -44.550437927246094, "step": 5534 }, { "epoch": 0.7536764705882353, "grad_norm": 38.820957070856025, "learning_rate": 1.3899754435081685e-07, "logits/chosen": 13.696724891662598, "logits/rejected": 13.811270713806152, "logps/chosen": -4.304543495178223, "logps/rejected": -4.583115100860596, "loss": 4.0496, "rewards/accuracies": 0.5, "rewards/chosen": -43.045433044433594, "rewards/margins": 2.785717010498047, "rewards/rejected": -45.831153869628906, "step": 5535 }, { "epoch": 0.7538126361655774, "grad_norm": 50.29835180132801, "learning_rate": 1.388534886946842e-07, "logits/chosen": 13.450796127319336, "logits/rejected": 14.01968002319336, "logps/chosen": -4.278957843780518, "logps/rejected": -4.649343490600586, "loss": 4.2793, "rewards/accuracies": 0.75, "rewards/chosen": -42.789581298828125, "rewards/margins": 3.7038564682006836, "rewards/rejected": -46.49343490600586, "step": 5536 }, { "epoch": 0.7539488017429193, "grad_norm": 49.92556650293691, "learning_rate": 1.387094920467644e-07, "logits/chosen": 14.577566146850586, "logits/rejected": 15.032485961914062, "logps/chosen": -4.312596321105957, "logps/rejected": -4.669480323791504, "loss": 3.4875, "rewards/accuracies": 0.75, "rewards/chosen": -43.12596130371094, "rewards/margins": 3.568840980529785, "rewards/rejected": -46.694801330566406, "step": 5537 }, { "epoch": 0.7540849673202614, "grad_norm": 41.18417887757861, "learning_rate": 1.385655544395949e-07, "logits/chosen": 14.291805267333984, "logits/rejected": 15.422126770019531, "logps/chosen": -4.380693435668945, "logps/rejected": -4.833593368530273, "loss": 3.5119, "rewards/accuracies": 0.75, "rewards/chosen": -43.80693435668945, "rewards/margins": 4.528995513916016, "rewards/rejected": -48.33592987060547, "step": 5538 }, { "epoch": 0.7542211328976035, "grad_norm": 39.341798410433555, "learning_rate": 1.384216759056993e-07, "logits/chosen": 13.514202117919922, "logits/rejected": 14.290010452270508, "logps/chosen": -4.531782627105713, "logps/rejected": -4.693309307098389, "loss": 3.6823, "rewards/accuracies": 0.75, "rewards/chosen": -45.31782531738281, "rewards/margins": 1.6152687072753906, "rewards/rejected": -46.9330940246582, "step": 5539 }, { "epoch": 0.7543572984749455, "grad_norm": 39.43038007730318, "learning_rate": 1.382778564775882e-07, "logits/chosen": 14.829580307006836, "logits/rejected": 14.912120819091797, "logps/chosen": -4.617015361785889, "logps/rejected": -4.8244218826293945, "loss": 4.215, "rewards/accuracies": 0.75, "rewards/chosen": -46.17015075683594, "rewards/margins": 2.074069023132324, "rewards/rejected": -48.24422073364258, "step": 5540 }, { "epoch": 0.7544934640522876, "grad_norm": 39.815089100024736, "learning_rate": 1.3813409618775903e-07, "logits/chosen": 14.039210319519043, "logits/rejected": 13.516908645629883, "logps/chosen": -4.756442070007324, "logps/rejected": -4.5077314376831055, "loss": 4.0687, "rewards/accuracies": 0.5, "rewards/chosen": -47.564422607421875, "rewards/margins": -2.4871082305908203, "rewards/rejected": -45.07731246948242, "step": 5541 }, { "epoch": 0.7546296296296297, "grad_norm": 44.82432342742092, "learning_rate": 1.3799039506869528e-07, "logits/chosen": 14.218276023864746, "logits/rejected": 15.008646011352539, "logps/chosen": -4.711996078491211, "logps/rejected": -4.952773094177246, "loss": 3.5334, "rewards/accuracies": 0.75, "rewards/chosen": -47.119956970214844, "rewards/margins": 2.407771110534668, "rewards/rejected": -49.52772903442383, "step": 5542 }, { "epoch": 0.7547657952069716, "grad_norm": 42.02705363399158, "learning_rate": 1.3784675315286754e-07, "logits/chosen": 14.440174102783203, "logits/rejected": 14.594016075134277, "logps/chosen": -4.57149076461792, "logps/rejected": -4.471323013305664, "loss": 4.3426, "rewards/accuracies": 0.5, "rewards/chosen": -45.71490478515625, "rewards/margins": -1.0016746520996094, "rewards/rejected": -44.71323013305664, "step": 5543 }, { "epoch": 0.7549019607843137, "grad_norm": 41.28772449248608, "learning_rate": 1.3770317047273307e-07, "logits/chosen": 14.110576629638672, "logits/rejected": 14.328420639038086, "logps/chosen": -4.412291526794434, "logps/rejected": -4.3697662353515625, "loss": 3.7217, "rewards/accuracies": 0.5, "rewards/chosen": -44.12291717529297, "rewards/margins": -0.4252490997314453, "rewards/rejected": -43.697662353515625, "step": 5544 }, { "epoch": 0.7550381263616558, "grad_norm": 39.55792207260617, "learning_rate": 1.3755964706073524e-07, "logits/chosen": 13.775358200073242, "logits/rejected": 14.317270278930664, "logps/chosen": -4.371593952178955, "logps/rejected": -4.632741451263428, "loss": 3.524, "rewards/accuracies": 0.75, "rewards/chosen": -43.715938568115234, "rewards/margins": 2.6114768981933594, "rewards/rejected": -46.327415466308594, "step": 5545 }, { "epoch": 0.7551742919389978, "grad_norm": 42.45849215356798, "learning_rate": 1.3741618294930452e-07, "logits/chosen": 14.758277893066406, "logits/rejected": 14.704977035522461, "logps/chosen": -4.83332633972168, "logps/rejected": -4.840344429016113, "loss": 3.6776, "rewards/accuracies": 0.75, "rewards/chosen": -48.33326721191406, "rewards/margins": 0.0701742172241211, "rewards/rejected": -48.4034423828125, "step": 5546 }, { "epoch": 0.7553104575163399, "grad_norm": 42.076941448908485, "learning_rate": 1.3727277817085793e-07, "logits/chosen": 13.814952850341797, "logits/rejected": 13.911540985107422, "logps/chosen": -4.485646724700928, "logps/rejected": -4.608532905578613, "loss": 4.1747, "rewards/accuracies": 0.5, "rewards/chosen": -44.856468200683594, "rewards/margins": 1.2288637161254883, "rewards/rejected": -46.085330963134766, "step": 5547 }, { "epoch": 0.7554466230936819, "grad_norm": 44.245845167247936, "learning_rate": 1.371294327577987e-07, "logits/chosen": 14.113645553588867, "logits/rejected": 13.836030006408691, "logps/chosen": -4.318646430969238, "logps/rejected": -4.2735395431518555, "loss": 3.9023, "rewards/accuracies": 0.25, "rewards/chosen": -43.18646240234375, "rewards/margins": -0.4510679244995117, "rewards/rejected": -42.73539352416992, "step": 5548 }, { "epoch": 0.755582788671024, "grad_norm": 41.047470708505664, "learning_rate": 1.3698614674251708e-07, "logits/chosen": 13.48277473449707, "logits/rejected": 14.443180084228516, "logps/chosen": -4.159125328063965, "logps/rejected": -4.9028825759887695, "loss": 3.8347, "rewards/accuracies": 0.75, "rewards/chosen": -41.591251373291016, "rewards/margins": 7.4375715255737305, "rewards/rejected": -49.02882385253906, "step": 5549 }, { "epoch": 0.755718954248366, "grad_norm": 42.74898320321637, "learning_rate": 1.3684292015738982e-07, "logits/chosen": 13.741240501403809, "logits/rejected": 14.269129753112793, "logps/chosen": -4.3905487060546875, "logps/rejected": -4.67183780670166, "loss": 3.5766, "rewards/accuracies": 0.5, "rewards/chosen": -43.90549087524414, "rewards/margins": 2.812887191772461, "rewards/rejected": -46.71837615966797, "step": 5550 }, { "epoch": 0.7558551198257081, "grad_norm": 44.231090608999935, "learning_rate": 1.366997530347799e-07, "logits/chosen": 14.725746154785156, "logits/rejected": 15.030574798583984, "logps/chosen": -4.751028060913086, "logps/rejected": -4.844991207122803, "loss": 4.2101, "rewards/accuracies": 0.5, "rewards/chosen": -47.51028060913086, "rewards/margins": 0.9396295547485352, "rewards/rejected": -48.449913024902344, "step": 5551 }, { "epoch": 0.7559912854030502, "grad_norm": 42.16271940584438, "learning_rate": 1.3655664540703722e-07, "logits/chosen": 14.467252731323242, "logits/rejected": 14.464853286743164, "logps/chosen": -5.00205135345459, "logps/rejected": -4.912487506866455, "loss": 3.9958, "rewards/accuracies": 0.75, "rewards/chosen": -50.02051544189453, "rewards/margins": -0.8956403732299805, "rewards/rejected": -49.1248779296875, "step": 5552 }, { "epoch": 0.7561274509803921, "grad_norm": 41.47690322179677, "learning_rate": 1.3641359730649828e-07, "logits/chosen": 13.859472274780273, "logits/rejected": 13.784817695617676, "logps/chosen": -4.395262241363525, "logps/rejected": -4.389647960662842, "loss": 3.9048, "rewards/accuracies": 0.5, "rewards/chosen": -43.95262145996094, "rewards/margins": -0.056145668029785156, "rewards/rejected": -43.89647674560547, "step": 5553 }, { "epoch": 0.7562636165577342, "grad_norm": 42.617131578439555, "learning_rate": 1.3627060876548572e-07, "logits/chosen": 14.470951080322266, "logits/rejected": 14.233072280883789, "logps/chosen": -4.5037455558776855, "logps/rejected": -4.6287150382995605, "loss": 3.9881, "rewards/accuracies": 0.5, "rewards/chosen": -45.037452697753906, "rewards/margins": 1.2496967315673828, "rewards/rejected": -46.287147521972656, "step": 5554 }, { "epoch": 0.7563997821350763, "grad_norm": 41.75077328115764, "learning_rate": 1.3612767981630917e-07, "logits/chosen": 13.997503280639648, "logits/rejected": 14.292686462402344, "logps/chosen": -4.694275856018066, "logps/rejected": -4.803765773773193, "loss": 4.0827, "rewards/accuracies": 0.5, "rewards/chosen": -46.94275665283203, "rewards/margins": 1.0948991775512695, "rewards/rejected": -48.03765869140625, "step": 5555 }, { "epoch": 0.7565359477124183, "grad_norm": 43.161677938921734, "learning_rate": 1.3598481049126464e-07, "logits/chosen": 14.088600158691406, "logits/rejected": 13.897632598876953, "logps/chosen": -4.529682159423828, "logps/rejected": -4.5366387367248535, "loss": 3.8957, "rewards/accuracies": 0.5, "rewards/chosen": -45.29682159423828, "rewards/margins": 0.06956672668457031, "rewards/rejected": -45.36638641357422, "step": 5556 }, { "epoch": 0.7566721132897604, "grad_norm": 40.4227390784484, "learning_rate": 1.3584200082263446e-07, "logits/chosen": 13.663954734802246, "logits/rejected": 14.313382148742676, "logps/chosen": -4.183111190795898, "logps/rejected": -4.455984115600586, "loss": 4.3793, "rewards/accuracies": 0.75, "rewards/chosen": -41.83111572265625, "rewards/margins": 2.728724479675293, "rewards/rejected": -44.55984115600586, "step": 5557 }, { "epoch": 0.7568082788671024, "grad_norm": 40.49195106394726, "learning_rate": 1.356992508426877e-07, "logits/chosen": 14.249777793884277, "logits/rejected": 13.963323593139648, "logps/chosen": -4.633727073669434, "logps/rejected": -4.315730571746826, "loss": 3.4209, "rewards/accuracies": 0.25, "rewards/chosen": -46.33727264404297, "rewards/margins": -3.179966926574707, "rewards/rejected": -43.15730285644531, "step": 5558 }, { "epoch": 0.7569444444444444, "grad_norm": 42.81108249272878, "learning_rate": 1.355565605836801e-07, "logits/chosen": 13.213699340820312, "logits/rejected": 14.588726043701172, "logps/chosen": -4.124914169311523, "logps/rejected": -4.629990100860596, "loss": 3.528, "rewards/accuracies": 1.0, "rewards/chosen": -41.2491455078125, "rewards/margins": 5.050753593444824, "rewards/rejected": -46.29990005493164, "step": 5559 }, { "epoch": 0.7570806100217865, "grad_norm": 48.196629769515866, "learning_rate": 1.3541393007785345e-07, "logits/chosen": 13.629071235656738, "logits/rejected": 14.22797966003418, "logps/chosen": -4.182558059692383, "logps/rejected": -4.6095991134643555, "loss": 3.9766, "rewards/accuracies": 1.0, "rewards/chosen": -41.82557678222656, "rewards/margins": 4.270414352416992, "rewards/rejected": -46.09599304199219, "step": 5560 }, { "epoch": 0.7572167755991286, "grad_norm": 43.476772552700275, "learning_rate": 1.3527135935743634e-07, "logits/chosen": 13.718541145324707, "logits/rejected": 14.130115509033203, "logps/chosen": -4.449687480926514, "logps/rejected": -4.701428413391113, "loss": 3.7677, "rewards/accuracies": 1.0, "rewards/chosen": -44.49687194824219, "rewards/margins": 2.517411231994629, "rewards/rejected": -47.0142822265625, "step": 5561 }, { "epoch": 0.7573529411764706, "grad_norm": 40.28626492298696, "learning_rate": 1.35128848454644e-07, "logits/chosen": 13.383079528808594, "logits/rejected": 14.45180892944336, "logps/chosen": -4.275566577911377, "logps/rejected": -4.517387390136719, "loss": 3.6933, "rewards/accuracies": 0.75, "rewards/chosen": -42.75566482543945, "rewards/margins": 2.418210983276367, "rewards/rejected": -45.17387771606445, "step": 5562 }, { "epoch": 0.7574891067538126, "grad_norm": 39.688771779619806, "learning_rate": 1.3498639740167766e-07, "logits/chosen": 13.72462272644043, "logits/rejected": 14.809206008911133, "logps/chosen": -4.56251859664917, "logps/rejected": -5.158735752105713, "loss": 3.387, "rewards/accuracies": 1.0, "rewards/chosen": -45.62518310546875, "rewards/margins": 5.962170600891113, "rewards/rejected": -51.58735656738281, "step": 5563 }, { "epoch": 0.7576252723311547, "grad_norm": 44.033675392062015, "learning_rate": 1.3484400623072546e-07, "logits/chosen": 14.588956832885742, "logits/rejected": 15.52437973022461, "logps/chosen": -4.699716567993164, "logps/rejected": -4.8023481369018555, "loss": 3.6407, "rewards/accuracies": 0.5, "rewards/chosen": -46.99716567993164, "rewards/margins": 1.026315689086914, "rewards/rejected": -48.02347946166992, "step": 5564 }, { "epoch": 0.7577614379084967, "grad_norm": 39.52639349094488, "learning_rate": 1.3470167497396197e-07, "logits/chosen": 13.97315788269043, "logits/rejected": 13.949405670166016, "logps/chosen": -4.157608985900879, "logps/rejected": -4.426187515258789, "loss": 4.2975, "rewards/accuracies": 1.0, "rewards/chosen": -41.576087951660156, "rewards/margins": 2.685792922973633, "rewards/rejected": -44.261878967285156, "step": 5565 }, { "epoch": 0.7578976034858388, "grad_norm": 40.49030879654368, "learning_rate": 1.345594036635479e-07, "logits/chosen": 13.789031028747559, "logits/rejected": 14.166814804077148, "logps/chosen": -4.4823527336120605, "logps/rejected": -4.458518028259277, "loss": 4.0653, "rewards/accuracies": 0.5, "rewards/chosen": -44.82352828979492, "rewards/margins": -0.23835086822509766, "rewards/rejected": -44.585174560546875, "step": 5566 }, { "epoch": 0.7580337690631809, "grad_norm": 44.792426714470416, "learning_rate": 1.3441719233163072e-07, "logits/chosen": 13.879232406616211, "logits/rejected": 13.783025741577148, "logps/chosen": -4.164422035217285, "logps/rejected": -4.179773807525635, "loss": 3.8082, "rewards/accuracies": 0.5, "rewards/chosen": -41.64421844482422, "rewards/margins": 0.1535177230834961, "rewards/rejected": -41.79773712158203, "step": 5567 }, { "epoch": 0.7581699346405228, "grad_norm": 49.98480408603627, "learning_rate": 1.3427504101034438e-07, "logits/chosen": 14.294620513916016, "logits/rejected": 14.012395858764648, "logps/chosen": -4.466250896453857, "logps/rejected": -4.221957683563232, "loss": 4.3291, "rewards/accuracies": 0.25, "rewards/chosen": -44.66250991821289, "rewards/margins": -2.4429311752319336, "rewards/rejected": -42.21957778930664, "step": 5568 }, { "epoch": 0.7583061002178649, "grad_norm": 41.26302558045254, "learning_rate": 1.3413294973180889e-07, "logits/chosen": 14.316823959350586, "logits/rejected": 14.737076759338379, "logps/chosen": -4.425189018249512, "logps/rejected": -4.866055011749268, "loss": 4.1376, "rewards/accuracies": 1.0, "rewards/chosen": -44.25189208984375, "rewards/margins": 4.408658027648926, "rewards/rejected": -48.66054916381836, "step": 5569 }, { "epoch": 0.758442265795207, "grad_norm": 37.83768549703198, "learning_rate": 1.3399091852813107e-07, "logits/chosen": 13.517889022827148, "logits/rejected": 13.537101745605469, "logps/chosen": -4.122624397277832, "logps/rejected": -4.223992347717285, "loss": 3.5571, "rewards/accuracies": 0.5, "rewards/chosen": -41.22624206542969, "rewards/margins": 1.0136852264404297, "rewards/rejected": -42.23992919921875, "step": 5570 }, { "epoch": 0.758578431372549, "grad_norm": 39.23169374094413, "learning_rate": 1.3384894743140422e-07, "logits/chosen": 13.436315536499023, "logits/rejected": 14.293760299682617, "logps/chosen": -4.2656354904174805, "logps/rejected": -4.565885543823242, "loss": 4.178, "rewards/accuracies": 0.5, "rewards/chosen": -42.65635681152344, "rewards/margins": 3.002497673034668, "rewards/rejected": -45.65885543823242, "step": 5571 }, { "epoch": 0.7587145969498911, "grad_norm": 41.41526576398391, "learning_rate": 1.3370703647370762e-07, "logits/chosen": 14.230853080749512, "logits/rejected": 14.439048767089844, "logps/chosen": -4.519214630126953, "logps/rejected": -4.688609600067139, "loss": 3.5964, "rewards/accuracies": 0.5, "rewards/chosen": -45.19214630126953, "rewards/margins": 1.6939506530761719, "rewards/rejected": -46.88610076904297, "step": 5572 }, { "epoch": 0.7588507625272332, "grad_norm": 41.590112902759245, "learning_rate": 1.3356518568710725e-07, "logits/chosen": 13.952515602111816, "logits/rejected": 14.03103256225586, "logps/chosen": -4.386834621429443, "logps/rejected": -4.5055060386657715, "loss": 3.8474, "rewards/accuracies": 0.75, "rewards/chosen": -43.86834716796875, "rewards/margins": 1.186716079711914, "rewards/rejected": -45.05506134033203, "step": 5573 }, { "epoch": 0.7589869281045751, "grad_norm": 43.45447792748419, "learning_rate": 1.3342339510365576e-07, "logits/chosen": 13.978490829467773, "logits/rejected": 14.652473449707031, "logps/chosen": -4.532675743103027, "logps/rejected": -4.779000759124756, "loss": 4.2705, "rewards/accuracies": 0.75, "rewards/chosen": -45.326759338378906, "rewards/margins": 2.4632530212402344, "rewards/rejected": -47.790008544921875, "step": 5574 }, { "epoch": 0.7591230936819172, "grad_norm": 41.735438273152525, "learning_rate": 1.3328166475539151e-07, "logits/chosen": 13.554965019226074, "logits/rejected": 14.426733016967773, "logps/chosen": -4.418816089630127, "logps/rejected": -4.587621688842773, "loss": 3.8373, "rewards/accuracies": 0.5, "rewards/chosen": -44.18815994262695, "rewards/margins": 1.6880569458007812, "rewards/rejected": -45.876216888427734, "step": 5575 }, { "epoch": 0.7592592592592593, "grad_norm": 53.01458114047868, "learning_rate": 1.3313999467433986e-07, "logits/chosen": 13.135763168334961, "logits/rejected": 13.868206024169922, "logps/chosen": -4.374478816986084, "logps/rejected": -4.665089130401611, "loss": 4.3077, "rewards/accuracies": 0.75, "rewards/chosen": -43.744789123535156, "rewards/margins": 2.9061012268066406, "rewards/rejected": -46.6508903503418, "step": 5576 }, { "epoch": 0.7593954248366013, "grad_norm": 42.75078500721332, "learning_rate": 1.3299838489251242e-07, "logits/chosen": 13.609634399414062, "logits/rejected": 14.289934158325195, "logps/chosen": -4.626708030700684, "logps/rejected": -4.762004852294922, "loss": 3.2481, "rewards/accuracies": 0.75, "rewards/chosen": -46.26707458496094, "rewards/margins": 1.3529748916625977, "rewards/rejected": -47.62004852294922, "step": 5577 }, { "epoch": 0.7595315904139434, "grad_norm": 39.79710903493364, "learning_rate": 1.3285683544190685e-07, "logits/chosen": 14.353224754333496, "logits/rejected": 14.535440444946289, "logps/chosen": -4.22037410736084, "logps/rejected": -4.606081008911133, "loss": 4.0553, "rewards/accuracies": 1.0, "rewards/chosen": -42.20374298095703, "rewards/margins": 3.857071876525879, "rewards/rejected": -46.060813903808594, "step": 5578 }, { "epoch": 0.7596677559912854, "grad_norm": 40.357974323177615, "learning_rate": 1.3271534635450753e-07, "logits/chosen": 14.073156356811523, "logits/rejected": 15.057186126708984, "logps/chosen": -4.253283500671387, "logps/rejected": -4.775078773498535, "loss": 4.1856, "rewards/accuracies": 1.0, "rewards/chosen": -42.5328369140625, "rewards/margins": 5.217947006225586, "rewards/rejected": -47.75078582763672, "step": 5579 }, { "epoch": 0.7598039215686274, "grad_norm": 49.00484747316815, "learning_rate": 1.3257391766228518e-07, "logits/chosen": 14.482736587524414, "logits/rejected": 14.0166015625, "logps/chosen": -4.477783203125, "logps/rejected": -4.564569473266602, "loss": 4.1555, "rewards/accuracies": 0.75, "rewards/chosen": -44.77783203125, "rewards/margins": 0.867863655090332, "rewards/rejected": -45.645694732666016, "step": 5580 }, { "epoch": 0.7599400871459695, "grad_norm": 37.78147146581926, "learning_rate": 1.3243254939719663e-07, "logits/chosen": 14.051815032958984, "logits/rejected": 15.303726196289062, "logps/chosen": -4.63697624206543, "logps/rejected": -4.912269592285156, "loss": 3.6333, "rewards/accuracies": 0.75, "rewards/chosen": -46.36976623535156, "rewards/margins": 2.752932548522949, "rewards/rejected": -49.12269592285156, "step": 5581 }, { "epoch": 0.7600762527233116, "grad_norm": 38.0045104202935, "learning_rate": 1.3229124159118525e-07, "logits/chosen": 13.265775680541992, "logits/rejected": 13.68514633178711, "logps/chosen": -4.058690547943115, "logps/rejected": -4.30084228515625, "loss": 3.7104, "rewards/accuracies": 0.5, "rewards/chosen": -40.58690643310547, "rewards/margins": 2.421520233154297, "rewards/rejected": -43.0084228515625, "step": 5582 }, { "epoch": 0.7602124183006536, "grad_norm": 42.130633874116654, "learning_rate": 1.3214999427618087e-07, "logits/chosen": 14.1602783203125, "logits/rejected": 14.447883605957031, "logps/chosen": -4.401533603668213, "logps/rejected": -4.60531759262085, "loss": 3.916, "rewards/accuracies": 0.75, "rewards/chosen": -44.01533508300781, "rewards/margins": 2.0378408432006836, "rewards/rejected": -46.05317687988281, "step": 5583 }, { "epoch": 0.7603485838779956, "grad_norm": 41.54230676786046, "learning_rate": 1.3200880748409925e-07, "logits/chosen": 14.883788108825684, "logits/rejected": 15.026708602905273, "logps/chosen": -4.495079517364502, "logps/rejected": -5.068079948425293, "loss": 3.8208, "rewards/accuracies": 1.0, "rewards/chosen": -44.95079803466797, "rewards/margins": 5.730002403259277, "rewards/rejected": -50.68080139160156, "step": 5584 }, { "epoch": 0.7604847494553377, "grad_norm": 40.82494428518494, "learning_rate": 1.318676812468428e-07, "logits/chosen": 14.01111888885498, "logits/rejected": 13.729742050170898, "logps/chosen": -4.464338779449463, "logps/rejected": -4.517466068267822, "loss": 4.1026, "rewards/accuracies": 0.75, "rewards/chosen": -44.64338684082031, "rewards/margins": 0.5312747955322266, "rewards/rejected": -45.174659729003906, "step": 5585 }, { "epoch": 0.7606209150326797, "grad_norm": 43.68936917454539, "learning_rate": 1.317266155963003e-07, "logits/chosen": 14.1529541015625, "logits/rejected": 14.243432998657227, "logps/chosen": -4.440757751464844, "logps/rejected": -4.528176307678223, "loss": 3.9663, "rewards/accuracies": 0.75, "rewards/chosen": -44.40757751464844, "rewards/margins": 0.8741827011108398, "rewards/rejected": -45.281761169433594, "step": 5586 }, { "epoch": 0.7607570806100218, "grad_norm": 39.56074101602798, "learning_rate": 1.3158561056434646e-07, "logits/chosen": 13.387516021728516, "logits/rejected": 14.133081436157227, "logps/chosen": -4.574097156524658, "logps/rejected": -4.579924583435059, "loss": 3.768, "rewards/accuracies": 0.5, "rewards/chosen": -45.74097442626953, "rewards/margins": 0.05827522277832031, "rewards/rejected": -45.79924774169922, "step": 5587 }, { "epoch": 0.7608932461873639, "grad_norm": 39.383490035832956, "learning_rate": 1.3144466618284265e-07, "logits/chosen": 13.937667846679688, "logits/rejected": 14.743670463562012, "logps/chosen": -4.368569374084473, "logps/rejected": -4.673495769500732, "loss": 3.6464, "rewards/accuracies": 0.75, "rewards/chosen": -43.685691833496094, "rewards/margins": 3.0492639541625977, "rewards/rejected": -46.73495864868164, "step": 5588 }, { "epoch": 0.7610294117647058, "grad_norm": 42.116333491084575, "learning_rate": 1.313037824836364e-07, "logits/chosen": 13.256841659545898, "logits/rejected": 13.854265213012695, "logps/chosen": -4.402843475341797, "logps/rejected": -4.725452899932861, "loss": 3.8531, "rewards/accuracies": 0.75, "rewards/chosen": -44.028438568115234, "rewards/margins": 3.226088523864746, "rewards/rejected": -47.2545280456543, "step": 5589 }, { "epoch": 0.7611655773420479, "grad_norm": 41.97542243880971, "learning_rate": 1.3116295949856172e-07, "logits/chosen": 14.599891662597656, "logits/rejected": 14.074596405029297, "logps/chosen": -4.439149856567383, "logps/rejected": -4.474949359893799, "loss": 4.0567, "rewards/accuracies": 0.5, "rewards/chosen": -44.391502380371094, "rewards/margins": 0.35799121856689453, "rewards/rejected": -44.749488830566406, "step": 5590 }, { "epoch": 0.76130174291939, "grad_norm": 41.052397099709616, "learning_rate": 1.310221972594384e-07, "logits/chosen": 14.510905265808105, "logits/rejected": 14.095845222473145, "logps/chosen": -4.298779487609863, "logps/rejected": -4.208462715148926, "loss": 4.3462, "rewards/accuracies": 0.75, "rewards/chosen": -42.98779296875, "rewards/margins": -0.9031667709350586, "rewards/rejected": -42.084625244140625, "step": 5591 }, { "epoch": 0.761437908496732, "grad_norm": 43.28110580011345, "learning_rate": 1.3088149579807303e-07, "logits/chosen": 13.86397647857666, "logits/rejected": 13.340740203857422, "logps/chosen": -4.226231098175049, "logps/rejected": -4.207057476043701, "loss": 3.7418, "rewards/accuracies": 0.5, "rewards/chosen": -42.262306213378906, "rewards/margins": -0.19173431396484375, "rewards/rejected": -42.07057189941406, "step": 5592 }, { "epoch": 0.7615740740740741, "grad_norm": 40.5691607529996, "learning_rate": 1.3074085514625837e-07, "logits/chosen": 13.98277473449707, "logits/rejected": 14.330348014831543, "logps/chosen": -4.185193061828613, "logps/rejected": -4.376017093658447, "loss": 3.9129, "rewards/accuracies": 0.75, "rewards/chosen": -41.8519287109375, "rewards/margins": 1.9082393646240234, "rewards/rejected": -43.760169982910156, "step": 5593 }, { "epoch": 0.7617102396514162, "grad_norm": 38.96863364055038, "learning_rate": 1.3060027533577308e-07, "logits/chosen": 14.455591201782227, "logits/rejected": 14.240503311157227, "logps/chosen": -4.785421371459961, "logps/rejected": -4.947352409362793, "loss": 3.6652, "rewards/accuracies": 0.75, "rewards/chosen": -47.854209899902344, "rewards/margins": 1.6193113327026367, "rewards/rejected": -49.4735221862793, "step": 5594 }, { "epoch": 0.7618464052287581, "grad_norm": 39.88099557961048, "learning_rate": 1.3045975639838242e-07, "logits/chosen": 14.573225975036621, "logits/rejected": 14.334527969360352, "logps/chosen": -4.456069469451904, "logps/rejected": -4.670243263244629, "loss": 4.3948, "rewards/accuracies": 1.0, "rewards/chosen": -44.560691833496094, "rewards/margins": 2.141739845275879, "rewards/rejected": -46.70243453979492, "step": 5595 }, { "epoch": 0.7619825708061002, "grad_norm": 36.660920163077066, "learning_rate": 1.3031929836583788e-07, "logits/chosen": 14.814334869384766, "logits/rejected": 14.265743255615234, "logps/chosen": -4.433374881744385, "logps/rejected": -4.642743110656738, "loss": 3.6304, "rewards/accuracies": 0.75, "rewards/chosen": -44.3337516784668, "rewards/margins": 2.0936803817749023, "rewards/rejected": -46.427433013916016, "step": 5596 }, { "epoch": 0.7621187363834423, "grad_norm": 36.2831498796408, "learning_rate": 1.301789012698772e-07, "logits/chosen": 13.154335021972656, "logits/rejected": 14.073078155517578, "logps/chosen": -4.236164093017578, "logps/rejected": -4.711325168609619, "loss": 3.7028, "rewards/accuracies": 1.0, "rewards/chosen": -42.361637115478516, "rewards/margins": 4.751616477966309, "rewards/rejected": -47.11325454711914, "step": 5597 }, { "epoch": 0.7622549019607843, "grad_norm": 40.32138360749457, "learning_rate": 1.3003856514222403e-07, "logits/chosen": 13.936281204223633, "logits/rejected": 14.725857734680176, "logps/chosen": -4.535392761230469, "logps/rejected": -4.8045973777771, "loss": 3.5053, "rewards/accuracies": 0.75, "rewards/chosen": -45.35393142700195, "rewards/margins": 2.692044258117676, "rewards/rejected": -48.04597473144531, "step": 5598 }, { "epoch": 0.7623910675381264, "grad_norm": 49.79056671394359, "learning_rate": 1.298982900145886e-07, "logits/chosen": 14.409370422363281, "logits/rejected": 13.981151580810547, "logps/chosen": -4.683990001678467, "logps/rejected": -4.482827663421631, "loss": 3.8616, "rewards/accuracies": 0.25, "rewards/chosen": -46.839900970458984, "rewards/margins": -2.0116233825683594, "rewards/rejected": -44.828277587890625, "step": 5599 }, { "epoch": 0.7625272331154684, "grad_norm": 41.87314814763456, "learning_rate": 1.297580759186673e-07, "logits/chosen": 13.418587684631348, "logits/rejected": 13.938777923583984, "logps/chosen": -4.241260528564453, "logps/rejected": -4.5701584815979, "loss": 4.4513, "rewards/accuracies": 0.75, "rewards/chosen": -42.41260528564453, "rewards/margins": 3.2889766693115234, "rewards/rejected": -45.70158386230469, "step": 5600 }, { "epoch": 0.7626633986928104, "grad_norm": 40.95906967313701, "learning_rate": 1.296179228861425e-07, "logits/chosen": 14.476632118225098, "logits/rejected": 13.969186782836914, "logps/chosen": -4.470344543457031, "logps/rejected": -4.490347862243652, "loss": 3.9209, "rewards/accuracies": 0.25, "rewards/chosen": -44.70344543457031, "rewards/margins": 0.2000293731689453, "rewards/rejected": -44.90347671508789, "step": 5601 }, { "epoch": 0.7627995642701525, "grad_norm": 42.02997787232447, "learning_rate": 1.29477830948683e-07, "logits/chosen": 14.304803848266602, "logits/rejected": 14.706371307373047, "logps/chosen": -4.689340591430664, "logps/rejected": -5.077398777008057, "loss": 3.522, "rewards/accuracies": 0.75, "rewards/chosen": -46.893402099609375, "rewards/margins": 3.880581855773926, "rewards/rejected": -50.77398681640625, "step": 5602 }, { "epoch": 0.7629357298474946, "grad_norm": 45.83813916752046, "learning_rate": 1.2933780013794374e-07, "logits/chosen": 13.551237106323242, "logits/rejected": 14.108598709106445, "logps/chosen": -4.064940452575684, "logps/rejected": -4.364930152893066, "loss": 4.1188, "rewards/accuracies": 1.0, "rewards/chosen": -40.6494026184082, "rewards/margins": 2.99990177154541, "rewards/rejected": -43.64930725097656, "step": 5603 }, { "epoch": 0.7630718954248366, "grad_norm": 41.89169303586651, "learning_rate": 1.2919783048556604e-07, "logits/chosen": 14.464057922363281, "logits/rejected": 14.581073760986328, "logps/chosen": -4.870028495788574, "logps/rejected": -4.72328519821167, "loss": 3.6482, "rewards/accuracies": 0.5, "rewards/chosen": -48.700286865234375, "rewards/margins": -1.4674367904663086, "rewards/rejected": -47.23284912109375, "step": 5604 }, { "epoch": 0.7632080610021786, "grad_norm": 43.34370746876474, "learning_rate": 1.2905792202317686e-07, "logits/chosen": 14.238763809204102, "logits/rejected": 13.787904739379883, "logps/chosen": -4.400156021118164, "logps/rejected": -4.486821174621582, "loss": 4.1094, "rewards/accuracies": 0.5, "rewards/chosen": -44.001564025878906, "rewards/margins": 0.8666477203369141, "rewards/rejected": -44.86820983886719, "step": 5605 }, { "epoch": 0.7633442265795207, "grad_norm": 42.21141307392907, "learning_rate": 1.2891807478238982e-07, "logits/chosen": 14.361248016357422, "logits/rejected": 14.595046997070312, "logps/chosen": -4.23731803894043, "logps/rejected": -4.704666614532471, "loss": 3.8129, "rewards/accuracies": 1.0, "rewards/chosen": -42.37317657470703, "rewards/margins": 4.673490524291992, "rewards/rejected": -47.04666519165039, "step": 5606 }, { "epoch": 0.7634803921568627, "grad_norm": 41.20381637614528, "learning_rate": 1.287782887948047e-07, "logits/chosen": 13.814693450927734, "logits/rejected": 13.581525802612305, "logps/chosen": -4.265946865081787, "logps/rejected": -4.474427223205566, "loss": 4.3666, "rewards/accuracies": 0.75, "rewards/chosen": -42.65946960449219, "rewards/margins": 2.0848045349121094, "rewards/rejected": -44.7442741394043, "step": 5607 }, { "epoch": 0.7636165577342048, "grad_norm": 40.8792823807763, "learning_rate": 1.2863856409200707e-07, "logits/chosen": 14.038164138793945, "logits/rejected": 14.229936599731445, "logps/chosen": -4.5653204917907715, "logps/rejected": -4.57739782333374, "loss": 3.8489, "rewards/accuracies": 0.75, "rewards/chosen": -45.65320587158203, "rewards/margins": 0.12077617645263672, "rewards/rejected": -45.77397918701172, "step": 5608 }, { "epoch": 0.7637527233115469, "grad_norm": 40.10715923586805, "learning_rate": 1.2849890070556897e-07, "logits/chosen": 14.647672653198242, "logits/rejected": 15.044269561767578, "logps/chosen": -4.758636951446533, "logps/rejected": -4.8868303298950195, "loss": 4.0211, "rewards/accuracies": 0.75, "rewards/chosen": -47.58637237548828, "rewards/margins": 1.281930923461914, "rewards/rejected": -48.86830139160156, "step": 5609 }, { "epoch": 0.7638888888888888, "grad_norm": 41.34980068681793, "learning_rate": 1.2835929866704862e-07, "logits/chosen": 13.408557891845703, "logits/rejected": 14.413864135742188, "logps/chosen": -4.401411056518555, "logps/rejected": -4.732456207275391, "loss": 4.167, "rewards/accuracies": 0.75, "rewards/chosen": -44.01410675048828, "rewards/margins": 3.310455322265625, "rewards/rejected": -47.324562072753906, "step": 5610 }, { "epoch": 0.7640250544662309, "grad_norm": 39.38945868623215, "learning_rate": 1.2821975800799e-07, "logits/chosen": 14.143041610717773, "logits/rejected": 14.042216300964355, "logps/chosen": -4.186159610748291, "logps/rejected": -4.11129903793335, "loss": 4.2021, "rewards/accuracies": 0.5, "rewards/chosen": -41.86159896850586, "rewards/margins": -0.7486085891723633, "rewards/rejected": -41.11298751831055, "step": 5611 }, { "epoch": 0.764161220043573, "grad_norm": 42.64939248419397, "learning_rate": 1.2808027875992365e-07, "logits/chosen": 14.10827922821045, "logits/rejected": 14.189544677734375, "logps/chosen": -4.66845703125, "logps/rejected": -4.539208889007568, "loss": 4.1935, "rewards/accuracies": 0.25, "rewards/chosen": -46.6845703125, "rewards/margins": -1.2924814224243164, "rewards/rejected": -45.39208984375, "step": 5612 }, { "epoch": 0.764297385620915, "grad_norm": 48.462941994399515, "learning_rate": 1.279408609543661e-07, "logits/chosen": 13.874241828918457, "logits/rejected": 14.381914138793945, "logps/chosen": -4.354961395263672, "logps/rejected": -4.659631252288818, "loss": 3.8827, "rewards/accuracies": 0.75, "rewards/chosen": -43.549617767333984, "rewards/margins": 3.046696662902832, "rewards/rejected": -46.5963134765625, "step": 5613 }, { "epoch": 0.7644335511982571, "grad_norm": 39.12767911553224, "learning_rate": 1.278015046228198e-07, "logits/chosen": 14.378304481506348, "logits/rejected": 14.62944221496582, "logps/chosen": -4.860635280609131, "logps/rejected": -5.094488143920898, "loss": 3.859, "rewards/accuracies": 0.75, "rewards/chosen": -48.606353759765625, "rewards/margins": 2.338526725769043, "rewards/rejected": -50.944881439208984, "step": 5614 }, { "epoch": 0.7645697167755992, "grad_norm": 47.372406572259784, "learning_rate": 1.2766220979677354e-07, "logits/chosen": 13.343940734863281, "logits/rejected": 13.308979034423828, "logps/chosen": -4.063128471374512, "logps/rejected": -4.108930587768555, "loss": 3.7516, "rewards/accuracies": 0.5, "rewards/chosen": -40.63128662109375, "rewards/margins": 0.45801734924316406, "rewards/rejected": -41.08930206298828, "step": 5615 }, { "epoch": 0.7647058823529411, "grad_norm": 39.52514177719193, "learning_rate": 1.2752297650770225e-07, "logits/chosen": 14.127867698669434, "logits/rejected": 14.14223861694336, "logps/chosen": -4.632158279418945, "logps/rejected": -4.6634368896484375, "loss": 3.5897, "rewards/accuracies": 0.5, "rewards/chosen": -46.32157897949219, "rewards/margins": 0.31278324127197266, "rewards/rejected": -46.634361267089844, "step": 5616 }, { "epoch": 0.7648420479302832, "grad_norm": 39.70858967852571, "learning_rate": 1.2738380478706662e-07, "logits/chosen": 13.74879264831543, "logits/rejected": 14.761741638183594, "logps/chosen": -4.162672996520996, "logps/rejected": -4.623592376708984, "loss": 3.6304, "rewards/accuracies": 1.0, "rewards/chosen": -41.62672805786133, "rewards/margins": 4.609193801879883, "rewards/rejected": -46.235923767089844, "step": 5617 }, { "epoch": 0.7649782135076253, "grad_norm": 40.015630305671436, "learning_rate": 1.2724469466631376e-07, "logits/chosen": 14.933820724487305, "logits/rejected": 14.585860252380371, "logps/chosen": -4.697493553161621, "logps/rejected": -4.7899980545043945, "loss": 4.0991, "rewards/accuracies": 0.5, "rewards/chosen": -46.974937438964844, "rewards/margins": 0.9250450134277344, "rewards/rejected": -47.89998245239258, "step": 5618 }, { "epoch": 0.7651143790849673, "grad_norm": 39.91467827010586, "learning_rate": 1.271056461768769e-07, "logits/chosen": 13.283475875854492, "logits/rejected": 14.35708999633789, "logps/chosen": -4.250293731689453, "logps/rejected": -4.626067161560059, "loss": 4.0505, "rewards/accuracies": 1.0, "rewards/chosen": -42.5029411315918, "rewards/margins": 3.7577285766601562, "rewards/rejected": -46.26066970825195, "step": 5619 }, { "epoch": 0.7652505446623094, "grad_norm": 41.44428694058491, "learning_rate": 1.2696665935017494e-07, "logits/chosen": 14.911229133605957, "logits/rejected": 14.985544204711914, "logps/chosen": -4.411308288574219, "logps/rejected": -4.6550612449646, "loss": 4.1979, "rewards/accuracies": 1.0, "rewards/chosen": -44.11307907104492, "rewards/margins": 2.437533378601074, "rewards/rejected": -46.55061340332031, "step": 5620 }, { "epoch": 0.7653867102396514, "grad_norm": 48.260685119442215, "learning_rate": 1.2682773421761316e-07, "logits/chosen": 13.120269775390625, "logits/rejected": 13.295063018798828, "logps/chosen": -4.029850006103516, "logps/rejected": -4.241095542907715, "loss": 4.1475, "rewards/accuracies": 0.75, "rewards/chosen": -40.298500061035156, "rewards/margins": 2.112452507019043, "rewards/rejected": -42.41094970703125, "step": 5621 }, { "epoch": 0.7655228758169934, "grad_norm": 41.72861379658854, "learning_rate": 1.2668887081058312e-07, "logits/chosen": 13.645746231079102, "logits/rejected": 13.925283432006836, "logps/chosen": -4.259903907775879, "logps/rejected": -4.428030967712402, "loss": 3.9833, "rewards/accuracies": 0.75, "rewards/chosen": -42.59904479980469, "rewards/margins": 1.6812677383422852, "rewards/rejected": -44.280311584472656, "step": 5622 }, { "epoch": 0.7656590413943355, "grad_norm": 40.99401891370311, "learning_rate": 1.2655006916046173e-07, "logits/chosen": 13.30444622039795, "logits/rejected": 14.140836715698242, "logps/chosen": -4.249845504760742, "logps/rejected": -4.689548492431641, "loss": 4.0775, "rewards/accuracies": 1.0, "rewards/chosen": -42.49845886230469, "rewards/margins": 4.397025108337402, "rewards/rejected": -46.895484924316406, "step": 5623 }, { "epoch": 0.7657952069716776, "grad_norm": 39.98590196867821, "learning_rate": 1.264113292986126e-07, "logits/chosen": 13.926485061645508, "logits/rejected": 14.23373031616211, "logps/chosen": -4.5238847732543945, "logps/rejected": -4.769758224487305, "loss": 3.6594, "rewards/accuracies": 0.75, "rewards/chosen": -45.23884963989258, "rewards/margins": 2.4587345123291016, "rewards/rejected": -47.69758224487305, "step": 5624 }, { "epoch": 0.7659313725490197, "grad_norm": 52.71898433572521, "learning_rate": 1.2627265125638524e-07, "logits/chosen": 14.525175094604492, "logits/rejected": 14.406707763671875, "logps/chosen": -4.60606575012207, "logps/rejected": -4.6883344650268555, "loss": 3.6889, "rewards/accuracies": 0.75, "rewards/chosen": -46.06065368652344, "rewards/margins": 0.8226900100708008, "rewards/rejected": -46.88334274291992, "step": 5625 }, { "epoch": 0.7660675381263616, "grad_norm": 41.70112862011298, "learning_rate": 1.2613403506511482e-07, "logits/chosen": 13.084614753723145, "logits/rejected": 13.644088745117188, "logps/chosen": -4.2111334800720215, "logps/rejected": -4.637945652008057, "loss": 3.5973, "rewards/accuracies": 0.75, "rewards/chosen": -42.11133575439453, "rewards/margins": 4.268121719360352, "rewards/rejected": -46.37945556640625, "step": 5626 }, { "epoch": 0.7662037037037037, "grad_norm": 42.191969692581324, "learning_rate": 1.2599548075612302e-07, "logits/chosen": 13.683023452758789, "logits/rejected": 13.786699295043945, "logps/chosen": -4.387679576873779, "logps/rejected": -4.343453407287598, "loss": 4.1471, "rewards/accuracies": 0.5, "rewards/chosen": -43.876792907714844, "rewards/margins": -0.4422616958618164, "rewards/rejected": -43.434532165527344, "step": 5627 }, { "epoch": 0.7663398692810458, "grad_norm": 40.467253595601484, "learning_rate": 1.2585698836071742e-07, "logits/chosen": 13.699423789978027, "logits/rejected": 13.993692398071289, "logps/chosen": -4.410794258117676, "logps/rejected": -4.670768737792969, "loss": 3.4213, "rewards/accuracies": 0.75, "rewards/chosen": -44.10794448852539, "rewards/margins": 2.599742889404297, "rewards/rejected": -46.70768737792969, "step": 5628 }, { "epoch": 0.7664760348583878, "grad_norm": 42.08836663156898, "learning_rate": 1.2571855791019124e-07, "logits/chosen": 14.85936164855957, "logits/rejected": 13.95050048828125, "logps/chosen": -4.444587230682373, "logps/rejected": -4.466065406799316, "loss": 3.7799, "rewards/accuracies": 0.5, "rewards/chosen": -44.44587326049805, "rewards/margins": 0.2147808074951172, "rewards/rejected": -44.66065216064453, "step": 5629 }, { "epoch": 0.7666122004357299, "grad_norm": 40.81583652363125, "learning_rate": 1.2558018943582417e-07, "logits/chosen": 13.613626480102539, "logits/rejected": 14.468994140625, "logps/chosen": -4.235846519470215, "logps/rejected": -4.581851959228516, "loss": 3.7274, "rewards/accuracies": 0.75, "rewards/chosen": -42.35846710205078, "rewards/margins": 3.460049629211426, "rewards/rejected": -45.81851577758789, "step": 5630 }, { "epoch": 0.766748366013072, "grad_norm": 42.87955145380509, "learning_rate": 1.2544188296888175e-07, "logits/chosen": 14.328386306762695, "logits/rejected": 15.28891658782959, "logps/chosen": -4.580227375030518, "logps/rejected": -4.883717060089111, "loss": 3.6542, "rewards/accuracies": 1.0, "rewards/chosen": -45.80227279663086, "rewards/margins": 3.0348987579345703, "rewards/rejected": -48.83717346191406, "step": 5631 }, { "epoch": 0.7668845315904139, "grad_norm": 36.329166107772686, "learning_rate": 1.253036385406153e-07, "logits/chosen": 13.555564880371094, "logits/rejected": 14.931228637695312, "logps/chosen": -4.360238075256348, "logps/rejected": -4.907019138336182, "loss": 3.8271, "rewards/accuracies": 0.75, "rewards/chosen": -43.602378845214844, "rewards/margins": 5.467812538146973, "rewards/rejected": -49.0701904296875, "step": 5632 }, { "epoch": 0.767020697167756, "grad_norm": 43.17491038744796, "learning_rate": 1.2516545618226236e-07, "logits/chosen": 14.03497314453125, "logits/rejected": 13.989219665527344, "logps/chosen": -4.7449116706848145, "logps/rejected": -4.713796138763428, "loss": 4.1299, "rewards/accuracies": 0.5, "rewards/chosen": -47.449119567871094, "rewards/margins": -0.3111581802368164, "rewards/rejected": -47.137962341308594, "step": 5633 }, { "epoch": 0.7671568627450981, "grad_norm": 42.05017613665766, "learning_rate": 1.2502733592504658e-07, "logits/chosen": 15.06230354309082, "logits/rejected": 14.667583465576172, "logps/chosen": -4.752456188201904, "logps/rejected": -4.710753440856934, "loss": 4.4382, "rewards/accuracies": 0.25, "rewards/chosen": -47.524559020996094, "rewards/margins": -0.4170246124267578, "rewards/rejected": -47.10753631591797, "step": 5634 }, { "epoch": 0.7672930283224401, "grad_norm": 44.918068082807956, "learning_rate": 1.2488927780017702e-07, "logits/chosen": 14.98669719696045, "logits/rejected": 14.175054550170898, "logps/chosen": -4.774992942810059, "logps/rejected": -4.533790588378906, "loss": 4.3556, "rewards/accuracies": 0.0, "rewards/chosen": -47.74993133544922, "rewards/margins": -2.412022590637207, "rewards/rejected": -45.33790588378906, "step": 5635 }, { "epoch": 0.7674291938997821, "grad_norm": 42.275207205285945, "learning_rate": 1.2475128183884925e-07, "logits/chosen": 14.255631446838379, "logits/rejected": 14.339729309082031, "logps/chosen": -4.377425193786621, "logps/rejected": -4.802002906799316, "loss": 3.7915, "rewards/accuracies": 0.75, "rewards/chosen": -43.774253845214844, "rewards/margins": 4.24577522277832, "rewards/rejected": -48.02002716064453, "step": 5636 }, { "epoch": 0.7675653594771242, "grad_norm": 37.7390977971578, "learning_rate": 1.2461334807224466e-07, "logits/chosen": 14.019834518432617, "logits/rejected": 14.393976211547852, "logps/chosen": -4.457916736602783, "logps/rejected": -4.611820697784424, "loss": 3.4157, "rewards/accuracies": 0.5, "rewards/chosen": -44.57917022705078, "rewards/margins": 1.5390377044677734, "rewards/rejected": -46.11820983886719, "step": 5637 }, { "epoch": 0.7677015250544662, "grad_norm": 43.542966529192036, "learning_rate": 1.2447547653153034e-07, "logits/chosen": 13.909200668334961, "logits/rejected": 14.384763717651367, "logps/chosen": -4.4299468994140625, "logps/rejected": -4.636326789855957, "loss": 3.5428, "rewards/accuracies": 0.75, "rewards/chosen": -44.299468994140625, "rewards/margins": 2.063800811767578, "rewards/rejected": -46.36326599121094, "step": 5638 }, { "epoch": 0.7678376906318083, "grad_norm": 44.79388211561875, "learning_rate": 1.2433766724785962e-07, "logits/chosen": 13.866828918457031, "logits/rejected": 13.759068489074707, "logps/chosen": -4.211003303527832, "logps/rejected": -4.58066987991333, "loss": 4.3916, "rewards/accuracies": 1.0, "rewards/chosen": -42.11003494262695, "rewards/margins": 3.696664810180664, "rewards/rejected": -45.80670166015625, "step": 5639 }, { "epoch": 0.7679738562091504, "grad_norm": 43.03321200337896, "learning_rate": 1.2419992025237176e-07, "logits/chosen": 13.93881607055664, "logits/rejected": 12.965377807617188, "logps/chosen": -4.16544771194458, "logps/rejected": -4.118755340576172, "loss": 4.3299, "rewards/accuracies": 0.5, "rewards/chosen": -41.654476165771484, "rewards/margins": -0.4669179916381836, "rewards/rejected": -41.18756103515625, "step": 5640 }, { "epoch": 0.7681100217864923, "grad_norm": 44.83439828233508, "learning_rate": 1.240622355761916e-07, "logits/chosen": 13.320943832397461, "logits/rejected": 13.935966491699219, "logps/chosen": -4.310884475708008, "logps/rejected": -4.444409370422363, "loss": 3.9277, "rewards/accuracies": 0.75, "rewards/chosen": -43.10884094238281, "rewards/margins": 1.3352489471435547, "rewards/rejected": -44.444091796875, "step": 5641 }, { "epoch": 0.7682461873638344, "grad_norm": 47.18416558584605, "learning_rate": 1.2392461325043018e-07, "logits/chosen": 13.576600074768066, "logits/rejected": 13.645232200622559, "logps/chosen": -4.34440803527832, "logps/rejected": -4.230434417724609, "loss": 4.3332, "rewards/accuracies": 0.5, "rewards/chosen": -43.44407653808594, "rewards/margins": -1.1397342681884766, "rewards/rejected": -42.304344177246094, "step": 5642 }, { "epoch": 0.7683823529411765, "grad_norm": 44.126048248306404, "learning_rate": 1.2378705330618463e-07, "logits/chosen": 14.846391677856445, "logits/rejected": 14.636514663696289, "logps/chosen": -4.337986946105957, "logps/rejected": -4.416732311248779, "loss": 4.5937, "rewards/accuracies": 0.75, "rewards/chosen": -43.37986755371094, "rewards/margins": 0.7874565124511719, "rewards/rejected": -44.167327880859375, "step": 5643 }, { "epoch": 0.7685185185185185, "grad_norm": 40.12915844750821, "learning_rate": 1.2364955577453743e-07, "logits/chosen": 13.864561080932617, "logits/rejected": 13.899205207824707, "logps/chosen": -4.161460876464844, "logps/rejected": -4.2997846603393555, "loss": 4.0423, "rewards/accuracies": 0.5, "rewards/chosen": -41.61460876464844, "rewards/margins": 1.3832340240478516, "rewards/rejected": -42.997840881347656, "step": 5644 }, { "epoch": 0.7686546840958606, "grad_norm": 40.63784260949148, "learning_rate": 1.2351212068655749e-07, "logits/chosen": 14.647042274475098, "logits/rejected": 14.49615478515625, "logps/chosen": -4.3567352294921875, "logps/rejected": -4.543537139892578, "loss": 4.0464, "rewards/accuracies": 0.75, "rewards/chosen": -43.567352294921875, "rewards/margins": 1.8680181503295898, "rewards/rejected": -45.43537139892578, "step": 5645 }, { "epoch": 0.7687908496732027, "grad_norm": 45.23087192940981, "learning_rate": 1.2337474807329944e-07, "logits/chosen": 14.105939865112305, "logits/rejected": 14.176532745361328, "logps/chosen": -4.404862403869629, "logps/rejected": -4.541579246520996, "loss": 3.8676, "rewards/accuracies": 0.5, "rewards/chosen": -44.04862594604492, "rewards/margins": 1.3671684265136719, "rewards/rejected": -45.415794372558594, "step": 5646 }, { "epoch": 0.7689270152505446, "grad_norm": 42.3323426111562, "learning_rate": 1.2323743796580351e-07, "logits/chosen": 14.404902458190918, "logits/rejected": 14.473363876342773, "logps/chosen": -4.422205924987793, "logps/rejected": -4.628695487976074, "loss": 3.857, "rewards/accuracies": 0.75, "rewards/chosen": -44.22206497192383, "rewards/margins": 2.064891815185547, "rewards/rejected": -46.286956787109375, "step": 5647 }, { "epoch": 0.7690631808278867, "grad_norm": 39.985570063030444, "learning_rate": 1.2310019039509628e-07, "logits/chosen": 14.052427291870117, "logits/rejected": 14.151311874389648, "logps/chosen": -4.228884696960449, "logps/rejected": -4.365232467651367, "loss": 3.5259, "rewards/accuracies": 0.75, "rewards/chosen": -42.28884506225586, "rewards/margins": 1.3634777069091797, "rewards/rejected": -43.652320861816406, "step": 5648 }, { "epoch": 0.7691993464052288, "grad_norm": 41.31078853886153, "learning_rate": 1.2296300539219e-07, "logits/chosen": 13.90641975402832, "logits/rejected": 14.380120277404785, "logps/chosen": -4.321135520935059, "logps/rejected": -4.611459732055664, "loss": 4.1272, "rewards/accuracies": 0.5, "rewards/chosen": -43.21135711669922, "rewards/margins": 2.9032421112060547, "rewards/rejected": -46.114601135253906, "step": 5649 }, { "epoch": 0.7693355119825708, "grad_norm": 41.248214173799084, "learning_rate": 1.2282588298808255e-07, "logits/chosen": 14.30303955078125, "logits/rejected": 14.654942512512207, "logps/chosen": -4.227038860321045, "logps/rejected": -4.524433612823486, "loss": 4.0819, "rewards/accuracies": 1.0, "rewards/chosen": -42.270389556884766, "rewards/margins": 2.9739465713500977, "rewards/rejected": -45.24433135986328, "step": 5650 }, { "epoch": 0.7694716775599129, "grad_norm": 46.59135908157513, "learning_rate": 1.2268882321375796e-07, "logits/chosen": 14.145143508911133, "logits/rejected": 14.2822904586792, "logps/chosen": -4.100475311279297, "logps/rejected": -4.458850383758545, "loss": 3.9425, "rewards/accuracies": 1.0, "rewards/chosen": -41.00475311279297, "rewards/margins": 3.583751678466797, "rewards/rejected": -44.5885009765625, "step": 5651 }, { "epoch": 0.7696078431372549, "grad_norm": 42.395520350028356, "learning_rate": 1.2255182610018619e-07, "logits/chosen": 13.386468887329102, "logits/rejected": 12.59324836730957, "logps/chosen": -4.026891708374023, "logps/rejected": -3.982208728790283, "loss": 3.8519, "rewards/accuracies": 0.5, "rewards/chosen": -40.26891326904297, "rewards/margins": -0.4468269348144531, "rewards/rejected": -39.82209014892578, "step": 5652 }, { "epoch": 0.7697440087145969, "grad_norm": 47.24011993996782, "learning_rate": 1.2241489167832257e-07, "logits/chosen": 14.385832786560059, "logits/rejected": 14.886117935180664, "logps/chosen": -4.506251335144043, "logps/rejected": -4.767737865447998, "loss": 3.8434, "rewards/accuracies": 0.75, "rewards/chosen": -45.0625114440918, "rewards/margins": 2.6148672103881836, "rewards/rejected": -47.67737579345703, "step": 5653 }, { "epoch": 0.769880174291939, "grad_norm": 39.77831361566817, "learning_rate": 1.2227801997910872e-07, "logits/chosen": 14.240499496459961, "logits/rejected": 15.235664367675781, "logps/chosen": -4.798697471618652, "logps/rejected": -4.907192230224609, "loss": 3.7877, "rewards/accuracies": 0.5, "rewards/chosen": -47.98697280883789, "rewards/margins": 1.0849456787109375, "rewards/rejected": -49.071922302246094, "step": 5654 }, { "epoch": 0.7700163398692811, "grad_norm": 39.64904100067631, "learning_rate": 1.2214121103347213e-07, "logits/chosen": 13.851646423339844, "logits/rejected": 13.616968154907227, "logps/chosen": -4.32585334777832, "logps/rejected": -4.288924217224121, "loss": 3.6263, "rewards/accuracies": 0.75, "rewards/chosen": -43.25852966308594, "rewards/margins": -0.3692922592163086, "rewards/rejected": -42.88924026489258, "step": 5655 }, { "epoch": 0.7701525054466231, "grad_norm": 39.9438529743761, "learning_rate": 1.220044648723255e-07, "logits/chosen": 14.082311630249023, "logits/rejected": 13.708826065063477, "logps/chosen": -4.361999988555908, "logps/rejected": -4.302597999572754, "loss": 4.0675, "rewards/accuracies": 0.5, "rewards/chosen": -43.619998931884766, "rewards/margins": -0.594019889831543, "rewards/rejected": -43.025978088378906, "step": 5656 }, { "epoch": 0.7702886710239651, "grad_norm": 37.6369585767073, "learning_rate": 1.2186778152656797e-07, "logits/chosen": 13.525115966796875, "logits/rejected": 14.271722793579102, "logps/chosen": -4.201494216918945, "logps/rejected": -4.524020195007324, "loss": 3.5007, "rewards/accuracies": 0.75, "rewards/chosen": -42.01494598388672, "rewards/margins": 3.2252578735351562, "rewards/rejected": -45.240203857421875, "step": 5657 }, { "epoch": 0.7704248366013072, "grad_norm": 80.13292250269295, "learning_rate": 1.2173116102708446e-07, "logits/chosen": 14.628410339355469, "logits/rejected": 15.399127960205078, "logps/chosen": -4.837557315826416, "logps/rejected": -4.795261383056641, "loss": 4.1767, "rewards/accuracies": 0.5, "rewards/chosen": -48.375572204589844, "rewards/margins": -0.4229612350463867, "rewards/rejected": -47.952613830566406, "step": 5658 }, { "epoch": 0.7705610021786492, "grad_norm": 44.70332132717878, "learning_rate": 1.2159460340474513e-07, "logits/chosen": 14.783337593078613, "logits/rejected": 15.0413818359375, "logps/chosen": -4.8753557205200195, "logps/rejected": -4.872748374938965, "loss": 4.4569, "rewards/accuracies": 0.75, "rewards/chosen": -48.75355529785156, "rewards/margins": -0.02607440948486328, "rewards/rejected": -48.727481842041016, "step": 5659 }, { "epoch": 0.7706971677559913, "grad_norm": 40.436244602500274, "learning_rate": 1.2145810869040652e-07, "logits/chosen": 14.525466918945312, "logits/rejected": 13.980846405029297, "logps/chosen": -4.589019298553467, "logps/rejected": -4.707140922546387, "loss": 3.6134, "rewards/accuracies": 0.5, "rewards/chosen": -45.89019012451172, "rewards/margins": 1.1812152862548828, "rewards/rejected": -47.07140350341797, "step": 5660 }, { "epoch": 0.7708333333333334, "grad_norm": 41.71165179994488, "learning_rate": 1.213216769149108e-07, "logits/chosen": 14.508601188659668, "logits/rejected": 14.987470626831055, "logps/chosen": -4.6747307777404785, "logps/rejected": -4.735253810882568, "loss": 3.4399, "rewards/accuracies": 0.5, "rewards/chosen": -46.747314453125, "rewards/margins": 0.6052265167236328, "rewards/rejected": -47.3525390625, "step": 5661 }, { "epoch": 0.7709694989106753, "grad_norm": 39.969565821870944, "learning_rate": 1.2118530810908563e-07, "logits/chosen": 13.906371116638184, "logits/rejected": 14.615489959716797, "logps/chosen": -4.508868217468262, "logps/rejected": -4.7875075340271, "loss": 3.6753, "rewards/accuracies": 0.75, "rewards/chosen": -45.08868408203125, "rewards/margins": 2.7863903045654297, "rewards/rejected": -47.87507247924805, "step": 5662 }, { "epoch": 0.7711056644880174, "grad_norm": 39.61552602543267, "learning_rate": 1.210490023037448e-07, "logits/chosen": 14.218585968017578, "logits/rejected": 14.380573272705078, "logps/chosen": -4.395339488983154, "logps/rejected": -4.4905290603637695, "loss": 3.7876, "rewards/accuracies": 0.5, "rewards/chosen": -43.95339584350586, "rewards/margins": 0.9518985748291016, "rewards/rejected": -44.905296325683594, "step": 5663 }, { "epoch": 0.7712418300653595, "grad_norm": 116.6629793640908, "learning_rate": 1.2091275952968784e-07, "logits/chosen": 14.515998840332031, "logits/rejected": 14.42518424987793, "logps/chosen": -4.499455451965332, "logps/rejected": -4.4135308265686035, "loss": 3.8684, "rewards/accuracies": 0.5, "rewards/chosen": -44.99455261230469, "rewards/margins": -0.8592453002929688, "rewards/rejected": -44.13530731201172, "step": 5664 }, { "epoch": 0.7713779956427015, "grad_norm": 43.21633325352946, "learning_rate": 1.207765798176997e-07, "logits/chosen": 14.325922012329102, "logits/rejected": 14.620697021484375, "logps/chosen": -4.544427394866943, "logps/rejected": -4.50360107421875, "loss": 3.9514, "rewards/accuracies": 0.5, "rewards/chosen": -45.44427490234375, "rewards/margins": -0.4082603454589844, "rewards/rejected": -45.036014556884766, "step": 5665 }, { "epoch": 0.7715141612200436, "grad_norm": 37.726779051645586, "learning_rate": 1.206404631985515e-07, "logits/chosen": 13.506645202636719, "logits/rejected": 13.399307250976562, "logps/chosen": -4.211953163146973, "logps/rejected": -4.322627544403076, "loss": 3.5572, "rewards/accuracies": 0.25, "rewards/chosen": -42.119529724121094, "rewards/margins": 1.1067476272583008, "rewards/rejected": -43.22627639770508, "step": 5666 }, { "epoch": 0.7716503267973857, "grad_norm": 35.59369705279676, "learning_rate": 1.205044097029999e-07, "logits/chosen": 13.136253356933594, "logits/rejected": 13.6572265625, "logps/chosen": -4.290779113769531, "logps/rejected": -4.5101118087768555, "loss": 3.8496, "rewards/accuracies": 0.75, "rewards/chosen": -42.90779113769531, "rewards/margins": 2.1933250427246094, "rewards/rejected": -45.10111999511719, "step": 5667 }, { "epoch": 0.7717864923747276, "grad_norm": 39.63158788141444, "learning_rate": 1.203684193617872e-07, "logits/chosen": 13.108474731445312, "logits/rejected": 14.367462158203125, "logps/chosen": -4.317785739898682, "logps/rejected": -4.6940436363220215, "loss": 4.1222, "rewards/accuracies": 0.75, "rewards/chosen": -43.1778564453125, "rewards/margins": 3.7625789642333984, "rewards/rejected": -46.94043731689453, "step": 5668 }, { "epoch": 0.7719226579520697, "grad_norm": 39.33143407574819, "learning_rate": 1.2023249220564155e-07, "logits/chosen": 15.305675506591797, "logits/rejected": 14.690610885620117, "logps/chosen": -4.86954402923584, "logps/rejected": -4.631196022033691, "loss": 3.9634, "rewards/accuracies": 0.5, "rewards/chosen": -48.69544219970703, "rewards/margins": -2.383481025695801, "rewards/rejected": -46.31195831298828, "step": 5669 }, { "epoch": 0.7720588235294118, "grad_norm": 44.59869079783898, "learning_rate": 1.2009662826527703e-07, "logits/chosen": 14.183016777038574, "logits/rejected": 14.270567893981934, "logps/chosen": -4.433130741119385, "logps/rejected": -4.543577194213867, "loss": 3.8905, "rewards/accuracies": 0.25, "rewards/chosen": -44.3313102722168, "rewards/margins": 1.1044645309448242, "rewards/rejected": -45.43577194213867, "step": 5670 }, { "epoch": 0.7721949891067538, "grad_norm": 38.22093773891881, "learning_rate": 1.199608275713929e-07, "logits/chosen": 15.263174057006836, "logits/rejected": 14.78154182434082, "logps/chosen": -4.512564659118652, "logps/rejected": -4.741361141204834, "loss": 3.8926, "rewards/accuracies": 0.75, "rewards/chosen": -45.125648498535156, "rewards/margins": 2.2879676818847656, "rewards/rejected": -47.41361618041992, "step": 5671 }, { "epoch": 0.7723311546840959, "grad_norm": 43.93849470748342, "learning_rate": 1.1982509015467458e-07, "logits/chosen": 13.9170503616333, "logits/rejected": 14.886747360229492, "logps/chosen": -4.482402801513672, "logps/rejected": -4.8372039794921875, "loss": 3.7915, "rewards/accuracies": 0.75, "rewards/chosen": -44.82403564453125, "rewards/margins": 3.548007011413574, "rewards/rejected": -48.372039794921875, "step": 5672 }, { "epoch": 0.7724673202614379, "grad_norm": 41.181487924293755, "learning_rate": 1.196894160457933e-07, "logits/chosen": 14.331244468688965, "logits/rejected": 13.6880464553833, "logps/chosen": -4.549295425415039, "logps/rejected": -4.597367286682129, "loss": 3.9788, "rewards/accuracies": 0.25, "rewards/chosen": -45.49295425415039, "rewards/margins": 0.48071956634521484, "rewards/rejected": -45.973670959472656, "step": 5673 }, { "epoch": 0.7726034858387799, "grad_norm": 46.4819873393638, "learning_rate": 1.1955380527540535e-07, "logits/chosen": 13.906778335571289, "logits/rejected": 14.764759063720703, "logps/chosen": -4.217162132263184, "logps/rejected": -4.932872772216797, "loss": 4.49, "rewards/accuracies": 1.0, "rewards/chosen": -42.17161560058594, "rewards/margins": 7.157110214233398, "rewards/rejected": -49.32872772216797, "step": 5674 }, { "epoch": 0.772739651416122, "grad_norm": 42.09880123316299, "learning_rate": 1.194182578741533e-07, "logits/chosen": 13.995290756225586, "logits/rejected": 14.673079490661621, "logps/chosen": -4.140018939971924, "logps/rejected": -4.561016082763672, "loss": 3.8314, "rewards/accuracies": 1.0, "rewards/chosen": -41.40018844604492, "rewards/margins": 4.209973335266113, "rewards/rejected": -45.61016082763672, "step": 5675 }, { "epoch": 0.7728758169934641, "grad_norm": 65.2847689946128, "learning_rate": 1.1928277387266535e-07, "logits/chosen": 14.333638191223145, "logits/rejected": 14.222858428955078, "logps/chosen": -4.457125663757324, "logps/rejected": -4.441315650939941, "loss": 4.261, "rewards/accuracies": 0.5, "rewards/chosen": -44.571258544921875, "rewards/margins": -0.15810489654541016, "rewards/rejected": -44.41315460205078, "step": 5676 }, { "epoch": 0.773011982570806, "grad_norm": 41.978958959456484, "learning_rate": 1.1914735330155492e-07, "logits/chosen": 13.961217880249023, "logits/rejected": 14.740087509155273, "logps/chosen": -4.3932342529296875, "logps/rejected": -4.547213554382324, "loss": 4.1325, "rewards/accuracies": 0.5, "rewards/chosen": -43.932342529296875, "rewards/margins": 1.5397958755493164, "rewards/rejected": -45.472137451171875, "step": 5677 }, { "epoch": 0.7731481481481481, "grad_norm": 40.44980436859049, "learning_rate": 1.1901199619142155e-07, "logits/chosen": 14.283344268798828, "logits/rejected": 14.21382999420166, "logps/chosen": -4.695973873138428, "logps/rejected": -4.842920780181885, "loss": 4.0222, "rewards/accuracies": 0.5, "rewards/chosen": -46.959739685058594, "rewards/margins": 1.4694671630859375, "rewards/rejected": -48.42920684814453, "step": 5678 }, { "epoch": 0.7732843137254902, "grad_norm": 43.150082587230465, "learning_rate": 1.1887670257285045e-07, "logits/chosen": 13.719717025756836, "logits/rejected": 13.836639404296875, "logps/chosen": -4.342741012573242, "logps/rejected": -4.3821702003479, "loss": 4.0646, "rewards/accuracies": 0.5, "rewards/chosen": -43.42741012573242, "rewards/margins": 0.3942909240722656, "rewards/rejected": -43.82170104980469, "step": 5679 }, { "epoch": 0.7734204793028322, "grad_norm": 38.926231724526794, "learning_rate": 1.187414724764121e-07, "logits/chosen": 13.412907600402832, "logits/rejected": 14.41122055053711, "logps/chosen": -4.176108360290527, "logps/rejected": -4.576488971710205, "loss": 3.9897, "rewards/accuracies": 0.75, "rewards/chosen": -41.76108169555664, "rewards/margins": 4.003808975219727, "rewards/rejected": -45.764892578125, "step": 5680 }, { "epoch": 0.7735566448801743, "grad_norm": 43.39334809649282, "learning_rate": 1.1860630593266291e-07, "logits/chosen": 13.849331855773926, "logits/rejected": 15.380186080932617, "logps/chosen": -4.355501174926758, "logps/rejected": -4.551612377166748, "loss": 3.7282, "rewards/accuracies": 0.5, "rewards/chosen": -43.55501174926758, "rewards/margins": 1.9611148834228516, "rewards/rejected": -45.5161247253418, "step": 5681 }, { "epoch": 0.7736928104575164, "grad_norm": 43.079066649383705, "learning_rate": 1.1847120297214508e-07, "logits/chosen": 13.546074867248535, "logits/rejected": 14.333030700683594, "logps/chosen": -4.062237739562988, "logps/rejected": -4.470753192901611, "loss": 3.9654, "rewards/accuracies": 0.75, "rewards/chosen": -40.62237548828125, "rewards/margins": 4.0851545333862305, "rewards/rejected": -44.7075309753418, "step": 5682 }, { "epoch": 0.7738289760348583, "grad_norm": 40.56473211411172, "learning_rate": 1.183361636253859e-07, "logits/chosen": 14.05689811706543, "logits/rejected": 14.946590423583984, "logps/chosen": -4.263110160827637, "logps/rejected": -4.397441387176514, "loss": 3.9896, "rewards/accuracies": 0.75, "rewards/chosen": -42.631103515625, "rewards/margins": 1.343313217163086, "rewards/rejected": -43.97441482543945, "step": 5683 }, { "epoch": 0.7739651416122004, "grad_norm": 42.799419151828616, "learning_rate": 1.1820118792289883e-07, "logits/chosen": 13.688438415527344, "logits/rejected": 13.409326553344727, "logps/chosen": -4.1670074462890625, "logps/rejected": -4.241394996643066, "loss": 3.5654, "rewards/accuracies": 0.75, "rewards/chosen": -41.67007064819336, "rewards/margins": 0.7438793182373047, "rewards/rejected": -42.4139518737793, "step": 5684 }, { "epoch": 0.7741013071895425, "grad_norm": 41.00299147126762, "learning_rate": 1.1806627589518288e-07, "logits/chosen": 13.65673828125, "logits/rejected": 14.082903861999512, "logps/chosen": -4.071779251098633, "logps/rejected": -4.570281028747559, "loss": 3.8686, "rewards/accuracies": 0.75, "rewards/chosen": -40.717796325683594, "rewards/margins": 4.985014915466309, "rewards/rejected": -45.70281219482422, "step": 5685 }, { "epoch": 0.7742374727668845, "grad_norm": 41.279224997565024, "learning_rate": 1.1793142757272221e-07, "logits/chosen": 13.925582885742188, "logits/rejected": 14.070514678955078, "logps/chosen": -3.9840216636657715, "logps/rejected": -4.522198677062988, "loss": 4.2501, "rewards/accuracies": 1.0, "rewards/chosen": -39.84021759033203, "rewards/margins": 5.381769180297852, "rewards/rejected": -45.22198486328125, "step": 5686 }, { "epoch": 0.7743736383442266, "grad_norm": 45.271323496295516, "learning_rate": 1.1779664298598713e-07, "logits/chosen": 14.626835823059082, "logits/rejected": 14.757250785827637, "logps/chosen": -4.28220272064209, "logps/rejected": -4.3454413414001465, "loss": 4.1578, "rewards/accuracies": 0.5, "rewards/chosen": -42.82202911376953, "rewards/margins": 0.6323862075805664, "rewards/rejected": -43.45441436767578, "step": 5687 }, { "epoch": 0.7745098039215687, "grad_norm": 38.48218632818991, "learning_rate": 1.1766192216543323e-07, "logits/chosen": 14.911399841308594, "logits/rejected": 14.5601806640625, "logps/chosen": -4.75572395324707, "logps/rejected": -4.8162946701049805, "loss": 3.3718, "rewards/accuracies": 0.5, "rewards/chosen": -47.5572395324707, "rewards/margins": 0.605708122253418, "rewards/rejected": -48.16294860839844, "step": 5688 }, { "epoch": 0.7746459694989106, "grad_norm": 52.112888090198986, "learning_rate": 1.1752726514150201e-07, "logits/chosen": 14.160943984985352, "logits/rejected": 14.820343971252441, "logps/chosen": -4.28152322769165, "logps/rejected": -4.796125411987305, "loss": 4.0167, "rewards/accuracies": 1.0, "rewards/chosen": -42.81523132324219, "rewards/margins": 5.146022796630859, "rewards/rejected": -47.96125030517578, "step": 5689 }, { "epoch": 0.7747821350762527, "grad_norm": 52.5844680978169, "learning_rate": 1.1739267194462002e-07, "logits/chosen": 14.573501586914062, "logits/rejected": 15.593764305114746, "logps/chosen": -4.591320991516113, "logps/rejected": -5.128318786621094, "loss": 3.5643, "rewards/accuracies": 1.0, "rewards/chosen": -45.913211822509766, "rewards/margins": 5.3699750900268555, "rewards/rejected": -51.28318786621094, "step": 5690 }, { "epoch": 0.7749183006535948, "grad_norm": 44.63462463970008, "learning_rate": 1.1725814260519986e-07, "logits/chosen": 14.5996732711792, "logits/rejected": 14.885522842407227, "logps/chosen": -4.626029014587402, "logps/rejected": -4.756152629852295, "loss": 3.9281, "rewards/accuracies": 0.75, "rewards/chosen": -46.260292053222656, "rewards/margins": 1.3012313842773438, "rewards/rejected": -47.5615234375, "step": 5691 }, { "epoch": 0.7750544662309368, "grad_norm": 36.932855842371964, "learning_rate": 1.1712367715363968e-07, "logits/chosen": 14.3254976272583, "logits/rejected": 14.930150985717773, "logps/chosen": -4.61143684387207, "logps/rejected": -5.025634288787842, "loss": 3.5879, "rewards/accuracies": 0.75, "rewards/chosen": -46.11437225341797, "rewards/margins": 4.141974449157715, "rewards/rejected": -50.256343841552734, "step": 5692 }, { "epoch": 0.7751906318082789, "grad_norm": 41.010838347339906, "learning_rate": 1.1698927562032284e-07, "logits/chosen": 14.773797035217285, "logits/rejected": 15.708207130432129, "logps/chosen": -4.559237480163574, "logps/rejected": -4.824350833892822, "loss": 4.0625, "rewards/accuracies": 0.5, "rewards/chosen": -45.592376708984375, "rewards/margins": 2.651132583618164, "rewards/rejected": -48.243507385253906, "step": 5693 }, { "epoch": 0.7753267973856209, "grad_norm": 36.94046883704878, "learning_rate": 1.1685493803561853e-07, "logits/chosen": 14.413921356201172, "logits/rejected": 14.79977798461914, "logps/chosen": -4.736065864562988, "logps/rejected": -4.96608304977417, "loss": 3.814, "rewards/accuracies": 0.75, "rewards/chosen": -47.36065673828125, "rewards/margins": 2.3001708984375, "rewards/rejected": -49.660831451416016, "step": 5694 }, { "epoch": 0.7754629629629629, "grad_norm": 38.115767034752714, "learning_rate": 1.1672066442988149e-07, "logits/chosen": 13.388559341430664, "logits/rejected": 13.805061340332031, "logps/chosen": -4.263913154602051, "logps/rejected": -4.679022789001465, "loss": 4.0264, "rewards/accuracies": 1.0, "rewards/chosen": -42.639129638671875, "rewards/margins": 4.15109920501709, "rewards/rejected": -46.79022979736328, "step": 5695 }, { "epoch": 0.775599128540305, "grad_norm": 46.60577472534176, "learning_rate": 1.1658645483345205e-07, "logits/chosen": 13.98300552368164, "logits/rejected": 14.115312576293945, "logps/chosen": -4.650900840759277, "logps/rejected": -4.665579319000244, "loss": 3.5594, "rewards/accuracies": 0.5, "rewards/chosen": -46.509010314941406, "rewards/margins": 0.14678573608398438, "rewards/rejected": -46.655792236328125, "step": 5696 }, { "epoch": 0.7757352941176471, "grad_norm": 56.35399803410073, "learning_rate": 1.164523092766557e-07, "logits/chosen": 14.687457084655762, "logits/rejected": 14.457620620727539, "logps/chosen": -4.559153079986572, "logps/rejected": -4.719328880310059, "loss": 3.5868, "rewards/accuracies": 0.75, "rewards/chosen": -45.591529846191406, "rewards/margins": 1.601755142211914, "rewards/rejected": -47.19328308105469, "step": 5697 }, { "epoch": 0.775871459694989, "grad_norm": 55.08220311115866, "learning_rate": 1.1631822778980392e-07, "logits/chosen": 14.585792541503906, "logits/rejected": 14.135374069213867, "logps/chosen": -4.049800872802734, "logps/rejected": -4.086446762084961, "loss": 4.2824, "rewards/accuracies": 0.5, "rewards/chosen": -40.49800491333008, "rewards/margins": 0.36646175384521484, "rewards/rejected": -40.86446762084961, "step": 5698 }, { "epoch": 0.7760076252723311, "grad_norm": 42.4074048070778, "learning_rate": 1.1618421040319364e-07, "logits/chosen": 14.365682601928711, "logits/rejected": 14.54888916015625, "logps/chosen": -4.466105937957764, "logps/rejected": -4.554695129394531, "loss": 3.6292, "rewards/accuracies": 0.5, "rewards/chosen": -44.66105651855469, "rewards/margins": 0.8858957290649414, "rewards/rejected": -45.54695510864258, "step": 5699 }, { "epoch": 0.7761437908496732, "grad_norm": 46.58638070403914, "learning_rate": 1.1605025714710697e-07, "logits/chosen": 14.261676788330078, "logits/rejected": 14.764127731323242, "logps/chosen": -4.122602939605713, "logps/rejected": -4.283634185791016, "loss": 4.0993, "rewards/accuracies": 0.5, "rewards/chosen": -41.22602844238281, "rewards/margins": 1.6103172302246094, "rewards/rejected": -42.83634567260742, "step": 5700 }, { "epoch": 0.7762799564270153, "grad_norm": 47.97965879889312, "learning_rate": 1.1591636805181178e-07, "logits/chosen": 14.859926223754883, "logits/rejected": 14.844644546508789, "logps/chosen": -4.687281131744385, "logps/rejected": -4.7942070960998535, "loss": 4.4011, "rewards/accuracies": 0.5, "rewards/chosen": -46.8728141784668, "rewards/margins": 1.0692577362060547, "rewards/rejected": -47.94207000732422, "step": 5701 }, { "epoch": 0.7764161220043573, "grad_norm": 39.88879287123574, "learning_rate": 1.1578254314756155e-07, "logits/chosen": 13.636595726013184, "logits/rejected": 14.727169036865234, "logps/chosen": -4.263379096984863, "logps/rejected": -4.720537185668945, "loss": 3.8511, "rewards/accuracies": 1.0, "rewards/chosen": -42.6337890625, "rewards/margins": 4.571582794189453, "rewards/rejected": -47.20537185668945, "step": 5702 }, { "epoch": 0.7765522875816994, "grad_norm": 41.50100803210923, "learning_rate": 1.1564878246459517e-07, "logits/chosen": 13.425531387329102, "logits/rejected": 14.387104988098145, "logps/chosen": -4.474323272705078, "logps/rejected": -4.388017654418945, "loss": 3.9234, "rewards/accuracies": 0.5, "rewards/chosen": -44.74323272705078, "rewards/margins": -0.8630533218383789, "rewards/rejected": -43.88018035888672, "step": 5703 }, { "epoch": 0.7766884531590414, "grad_norm": 39.45246274101592, "learning_rate": 1.1551508603313673e-07, "logits/chosen": 14.617220878601074, "logits/rejected": 14.801229476928711, "logps/chosen": -4.66109561920166, "logps/rejected": -4.96789026260376, "loss": 3.99, "rewards/accuracies": 1.0, "rewards/chosen": -46.61095428466797, "rewards/margins": 3.067948341369629, "rewards/rejected": -49.67890548706055, "step": 5704 }, { "epoch": 0.7768246187363834, "grad_norm": 41.114895138106704, "learning_rate": 1.1538145388339615e-07, "logits/chosen": 14.250480651855469, "logits/rejected": 13.62600326538086, "logps/chosen": -4.451478004455566, "logps/rejected": -4.393481254577637, "loss": 3.8515, "rewards/accuracies": 0.5, "rewards/chosen": -44.5147819519043, "rewards/margins": -0.5799646377563477, "rewards/rejected": -43.934814453125, "step": 5705 }, { "epoch": 0.7769607843137255, "grad_norm": 42.18809342938434, "learning_rate": 1.1524788604556891e-07, "logits/chosen": 12.98169994354248, "logits/rejected": 13.916166305541992, "logps/chosen": -4.085987091064453, "logps/rejected": -4.556550979614258, "loss": 3.8657, "rewards/accuracies": 0.75, "rewards/chosen": -40.85987091064453, "rewards/margins": 4.705641746520996, "rewards/rejected": -45.56550979614258, "step": 5706 }, { "epoch": 0.7770969498910676, "grad_norm": 48.80644971253672, "learning_rate": 1.1511438254983548e-07, "logits/chosen": 14.579161643981934, "logits/rejected": 14.555434226989746, "logps/chosen": -4.582319259643555, "logps/rejected": -4.789997577667236, "loss": 4.0824, "rewards/accuracies": 0.75, "rewards/chosen": -45.82319259643555, "rewards/margins": 2.076785087585449, "rewards/rejected": -47.89997863769531, "step": 5707 }, { "epoch": 0.7772331154684096, "grad_norm": 40.49763460219379, "learning_rate": 1.1498094342636218e-07, "logits/chosen": 14.708762168884277, "logits/rejected": 13.62089729309082, "logps/chosen": -4.763035774230957, "logps/rejected": -4.466211318969727, "loss": 3.7908, "rewards/accuracies": 0.25, "rewards/chosen": -47.63035583496094, "rewards/margins": -2.9682464599609375, "rewards/rejected": -44.662109375, "step": 5708 }, { "epoch": 0.7773692810457516, "grad_norm": 46.84838527249773, "learning_rate": 1.1484756870530081e-07, "logits/chosen": 14.124427795410156, "logits/rejected": 14.6181640625, "logps/chosen": -4.529351234436035, "logps/rejected": -4.7296342849731445, "loss": 3.1901, "rewards/accuracies": 0.75, "rewards/chosen": -45.29351043701172, "rewards/margins": 2.0028295516967773, "rewards/rejected": -47.29634094238281, "step": 5709 }, { "epoch": 0.7775054466230937, "grad_norm": 46.14202140450544, "learning_rate": 1.147142584167883e-07, "logits/chosen": 14.973241806030273, "logits/rejected": 15.194039344787598, "logps/chosen": -4.51629638671875, "logps/rejected": -4.716601371765137, "loss": 4.1544, "rewards/accuracies": 0.5, "rewards/chosen": -45.162960052490234, "rewards/margins": 2.003056526184082, "rewards/rejected": -47.166015625, "step": 5710 }, { "epoch": 0.7776416122004357, "grad_norm": 40.989160549594686, "learning_rate": 1.145810125909473e-07, "logits/chosen": 15.327491760253906, "logits/rejected": 15.409074783325195, "logps/chosen": -4.659866809844971, "logps/rejected": -4.7573723793029785, "loss": 4.3026, "rewards/accuracies": 0.5, "rewards/chosen": -46.59866714477539, "rewards/margins": 0.9750556945800781, "rewards/rejected": -47.57372283935547, "step": 5711 }, { "epoch": 0.7777777777777778, "grad_norm": 41.52315845598564, "learning_rate": 1.1444783125788591e-07, "logits/chosen": 13.510480880737305, "logits/rejected": 13.67972183227539, "logps/chosen": -4.231331825256348, "logps/rejected": -4.322965145111084, "loss": 4.5937, "rewards/accuracies": 0.25, "rewards/chosen": -42.31332015991211, "rewards/margins": 0.9163351058959961, "rewards/rejected": -43.229652404785156, "step": 5712 }, { "epoch": 0.7779139433551199, "grad_norm": 39.17034333609134, "learning_rate": 1.1431471444769734e-07, "logits/chosen": 14.664850234985352, "logits/rejected": 13.963096618652344, "logps/chosen": -4.385096549987793, "logps/rejected": -4.197924613952637, "loss": 4.1681, "rewards/accuracies": 0.5, "rewards/chosen": -43.85096740722656, "rewards/margins": -1.871720314025879, "rewards/rejected": -41.979248046875, "step": 5713 }, { "epoch": 0.7780501089324618, "grad_norm": 41.71162486094128, "learning_rate": 1.1418166219046051e-07, "logits/chosen": 13.33161735534668, "logits/rejected": 13.658212661743164, "logps/chosen": -4.290576457977295, "logps/rejected": -4.462393760681152, "loss": 3.9413, "rewards/accuracies": 0.75, "rewards/chosen": -42.90576171875, "rewards/margins": 1.7181692123413086, "rewards/rejected": -44.623931884765625, "step": 5714 }, { "epoch": 0.7781862745098039, "grad_norm": 42.732587612712905, "learning_rate": 1.1404867451623981e-07, "logits/chosen": 14.017486572265625, "logits/rejected": 15.261343002319336, "logps/chosen": -4.373835563659668, "logps/rejected": -4.589162826538086, "loss": 4.2378, "rewards/accuracies": 0.75, "rewards/chosen": -43.73835372924805, "rewards/margins": 2.153273582458496, "rewards/rejected": -45.891624450683594, "step": 5715 }, { "epoch": 0.778322440087146, "grad_norm": 44.585585209901446, "learning_rate": 1.1391575145508471e-07, "logits/chosen": 14.064114570617676, "logits/rejected": 14.607210159301758, "logps/chosen": -4.3112473487854, "logps/rejected": -4.4873247146606445, "loss": 4.2285, "rewards/accuracies": 0.75, "rewards/chosen": -43.11247253417969, "rewards/margins": 1.7607765197753906, "rewards/rejected": -44.87324905395508, "step": 5716 }, { "epoch": 0.778458605664488, "grad_norm": 40.253153200053625, "learning_rate": 1.1378289303703036e-07, "logits/chosen": 13.221879005432129, "logits/rejected": 14.241886138916016, "logps/chosen": -4.458128929138184, "logps/rejected": -4.646636009216309, "loss": 4.2892, "rewards/accuracies": 0.5, "rewards/chosen": -44.58129119873047, "rewards/margins": 1.8850688934326172, "rewards/rejected": -46.46635818481445, "step": 5717 }, { "epoch": 0.7785947712418301, "grad_norm": 41.78687784988185, "learning_rate": 1.1365009929209737e-07, "logits/chosen": 14.8536376953125, "logits/rejected": 14.354246139526367, "logps/chosen": -4.417634010314941, "logps/rejected": -4.49578332901001, "loss": 3.9945, "rewards/accuracies": 0.5, "rewards/chosen": -44.17634582519531, "rewards/margins": 0.7814884185791016, "rewards/rejected": -44.95783233642578, "step": 5718 }, { "epoch": 0.7787309368191722, "grad_norm": 37.24515054784663, "learning_rate": 1.1351737025029132e-07, "logits/chosen": 15.168783187866211, "logits/rejected": 15.11921501159668, "logps/chosen": -4.998826026916504, "logps/rejected": -4.641671180725098, "loss": 3.865, "rewards/accuracies": 0.25, "rewards/chosen": -49.988258361816406, "rewards/margins": -3.5715513229370117, "rewards/rejected": -46.416709899902344, "step": 5719 }, { "epoch": 0.7788671023965141, "grad_norm": 35.84831978040492, "learning_rate": 1.133847059416035e-07, "logits/chosen": 14.151695251464844, "logits/rejected": 15.306830406188965, "logps/chosen": -4.72099494934082, "logps/rejected": -4.785649299621582, "loss": 3.7008, "rewards/accuracies": 0.25, "rewards/chosen": -47.20995330810547, "rewards/margins": 0.6465387344360352, "rewards/rejected": -47.85649108886719, "step": 5720 }, { "epoch": 0.7790032679738562, "grad_norm": 48.128251406666436, "learning_rate": 1.1325210639601071e-07, "logits/chosen": 14.73727798461914, "logits/rejected": 15.096261978149414, "logps/chosen": -4.529991626739502, "logps/rejected": -4.62490177154541, "loss": 3.4649, "rewards/accuracies": 0.5, "rewards/chosen": -45.2999153137207, "rewards/margins": 0.9491024017333984, "rewards/rejected": -46.24901580810547, "step": 5721 }, { "epoch": 0.7791394335511983, "grad_norm": 39.505541111704304, "learning_rate": 1.1311957164347461e-07, "logits/chosen": 14.744146347045898, "logits/rejected": 15.252642631530762, "logps/chosen": -4.654301643371582, "logps/rejected": -4.930692672729492, "loss": 3.7237, "rewards/accuracies": 0.5, "rewards/chosen": -46.54301452636719, "rewards/margins": 2.763913154602051, "rewards/rejected": -49.30693054199219, "step": 5722 }, { "epoch": 0.7792755991285403, "grad_norm": 43.427355840394, "learning_rate": 1.1298710171394272e-07, "logits/chosen": 14.588968276977539, "logits/rejected": 14.609735488891602, "logps/chosen": -4.584214210510254, "logps/rejected": -4.580142021179199, "loss": 4.6396, "rewards/accuracies": 0.5, "rewards/chosen": -45.84214401245117, "rewards/margins": -0.04072856903076172, "rewards/rejected": -45.801414489746094, "step": 5723 }, { "epoch": 0.7794117647058824, "grad_norm": 44.64340762668646, "learning_rate": 1.1285469663734777e-07, "logits/chosen": 14.379899978637695, "logits/rejected": 14.64660930633545, "logps/chosen": -4.4757890701293945, "logps/rejected": -4.63734769821167, "loss": 4.2454, "rewards/accuracies": 0.5, "rewards/chosen": -44.75788879394531, "rewards/margins": 1.615591049194336, "rewards/rejected": -46.373477935791016, "step": 5724 }, { "epoch": 0.7795479302832244, "grad_norm": 47.71287109333099, "learning_rate": 1.1272235644360754e-07, "logits/chosen": 13.438385963439941, "logits/rejected": 15.220085144042969, "logps/chosen": -4.378769397735596, "logps/rejected": -4.8629536628723145, "loss": 4.1889, "rewards/accuracies": 1.0, "rewards/chosen": -43.78769302368164, "rewards/margins": 4.841843605041504, "rewards/rejected": -48.629539489746094, "step": 5725 }, { "epoch": 0.7796840958605664, "grad_norm": 41.552221760841206, "learning_rate": 1.1259008116262556e-07, "logits/chosen": 14.173004150390625, "logits/rejected": 14.737257957458496, "logps/chosen": -4.6011738777160645, "logps/rejected": -4.764719009399414, "loss": 3.6344, "rewards/accuracies": 0.5, "rewards/chosen": -46.011741638183594, "rewards/margins": 1.635451316833496, "rewards/rejected": -47.64719009399414, "step": 5726 }, { "epoch": 0.7798202614379085, "grad_norm": 39.75364312204043, "learning_rate": 1.1245787082429061e-07, "logits/chosen": 15.176968574523926, "logits/rejected": 14.79627513885498, "logps/chosen": -4.339303970336914, "logps/rejected": -4.319155693054199, "loss": 3.6855, "rewards/accuracies": 0.25, "rewards/chosen": -43.39303970336914, "rewards/margins": -0.20148086547851562, "rewards/rejected": -43.19156265258789, "step": 5727 }, { "epoch": 0.7799564270152506, "grad_norm": 38.329246582354436, "learning_rate": 1.1232572545847649e-07, "logits/chosen": 14.527814865112305, "logits/rejected": 14.101299285888672, "logps/chosen": -4.3845534324646, "logps/rejected": -4.43358039855957, "loss": 4.0804, "rewards/accuracies": 0.75, "rewards/chosen": -43.84553146362305, "rewards/margins": 0.49026966094970703, "rewards/rejected": -44.3358039855957, "step": 5728 }, { "epoch": 0.7800925925925926, "grad_norm": 48.97519603930735, "learning_rate": 1.1219364509504266e-07, "logits/chosen": 13.943666458129883, "logits/rejected": 14.430700302124023, "logps/chosen": -4.138979434967041, "logps/rejected": -4.570021152496338, "loss": 3.8391, "rewards/accuracies": 1.0, "rewards/chosen": -41.389793395996094, "rewards/margins": 4.310418128967285, "rewards/rejected": -45.70021057128906, "step": 5729 }, { "epoch": 0.7802287581699346, "grad_norm": 42.58932064583495, "learning_rate": 1.1206162976383384e-07, "logits/chosen": 14.703010559082031, "logits/rejected": 15.10689926147461, "logps/chosen": -4.459738731384277, "logps/rejected": -4.625769138336182, "loss": 4.068, "rewards/accuracies": 0.5, "rewards/chosen": -44.597389221191406, "rewards/margins": 1.6603031158447266, "rewards/rejected": -46.2576904296875, "step": 5730 }, { "epoch": 0.7803649237472767, "grad_norm": 41.24584407179607, "learning_rate": 1.1192967949467975e-07, "logits/chosen": 14.156227111816406, "logits/rejected": 14.790729522705078, "logps/chosen": -4.460042953491211, "logps/rejected": -4.873664379119873, "loss": 4.136, "rewards/accuracies": 1.0, "rewards/chosen": -44.600433349609375, "rewards/margins": 4.136211395263672, "rewards/rejected": -48.73664474487305, "step": 5731 }, { "epoch": 0.7805010893246187, "grad_norm": 44.36454238867817, "learning_rate": 1.1179779431739582e-07, "logits/chosen": 13.684106826782227, "logits/rejected": 14.02099895477295, "logps/chosen": -4.080076694488525, "logps/rejected": -4.0231781005859375, "loss": 3.8337, "rewards/accuracies": 0.25, "rewards/chosen": -40.8007698059082, "rewards/margins": -0.5689907073974609, "rewards/rejected": -40.23177719116211, "step": 5732 }, { "epoch": 0.7806372549019608, "grad_norm": 40.35651752506872, "learning_rate": 1.116659742617827e-07, "logits/chosen": 14.400495529174805, "logits/rejected": 14.074275970458984, "logps/chosen": -4.571379661560059, "logps/rejected": -4.521347999572754, "loss": 3.9526, "rewards/accuracies": 0.5, "rewards/chosen": -45.71379470825195, "rewards/margins": -0.5003118515014648, "rewards/rejected": -45.21348571777344, "step": 5733 }, { "epoch": 0.7807734204793029, "grad_norm": 48.77792019573043, "learning_rate": 1.1153421935762595e-07, "logits/chosen": 13.335931777954102, "logits/rejected": 13.289924621582031, "logps/chosen": -4.124673843383789, "logps/rejected": -4.112269878387451, "loss": 4.5196, "rewards/accuracies": 0.5, "rewards/chosen": -41.24673843383789, "rewards/margins": -0.1240386962890625, "rewards/rejected": -41.12269973754883, "step": 5734 }, { "epoch": 0.7809095860566448, "grad_norm": 41.954550489863294, "learning_rate": 1.1140252963469686e-07, "logits/chosen": 13.517648696899414, "logits/rejected": 14.12293815612793, "logps/chosen": -4.311713695526123, "logps/rejected": -4.74933385848999, "loss": 3.8485, "rewards/accuracies": 1.0, "rewards/chosen": -43.11713790893555, "rewards/margins": 4.376199722290039, "rewards/rejected": -47.49333572387695, "step": 5735 }, { "epoch": 0.7810457516339869, "grad_norm": 44.07395437374711, "learning_rate": 1.1127090512275183e-07, "logits/chosen": 14.199657440185547, "logits/rejected": 14.350865364074707, "logps/chosen": -4.647537708282471, "logps/rejected": -4.750777244567871, "loss": 3.7529, "rewards/accuracies": 0.75, "rewards/chosen": -46.475379943847656, "rewards/margins": 1.0323944091796875, "rewards/rejected": -47.50777053833008, "step": 5736 }, { "epoch": 0.781181917211329, "grad_norm": 39.95980662604833, "learning_rate": 1.1113934585153235e-07, "logits/chosen": 13.831731796264648, "logits/rejected": 13.976198196411133, "logps/chosen": -4.477655410766602, "logps/rejected": -4.535170555114746, "loss": 3.8006, "rewards/accuracies": 0.5, "rewards/chosen": -44.77655029296875, "rewards/margins": 0.5751571655273438, "rewards/rejected": -45.351707458496094, "step": 5737 }, { "epoch": 0.781318082788671, "grad_norm": 41.5403990668052, "learning_rate": 1.1100785185076542e-07, "logits/chosen": 14.01840591430664, "logits/rejected": 14.268973350524902, "logps/chosen": -4.336379051208496, "logps/rejected": -4.447337627410889, "loss": 3.9006, "rewards/accuracies": 0.75, "rewards/chosen": -43.36378860473633, "rewards/margins": 1.1095876693725586, "rewards/rejected": -44.4733772277832, "step": 5738 }, { "epoch": 0.7814542483660131, "grad_norm": 37.72659444286237, "learning_rate": 1.1087642315016332e-07, "logits/chosen": 13.62081241607666, "logits/rejected": 14.317758560180664, "logps/chosen": -4.430208206176758, "logps/rejected": -4.739917755126953, "loss": 3.5327, "rewards/accuracies": 0.75, "rewards/chosen": -44.302085876464844, "rewards/margins": 3.0970888137817383, "rewards/rejected": -47.399173736572266, "step": 5739 }, { "epoch": 0.7815904139433552, "grad_norm": 41.6848569494517, "learning_rate": 1.1074505977942323e-07, "logits/chosen": 13.379613876342773, "logits/rejected": 14.36421012878418, "logps/chosen": -4.337096691131592, "logps/rejected": -4.643477439880371, "loss": 4.1003, "rewards/accuracies": 0.75, "rewards/chosen": -43.370967864990234, "rewards/margins": 3.0638046264648438, "rewards/rejected": -46.43476867675781, "step": 5740 }, { "epoch": 0.7817265795206971, "grad_norm": 44.04412015635273, "learning_rate": 1.1061376176822785e-07, "logits/chosen": 13.909666061401367, "logits/rejected": 13.305002212524414, "logps/chosen": -4.294626235961914, "logps/rejected": -4.202571392059326, "loss": 3.7685, "rewards/accuracies": 0.25, "rewards/chosen": -42.94626235961914, "rewards/margins": -0.9205455780029297, "rewards/rejected": -42.02571487426758, "step": 5741 }, { "epoch": 0.7818627450980392, "grad_norm": 38.23154022295865, "learning_rate": 1.1048252914624522e-07, "logits/chosen": 14.195707321166992, "logits/rejected": 14.380483627319336, "logps/chosen": -4.30949592590332, "logps/rejected": -4.534857273101807, "loss": 4.1028, "rewards/accuracies": 0.75, "rewards/chosen": -43.0949592590332, "rewards/margins": 2.2536115646362305, "rewards/rejected": -45.34857177734375, "step": 5742 }, { "epoch": 0.7819989106753813, "grad_norm": 43.68347572077742, "learning_rate": 1.1035136194312822e-07, "logits/chosen": 14.506571769714355, "logits/rejected": 14.445826530456543, "logps/chosen": -4.453420639038086, "logps/rejected": -4.696730613708496, "loss": 4.4143, "rewards/accuracies": 0.75, "rewards/chosen": -44.53420639038086, "rewards/margins": 2.4330968856811523, "rewards/rejected": -46.96730041503906, "step": 5743 }, { "epoch": 0.7821350762527233, "grad_norm": 42.147137753654036, "learning_rate": 1.102202601885152e-07, "logits/chosen": 14.670330047607422, "logits/rejected": 14.672040939331055, "logps/chosen": -4.247585296630859, "logps/rejected": -4.452824592590332, "loss": 3.9363, "rewards/accuracies": 0.75, "rewards/chosen": -42.475852966308594, "rewards/margins": 2.0523910522460938, "rewards/rejected": -44.52824401855469, "step": 5744 }, { "epoch": 0.7822712418300654, "grad_norm": 40.83716910422459, "learning_rate": 1.1008922391202986e-07, "logits/chosen": 14.802080154418945, "logits/rejected": 14.72708511352539, "logps/chosen": -4.71654748916626, "logps/rejected": -4.815120220184326, "loss": 3.6639, "rewards/accuracies": 0.75, "rewards/chosen": -47.16547393798828, "rewards/margins": 0.9857301712036133, "rewards/rejected": -48.151206970214844, "step": 5745 }, { "epoch": 0.7824074074074074, "grad_norm": 39.75790503876636, "learning_rate": 1.0995825314328073e-07, "logits/chosen": 14.223238945007324, "logits/rejected": 15.062887191772461, "logps/chosen": -4.425508975982666, "logps/rejected": -4.842348575592041, "loss": 4.0891, "rewards/accuracies": 0.75, "rewards/chosen": -44.25509262084961, "rewards/margins": 4.16839599609375, "rewards/rejected": -48.423484802246094, "step": 5746 }, { "epoch": 0.7825435729847494, "grad_norm": 38.323684269903346, "learning_rate": 1.0982734791186179e-07, "logits/chosen": 13.881839752197266, "logits/rejected": 15.039798736572266, "logps/chosen": -4.143191337585449, "logps/rejected": -4.629172325134277, "loss": 4.106, "rewards/accuracies": 1.0, "rewards/chosen": -41.431915283203125, "rewards/margins": 4.859806060791016, "rewards/rejected": -46.29172134399414, "step": 5747 }, { "epoch": 0.7826797385620915, "grad_norm": 39.492911906511544, "learning_rate": 1.0969650824735226e-07, "logits/chosen": 14.412433624267578, "logits/rejected": 14.207963943481445, "logps/chosen": -4.3942341804504395, "logps/rejected": -4.601016521453857, "loss": 3.8604, "rewards/accuracies": 1.0, "rewards/chosen": -43.94234085083008, "rewards/margins": 2.0678224563598633, "rewards/rejected": -46.010162353515625, "step": 5748 }, { "epoch": 0.7828159041394336, "grad_norm": 37.302947708613566, "learning_rate": 1.0956573417931627e-07, "logits/chosen": 13.528779983520508, "logits/rejected": 14.162731170654297, "logps/chosen": -4.566032886505127, "logps/rejected": -4.591060638427734, "loss": 3.6762, "rewards/accuracies": 0.75, "rewards/chosen": -45.66033172607422, "rewards/margins": 0.2502784729003906, "rewards/rejected": -45.910606384277344, "step": 5749 }, { "epoch": 0.7829520697167756, "grad_norm": 65.46263537030521, "learning_rate": 1.0943502573730343e-07, "logits/chosen": 14.339929580688477, "logits/rejected": 14.443016052246094, "logps/chosen": -4.335482597351074, "logps/rejected": -4.529886722564697, "loss": 4.031, "rewards/accuracies": 0.75, "rewards/chosen": -43.354827880859375, "rewards/margins": 1.9440431594848633, "rewards/rejected": -45.298866271972656, "step": 5750 }, { "epoch": 0.7830882352941176, "grad_norm": 40.20683040487868, "learning_rate": 1.0930438295084842e-07, "logits/chosen": 14.129920959472656, "logits/rejected": 14.268514633178711, "logps/chosen": -4.371793746948242, "logps/rejected": -4.842840194702148, "loss": 3.7898, "rewards/accuracies": 1.0, "rewards/chosen": -43.71793746948242, "rewards/margins": 4.710460662841797, "rewards/rejected": -48.42839813232422, "step": 5751 }, { "epoch": 0.7832244008714597, "grad_norm": 40.805518426947, "learning_rate": 1.0917380584947094e-07, "logits/chosen": 13.802576065063477, "logits/rejected": 14.38352108001709, "logps/chosen": -4.276608943939209, "logps/rejected": -4.659600257873535, "loss": 3.8814, "rewards/accuracies": 0.75, "rewards/chosen": -42.76608657836914, "rewards/margins": 3.8299179077148438, "rewards/rejected": -46.596004486083984, "step": 5752 }, { "epoch": 0.7833605664488017, "grad_norm": 42.02156811019805, "learning_rate": 1.0904329446267597e-07, "logits/chosen": 14.180366516113281, "logits/rejected": 15.731727600097656, "logps/chosen": -4.393924236297607, "logps/rejected": -4.819442272186279, "loss": 3.6626, "rewards/accuracies": 1.0, "rewards/chosen": -43.93924331665039, "rewards/margins": 4.255183219909668, "rewards/rejected": -48.19442367553711, "step": 5753 }, { "epoch": 0.7834967320261438, "grad_norm": 42.223540124733645, "learning_rate": 1.0891284881995387e-07, "logits/chosen": 14.528534889221191, "logits/rejected": 14.884241104125977, "logps/chosen": -4.49918270111084, "logps/rejected": -4.857220649719238, "loss": 3.814, "rewards/accuracies": 0.75, "rewards/chosen": -44.991825103759766, "rewards/margins": 3.580381393432617, "rewards/rejected": -48.57220458984375, "step": 5754 }, { "epoch": 0.7836328976034859, "grad_norm": 44.31716137476857, "learning_rate": 1.0878246895077956e-07, "logits/chosen": 14.904544830322266, "logits/rejected": 15.04063606262207, "logps/chosen": -4.579428672790527, "logps/rejected": -4.744627475738525, "loss": 4.0525, "rewards/accuracies": 0.5, "rewards/chosen": -45.794288635253906, "rewards/margins": 1.6519861221313477, "rewards/rejected": -47.44627380371094, "step": 5755 }, { "epoch": 0.7837690631808278, "grad_norm": 39.816023600311915, "learning_rate": 1.0865215488461359e-07, "logits/chosen": 14.006404876708984, "logits/rejected": 14.6627779006958, "logps/chosen": -4.4566497802734375, "logps/rejected": -4.890233039855957, "loss": 3.5674, "rewards/accuracies": 1.0, "rewards/chosen": -44.566497802734375, "rewards/margins": 4.335831642150879, "rewards/rejected": -48.90232849121094, "step": 5756 }, { "epoch": 0.7839052287581699, "grad_norm": 40.97448777054677, "learning_rate": 1.0852190665090173e-07, "logits/chosen": 14.375102996826172, "logits/rejected": 14.963837623596191, "logps/chosen": -4.539394378662109, "logps/rejected": -4.588841438293457, "loss": 4.4148, "rewards/accuracies": 0.5, "rewards/chosen": -45.393943786621094, "rewards/margins": 0.4944734573364258, "rewards/rejected": -45.8884162902832, "step": 5757 }, { "epoch": 0.784041394335512, "grad_norm": 47.40584555070949, "learning_rate": 1.0839172427907426e-07, "logits/chosen": 14.841525077819824, "logits/rejected": 14.771783828735352, "logps/chosen": -4.595337867736816, "logps/rejected": -4.617476463317871, "loss": 4.1469, "rewards/accuracies": 0.5, "rewards/chosen": -45.95337677001953, "rewards/margins": 0.2213907241821289, "rewards/rejected": -46.17477035522461, "step": 5758 }, { "epoch": 0.784177559912854, "grad_norm": 47.13298860819584, "learning_rate": 1.0826160779854716e-07, "logits/chosen": 14.044607162475586, "logits/rejected": 14.282369613647461, "logps/chosen": -4.5481157302856445, "logps/rejected": -4.5952277183532715, "loss": 3.6895, "rewards/accuracies": 0.75, "rewards/chosen": -45.48115539550781, "rewards/margins": 0.47111988067626953, "rewards/rejected": -45.95227813720703, "step": 5759 }, { "epoch": 0.7843137254901961, "grad_norm": 42.69194685159979, "learning_rate": 1.0813155723872145e-07, "logits/chosen": 14.329687118530273, "logits/rejected": 14.330738067626953, "logps/chosen": -4.604330539703369, "logps/rejected": -4.601813316345215, "loss": 4.0275, "rewards/accuracies": 0.5, "rewards/chosen": -46.043304443359375, "rewards/margins": -0.02516937255859375, "rewards/rejected": -46.01813507080078, "step": 5760 }, { "epoch": 0.7844498910675382, "grad_norm": 40.994686089684414, "learning_rate": 1.0800157262898286e-07, "logits/chosen": 13.957002639770508, "logits/rejected": 14.712787628173828, "logps/chosen": -4.704572677612305, "logps/rejected": -4.883768081665039, "loss": 4.0069, "rewards/accuracies": 0.75, "rewards/chosen": -47.04572296142578, "rewards/margins": 1.7919530868530273, "rewards/rejected": -48.837677001953125, "step": 5761 }, { "epoch": 0.7845860566448801, "grad_norm": 39.29297111685255, "learning_rate": 1.0787165399870261e-07, "logits/chosen": 14.825925827026367, "logits/rejected": 15.068451881408691, "logps/chosen": -4.759599685668945, "logps/rejected": -4.981096267700195, "loss": 3.8969, "rewards/accuracies": 0.75, "rewards/chosen": -47.59599304199219, "rewards/margins": 2.214970588684082, "rewards/rejected": -49.81096649169922, "step": 5762 }, { "epoch": 0.7847222222222222, "grad_norm": 40.04869193280334, "learning_rate": 1.0774180137723705e-07, "logits/chosen": 13.703424453735352, "logits/rejected": 14.419602394104004, "logps/chosen": -4.147397041320801, "logps/rejected": -4.693138599395752, "loss": 4.3373, "rewards/accuracies": 0.75, "rewards/chosen": -41.473968505859375, "rewards/margins": 5.457415580749512, "rewards/rejected": -46.93138122558594, "step": 5763 }, { "epoch": 0.7848583877995643, "grad_norm": 44.27344406407919, "learning_rate": 1.0761201479392714e-07, "logits/chosen": 14.592873573303223, "logits/rejected": 15.027097702026367, "logps/chosen": -4.455079078674316, "logps/rejected": -4.997397422790527, "loss": 4.4041, "rewards/accuracies": 1.0, "rewards/chosen": -44.55079650878906, "rewards/margins": 5.423175811767578, "rewards/rejected": -49.973968505859375, "step": 5764 }, { "epoch": 0.7849945533769063, "grad_norm": 42.83630345242901, "learning_rate": 1.0748229427809942e-07, "logits/chosen": 14.93637466430664, "logits/rejected": 14.478360176086426, "logps/chosen": -4.687731742858887, "logps/rejected": -4.406613349914551, "loss": 4.1356, "rewards/accuracies": 0.25, "rewards/chosen": -46.8773193359375, "rewards/margins": -2.8111839294433594, "rewards/rejected": -44.06613540649414, "step": 5765 }, { "epoch": 0.7851307189542484, "grad_norm": 48.74560439871963, "learning_rate": 1.073526398590654e-07, "logits/chosen": 13.879218101501465, "logits/rejected": 13.889533996582031, "logps/chosen": -4.454747676849365, "logps/rejected": -4.520247459411621, "loss": 4.067, "rewards/accuracies": 0.75, "rewards/chosen": -44.54747772216797, "rewards/margins": 0.654998779296875, "rewards/rejected": -45.202476501464844, "step": 5766 }, { "epoch": 0.7852668845315904, "grad_norm": 45.959975404540934, "learning_rate": 1.072230515661213e-07, "logits/chosen": 14.966288566589355, "logits/rejected": 14.617223739624023, "logps/chosen": -4.855362892150879, "logps/rejected": -4.723209381103516, "loss": 4.5723, "rewards/accuracies": 0.5, "rewards/chosen": -48.553627014160156, "rewards/margins": -1.3215360641479492, "rewards/rejected": -47.232093811035156, "step": 5767 }, { "epoch": 0.7854030501089324, "grad_norm": 45.6903809891143, "learning_rate": 1.0709352942854887e-07, "logits/chosen": 14.243404388427734, "logits/rejected": 14.18995475769043, "logps/chosen": -4.639768600463867, "logps/rejected": -4.529009819030762, "loss": 3.8974, "rewards/accuracies": 0.5, "rewards/chosen": -46.39768600463867, "rewards/margins": -1.1075878143310547, "rewards/rejected": -45.29010009765625, "step": 5768 }, { "epoch": 0.7855392156862745, "grad_norm": 41.26812689848795, "learning_rate": 1.0696407347561471e-07, "logits/chosen": 13.725976943969727, "logits/rejected": 13.911979675292969, "logps/chosen": -4.1888017654418945, "logps/rejected": -4.441679000854492, "loss": 4.339, "rewards/accuracies": 0.75, "rewards/chosen": -41.88801956176758, "rewards/margins": 2.5287694931030273, "rewards/rejected": -44.416786193847656, "step": 5769 }, { "epoch": 0.7856753812636166, "grad_norm": 39.442906574782874, "learning_rate": 1.0683468373657034e-07, "logits/chosen": 14.025245666503906, "logits/rejected": 14.401772499084473, "logps/chosen": -4.441434860229492, "logps/rejected": -4.623291015625, "loss": 3.9429, "rewards/accuracies": 0.5, "rewards/chosen": -44.41434860229492, "rewards/margins": 1.8185596466064453, "rewards/rejected": -46.232906341552734, "step": 5770 }, { "epoch": 0.7858115468409586, "grad_norm": 40.64718508986996, "learning_rate": 1.0670536024065251e-07, "logits/chosen": 14.56795883178711, "logits/rejected": 14.106674194335938, "logps/chosen": -4.596139907836914, "logps/rejected": -4.655835151672363, "loss": 3.7162, "rewards/accuracies": 0.5, "rewards/chosen": -45.961395263671875, "rewards/margins": 0.5969562530517578, "rewards/rejected": -46.558349609375, "step": 5771 }, { "epoch": 0.7859477124183006, "grad_norm": 41.68229437334024, "learning_rate": 1.0657610301708304e-07, "logits/chosen": 14.763118743896484, "logits/rejected": 14.928094863891602, "logps/chosen": -4.480954170227051, "logps/rejected": -4.905332565307617, "loss": 3.9593, "rewards/accuracies": 0.75, "rewards/chosen": -44.809547424316406, "rewards/margins": 4.243779182434082, "rewards/rejected": -49.05332565307617, "step": 5772 }, { "epoch": 0.7860838779956427, "grad_norm": 45.56898012754738, "learning_rate": 1.064469120950684e-07, "logits/chosen": 14.530340194702148, "logits/rejected": 13.689722061157227, "logps/chosen": -4.750710487365723, "logps/rejected": -4.593179702758789, "loss": 4.0367, "rewards/accuracies": 0.5, "rewards/chosen": -47.50710678100586, "rewards/margins": -1.5753097534179688, "rewards/rejected": -45.93179702758789, "step": 5773 }, { "epoch": 0.7862200435729847, "grad_norm": 38.76107374779565, "learning_rate": 1.0631778750380055e-07, "logits/chosen": 14.93018913269043, "logits/rejected": 14.848550796508789, "logps/chosen": -4.902925968170166, "logps/rejected": -4.763419151306152, "loss": 3.8675, "rewards/accuracies": 0.5, "rewards/chosen": -49.029258728027344, "rewards/margins": -1.3950719833374023, "rewards/rejected": -47.634185791015625, "step": 5774 }, { "epoch": 0.7863562091503268, "grad_norm": 41.6276986789704, "learning_rate": 1.0618872927245632e-07, "logits/chosen": 13.966131210327148, "logits/rejected": 14.005071640014648, "logps/chosen": -4.492687225341797, "logps/rejected": -4.408544540405273, "loss": 3.7768, "rewards/accuracies": 0.25, "rewards/chosen": -44.926876068115234, "rewards/margins": -0.8414316177368164, "rewards/rejected": -44.08544158935547, "step": 5775 }, { "epoch": 0.7864923747276689, "grad_norm": 41.38347238835857, "learning_rate": 1.060597374301973e-07, "logits/chosen": 13.299511909484863, "logits/rejected": 13.74019718170166, "logps/chosen": -4.20411491394043, "logps/rejected": -4.579581260681152, "loss": 3.8163, "rewards/accuracies": 1.0, "rewards/chosen": -42.0411491394043, "rewards/margins": 3.75466251373291, "rewards/rejected": -45.79581069946289, "step": 5776 }, { "epoch": 0.786628540305011, "grad_norm": 38.20733072500204, "learning_rate": 1.059308120061703e-07, "logits/chosen": 14.754831314086914, "logits/rejected": 14.613121032714844, "logps/chosen": -4.5690717697143555, "logps/rejected": -4.760715961456299, "loss": 3.88, "rewards/accuracies": 0.75, "rewards/chosen": -45.69071960449219, "rewards/margins": 1.9164390563964844, "rewards/rejected": -47.60715866088867, "step": 5777 }, { "epoch": 0.7867647058823529, "grad_norm": 39.68260621667445, "learning_rate": 1.0580195302950725e-07, "logits/chosen": 13.591264724731445, "logits/rejected": 14.486525535583496, "logps/chosen": -4.343635082244873, "logps/rejected": -4.3299479484558105, "loss": 4.1208, "rewards/accuracies": 0.5, "rewards/chosen": -43.43635177612305, "rewards/margins": -0.136871337890625, "rewards/rejected": -43.29948043823242, "step": 5778 }, { "epoch": 0.786900871459695, "grad_norm": 39.66365280589273, "learning_rate": 1.0567316052932467e-07, "logits/chosen": 14.463376998901367, "logits/rejected": 15.066787719726562, "logps/chosen": -4.678951263427734, "logps/rejected": -4.756556510925293, "loss": 3.7561, "rewards/accuracies": 0.5, "rewards/chosen": -46.789512634277344, "rewards/margins": 0.7760486602783203, "rewards/rejected": -47.56555938720703, "step": 5779 }, { "epoch": 0.7870370370370371, "grad_norm": 46.123631121315555, "learning_rate": 1.0554443453472436e-07, "logits/chosen": 14.011551856994629, "logits/rejected": 14.48808479309082, "logps/chosen": -4.408458709716797, "logps/rejected": -4.584649085998535, "loss": 3.4434, "rewards/accuracies": 0.75, "rewards/chosen": -44.0845832824707, "rewards/margins": 1.761906623840332, "rewards/rejected": -45.84648895263672, "step": 5780 }, { "epoch": 0.7871732026143791, "grad_norm": 41.600991261166996, "learning_rate": 1.0541577507479322e-07, "logits/chosen": 14.281571388244629, "logits/rejected": 14.611520767211914, "logps/chosen": -4.7178449630737305, "logps/rejected": -4.944541931152344, "loss": 4.0471, "rewards/accuracies": 0.75, "rewards/chosen": -47.17844772338867, "rewards/margins": 2.2669715881347656, "rewards/rejected": -49.44541931152344, "step": 5781 }, { "epoch": 0.7873093681917211, "grad_norm": 45.055118788047224, "learning_rate": 1.0528718217860263e-07, "logits/chosen": 14.479272842407227, "logits/rejected": 14.164938926696777, "logps/chosen": -4.554056167602539, "logps/rejected": -4.713210105895996, "loss": 4.9359, "rewards/accuracies": 0.5, "rewards/chosen": -45.540565490722656, "rewards/margins": 1.591538429260254, "rewards/rejected": -47.132102966308594, "step": 5782 }, { "epoch": 0.7874455337690632, "grad_norm": 37.8613003717448, "learning_rate": 1.0515865587520938e-07, "logits/chosen": 14.78016471862793, "logits/rejected": 14.855768203735352, "logps/chosen": -4.661127090454102, "logps/rejected": -4.619752407073975, "loss": 3.9111, "rewards/accuracies": 0.5, "rewards/chosen": -46.61127471923828, "rewards/margins": -0.41374969482421875, "rewards/rejected": -46.19752502441406, "step": 5783 }, { "epoch": 0.7875816993464052, "grad_norm": 38.49455185475536, "learning_rate": 1.050301961936551e-07, "logits/chosen": 13.90068244934082, "logits/rejected": 14.138617515563965, "logps/chosen": -4.179279327392578, "logps/rejected": -4.656304836273193, "loss": 3.5955, "rewards/accuracies": 1.0, "rewards/chosen": -41.79279327392578, "rewards/margins": 4.770256042480469, "rewards/rejected": -46.56304931640625, "step": 5784 }, { "epoch": 0.7877178649237473, "grad_norm": 38.296853307899546, "learning_rate": 1.0490180316296613e-07, "logits/chosen": 14.69919490814209, "logits/rejected": 14.944193840026855, "logps/chosen": -4.808440208435059, "logps/rejected": -4.8366265296936035, "loss": 3.8111, "rewards/accuracies": 0.5, "rewards/chosen": -48.08440017700195, "rewards/margins": 0.28186511993408203, "rewards/rejected": -48.36626434326172, "step": 5785 }, { "epoch": 0.7878540305010894, "grad_norm": 40.525020103996695, "learning_rate": 1.0477347681215402e-07, "logits/chosen": 13.52310562133789, "logits/rejected": 13.493261337280273, "logps/chosen": -4.210048198699951, "logps/rejected": -4.47447395324707, "loss": 4.3915, "rewards/accuracies": 0.75, "rewards/chosen": -42.10048294067383, "rewards/margins": 2.6442575454711914, "rewards/rejected": -44.7447395324707, "step": 5786 }, { "epoch": 0.7879901960784313, "grad_norm": 42.83202388248674, "learning_rate": 1.0464521717021524e-07, "logits/chosen": 14.643061637878418, "logits/rejected": 15.265146255493164, "logps/chosen": -4.747842788696289, "logps/rejected": -4.803824424743652, "loss": 3.9033, "rewards/accuracies": 0.5, "rewards/chosen": -47.47842788696289, "rewards/margins": 0.5598134994506836, "rewards/rejected": -48.03824234008789, "step": 5787 }, { "epoch": 0.7881263616557734, "grad_norm": 39.92459491646469, "learning_rate": 1.0451702426613116e-07, "logits/chosen": 13.863190650939941, "logits/rejected": 13.924163818359375, "logps/chosen": -4.071383953094482, "logps/rejected": -4.455746173858643, "loss": 3.5038, "rewards/accuracies": 1.0, "rewards/chosen": -40.713836669921875, "rewards/margins": 3.843623161315918, "rewards/rejected": -44.55746078491211, "step": 5788 }, { "epoch": 0.7882625272331155, "grad_norm": 39.00381175530067, "learning_rate": 1.0438889812886777e-07, "logits/chosen": 14.057146072387695, "logits/rejected": 14.322091102600098, "logps/chosen": -3.814120054244995, "logps/rejected": -4.352484226226807, "loss": 3.9499, "rewards/accuracies": 1.0, "rewards/chosen": -38.141197204589844, "rewards/margins": 5.38364315032959, "rewards/rejected": -43.52484130859375, "step": 5789 }, { "epoch": 0.7883986928104575, "grad_norm": 50.970315316441734, "learning_rate": 1.0426083878737646e-07, "logits/chosen": 14.18736457824707, "logits/rejected": 14.924881935119629, "logps/chosen": -4.503000259399414, "logps/rejected": -4.932685852050781, "loss": 4.2091, "rewards/accuracies": 0.75, "rewards/chosen": -45.030006408691406, "rewards/margins": 4.296856880187988, "rewards/rejected": -49.32685852050781, "step": 5790 }, { "epoch": 0.7885348583877996, "grad_norm": 42.47862374352082, "learning_rate": 1.0413284627059331e-07, "logits/chosen": 13.776820182800293, "logits/rejected": 13.911083221435547, "logps/chosen": -4.552273750305176, "logps/rejected": -4.658480644226074, "loss": 4.141, "rewards/accuracies": 1.0, "rewards/chosen": -45.52273941040039, "rewards/margins": 1.0620660781860352, "rewards/rejected": -46.58480453491211, "step": 5791 }, { "epoch": 0.7886710239651417, "grad_norm": 37.70836618848202, "learning_rate": 1.040049206074391e-07, "logits/chosen": 14.198823928833008, "logits/rejected": 14.611043930053711, "logps/chosen": -4.559943199157715, "logps/rejected": -4.529068470001221, "loss": 3.3371, "rewards/accuracies": 0.5, "rewards/chosen": -45.599430084228516, "rewards/margins": -0.3087434768676758, "rewards/rejected": -45.290687561035156, "step": 5792 }, { "epoch": 0.7888071895424836, "grad_norm": 35.09787552848249, "learning_rate": 1.0387706182681984e-07, "logits/chosen": 14.339714050292969, "logits/rejected": 14.642908096313477, "logps/chosen": -4.544496536254883, "logps/rejected": -4.945732116699219, "loss": 3.7506, "rewards/accuracies": 0.75, "rewards/chosen": -45.44496536254883, "rewards/margins": 4.012353897094727, "rewards/rejected": -49.45732116699219, "step": 5793 }, { "epoch": 0.7889433551198257, "grad_norm": 40.47085492583201, "learning_rate": 1.0374926995762616e-07, "logits/chosen": 13.945158958435059, "logits/rejected": 13.68371868133545, "logps/chosen": -4.611728668212891, "logps/rejected": -4.386756420135498, "loss": 4.1778, "rewards/accuracies": 0.25, "rewards/chosen": -46.11728286743164, "rewards/margins": -2.2497196197509766, "rewards/rejected": -43.8675651550293, "step": 5794 }, { "epoch": 0.7890795206971678, "grad_norm": 39.41015464718567, "learning_rate": 1.0362154502873393e-07, "logits/chosen": 14.018692016601562, "logits/rejected": 14.31936264038086, "logps/chosen": -4.291994571685791, "logps/rejected": -4.6011152267456055, "loss": 3.6664, "rewards/accuracies": 0.75, "rewards/chosen": -42.919944763183594, "rewards/margins": 3.0912046432495117, "rewards/rejected": -46.011146545410156, "step": 5795 }, { "epoch": 0.7892156862745098, "grad_norm": 40.418270395579135, "learning_rate": 1.0349388706900337e-07, "logits/chosen": 14.240791320800781, "logits/rejected": 14.43313217163086, "logps/chosen": -4.199864387512207, "logps/rejected": -4.529167175292969, "loss": 4.3775, "rewards/accuracies": 0.75, "rewards/chosen": -41.99864196777344, "rewards/margins": 3.2930288314819336, "rewards/rejected": -45.29167175292969, "step": 5796 }, { "epoch": 0.7893518518518519, "grad_norm": 40.96758176599042, "learning_rate": 1.0336629610727995e-07, "logits/chosen": 14.00782585144043, "logits/rejected": 15.38679313659668, "logps/chosen": -4.541437149047852, "logps/rejected": -4.8277177810668945, "loss": 3.9613, "rewards/accuracies": 0.75, "rewards/chosen": -45.41436767578125, "rewards/margins": 2.862812042236328, "rewards/rejected": -48.277183532714844, "step": 5797 }, { "epoch": 0.789488017429194, "grad_norm": 42.39348066684436, "learning_rate": 1.032387721723941e-07, "logits/chosen": 14.579242706298828, "logits/rejected": 14.256845474243164, "logps/chosen": -4.795101642608643, "logps/rejected": -4.700182914733887, "loss": 4.2328, "rewards/accuracies": 0.5, "rewards/chosen": -47.951019287109375, "rewards/margins": -0.9491853713989258, "rewards/rejected": -47.0018310546875, "step": 5798 }, { "epoch": 0.7896241830065359, "grad_norm": 38.830056480569034, "learning_rate": 1.0311131529316055e-07, "logits/chosen": 14.20687484741211, "logits/rejected": 14.885711669921875, "logps/chosen": -4.547031402587891, "logps/rejected": -4.546588897705078, "loss": 3.6234, "rewards/accuracies": 0.5, "rewards/chosen": -45.470314025878906, "rewards/margins": -0.004420280456542969, "rewards/rejected": -45.46588897705078, "step": 5799 }, { "epoch": 0.789760348583878, "grad_norm": 44.469617277226355, "learning_rate": 1.0298392549837944e-07, "logits/chosen": 14.246052742004395, "logits/rejected": 15.090824127197266, "logps/chosen": -4.5352783203125, "logps/rejected": -4.751448631286621, "loss": 4.1636, "rewards/accuracies": 0.5, "rewards/chosen": -45.352783203125, "rewards/margins": 2.161701202392578, "rewards/rejected": -47.51448440551758, "step": 5800 }, { "epoch": 0.7898965141612201, "grad_norm": 42.03849025529932, "learning_rate": 1.0285660281683553e-07, "logits/chosen": 13.050131797790527, "logits/rejected": 14.184206008911133, "logps/chosen": -4.060085296630859, "logps/rejected": -4.456652641296387, "loss": 4.0785, "rewards/accuracies": 1.0, "rewards/chosen": -40.600852966308594, "rewards/margins": 3.9656782150268555, "rewards/rejected": -44.5665283203125, "step": 5801 }, { "epoch": 0.7900326797385621, "grad_norm": 39.63082216296997, "learning_rate": 1.0272934727729854e-07, "logits/chosen": 13.94675064086914, "logits/rejected": 15.227439880371094, "logps/chosen": -4.106241226196289, "logps/rejected": -4.566038131713867, "loss": 3.6908, "rewards/accuracies": 1.0, "rewards/chosen": -41.062408447265625, "rewards/margins": 4.597969055175781, "rewards/rejected": -45.660377502441406, "step": 5802 }, { "epoch": 0.7901688453159041, "grad_norm": 39.88675572002847, "learning_rate": 1.0260215890852268e-07, "logits/chosen": 14.721221923828125, "logits/rejected": 14.735990524291992, "logps/chosen": -4.280930519104004, "logps/rejected": -4.5864057540893555, "loss": 4.0997, "rewards/accuracies": 0.75, "rewards/chosen": -42.809303283691406, "rewards/margins": 3.0547542572021484, "rewards/rejected": -45.86405944824219, "step": 5803 }, { "epoch": 0.7903050108932462, "grad_norm": 57.85129508122199, "learning_rate": 1.0247503773924733e-07, "logits/chosen": 14.300247192382812, "logits/rejected": 14.084145545959473, "logps/chosen": -4.490939617156982, "logps/rejected": -4.182397842407227, "loss": 4.4799, "rewards/accuracies": 0.25, "rewards/chosen": -44.909393310546875, "rewards/margins": -3.085420608520508, "rewards/rejected": -41.823974609375, "step": 5804 }, { "epoch": 0.7904411764705882, "grad_norm": 40.695380413014824, "learning_rate": 1.0234798379819664e-07, "logits/chosen": 14.158851623535156, "logits/rejected": 14.843473434448242, "logps/chosen": -4.392067909240723, "logps/rejected": -4.840124130249023, "loss": 3.6529, "rewards/accuracies": 1.0, "rewards/chosen": -43.92068099975586, "rewards/margins": 4.480562210083008, "rewards/rejected": -48.401241302490234, "step": 5805 }, { "epoch": 0.7905773420479303, "grad_norm": 39.938257008712995, "learning_rate": 1.0222099711407937e-07, "logits/chosen": 14.252684593200684, "logits/rejected": 14.741884231567383, "logps/chosen": -4.746723651885986, "logps/rejected": -5.114449977874756, "loss": 3.7579, "rewards/accuracies": 0.75, "rewards/chosen": -47.46723556518555, "rewards/margins": 3.6772642135620117, "rewards/rejected": -51.144500732421875, "step": 5806 }, { "epoch": 0.7907135076252724, "grad_norm": 39.618826406646136, "learning_rate": 1.0209407771558924e-07, "logits/chosen": 15.038471221923828, "logits/rejected": 14.609299659729004, "logps/chosen": -4.588553428649902, "logps/rejected": -4.750272274017334, "loss": 3.5653, "rewards/accuracies": 0.75, "rewards/chosen": -45.885536193847656, "rewards/margins": 1.6171855926513672, "rewards/rejected": -47.502716064453125, "step": 5807 }, { "epoch": 0.7908496732026143, "grad_norm": 40.42278129568004, "learning_rate": 1.0196722563140489e-07, "logits/chosen": 14.632879257202148, "logits/rejected": 14.233685493469238, "logps/chosen": -4.470579624176025, "logps/rejected": -4.462182998657227, "loss": 4.0819, "rewards/accuracies": 0.25, "rewards/chosen": -44.70579147338867, "rewards/margins": -0.08396530151367188, "rewards/rejected": -44.621826171875, "step": 5808 }, { "epoch": 0.7909858387799564, "grad_norm": 41.09999200293522, "learning_rate": 1.018404408901894e-07, "logits/chosen": 14.203634262084961, "logits/rejected": 14.87432861328125, "logps/chosen": -4.418307304382324, "logps/rejected": -4.586682319641113, "loss": 3.9375, "rewards/accuracies": 1.0, "rewards/chosen": -44.18307113647461, "rewards/margins": 1.683751106262207, "rewards/rejected": -45.8668212890625, "step": 5809 }, { "epoch": 0.7911220043572985, "grad_norm": 79.88191352664883, "learning_rate": 1.0171372352059084e-07, "logits/chosen": 15.243535995483398, "logits/rejected": 15.338539123535156, "logps/chosen": -4.655713081359863, "logps/rejected": -4.770907878875732, "loss": 4.2712, "rewards/accuracies": 0.75, "rewards/chosen": -46.55712890625, "rewards/margins": 1.151951789855957, "rewards/rejected": -47.70907974243164, "step": 5810 }, { "epoch": 0.7912581699346405, "grad_norm": 41.61219803901829, "learning_rate": 1.0158707355124225e-07, "logits/chosen": 13.568239212036133, "logits/rejected": 14.516918182373047, "logps/chosen": -4.158367156982422, "logps/rejected": -4.355015754699707, "loss": 4.3105, "rewards/accuracies": 0.75, "rewards/chosen": -41.58367156982422, "rewards/margins": 1.9664859771728516, "rewards/rejected": -43.5501594543457, "step": 5811 }, { "epoch": 0.7913943355119826, "grad_norm": 42.069606598331596, "learning_rate": 1.0146049101076095e-07, "logits/chosen": 15.067956924438477, "logits/rejected": 14.784811019897461, "logps/chosen": -4.8585205078125, "logps/rejected": -4.712027549743652, "loss": 4.2983, "rewards/accuracies": 0.25, "rewards/chosen": -48.585205078125, "rewards/margins": -1.4649314880371094, "rewards/rejected": -47.12027359008789, "step": 5812 }, { "epoch": 0.7915305010893247, "grad_norm": 42.82082826845427, "learning_rate": 1.0133397592774952e-07, "logits/chosen": 13.967920303344727, "logits/rejected": 13.617916107177734, "logps/chosen": -4.186651706695557, "logps/rejected": -4.385491371154785, "loss": 3.7741, "rewards/accuracies": 0.75, "rewards/chosen": -41.86651611328125, "rewards/margins": 1.9883966445922852, "rewards/rejected": -43.85491180419922, "step": 5813 }, { "epoch": 0.7916666666666666, "grad_norm": 39.144173794610175, "learning_rate": 1.0120752833079511e-07, "logits/chosen": 14.228242874145508, "logits/rejected": 13.68808650970459, "logps/chosen": -4.550027370452881, "logps/rejected": -4.353416442871094, "loss": 3.882, "rewards/accuracies": 0.25, "rewards/chosen": -45.500274658203125, "rewards/margins": -1.9661083221435547, "rewards/rejected": -43.5341682434082, "step": 5814 }, { "epoch": 0.7918028322440087, "grad_norm": 43.807971629413224, "learning_rate": 1.0108114824846938e-07, "logits/chosen": 14.449573516845703, "logits/rejected": 15.302412986755371, "logps/chosen": -4.512302875518799, "logps/rejected": -4.944786071777344, "loss": 3.2668, "rewards/accuracies": 0.75, "rewards/chosen": -45.12303161621094, "rewards/margins": 4.324832916259766, "rewards/rejected": -49.44786071777344, "step": 5815 }, { "epoch": 0.7919389978213508, "grad_norm": 41.925216227257316, "learning_rate": 1.0095483570932915e-07, "logits/chosen": 14.593536376953125, "logits/rejected": 14.833736419677734, "logps/chosen": -4.325716972351074, "logps/rejected": -4.663734436035156, "loss": 4.1175, "rewards/accuracies": 0.75, "rewards/chosen": -43.25716781616211, "rewards/margins": 3.3801746368408203, "rewards/rejected": -46.63734436035156, "step": 5816 }, { "epoch": 0.7920751633986928, "grad_norm": 45.343043131254944, "learning_rate": 1.0082859074191579e-07, "logits/chosen": 14.196985244750977, "logits/rejected": 14.897207260131836, "logps/chosen": -4.363272666931152, "logps/rejected": -5.211804389953613, "loss": 4.3215, "rewards/accuracies": 1.0, "rewards/chosen": -43.632728576660156, "rewards/margins": 8.485316276550293, "rewards/rejected": -52.1180419921875, "step": 5817 }, { "epoch": 0.7922113289760349, "grad_norm": 41.98209375628557, "learning_rate": 1.007024133747552e-07, "logits/chosen": 13.408537864685059, "logits/rejected": 14.612205505371094, "logps/chosen": -4.276849269866943, "logps/rejected": -4.701637268066406, "loss": 3.9799, "rewards/accuracies": 0.5, "rewards/chosen": -42.76849365234375, "rewards/margins": 4.2478790283203125, "rewards/rejected": -47.01637268066406, "step": 5818 }, { "epoch": 0.7923474945533769, "grad_norm": 39.57967539019255, "learning_rate": 1.0057630363635836e-07, "logits/chosen": 13.995147705078125, "logits/rejected": 14.306150436401367, "logps/chosen": -4.614133834838867, "logps/rejected": -4.783923149108887, "loss": 3.6583, "rewards/accuracies": 0.5, "rewards/chosen": -46.14134216308594, "rewards/margins": 1.6978912353515625, "rewards/rejected": -47.8392333984375, "step": 5819 }, { "epoch": 0.7924836601307189, "grad_norm": 46.086957639550405, "learning_rate": 1.0045026155522087e-07, "logits/chosen": 14.862449645996094, "logits/rejected": 14.80240249633789, "logps/chosen": -4.641191482543945, "logps/rejected": -4.592751502990723, "loss": 4.1325, "rewards/accuracies": 0.5, "rewards/chosen": -46.41191864013672, "rewards/margins": -0.4844036102294922, "rewards/rejected": -45.927513122558594, "step": 5820 }, { "epoch": 0.792619825708061, "grad_norm": 49.6733647423429, "learning_rate": 1.003242871598228e-07, "logits/chosen": 14.010271072387695, "logits/rejected": 14.209579467773438, "logps/chosen": -4.547964096069336, "logps/rejected": -4.71238899230957, "loss": 3.8483, "rewards/accuracies": 0.75, "rewards/chosen": -45.479644775390625, "rewards/margins": 1.6442451477050781, "rewards/rejected": -47.1238899230957, "step": 5821 }, { "epoch": 0.7927559912854031, "grad_norm": 42.31863109531816, "learning_rate": 1.0019838047862919e-07, "logits/chosen": 13.569938659667969, "logits/rejected": 14.54088020324707, "logps/chosen": -4.159456729888916, "logps/rejected": -4.645138263702393, "loss": 4.2584, "rewards/accuracies": 0.75, "rewards/chosen": -41.594566345214844, "rewards/margins": 4.856816291809082, "rewards/rejected": -46.451385498046875, "step": 5822 }, { "epoch": 0.7928921568627451, "grad_norm": 40.54331855945013, "learning_rate": 1.0007254154008977e-07, "logits/chosen": 13.83265209197998, "logits/rejected": 14.627998352050781, "logps/chosen": -4.294297695159912, "logps/rejected": -4.634242057800293, "loss": 3.4526, "rewards/accuracies": 0.75, "rewards/chosen": -42.94297790527344, "rewards/margins": 3.399444580078125, "rewards/rejected": -46.34242248535156, "step": 5823 }, { "epoch": 0.7930283224400871, "grad_norm": 44.092415233204946, "learning_rate": 9.994677037263874e-08, "logits/chosen": 14.460371017456055, "logits/rejected": 14.259401321411133, "logps/chosen": -4.330967903137207, "logps/rejected": -4.3543806076049805, "loss": 3.7604, "rewards/accuracies": 0.25, "rewards/chosen": -43.30967712402344, "rewards/margins": 0.2341289520263672, "rewards/rejected": -43.54380798339844, "step": 5824 }, { "epoch": 0.7931644880174292, "grad_norm": 42.700840065393834, "learning_rate": 9.982106700469519e-08, "logits/chosen": 13.992910385131836, "logits/rejected": 14.789518356323242, "logps/chosen": -4.280214309692383, "logps/rejected": -4.664754867553711, "loss": 3.7729, "rewards/accuracies": 0.75, "rewards/chosen": -42.802146911621094, "rewards/margins": 3.845407485961914, "rewards/rejected": -46.647552490234375, "step": 5825 }, { "epoch": 0.7933006535947712, "grad_norm": 39.09346914324886, "learning_rate": 9.969543146466297e-08, "logits/chosen": 14.08149528503418, "logits/rejected": 14.720149040222168, "logps/chosen": -4.652085304260254, "logps/rejected": -4.592024803161621, "loss": 3.8293, "rewards/accuracies": 0.5, "rewards/chosen": -46.52085494995117, "rewards/margins": -0.6006088256835938, "rewards/rejected": -45.92024612426758, "step": 5826 }, { "epoch": 0.7934368191721133, "grad_norm": 40.29479157634522, "learning_rate": 9.956986378093022e-08, "logits/chosen": 15.299986839294434, "logits/rejected": 15.417306900024414, "logps/chosen": -5.039063453674316, "logps/rejected": -5.150848865509033, "loss": 3.9259, "rewards/accuracies": 0.5, "rewards/chosen": -50.3906364440918, "rewards/margins": 1.117849349975586, "rewards/rejected": -51.50848388671875, "step": 5827 }, { "epoch": 0.7935729847494554, "grad_norm": 41.258927061554544, "learning_rate": 9.944436398187014e-08, "logits/chosen": 14.016481399536133, "logits/rejected": 14.233833312988281, "logps/chosen": -4.167896270751953, "logps/rejected": -4.471177577972412, "loss": 3.2549, "rewards/accuracies": 1.0, "rewards/chosen": -41.6789665222168, "rewards/margins": 3.0328102111816406, "rewards/rejected": -44.71177673339844, "step": 5828 }, { "epoch": 0.7937091503267973, "grad_norm": 46.150159850393045, "learning_rate": 9.931893209584061e-08, "logits/chosen": 13.848094940185547, "logits/rejected": 14.365188598632812, "logps/chosen": -4.38284969329834, "logps/rejected": -4.706478118896484, "loss": 4.6306, "rewards/accuracies": 0.75, "rewards/chosen": -43.8284912109375, "rewards/margins": 3.236283302307129, "rewards/rejected": -47.06477355957031, "step": 5829 }, { "epoch": 0.7938453159041394, "grad_norm": 40.05914481124227, "learning_rate": 9.919356815118364e-08, "logits/chosen": 14.938777923583984, "logits/rejected": 15.253774642944336, "logps/chosen": -4.452754497528076, "logps/rejected": -4.7733988761901855, "loss": 3.838, "rewards/accuracies": 0.75, "rewards/chosen": -44.52754592895508, "rewards/margins": 3.206441879272461, "rewards/rejected": -47.73398971557617, "step": 5830 }, { "epoch": 0.7939814814814815, "grad_norm": 41.99560430789122, "learning_rate": 9.906827217622647e-08, "logits/chosen": 13.325267791748047, "logits/rejected": 14.189290046691895, "logps/chosen": -3.943863868713379, "logps/rejected": -4.4481377601623535, "loss": 3.7514, "rewards/accuracies": 1.0, "rewards/chosen": -39.438636779785156, "rewards/margins": 5.042740821838379, "rewards/rejected": -44.48137664794922, "step": 5831 }, { "epoch": 0.7941176470588235, "grad_norm": 41.665974984432566, "learning_rate": 9.894304419928086e-08, "logits/chosen": 14.324846267700195, "logits/rejected": 14.49236011505127, "logps/chosen": -4.8837056159973145, "logps/rejected": -4.816578388214111, "loss": 3.9076, "rewards/accuracies": 0.5, "rewards/chosen": -48.83705520629883, "rewards/margins": -0.6712703704833984, "rewards/rejected": -48.1657829284668, "step": 5832 }, { "epoch": 0.7942538126361656, "grad_norm": 41.75920270092453, "learning_rate": 9.881788424864281e-08, "logits/chosen": 14.738384246826172, "logits/rejected": 14.837425231933594, "logps/chosen": -4.5990400314331055, "logps/rejected": -4.740792274475098, "loss": 3.8393, "rewards/accuracies": 0.75, "rewards/chosen": -45.99040222167969, "rewards/margins": 1.417520523071289, "rewards/rejected": -47.407920837402344, "step": 5833 }, { "epoch": 0.7943899782135077, "grad_norm": 41.22191543951036, "learning_rate": 9.869279235259345e-08, "logits/chosen": 13.874187469482422, "logits/rejected": 14.532974243164062, "logps/chosen": -4.522237777709961, "logps/rejected": -4.7148847579956055, "loss": 4.2664, "rewards/accuracies": 0.75, "rewards/chosen": -45.222373962402344, "rewards/margins": 1.9264717102050781, "rewards/rejected": -47.14884567260742, "step": 5834 }, { "epoch": 0.7945261437908496, "grad_norm": 43.71417743410763, "learning_rate": 9.856776853939837e-08, "logits/chosen": 14.355623245239258, "logits/rejected": 14.387359619140625, "logps/chosen": -4.272115707397461, "logps/rejected": -4.615034103393555, "loss": 3.6715, "rewards/accuracies": 1.0, "rewards/chosen": -42.721160888671875, "rewards/margins": 3.4291820526123047, "rewards/rejected": -46.15034484863281, "step": 5835 }, { "epoch": 0.7946623093681917, "grad_norm": 45.95160457112131, "learning_rate": 9.844281283730755e-08, "logits/chosen": 14.757912635803223, "logits/rejected": 14.963712692260742, "logps/chosen": -4.754575729370117, "logps/rejected": -4.731060028076172, "loss": 4.1175, "rewards/accuracies": 0.5, "rewards/chosen": -47.54575729370117, "rewards/margins": -0.23515796661376953, "rewards/rejected": -47.31060028076172, "step": 5836 }, { "epoch": 0.7947984749455338, "grad_norm": 45.06834372493336, "learning_rate": 9.83179252745559e-08, "logits/chosen": 13.482870101928711, "logits/rejected": 14.115768432617188, "logps/chosen": -4.439128398895264, "logps/rejected": -4.680212497711182, "loss": 4.5997, "rewards/accuracies": 0.75, "rewards/chosen": -44.39128875732422, "rewards/margins": 2.410834312438965, "rewards/rejected": -46.802120208740234, "step": 5837 }, { "epoch": 0.7949346405228758, "grad_norm": 40.798999067922445, "learning_rate": 9.819310587936285e-08, "logits/chosen": 14.60239028930664, "logits/rejected": 14.909235000610352, "logps/chosen": -4.419053077697754, "logps/rejected": -4.456652641296387, "loss": 3.9594, "rewards/accuracies": 0.25, "rewards/chosen": -44.190528869628906, "rewards/margins": 0.37600040435791016, "rewards/rejected": -44.5665283203125, "step": 5838 }, { "epoch": 0.7950708061002179, "grad_norm": 39.764225689744414, "learning_rate": 9.806835467993217e-08, "logits/chosen": 14.080318450927734, "logits/rejected": 14.836584091186523, "logps/chosen": -4.503899574279785, "logps/rejected": -4.6735968589782715, "loss": 3.3008, "rewards/accuracies": 0.75, "rewards/chosen": -45.03899383544922, "rewards/margins": 1.6969738006591797, "rewards/rejected": -46.73596954345703, "step": 5839 }, { "epoch": 0.7952069716775599, "grad_norm": 41.98978905320982, "learning_rate": 9.794367170445257e-08, "logits/chosen": 13.795818328857422, "logits/rejected": 15.03414535522461, "logps/chosen": -4.2529401779174805, "logps/rejected": -4.7895402908325195, "loss": 4.0123, "rewards/accuracies": 0.75, "rewards/chosen": -42.52939987182617, "rewards/margins": 5.366002082824707, "rewards/rejected": -47.89540100097656, "step": 5840 }, { "epoch": 0.7953431372549019, "grad_norm": 40.72217509651744, "learning_rate": 9.781905698109722e-08, "logits/chosen": 14.381330490112305, "logits/rejected": 15.219263076782227, "logps/chosen": -4.358331680297852, "logps/rejected": -4.909947872161865, "loss": 4.0766, "rewards/accuracies": 1.0, "rewards/chosen": -43.583316802978516, "rewards/margins": 5.516160011291504, "rewards/rejected": -49.0994758605957, "step": 5841 }, { "epoch": 0.795479302832244, "grad_norm": 40.15867983527642, "learning_rate": 9.769451053802376e-08, "logits/chosen": 13.763043403625488, "logits/rejected": 14.267634391784668, "logps/chosen": -4.449834823608398, "logps/rejected": -4.637916088104248, "loss": 3.8497, "rewards/accuracies": 0.75, "rewards/chosen": -44.498348236083984, "rewards/margins": 1.8808135986328125, "rewards/rejected": -46.3791618347168, "step": 5842 }, { "epoch": 0.7956154684095861, "grad_norm": 38.176710240912016, "learning_rate": 9.75700324033745e-08, "logits/chosen": 13.760004043579102, "logits/rejected": 13.971495628356934, "logps/chosen": -4.29315710067749, "logps/rejected": -4.412778854370117, "loss": 3.6478, "rewards/accuracies": 0.75, "rewards/chosen": -42.93157196044922, "rewards/margins": 1.196213722229004, "rewards/rejected": -44.12778854370117, "step": 5843 }, { "epoch": 0.795751633986928, "grad_norm": 41.88432134373929, "learning_rate": 9.744562260527645e-08, "logits/chosen": 14.103582382202148, "logits/rejected": 14.142251968383789, "logps/chosen": -4.274663925170898, "logps/rejected": -4.451421737670898, "loss": 4.2301, "rewards/accuracies": 0.75, "rewards/chosen": -42.746639251708984, "rewards/margins": 1.7675714492797852, "rewards/rejected": -44.51421356201172, "step": 5844 }, { "epoch": 0.7958877995642701, "grad_norm": 42.62908706689904, "learning_rate": 9.732128117184078e-08, "logits/chosen": 15.160669326782227, "logits/rejected": 16.017305374145508, "logps/chosen": -4.144685745239258, "logps/rejected": -4.525174140930176, "loss": 3.9692, "rewards/accuracies": 0.75, "rewards/chosen": -41.446861267089844, "rewards/margins": 3.804884910583496, "rewards/rejected": -45.25174331665039, "step": 5845 }, { "epoch": 0.7960239651416122, "grad_norm": 47.24470382874557, "learning_rate": 9.719700813116358e-08, "logits/chosen": 14.18325138092041, "logits/rejected": 14.6276273727417, "logps/chosen": -4.66911506652832, "logps/rejected": -4.535199165344238, "loss": 4.5513, "rewards/accuracies": 0.25, "rewards/chosen": -46.69114685058594, "rewards/margins": -1.339156150817871, "rewards/rejected": -45.35198974609375, "step": 5846 }, { "epoch": 0.7961601307189542, "grad_norm": 41.978121470488475, "learning_rate": 9.707280351132552e-08, "logits/chosen": 14.111148834228516, "logits/rejected": 14.136831283569336, "logps/chosen": -4.8723978996276855, "logps/rejected": -5.105135440826416, "loss": 4.0694, "rewards/accuracies": 0.5, "rewards/chosen": -48.723976135253906, "rewards/margins": 2.3273754119873047, "rewards/rejected": -51.051353454589844, "step": 5847 }, { "epoch": 0.7962962962962963, "grad_norm": 38.70962241274074, "learning_rate": 9.694866734039143e-08, "logits/chosen": 13.561935424804688, "logits/rejected": 13.925008773803711, "logps/chosen": -4.171435832977295, "logps/rejected": -4.593183994293213, "loss": 3.5259, "rewards/accuracies": 0.75, "rewards/chosen": -41.71435546875, "rewards/margins": 4.217482566833496, "rewards/rejected": -45.93183898925781, "step": 5848 }, { "epoch": 0.7964324618736384, "grad_norm": 40.1832139578432, "learning_rate": 9.682459964641099e-08, "logits/chosen": 13.915558815002441, "logits/rejected": 14.069074630737305, "logps/chosen": -4.427026748657227, "logps/rejected": -4.588312149047852, "loss": 3.9929, "rewards/accuracies": 0.75, "rewards/chosen": -44.270267486572266, "rewards/margins": 1.6128501892089844, "rewards/rejected": -45.88311767578125, "step": 5849 }, { "epoch": 0.7965686274509803, "grad_norm": 42.21207604642969, "learning_rate": 9.670060045741846e-08, "logits/chosen": 14.462503433227539, "logits/rejected": 15.283041000366211, "logps/chosen": -4.673435688018799, "logps/rejected": -4.829197883605957, "loss": 4.1647, "rewards/accuracies": 0.75, "rewards/chosen": -46.734352111816406, "rewards/margins": 1.5576276779174805, "rewards/rejected": -48.29198455810547, "step": 5850 }, { "epoch": 0.7967047930283224, "grad_norm": 45.566706150070466, "learning_rate": 9.657666980143222e-08, "logits/chosen": 14.39228630065918, "logits/rejected": 14.338949203491211, "logps/chosen": -4.696768760681152, "logps/rejected": -4.707321643829346, "loss": 3.6956, "rewards/accuracies": 0.5, "rewards/chosen": -46.967689514160156, "rewards/margins": 0.10552406311035156, "rewards/rejected": -47.07321548461914, "step": 5851 }, { "epoch": 0.7968409586056645, "grad_norm": 41.87879155647783, "learning_rate": 9.645280770645556e-08, "logits/chosen": 14.684293746948242, "logits/rejected": 14.619462966918945, "logps/chosen": -4.854608058929443, "logps/rejected": -4.742032051086426, "loss": 3.7756, "rewards/accuracies": 0.5, "rewards/chosen": -48.546077728271484, "rewards/margins": -1.1257591247558594, "rewards/rejected": -47.420318603515625, "step": 5852 }, { "epoch": 0.7969771241830066, "grad_norm": 47.974728086000646, "learning_rate": 9.632901420047627e-08, "logits/chosen": 14.461532592773438, "logits/rejected": 14.811254501342773, "logps/chosen": -4.517449378967285, "logps/rejected": -4.873772621154785, "loss": 4.3113, "rewards/accuracies": 0.75, "rewards/chosen": -45.17449188232422, "rewards/margins": 3.563230514526367, "rewards/rejected": -48.73772430419922, "step": 5853 }, { "epoch": 0.7971132897603486, "grad_norm": 43.304274279039596, "learning_rate": 9.620528931146629e-08, "logits/chosen": 13.782112121582031, "logits/rejected": 13.878549575805664, "logps/chosen": -4.281286239624023, "logps/rejected": -4.140617847442627, "loss": 3.9091, "rewards/accuracies": 0.5, "rewards/chosen": -42.81285858154297, "rewards/margins": -1.406682014465332, "rewards/rejected": -41.40618133544922, "step": 5854 }, { "epoch": 0.7972494553376906, "grad_norm": 45.70944162772079, "learning_rate": 9.608163306738238e-08, "logits/chosen": 14.657562255859375, "logits/rejected": 14.502824783325195, "logps/chosen": -4.921667098999023, "logps/rejected": -4.947256565093994, "loss": 4.1483, "rewards/accuracies": 0.75, "rewards/chosen": -49.216670989990234, "rewards/margins": 0.25589466094970703, "rewards/rejected": -49.472564697265625, "step": 5855 }, { "epoch": 0.7973856209150327, "grad_norm": 41.30211079865235, "learning_rate": 9.59580454961658e-08, "logits/chosen": 14.38147258758545, "logits/rejected": 15.232626914978027, "logps/chosen": -4.634242057800293, "logps/rejected": -4.9128546714782715, "loss": 4.0804, "rewards/accuracies": 0.75, "rewards/chosen": -46.34242248535156, "rewards/margins": 2.7861242294311523, "rewards/rejected": -49.12854766845703, "step": 5856 }, { "epoch": 0.7975217864923747, "grad_norm": 60.258186853030864, "learning_rate": 9.583452662574196e-08, "logits/chosen": 14.6474609375, "logits/rejected": 14.088603973388672, "logps/chosen": -4.807809829711914, "logps/rejected": -4.683182716369629, "loss": 4.4116, "rewards/accuracies": 0.75, "rewards/chosen": -48.07809829711914, "rewards/margins": -1.246272087097168, "rewards/rejected": -46.831825256347656, "step": 5857 }, { "epoch": 0.7976579520697168, "grad_norm": 40.264663890692724, "learning_rate": 9.571107648402108e-08, "logits/chosen": 15.22846794128418, "logits/rejected": 15.348053932189941, "logps/chosen": -4.906325340270996, "logps/rejected": -5.051329135894775, "loss": 3.9333, "rewards/accuracies": 0.75, "rewards/chosen": -49.063255310058594, "rewards/margins": 1.4500360488891602, "rewards/rejected": -50.51329040527344, "step": 5858 }, { "epoch": 0.7977941176470589, "grad_norm": 39.59451690250948, "learning_rate": 9.558769509889786e-08, "logits/chosen": 12.827703475952148, "logits/rejected": 13.881481170654297, "logps/chosen": -4.073131084442139, "logps/rejected": -4.510995864868164, "loss": 4.1422, "rewards/accuracies": 0.75, "rewards/chosen": -40.73130798339844, "rewards/margins": 4.378650665283203, "rewards/rejected": -45.10995864868164, "step": 5859 }, { "epoch": 0.7979302832244008, "grad_norm": 39.606398923454385, "learning_rate": 9.54643824982511e-08, "logits/chosen": 14.244580268859863, "logits/rejected": 14.190776824951172, "logps/chosen": -4.408082962036133, "logps/rejected": -4.843160629272461, "loss": 3.7196, "rewards/accuracies": 0.75, "rewards/chosen": -44.08082962036133, "rewards/margins": 4.350773811340332, "rewards/rejected": -48.431602478027344, "step": 5860 }, { "epoch": 0.7980664488017429, "grad_norm": 47.59178558836488, "learning_rate": 9.53411387099445e-08, "logits/chosen": 13.39842414855957, "logits/rejected": 13.218693733215332, "logps/chosen": -4.180980682373047, "logps/rejected": -4.421832084655762, "loss": 4.166, "rewards/accuracies": 0.75, "rewards/chosen": -41.80980682373047, "rewards/margins": 2.4085121154785156, "rewards/rejected": -44.21832275390625, "step": 5861 }, { "epoch": 0.798202614379085, "grad_norm": 43.18481238283056, "learning_rate": 9.521796376182601e-08, "logits/chosen": 14.022619247436523, "logits/rejected": 14.451014518737793, "logps/chosen": -4.330627918243408, "logps/rejected": -4.737526893615723, "loss": 4.4293, "rewards/accuracies": 1.0, "rewards/chosen": -43.3062744140625, "rewards/margins": 4.06899356842041, "rewards/rejected": -47.37527084350586, "step": 5862 }, { "epoch": 0.798338779956427, "grad_norm": 43.30877697814459, "learning_rate": 9.509485768172783e-08, "logits/chosen": 14.41099739074707, "logits/rejected": 14.23404312133789, "logps/chosen": -4.663674354553223, "logps/rejected": -4.555194854736328, "loss": 4.3564, "rewards/accuracies": 0.5, "rewards/chosen": -46.636741638183594, "rewards/margins": -1.0847949981689453, "rewards/rejected": -45.55194854736328, "step": 5863 }, { "epoch": 0.7984749455337691, "grad_norm": 40.148642964806406, "learning_rate": 9.497182049746694e-08, "logits/chosen": 14.236711502075195, "logits/rejected": 14.927173614501953, "logps/chosen": -4.485424995422363, "logps/rejected": -4.8047566413879395, "loss": 4.0976, "rewards/accuracies": 0.75, "rewards/chosen": -44.854248046875, "rewards/margins": 3.1933202743530273, "rewards/rejected": -48.04756546020508, "step": 5864 }, { "epoch": 0.7986111111111112, "grad_norm": 47.051564148101896, "learning_rate": 9.484885223684473e-08, "logits/chosen": 13.887377738952637, "logits/rejected": 14.389601707458496, "logps/chosen": -4.424234390258789, "logps/rejected": -4.6499128341674805, "loss": 4.258, "rewards/accuracies": 0.5, "rewards/chosen": -44.242347717285156, "rewards/margins": 2.256784439086914, "rewards/rejected": -46.49913024902344, "step": 5865 }, { "epoch": 0.7987472766884531, "grad_norm": 39.13338952397825, "learning_rate": 9.472595292764664e-08, "logits/chosen": 14.824082374572754, "logits/rejected": 14.894094467163086, "logps/chosen": -4.71682596206665, "logps/rejected": -4.86060094833374, "loss": 3.9826, "rewards/accuracies": 0.5, "rewards/chosen": -47.16825866699219, "rewards/margins": 1.4377517700195312, "rewards/rejected": -48.60600662231445, "step": 5866 }, { "epoch": 0.7988834422657952, "grad_norm": 42.10231801127253, "learning_rate": 9.460312259764291e-08, "logits/chosen": 14.07841968536377, "logits/rejected": 14.089132308959961, "logps/chosen": -4.359786033630371, "logps/rejected": -4.3961005210876465, "loss": 4.2018, "rewards/accuracies": 0.5, "rewards/chosen": -43.597862243652344, "rewards/margins": 0.3631448745727539, "rewards/rejected": -43.96100616455078, "step": 5867 }, { "epoch": 0.7990196078431373, "grad_norm": 42.33236179412646, "learning_rate": 9.448036127458813e-08, "logits/chosen": 14.059869766235352, "logits/rejected": 13.849750518798828, "logps/chosen": -4.37969446182251, "logps/rejected": -4.297983646392822, "loss": 4.2329, "rewards/accuracies": 0.25, "rewards/chosen": -43.79694366455078, "rewards/margins": -0.8171072006225586, "rewards/rejected": -42.979835510253906, "step": 5868 }, { "epoch": 0.7991557734204793, "grad_norm": 43.25594977950732, "learning_rate": 9.435766898622115e-08, "logits/chosen": 13.908271789550781, "logits/rejected": 14.497381210327148, "logps/chosen": -4.074107646942139, "logps/rejected": -4.347954273223877, "loss": 3.9179, "rewards/accuracies": 0.75, "rewards/chosen": -40.74107360839844, "rewards/margins": 2.738466262817383, "rewards/rejected": -43.47954177856445, "step": 5869 }, { "epoch": 0.7992919389978214, "grad_norm": 40.00754795889246, "learning_rate": 9.423504576026524e-08, "logits/chosen": 14.1631441116333, "logits/rejected": 14.538533210754395, "logps/chosen": -3.920400619506836, "logps/rejected": -4.4335036277771, "loss": 3.3591, "rewards/accuracies": 1.0, "rewards/chosen": -39.204010009765625, "rewards/margins": 5.1310272216796875, "rewards/rejected": -44.33503723144531, "step": 5870 }, { "epoch": 0.7994281045751634, "grad_norm": 46.20629896278893, "learning_rate": 9.411249162442838e-08, "logits/chosen": 13.750717163085938, "logits/rejected": 13.417506217956543, "logps/chosen": -4.445435523986816, "logps/rejected": -4.2606201171875, "loss": 3.4517, "rewards/accuracies": 0.25, "rewards/chosen": -44.45435333251953, "rewards/margins": -1.848154067993164, "rewards/rejected": -42.606201171875, "step": 5871 }, { "epoch": 0.7995642701525054, "grad_norm": 43.428789320433985, "learning_rate": 9.399000660640242e-08, "logits/chosen": 14.45480728149414, "logits/rejected": 14.680410385131836, "logps/chosen": -4.38938045501709, "logps/rejected": -4.384267807006836, "loss": 3.8685, "rewards/accuracies": 0.5, "rewards/chosen": -43.89380645751953, "rewards/margins": -0.05112743377685547, "rewards/rejected": -43.84267807006836, "step": 5872 }, { "epoch": 0.7997004357298475, "grad_norm": 41.44795843806219, "learning_rate": 9.386759073386397e-08, "logits/chosen": 14.640140533447266, "logits/rejected": 13.746828079223633, "logps/chosen": -4.386067867279053, "logps/rejected": -4.622615814208984, "loss": 3.6888, "rewards/accuracies": 0.5, "rewards/chosen": -43.860679626464844, "rewards/margins": 2.3654823303222656, "rewards/rejected": -46.226158142089844, "step": 5873 }, { "epoch": 0.7998366013071896, "grad_norm": 39.585840647639905, "learning_rate": 9.374524403447401e-08, "logits/chosen": 13.940763473510742, "logits/rejected": 13.441804885864258, "logps/chosen": -4.557040214538574, "logps/rejected": -4.641885280609131, "loss": 3.9849, "rewards/accuracies": 0.5, "rewards/chosen": -45.57040023803711, "rewards/margins": 0.8484506607055664, "rewards/rejected": -46.418853759765625, "step": 5874 }, { "epoch": 0.7999727668845316, "grad_norm": 41.241869911397444, "learning_rate": 9.362296653587755e-08, "logits/chosen": 13.703487396240234, "logits/rejected": 13.711240768432617, "logps/chosen": -4.341367244720459, "logps/rejected": -4.328768730163574, "loss": 4.2899, "rewards/accuracies": 0.5, "rewards/chosen": -43.413673400878906, "rewards/margins": -0.12598609924316406, "rewards/rejected": -43.287689208984375, "step": 5875 }, { "epoch": 0.8001089324618736, "grad_norm": 43.54716917769656, "learning_rate": 9.350075826570436e-08, "logits/chosen": 14.045003890991211, "logits/rejected": 13.868230819702148, "logps/chosen": -4.34404993057251, "logps/rejected": -4.411654472351074, "loss": 4.4204, "rewards/accuracies": 0.75, "rewards/chosen": -43.44049835205078, "rewards/margins": 0.6760463714599609, "rewards/rejected": -44.116546630859375, "step": 5876 }, { "epoch": 0.8002450980392157, "grad_norm": 51.81276959481478, "learning_rate": 9.337861925156851e-08, "logits/chosen": 14.097991943359375, "logits/rejected": 14.104701042175293, "logps/chosen": -4.566659450531006, "logps/rejected": -4.64647102355957, "loss": 4.4141, "rewards/accuracies": 0.5, "rewards/chosen": -45.666595458984375, "rewards/margins": 0.7981138229370117, "rewards/rejected": -46.46470642089844, "step": 5877 }, { "epoch": 0.8003812636165577, "grad_norm": 40.77693287192237, "learning_rate": 9.325654952106811e-08, "logits/chosen": 14.245879173278809, "logits/rejected": 14.911070823669434, "logps/chosen": -4.483136177062988, "logps/rejected": -4.688880920410156, "loss": 3.9283, "rewards/accuracies": 1.0, "rewards/chosen": -44.83135986328125, "rewards/margins": 2.0574493408203125, "rewards/rejected": -46.88880920410156, "step": 5878 }, { "epoch": 0.8005174291938998, "grad_norm": 48.5038798413984, "learning_rate": 9.313454910178587e-08, "logits/chosen": 14.119874000549316, "logits/rejected": 14.838523864746094, "logps/chosen": -4.173157691955566, "logps/rejected": -4.384952545166016, "loss": 3.836, "rewards/accuracies": 0.75, "rewards/chosen": -41.7315788269043, "rewards/margins": 2.1179428100585938, "rewards/rejected": -43.84952163696289, "step": 5879 }, { "epoch": 0.8006535947712419, "grad_norm": 40.249374684399235, "learning_rate": 9.301261802128895e-08, "logits/chosen": 13.740829467773438, "logits/rejected": 14.189647674560547, "logps/chosen": -4.232101917266846, "logps/rejected": -4.594647407531738, "loss": 3.9398, "rewards/accuracies": 0.75, "rewards/chosen": -42.321022033691406, "rewards/margins": 3.62545108795166, "rewards/rejected": -45.94647216796875, "step": 5880 }, { "epoch": 0.8007897603485838, "grad_norm": 38.88817306528085, "learning_rate": 9.289075630712848e-08, "logits/chosen": 14.939886093139648, "logits/rejected": 15.320884704589844, "logps/chosen": -4.80498743057251, "logps/rejected": -4.940533638000488, "loss": 3.9549, "rewards/accuracies": 0.75, "rewards/chosen": -48.049869537353516, "rewards/margins": 1.3554620742797852, "rewards/rejected": -49.40533447265625, "step": 5881 }, { "epoch": 0.8009259259259259, "grad_norm": 50.98803362317862, "learning_rate": 9.276896398684022e-08, "logits/chosen": 14.234184265136719, "logits/rejected": 14.176738739013672, "logps/chosen": -4.778128623962402, "logps/rejected": -4.758403301239014, "loss": 4.2007, "rewards/accuracies": 0.5, "rewards/chosen": -47.781288146972656, "rewards/margins": -0.19725418090820312, "rewards/rejected": -47.58403396606445, "step": 5882 }, { "epoch": 0.801062091503268, "grad_norm": 39.81294705398105, "learning_rate": 9.264724108794424e-08, "logits/chosen": 13.830936431884766, "logits/rejected": 14.31184196472168, "logps/chosen": -4.3965888023376465, "logps/rejected": -4.730284690856934, "loss": 3.5185, "rewards/accuracies": 1.0, "rewards/chosen": -43.96588897705078, "rewards/margins": 3.3369569778442383, "rewards/rejected": -47.30284881591797, "step": 5883 }, { "epoch": 0.80119825708061, "grad_norm": 39.66254448023616, "learning_rate": 9.252558763794463e-08, "logits/chosen": 14.59377384185791, "logits/rejected": 15.034994125366211, "logps/chosen": -4.272372245788574, "logps/rejected": -4.743697166442871, "loss": 4.0804, "rewards/accuracies": 1.0, "rewards/chosen": -42.72372055053711, "rewards/margins": 4.713253974914551, "rewards/rejected": -47.436973571777344, "step": 5884 }, { "epoch": 0.8013344226579521, "grad_norm": 41.79760616351485, "learning_rate": 9.24040036643301e-08, "logits/chosen": 14.158601760864258, "logits/rejected": 13.201183319091797, "logps/chosen": -4.528960227966309, "logps/rejected": -4.3081254959106445, "loss": 3.8908, "rewards/accuracies": 0.25, "rewards/chosen": -45.28959655761719, "rewards/margins": -2.2083425521850586, "rewards/rejected": -43.08125305175781, "step": 5885 }, { "epoch": 0.8014705882352942, "grad_norm": 36.581900839745956, "learning_rate": 9.228248919457357e-08, "logits/chosen": 14.523725509643555, "logits/rejected": 14.565214157104492, "logps/chosen": -4.6722917556762695, "logps/rejected": -4.453011512756348, "loss": 3.7119, "rewards/accuracies": 0.25, "rewards/chosen": -46.72291564941406, "rewards/margins": -2.192800521850586, "rewards/rejected": -44.53011703491211, "step": 5886 }, { "epoch": 0.8016067538126361, "grad_norm": 38.385685907761086, "learning_rate": 9.216104425613234e-08, "logits/chosen": 14.764259338378906, "logits/rejected": 15.431962966918945, "logps/chosen": -4.933107376098633, "logps/rejected": -5.25380802154541, "loss": 3.4895, "rewards/accuracies": 0.5, "rewards/chosen": -49.33106994628906, "rewards/margins": 3.2070093154907227, "rewards/rejected": -52.53807830810547, "step": 5887 }, { "epoch": 0.8017429193899782, "grad_norm": 46.7134089612267, "learning_rate": 9.203966887644763e-08, "logits/chosen": 14.613940238952637, "logits/rejected": 14.424110412597656, "logps/chosen": -4.528806686401367, "logps/rejected": -4.590219497680664, "loss": 3.6982, "rewards/accuracies": 0.5, "rewards/chosen": -45.28806686401367, "rewards/margins": 0.6141300201416016, "rewards/rejected": -45.902198791503906, "step": 5888 }, { "epoch": 0.8018790849673203, "grad_norm": 39.43053643245024, "learning_rate": 9.191836308294538e-08, "logits/chosen": 13.981010437011719, "logits/rejected": 14.526283264160156, "logps/chosen": -4.278280258178711, "logps/rejected": -4.249289035797119, "loss": 3.5419, "rewards/accuracies": 0.5, "rewards/chosen": -42.782806396484375, "rewards/margins": -0.2899169921875, "rewards/rejected": -42.492889404296875, "step": 5889 }, { "epoch": 0.8020152505446623, "grad_norm": 37.791618216129585, "learning_rate": 9.179712690303575e-08, "logits/chosen": 14.503440856933594, "logits/rejected": 15.041749000549316, "logps/chosen": -4.168287754058838, "logps/rejected": -4.611891746520996, "loss": 3.9535, "rewards/accuracies": 1.0, "rewards/chosen": -41.68287658691406, "rewards/margins": 4.436038970947266, "rewards/rejected": -46.11891174316406, "step": 5890 }, { "epoch": 0.8021514161220044, "grad_norm": 93.54981397879011, "learning_rate": 9.16759603641128e-08, "logits/chosen": 13.95733642578125, "logits/rejected": 13.630873680114746, "logps/chosen": -4.451082229614258, "logps/rejected": -4.261716365814209, "loss": 3.6743, "rewards/accuracies": 0.25, "rewards/chosen": -44.51082229614258, "rewards/margins": -1.8936576843261719, "rewards/rejected": -42.617164611816406, "step": 5891 }, { "epoch": 0.8022875816993464, "grad_norm": 41.94320703373887, "learning_rate": 9.155486349355528e-08, "logits/chosen": 14.50808334350586, "logits/rejected": 14.226665496826172, "logps/chosen": -4.681435585021973, "logps/rejected": -4.795151710510254, "loss": 4.1788, "rewards/accuracies": 0.5, "rewards/chosen": -46.814353942871094, "rewards/margins": 1.1371650695800781, "rewards/rejected": -47.95151901245117, "step": 5892 }, { "epoch": 0.8024237472766884, "grad_norm": 37.993033141372855, "learning_rate": 9.143383631872592e-08, "logits/chosen": 14.225959777832031, "logits/rejected": 14.3366060256958, "logps/chosen": -4.320219993591309, "logps/rejected": -4.646768569946289, "loss": 4.0661, "rewards/accuracies": 0.75, "rewards/chosen": -43.20220184326172, "rewards/margins": 3.265481948852539, "rewards/rejected": -46.467681884765625, "step": 5893 }, { "epoch": 0.8025599128540305, "grad_norm": 45.559909292368374, "learning_rate": 9.131287886697205e-08, "logits/chosen": 14.259624481201172, "logits/rejected": 14.048907279968262, "logps/chosen": -4.261091232299805, "logps/rejected": -4.390798091888428, "loss": 3.7599, "rewards/accuracies": 0.5, "rewards/chosen": -42.61091613769531, "rewards/margins": 1.2970647811889648, "rewards/rejected": -43.90797805786133, "step": 5894 }, { "epoch": 0.8026960784313726, "grad_norm": 42.99337408752431, "learning_rate": 9.11919911656247e-08, "logits/chosen": 14.419146537780762, "logits/rejected": 14.817428588867188, "logps/chosen": -4.62207555770874, "logps/rejected": -4.926056861877441, "loss": 4.3766, "rewards/accuracies": 1.0, "rewards/chosen": -46.22075653076172, "rewards/margins": 3.0398101806640625, "rewards/rejected": -49.26056671142578, "step": 5895 }, { "epoch": 0.8028322440087146, "grad_norm": 39.575731016972945, "learning_rate": 9.107117324199958e-08, "logits/chosen": 13.853058815002441, "logits/rejected": 14.285483360290527, "logps/chosen": -4.310698986053467, "logps/rejected": -4.503696918487549, "loss": 3.8661, "rewards/accuracies": 0.75, "rewards/chosen": -43.106990814208984, "rewards/margins": 1.9299802780151367, "rewards/rejected": -45.03697204589844, "step": 5896 }, { "epoch": 0.8029684095860566, "grad_norm": 38.149705208019526, "learning_rate": 9.095042512339656e-08, "logits/chosen": 14.137539863586426, "logits/rejected": 14.92572021484375, "logps/chosen": -4.474298000335693, "logps/rejected": -4.887063980102539, "loss": 3.9851, "rewards/accuracies": 1.0, "rewards/chosen": -44.742977142333984, "rewards/margins": 4.127660751342773, "rewards/rejected": -48.870635986328125, "step": 5897 }, { "epoch": 0.8031045751633987, "grad_norm": 41.119852979431734, "learning_rate": 9.082974683709959e-08, "logits/chosen": 14.489933013916016, "logits/rejected": 15.238363265991211, "logps/chosen": -4.879515171051025, "logps/rejected": -5.1400628089904785, "loss": 3.9608, "rewards/accuracies": 0.75, "rewards/chosen": -48.79515075683594, "rewards/margins": 2.6054725646972656, "rewards/rejected": -51.40062713623047, "step": 5898 }, { "epoch": 0.8032407407407407, "grad_norm": 51.30209650259746, "learning_rate": 9.070913841037691e-08, "logits/chosen": 13.982696533203125, "logits/rejected": 14.490854263305664, "logps/chosen": -4.646198272705078, "logps/rejected": -4.727566719055176, "loss": 3.7085, "rewards/accuracies": 0.5, "rewards/chosen": -46.46198272705078, "rewards/margins": 0.8136835098266602, "rewards/rejected": -47.275665283203125, "step": 5899 }, { "epoch": 0.8033769063180828, "grad_norm": 40.41761973260275, "learning_rate": 9.0588599870481e-08, "logits/chosen": 13.528457641601562, "logits/rejected": 14.267852783203125, "logps/chosen": -4.2795186042785645, "logps/rejected": -4.734250068664551, "loss": 3.457, "rewards/accuracies": 0.75, "rewards/chosen": -42.79518127441406, "rewards/margins": 4.547319412231445, "rewards/rejected": -47.34250259399414, "step": 5900 }, { "epoch": 0.8035130718954249, "grad_norm": 39.952607395068924, "learning_rate": 9.04681312446487e-08, "logits/chosen": 13.80346965789795, "logits/rejected": 14.086241722106934, "logps/chosen": -4.246281623840332, "logps/rejected": -4.464783668518066, "loss": 3.7501, "rewards/accuracies": 0.75, "rewards/chosen": -42.46281814575195, "rewards/margins": 2.1850194931030273, "rewards/rejected": -44.64783477783203, "step": 5901 }, { "epoch": 0.8036492374727668, "grad_norm": 43.03534362006479, "learning_rate": 9.034773256010066e-08, "logits/chosen": 14.105146408081055, "logits/rejected": 13.980669021606445, "logps/chosen": -4.6425395011901855, "logps/rejected": -4.47258996963501, "loss": 3.4821, "rewards/accuracies": 0.5, "rewards/chosen": -46.425392150878906, "rewards/margins": -1.6994962692260742, "rewards/rejected": -44.72589874267578, "step": 5902 }, { "epoch": 0.8037854030501089, "grad_norm": 43.70741685723255, "learning_rate": 9.022740384404204e-08, "logits/chosen": 14.973796844482422, "logits/rejected": 13.780191421508789, "logps/chosen": -4.222107887268066, "logps/rejected": -4.41612434387207, "loss": 4.4739, "rewards/accuracies": 0.75, "rewards/chosen": -42.22107696533203, "rewards/margins": 1.9401626586914062, "rewards/rejected": -44.16123962402344, "step": 5903 }, { "epoch": 0.803921568627451, "grad_norm": 41.090996832595934, "learning_rate": 9.010714512366227e-08, "logits/chosen": 14.800214767456055, "logits/rejected": 14.34197998046875, "logps/chosen": -4.88081169128418, "logps/rejected": -4.734127044677734, "loss": 4.2879, "rewards/accuracies": 0.0, "rewards/chosen": -48.80811309814453, "rewards/margins": -1.4668426513671875, "rewards/rejected": -47.34127426147461, "step": 5904 }, { "epoch": 0.804057734204793, "grad_norm": 56.294664583733194, "learning_rate": 8.998695642613454e-08, "logits/chosen": 13.713788986206055, "logits/rejected": 13.69351863861084, "logps/chosen": -4.173512935638428, "logps/rejected": -4.227329254150391, "loss": 3.9472, "rewards/accuracies": 0.5, "rewards/chosen": -41.735130310058594, "rewards/margins": 0.5381612777709961, "rewards/rejected": -42.273292541503906, "step": 5905 }, { "epoch": 0.8041938997821351, "grad_norm": 41.50395745732129, "learning_rate": 8.986683777861657e-08, "logits/chosen": 14.752376556396484, "logits/rejected": 15.211393356323242, "logps/chosen": -4.667254447937012, "logps/rejected": -4.951290130615234, "loss": 3.6535, "rewards/accuracies": 1.0, "rewards/chosen": -46.67254638671875, "rewards/margins": 2.8403568267822266, "rewards/rejected": -49.512901306152344, "step": 5906 }, { "epoch": 0.8043300653594772, "grad_norm": 41.238630255036675, "learning_rate": 8.974678920825036e-08, "logits/chosen": 13.402165412902832, "logits/rejected": 15.199012756347656, "logps/chosen": -4.164893627166748, "logps/rejected": -4.614080429077148, "loss": 3.7276, "rewards/accuracies": 1.0, "rewards/chosen": -41.6489372253418, "rewards/margins": 4.491865158081055, "rewards/rejected": -46.14080047607422, "step": 5907 }, { "epoch": 0.8044662309368191, "grad_norm": 49.17307536162598, "learning_rate": 8.962681074216156e-08, "logits/chosen": 14.309605598449707, "logits/rejected": 14.349336624145508, "logps/chosen": -4.20830774307251, "logps/rejected": -4.244446754455566, "loss": 4.1099, "rewards/accuracies": 0.5, "rewards/chosen": -42.08307647705078, "rewards/margins": 0.3613929748535156, "rewards/rejected": -42.4444694519043, "step": 5908 }, { "epoch": 0.8046023965141612, "grad_norm": 44.91818986274369, "learning_rate": 8.950690240746043e-08, "logits/chosen": 14.31242561340332, "logits/rejected": 14.407354354858398, "logps/chosen": -4.232799053192139, "logps/rejected": -4.215839862823486, "loss": 4.5513, "rewards/accuracies": 0.75, "rewards/chosen": -42.3279914855957, "rewards/margins": -0.16959095001220703, "rewards/rejected": -42.15840148925781, "step": 5909 }, { "epoch": 0.8047385620915033, "grad_norm": 39.83510060190788, "learning_rate": 8.938706423124141e-08, "logits/chosen": 14.010478973388672, "logits/rejected": 15.280900955200195, "logps/chosen": -4.4892578125, "logps/rejected": -5.258326053619385, "loss": 3.6532, "rewards/accuracies": 1.0, "rewards/chosen": -44.892578125, "rewards/margins": 7.690681457519531, "rewards/rejected": -52.58325958251953, "step": 5910 }, { "epoch": 0.8048747276688453, "grad_norm": 41.43076241494502, "learning_rate": 8.926729624058263e-08, "logits/chosen": 14.01037883758545, "logits/rejected": 15.273910522460938, "logps/chosen": -4.457847595214844, "logps/rejected": -4.886510848999023, "loss": 4.2592, "rewards/accuracies": 1.0, "rewards/chosen": -44.57847595214844, "rewards/margins": 4.2866315841674805, "rewards/rejected": -48.86510467529297, "step": 5911 }, { "epoch": 0.8050108932461874, "grad_norm": 42.579696636566545, "learning_rate": 8.914759846254681e-08, "logits/chosen": 14.88503646850586, "logits/rejected": 14.535564422607422, "logps/chosen": -4.73592472076416, "logps/rejected": -4.665835380554199, "loss": 4.246, "rewards/accuracies": 0.5, "rewards/chosen": -47.35924530029297, "rewards/margins": -0.7008943557739258, "rewards/rejected": -46.65835189819336, "step": 5912 }, { "epoch": 0.8051470588235294, "grad_norm": 38.396759828162736, "learning_rate": 8.902797092418079e-08, "logits/chosen": 14.367447853088379, "logits/rejected": 15.213071823120117, "logps/chosen": -4.344668388366699, "logps/rejected": -4.69624137878418, "loss": 3.9391, "rewards/accuracies": 0.75, "rewards/chosen": -43.446685791015625, "rewards/margins": 3.5157251358032227, "rewards/rejected": -46.9624137878418, "step": 5913 }, { "epoch": 0.8052832244008714, "grad_norm": 41.33240635182948, "learning_rate": 8.890841365251511e-08, "logits/chosen": 14.142995834350586, "logits/rejected": 14.196924209594727, "logps/chosen": -4.2080206871032715, "logps/rejected": -4.480422496795654, "loss": 4.262, "rewards/accuracies": 0.75, "rewards/chosen": -42.08020782470703, "rewards/margins": 2.724015235900879, "rewards/rejected": -44.804222106933594, "step": 5914 }, { "epoch": 0.8054193899782135, "grad_norm": 40.26037897240133, "learning_rate": 8.87889266745649e-08, "logits/chosen": 13.194770812988281, "logits/rejected": 13.844263076782227, "logps/chosen": -4.321121692657471, "logps/rejected": -4.33000373840332, "loss": 3.9319, "rewards/accuracies": 0.75, "rewards/chosen": -43.211219787597656, "rewards/margins": 0.0888223648071289, "rewards/rejected": -43.30004119873047, "step": 5915 }, { "epoch": 0.8055555555555556, "grad_norm": 40.69325989346752, "learning_rate": 8.866951001732932e-08, "logits/chosen": 14.643529891967773, "logits/rejected": 13.760448455810547, "logps/chosen": -4.35964298248291, "logps/rejected": -3.996056318283081, "loss": 3.6014, "rewards/accuracies": 0.25, "rewards/chosen": -43.59642791748047, "rewards/margins": -3.635862350463867, "rewards/rejected": -39.96056365966797, "step": 5916 }, { "epoch": 0.8056917211328976, "grad_norm": 39.51759394322339, "learning_rate": 8.855016370779131e-08, "logits/chosen": 14.663419723510742, "logits/rejected": 14.836862564086914, "logps/chosen": -4.761508941650391, "logps/rejected": -4.983806610107422, "loss": 4.2004, "rewards/accuracies": 0.5, "rewards/chosen": -47.615089416503906, "rewards/margins": 2.2229795455932617, "rewards/rejected": -49.83806610107422, "step": 5917 }, { "epoch": 0.8058278867102396, "grad_norm": 52.02116801626462, "learning_rate": 8.84308877729183e-08, "logits/chosen": 13.379884719848633, "logits/rejected": 14.106103897094727, "logps/chosen": -4.2440409660339355, "logps/rejected": -4.670069694519043, "loss": 4.0635, "rewards/accuracies": 1.0, "rewards/chosen": -42.440406799316406, "rewards/margins": 4.260289192199707, "rewards/rejected": -46.70069885253906, "step": 5918 }, { "epoch": 0.8059640522875817, "grad_norm": 53.33561287468673, "learning_rate": 8.831168223966177e-08, "logits/chosen": 14.235499382019043, "logits/rejected": 15.22288703918457, "logps/chosen": -4.497483253479004, "logps/rejected": -4.91334342956543, "loss": 4.6124, "rewards/accuracies": 0.75, "rewards/chosen": -44.974830627441406, "rewards/margins": 4.158602714538574, "rewards/rejected": -49.1334342956543, "step": 5919 }, { "epoch": 0.8061002178649237, "grad_norm": 38.18867635174764, "learning_rate": 8.819254713495694e-08, "logits/chosen": 13.400089263916016, "logits/rejected": 14.271728515625, "logps/chosen": -4.089032173156738, "logps/rejected": -4.279209136962891, "loss": 3.7004, "rewards/accuracies": 0.5, "rewards/chosen": -40.89031982421875, "rewards/margins": 1.9017696380615234, "rewards/rejected": -42.792091369628906, "step": 5920 }, { "epoch": 0.8062363834422658, "grad_norm": 43.665868485516214, "learning_rate": 8.807348248572352e-08, "logits/chosen": 13.879545211791992, "logits/rejected": 14.631256103515625, "logps/chosen": -4.199756145477295, "logps/rejected": -4.440397262573242, "loss": 3.8159, "rewards/accuracies": 1.0, "rewards/chosen": -41.99755859375, "rewards/margins": 2.4064102172851562, "rewards/rejected": -44.403968811035156, "step": 5921 }, { "epoch": 0.8063725490196079, "grad_norm": 41.2513333510067, "learning_rate": 8.795448831886525e-08, "logits/chosen": 14.039323806762695, "logits/rejected": 14.715323448181152, "logps/chosen": -4.300083637237549, "logps/rejected": -4.681111812591553, "loss": 3.3703, "rewards/accuracies": 1.0, "rewards/chosen": -43.00083541870117, "rewards/margins": 3.8102807998657227, "rewards/rejected": -46.811119079589844, "step": 5922 }, { "epoch": 0.8065087145969498, "grad_norm": 42.32785413836211, "learning_rate": 8.783556466126966e-08, "logits/chosen": 15.516059875488281, "logits/rejected": 15.019821166992188, "logps/chosen": -4.827819347381592, "logps/rejected": -4.916753768920898, "loss": 4.2325, "rewards/accuracies": 0.5, "rewards/chosen": -48.27819061279297, "rewards/margins": 0.8893451690673828, "rewards/rejected": -49.16754150390625, "step": 5923 }, { "epoch": 0.8066448801742919, "grad_norm": 42.65424592954796, "learning_rate": 8.771671153980858e-08, "logits/chosen": 14.087620735168457, "logits/rejected": 14.383111000061035, "logps/chosen": -4.57108736038208, "logps/rejected": -4.524674415588379, "loss": 4.0229, "rewards/accuracies": 0.25, "rewards/chosen": -45.71087646484375, "rewards/margins": -0.4641275405883789, "rewards/rejected": -45.24674606323242, "step": 5924 }, { "epoch": 0.806781045751634, "grad_norm": 45.40411667412553, "learning_rate": 8.759792898133799e-08, "logits/chosen": 15.233255386352539, "logits/rejected": 14.677831649780273, "logps/chosen": -4.839053153991699, "logps/rejected": -4.587489604949951, "loss": 4.3063, "rewards/accuracies": 0.25, "rewards/chosen": -48.390533447265625, "rewards/margins": -2.5156326293945312, "rewards/rejected": -45.87489700317383, "step": 5925 }, { "epoch": 0.806917211328976, "grad_norm": 39.163588437721046, "learning_rate": 8.747921701269762e-08, "logits/chosen": 13.931429862976074, "logits/rejected": 14.82305908203125, "logps/chosen": -4.630059242248535, "logps/rejected": -4.640105247497559, "loss": 4.3027, "rewards/accuracies": 0.75, "rewards/chosen": -46.30059051513672, "rewards/margins": 0.10046100616455078, "rewards/rejected": -46.40105056762695, "step": 5926 }, { "epoch": 0.8070533769063181, "grad_norm": 38.46257931496249, "learning_rate": 8.736057566071147e-08, "logits/chosen": 14.508342742919922, "logits/rejected": 14.506799697875977, "logps/chosen": -4.469101905822754, "logps/rejected": -4.325448036193848, "loss": 4.0537, "rewards/accuracies": 0.25, "rewards/chosen": -44.691017150878906, "rewards/margins": -1.4365358352661133, "rewards/rejected": -43.254478454589844, "step": 5927 }, { "epoch": 0.8071895424836601, "grad_norm": 41.93322110405425, "learning_rate": 8.724200495218764e-08, "logits/chosen": 13.218781471252441, "logits/rejected": 14.400884628295898, "logps/chosen": -4.261327743530273, "logps/rejected": -4.575641632080078, "loss": 4.4604, "rewards/accuracies": 0.75, "rewards/chosen": -42.61328125, "rewards/margins": 3.143136978149414, "rewards/rejected": -45.75642013549805, "step": 5928 }, { "epoch": 0.8073257080610022, "grad_norm": 41.25742750778276, "learning_rate": 8.712350491391797e-08, "logits/chosen": 14.219657897949219, "logits/rejected": 14.055192947387695, "logps/chosen": -4.456846237182617, "logps/rejected": -4.540789604187012, "loss": 4.4963, "rewards/accuracies": 0.5, "rewards/chosen": -44.568458557128906, "rewards/margins": 0.8394393920898438, "rewards/rejected": -45.40789794921875, "step": 5929 }, { "epoch": 0.8074618736383442, "grad_norm": 39.056620578404754, "learning_rate": 8.700507557267864e-08, "logits/chosen": 14.599773406982422, "logits/rejected": 14.507240295410156, "logps/chosen": -4.626766681671143, "logps/rejected": -4.767254829406738, "loss": 3.6906, "rewards/accuracies": 0.75, "rewards/chosen": -46.267669677734375, "rewards/margins": 1.4048805236816406, "rewards/rejected": -47.67254638671875, "step": 5930 }, { "epoch": 0.8075980392156863, "grad_norm": 44.2434131210961, "learning_rate": 8.688671695522987e-08, "logits/chosen": 14.988896369934082, "logits/rejected": 15.121325492858887, "logps/chosen": -4.505374431610107, "logps/rejected": -4.723690509796143, "loss": 4.3165, "rewards/accuracies": 0.75, "rewards/chosen": -45.053741455078125, "rewards/margins": 2.183161735534668, "rewards/rejected": -47.23690414428711, "step": 5931 }, { "epoch": 0.8077342047930284, "grad_norm": 44.16266251430491, "learning_rate": 8.676842908831545e-08, "logits/chosen": 14.877467155456543, "logits/rejected": 14.604485511779785, "logps/chosen": -4.906266212463379, "logps/rejected": -4.524449348449707, "loss": 3.8276, "rewards/accuracies": 0.0, "rewards/chosen": -49.06266403198242, "rewards/margins": -3.8181686401367188, "rewards/rejected": -45.24449157714844, "step": 5932 }, { "epoch": 0.8078703703703703, "grad_norm": 46.90748824707289, "learning_rate": 8.665021199866371e-08, "logits/chosen": 13.565508842468262, "logits/rejected": 13.724245071411133, "logps/chosen": -4.313859939575195, "logps/rejected": -4.369580268859863, "loss": 4.2996, "rewards/accuracies": 0.5, "rewards/chosen": -43.13859558105469, "rewards/margins": 0.5572052001953125, "rewards/rejected": -43.69580078125, "step": 5933 }, { "epoch": 0.8080065359477124, "grad_norm": 42.50139735236597, "learning_rate": 8.653206571298688e-08, "logits/chosen": 14.315666198730469, "logits/rejected": 14.676191329956055, "logps/chosen": -4.584670543670654, "logps/rejected": -4.7820844650268555, "loss": 3.3971, "rewards/accuracies": 0.5, "rewards/chosen": -45.846702575683594, "rewards/margins": 1.9741439819335938, "rewards/rejected": -47.82084655761719, "step": 5934 }, { "epoch": 0.8081427015250545, "grad_norm": 40.67213914522233, "learning_rate": 8.64139902579808e-08, "logits/chosen": 14.157434463500977, "logits/rejected": 14.728346824645996, "logps/chosen": -4.664104461669922, "logps/rejected": -4.931413650512695, "loss": 3.8199, "rewards/accuracies": 1.0, "rewards/chosen": -46.641048431396484, "rewards/margins": 2.673089027404785, "rewards/rejected": -49.31413650512695, "step": 5935 }, { "epoch": 0.8082788671023965, "grad_norm": 40.919136352561985, "learning_rate": 8.629598566032577e-08, "logits/chosen": 14.508504867553711, "logits/rejected": 14.431845664978027, "logps/chosen": -4.271664619445801, "logps/rejected": -4.5221734046936035, "loss": 3.5667, "rewards/accuracies": 0.5, "rewards/chosen": -42.716644287109375, "rewards/margins": 2.5050907135009766, "rewards/rejected": -45.22173309326172, "step": 5936 }, { "epoch": 0.8084150326797386, "grad_norm": 44.39076239993167, "learning_rate": 8.617805194668597e-08, "logits/chosen": 13.856755256652832, "logits/rejected": 14.646408081054688, "logps/chosen": -4.593601226806641, "logps/rejected": -4.871076583862305, "loss": 3.8537, "rewards/accuracies": 0.75, "rewards/chosen": -45.93600845336914, "rewards/margins": 2.7747554779052734, "rewards/rejected": -48.71076202392578, "step": 5937 }, { "epoch": 0.8085511982570807, "grad_norm": 41.51510907346023, "learning_rate": 8.606018914370933e-08, "logits/chosen": 14.998172760009766, "logits/rejected": 14.89842414855957, "logps/chosen": -4.938782691955566, "logps/rejected": -5.054908275604248, "loss": 3.8584, "rewards/accuracies": 0.25, "rewards/chosen": -49.38782501220703, "rewards/margins": 1.1612567901611328, "rewards/rejected": -50.5490837097168, "step": 5938 }, { "epoch": 0.8086873638344226, "grad_norm": 41.91788469968352, "learning_rate": 8.594239727802799e-08, "logits/chosen": 13.484465599060059, "logits/rejected": 14.23367691040039, "logps/chosen": -4.4099931716918945, "logps/rejected": -4.480853080749512, "loss": 3.6715, "rewards/accuracies": 0.5, "rewards/chosen": -44.099937438964844, "rewards/margins": 0.7085981369018555, "rewards/rejected": -44.80853271484375, "step": 5939 }, { "epoch": 0.8088235294117647, "grad_norm": 44.550742936938285, "learning_rate": 8.582467637625814e-08, "logits/chosen": 15.022247314453125, "logits/rejected": 14.806325912475586, "logps/chosen": -4.491458415985107, "logps/rejected": -4.727115154266357, "loss": 4.1865, "rewards/accuracies": 0.75, "rewards/chosen": -44.914581298828125, "rewards/margins": 2.3565683364868164, "rewards/rejected": -47.271148681640625, "step": 5940 }, { "epoch": 0.8089596949891068, "grad_norm": 41.68732036158336, "learning_rate": 8.570702646499954e-08, "logits/chosen": 13.587590217590332, "logits/rejected": 13.307703018188477, "logps/chosen": -4.3520708084106445, "logps/rejected": -4.4802751541137695, "loss": 3.742, "rewards/accuracies": 0.5, "rewards/chosen": -43.52070617675781, "rewards/margins": 1.2820444107055664, "rewards/rejected": -44.80274963378906, "step": 5941 }, { "epoch": 0.8090958605664488, "grad_norm": 57.94845692733753, "learning_rate": 8.55894475708363e-08, "logits/chosen": 13.529499053955078, "logits/rejected": 14.49492073059082, "logps/chosen": -4.229226112365723, "logps/rejected": -4.572157382965088, "loss": 4.8184, "rewards/accuracies": 0.75, "rewards/chosen": -42.292266845703125, "rewards/margins": 3.4293079376220703, "rewards/rejected": -45.72157287597656, "step": 5942 }, { "epoch": 0.8092320261437909, "grad_norm": 41.98938157621314, "learning_rate": 8.547193972033642e-08, "logits/chosen": 14.521891593933105, "logits/rejected": 15.49371337890625, "logps/chosen": -4.383334159851074, "logps/rejected": -4.712128639221191, "loss": 3.8932, "rewards/accuracies": 0.75, "rewards/chosen": -43.833343505859375, "rewards/margins": 3.2879457473754883, "rewards/rejected": -47.12128829956055, "step": 5943 }, { "epoch": 0.809368191721133, "grad_norm": 39.9132088091777, "learning_rate": 8.535450294005153e-08, "logits/chosen": 15.381708145141602, "logits/rejected": 14.946929931640625, "logps/chosen": -4.716368675231934, "logps/rejected": -4.8487091064453125, "loss": 4.1068, "rewards/accuracies": 0.75, "rewards/chosen": -47.16368865966797, "rewards/margins": 1.3233985900878906, "rewards/rejected": -48.487083435058594, "step": 5944 }, { "epoch": 0.8095043572984749, "grad_norm": 47.533075119034734, "learning_rate": 8.523713725651762e-08, "logits/chosen": 13.800086975097656, "logits/rejected": 14.57917594909668, "logps/chosen": -4.203454494476318, "logps/rejected": -4.348208427429199, "loss": 4.5267, "rewards/accuracies": 0.75, "rewards/chosen": -42.0345458984375, "rewards/margins": 1.4475364685058594, "rewards/rejected": -43.48208236694336, "step": 5945 }, { "epoch": 0.809640522875817, "grad_norm": 42.025959390115666, "learning_rate": 8.511984269625454e-08, "logits/chosen": 13.883447647094727, "logits/rejected": 14.441256523132324, "logps/chosen": -4.353708267211914, "logps/rejected": -4.599067687988281, "loss": 4.0295, "rewards/accuracies": 0.75, "rewards/chosen": -43.53708267211914, "rewards/margins": 2.4535961151123047, "rewards/rejected": -45.99068069458008, "step": 5946 }, { "epoch": 0.8097766884531591, "grad_norm": 42.43236061108158, "learning_rate": 8.500261928576567e-08, "logits/chosen": 14.55978775024414, "logits/rejected": 14.354430198669434, "logps/chosen": -4.761685848236084, "logps/rejected": -4.405493259429932, "loss": 3.9546, "rewards/accuracies": 0.5, "rewards/chosen": -47.616859436035156, "rewards/margins": -3.5619258880615234, "rewards/rejected": -44.054931640625, "step": 5947 }, { "epoch": 0.8099128540305011, "grad_norm": 49.71695294030474, "learning_rate": 8.488546705153879e-08, "logits/chosen": 13.554740905761719, "logits/rejected": 14.963157653808594, "logps/chosen": -4.619544982910156, "logps/rejected": -5.19053840637207, "loss": 3.8911, "rewards/accuracies": 1.0, "rewards/chosen": -46.19545364379883, "rewards/margins": 5.709927558898926, "rewards/rejected": -51.90538024902344, "step": 5948 }, { "epoch": 0.8100490196078431, "grad_norm": 43.941979593165264, "learning_rate": 8.476838602004553e-08, "logits/chosen": 14.611705780029297, "logits/rejected": 14.8195161819458, "logps/chosen": -4.818171501159668, "logps/rejected": -4.702637195587158, "loss": 4.307, "rewards/accuracies": 0.5, "rewards/chosen": -48.18171310424805, "rewards/margins": -1.1553430557250977, "rewards/rejected": -47.026371002197266, "step": 5949 }, { "epoch": 0.8101851851851852, "grad_norm": 38.92974160734983, "learning_rate": 8.465137621774104e-08, "logits/chosen": 14.149181365966797, "logits/rejected": 14.452544212341309, "logps/chosen": -4.465573310852051, "logps/rejected": -4.695736885070801, "loss": 3.8722, "rewards/accuracies": 0.75, "rewards/chosen": -44.655731201171875, "rewards/margins": 2.3016433715820312, "rewards/rejected": -46.95737075805664, "step": 5950 }, { "epoch": 0.8103213507625272, "grad_norm": 40.65011958170259, "learning_rate": 8.453443767106478e-08, "logits/chosen": 15.162039756774902, "logits/rejected": 15.187006950378418, "logps/chosen": -4.936456680297852, "logps/rejected": -4.896845817565918, "loss": 4.3762, "rewards/accuracies": 0.25, "rewards/chosen": -49.364566802978516, "rewards/margins": -0.39610767364501953, "rewards/rejected": -48.96845626831055, "step": 5951 }, { "epoch": 0.8104575163398693, "grad_norm": 44.15334575321218, "learning_rate": 8.441757040644013e-08, "logits/chosen": 13.785030364990234, "logits/rejected": 13.286388397216797, "logps/chosen": -4.371310710906982, "logps/rejected": -4.142215728759766, "loss": 4.2581, "rewards/accuracies": 0.25, "rewards/chosen": -43.713104248046875, "rewards/margins": -2.2909536361694336, "rewards/rejected": -41.422149658203125, "step": 5952 }, { "epoch": 0.8105936819172114, "grad_norm": 40.58127463158539, "learning_rate": 8.430077445027395e-08, "logits/chosen": 13.727751731872559, "logits/rejected": 14.613454818725586, "logps/chosen": -4.648050785064697, "logps/rejected": -5.016287803649902, "loss": 4.0639, "rewards/accuracies": 0.75, "rewards/chosen": -46.480506896972656, "rewards/margins": 3.6823692321777344, "rewards/rejected": -50.162879943847656, "step": 5953 }, { "epoch": 0.8107298474945533, "grad_norm": 42.76875911466819, "learning_rate": 8.41840498289574e-08, "logits/chosen": 14.819219589233398, "logits/rejected": 14.873779296875, "logps/chosen": -4.5978899002075195, "logps/rejected": -4.9073710441589355, "loss": 3.4261, "rewards/accuracies": 0.75, "rewards/chosen": -45.97890090942383, "rewards/margins": 3.094808578491211, "rewards/rejected": -49.073707580566406, "step": 5954 }, { "epoch": 0.8108660130718954, "grad_norm": 47.709405428692584, "learning_rate": 8.406739656886541e-08, "logits/chosen": 14.07209587097168, "logits/rejected": 14.186256408691406, "logps/chosen": -4.442794322967529, "logps/rejected": -4.4008283615112305, "loss": 4.2683, "rewards/accuracies": 0.75, "rewards/chosen": -44.427947998046875, "rewards/margins": -0.4196643829345703, "rewards/rejected": -44.008277893066406, "step": 5955 }, { "epoch": 0.8110021786492375, "grad_norm": 42.66910164573637, "learning_rate": 8.395081469635661e-08, "logits/chosen": 14.31757926940918, "logits/rejected": 13.143779754638672, "logps/chosen": -4.522915840148926, "logps/rejected": -4.359552383422852, "loss": 4.3165, "rewards/accuracies": 0.5, "rewards/chosen": -45.229156494140625, "rewards/margins": -1.6336345672607422, "rewards/rejected": -43.595523834228516, "step": 5956 }, { "epoch": 0.8111383442265795, "grad_norm": 40.68735969118424, "learning_rate": 8.383430423777373e-08, "logits/chosen": 14.072591781616211, "logits/rejected": 14.603723526000977, "logps/chosen": -4.609507083892822, "logps/rejected": -4.770083427429199, "loss": 4.1548, "rewards/accuracies": 0.75, "rewards/chosen": -46.095069885253906, "rewards/margins": 1.6057586669921875, "rewards/rejected": -47.700828552246094, "step": 5957 }, { "epoch": 0.8112745098039216, "grad_norm": 43.150226448408645, "learning_rate": 8.371786521944338e-08, "logits/chosen": 13.995052337646484, "logits/rejected": 13.982261657714844, "logps/chosen": -4.6023969650268555, "logps/rejected": -4.491677761077881, "loss": 3.7806, "rewards/accuracies": 0.25, "rewards/chosen": -46.02397155761719, "rewards/margins": -1.1071929931640625, "rewards/rejected": -44.91677474975586, "step": 5958 }, { "epoch": 0.8114106753812637, "grad_norm": 41.009372441613266, "learning_rate": 8.360149766767568e-08, "logits/chosen": 14.492705345153809, "logits/rejected": 14.37446403503418, "logps/chosen": -4.492565155029297, "logps/rejected": -4.798015117645264, "loss": 3.8784, "rewards/accuracies": 0.75, "rewards/chosen": -44.92565155029297, "rewards/margins": 3.0544958114624023, "rewards/rejected": -47.98014831542969, "step": 5959 }, { "epoch": 0.8115468409586056, "grad_norm": 44.3207834022469, "learning_rate": 8.348520160876496e-08, "logits/chosen": 13.916202545166016, "logits/rejected": 14.954580307006836, "logps/chosen": -4.611257553100586, "logps/rejected": -5.2124152183532715, "loss": 4.1145, "rewards/accuracies": 0.75, "rewards/chosen": -46.11257553100586, "rewards/margins": 6.011578559875488, "rewards/rejected": -52.12415313720703, "step": 5960 }, { "epoch": 0.8116830065359477, "grad_norm": 41.91987469857962, "learning_rate": 8.336897706898937e-08, "logits/chosen": 13.566904067993164, "logits/rejected": 14.5546875, "logps/chosen": -4.5158796310424805, "logps/rejected": -4.607900142669678, "loss": 4.0426, "rewards/accuracies": 0.5, "rewards/chosen": -45.15879440307617, "rewards/margins": 0.9202070236206055, "rewards/rejected": -46.079002380371094, "step": 5961 }, { "epoch": 0.8118191721132898, "grad_norm": 42.30053606206448, "learning_rate": 8.32528240746106e-08, "logits/chosen": 14.598686218261719, "logits/rejected": 15.27320671081543, "logps/chosen": -4.646725654602051, "logps/rejected": -5.073060512542725, "loss": 4.2184, "rewards/accuracies": 1.0, "rewards/chosen": -46.467262268066406, "rewards/margins": 4.263343811035156, "rewards/rejected": -50.73060607910156, "step": 5962 }, { "epoch": 0.8119553376906318, "grad_norm": 44.495272469707984, "learning_rate": 8.31367426518745e-08, "logits/chosen": 13.988483428955078, "logits/rejected": 14.775766372680664, "logps/chosen": -3.9924960136413574, "logps/rejected": -4.293076038360596, "loss": 3.7729, "rewards/accuracies": 0.75, "rewards/chosen": -39.92496109008789, "rewards/margins": 3.005801200866699, "rewards/rejected": -42.930763244628906, "step": 5963 }, { "epoch": 0.8120915032679739, "grad_norm": 44.32888839945566, "learning_rate": 8.302073282701072e-08, "logits/chosen": 14.03309154510498, "logits/rejected": 13.916045188903809, "logps/chosen": -4.345166206359863, "logps/rejected": -4.285349369049072, "loss": 4.1447, "rewards/accuracies": 0.5, "rewards/chosen": -43.45166015625, "rewards/margins": -0.5981655120849609, "rewards/rejected": -42.853492736816406, "step": 5964 }, { "epoch": 0.8122276688453159, "grad_norm": 38.70841002606347, "learning_rate": 8.290479462623242e-08, "logits/chosen": 13.80821418762207, "logits/rejected": 14.79959487915039, "logps/chosen": -4.544618606567383, "logps/rejected": -4.586846828460693, "loss": 3.6666, "rewards/accuracies": 0.5, "rewards/chosen": -45.44618606567383, "rewards/margins": 0.42228221893310547, "rewards/rejected": -45.86846923828125, "step": 5965 }, { "epoch": 0.8123638344226579, "grad_norm": 40.72805711562287, "learning_rate": 8.278892807573691e-08, "logits/chosen": 14.377710342407227, "logits/rejected": 14.955526351928711, "logps/chosen": -4.445988178253174, "logps/rejected": -4.868358612060547, "loss": 3.8474, "rewards/accuracies": 1.0, "rewards/chosen": -44.45988464355469, "rewards/margins": 4.223705291748047, "rewards/rejected": -48.68358612060547, "step": 5966 }, { "epoch": 0.8125, "grad_norm": 41.708963030680906, "learning_rate": 8.26731332017053e-08, "logits/chosen": 13.863191604614258, "logits/rejected": 13.688737869262695, "logps/chosen": -4.615052223205566, "logps/rejected": -4.466191291809082, "loss": 4.0058, "rewards/accuracies": 0.5, "rewards/chosen": -46.15052032470703, "rewards/margins": -1.488607406616211, "rewards/rejected": -44.66191101074219, "step": 5967 }, { "epoch": 0.8126361655773421, "grad_norm": 41.25086350328575, "learning_rate": 8.255741003030219e-08, "logits/chosen": 13.375391006469727, "logits/rejected": 14.055009841918945, "logps/chosen": -4.24243688583374, "logps/rejected": -4.5365095138549805, "loss": 3.6349, "rewards/accuracies": 0.75, "rewards/chosen": -42.42436981201172, "rewards/margins": 2.9407262802124023, "rewards/rejected": -45.36509704589844, "step": 5968 }, { "epoch": 0.8127723311546841, "grad_norm": 41.52110308023375, "learning_rate": 8.24417585876763e-08, "logits/chosen": 14.42355728149414, "logits/rejected": 14.179288864135742, "logps/chosen": -4.704919338226318, "logps/rejected": -4.806907653808594, "loss": 4.2045, "rewards/accuracies": 0.5, "rewards/chosen": -47.0491943359375, "rewards/margins": 1.0198774337768555, "rewards/rejected": -48.06907272338867, "step": 5969 }, { "epoch": 0.8129084967320261, "grad_norm": 44.585184576295944, "learning_rate": 8.232617889996012e-08, "logits/chosen": 13.662569046020508, "logits/rejected": 13.551270484924316, "logps/chosen": -4.237071990966797, "logps/rejected": -4.54141092300415, "loss": 3.6836, "rewards/accuracies": 0.75, "rewards/chosen": -42.37071990966797, "rewards/margins": 3.043391227722168, "rewards/rejected": -45.41411209106445, "step": 5970 }, { "epoch": 0.8130446623093682, "grad_norm": 40.20509965026805, "learning_rate": 8.221067099326964e-08, "logits/chosen": 14.448923110961914, "logits/rejected": 14.979394912719727, "logps/chosen": -4.507142066955566, "logps/rejected": -4.728066444396973, "loss": 3.9364, "rewards/accuracies": 0.5, "rewards/chosen": -45.07141876220703, "rewards/margins": 2.2092437744140625, "rewards/rejected": -47.280662536621094, "step": 5971 }, { "epoch": 0.8131808278867102, "grad_norm": 41.99385266955709, "learning_rate": 8.209523489370491e-08, "logits/chosen": 14.993748664855957, "logits/rejected": 15.143325805664062, "logps/chosen": -4.733598709106445, "logps/rejected": -4.748268127441406, "loss": 3.9685, "rewards/accuracies": 0.5, "rewards/chosen": -47.33598327636719, "rewards/margins": 0.14669513702392578, "rewards/rejected": -47.48268127441406, "step": 5972 }, { "epoch": 0.8133169934640523, "grad_norm": 43.93758739041588, "learning_rate": 8.197987062734979e-08, "logits/chosen": 14.555898666381836, "logits/rejected": 13.80015754699707, "logps/chosen": -4.567080497741699, "logps/rejected": -4.191227436065674, "loss": 3.6883, "rewards/accuracies": 0.25, "rewards/chosen": -45.670806884765625, "rewards/margins": -3.7585344314575195, "rewards/rejected": -41.91227340698242, "step": 5973 }, { "epoch": 0.8134531590413944, "grad_norm": 39.802407643975904, "learning_rate": 8.18645782202716e-08, "logits/chosen": 14.639259338378906, "logits/rejected": 14.844980239868164, "logps/chosen": -4.68572473526001, "logps/rejected": -4.9780168533325195, "loss": 4.0891, "rewards/accuracies": 0.75, "rewards/chosen": -46.85724639892578, "rewards/margins": 2.9229211807250977, "rewards/rejected": -49.78017044067383, "step": 5974 }, { "epoch": 0.8135893246187363, "grad_norm": 42.47741004727329, "learning_rate": 8.174935769852167e-08, "logits/chosen": 13.987333297729492, "logits/rejected": 13.472171783447266, "logps/chosen": -4.444664001464844, "logps/rejected": -4.536813735961914, "loss": 3.5073, "rewards/accuracies": 0.75, "rewards/chosen": -44.44664001464844, "rewards/margins": 0.9215002059936523, "rewards/rejected": -45.368141174316406, "step": 5975 }, { "epoch": 0.8137254901960784, "grad_norm": 43.73858173884827, "learning_rate": 8.163420908813519e-08, "logits/chosen": 13.78921890258789, "logits/rejected": 13.070932388305664, "logps/chosen": -4.330052375793457, "logps/rejected": -4.24038553237915, "loss": 4.1441, "rewards/accuracies": 0.5, "rewards/chosen": -43.30052185058594, "rewards/margins": -0.8966665267944336, "rewards/rejected": -42.40385437011719, "step": 5976 }, { "epoch": 0.8138616557734205, "grad_norm": 42.76237400425438, "learning_rate": 8.151913241513067e-08, "logits/chosen": 14.429931640625, "logits/rejected": 15.064927101135254, "logps/chosen": -4.484277725219727, "logps/rejected": -4.878414154052734, "loss": 3.7901, "rewards/accuracies": 1.0, "rewards/chosen": -44.8427734375, "rewards/margins": 3.9413700103759766, "rewards/rejected": -48.784141540527344, "step": 5977 }, { "epoch": 0.8139978213507625, "grad_norm": 37.41650063116128, "learning_rate": 8.140412770551078e-08, "logits/chosen": 14.899995803833008, "logits/rejected": 15.089670181274414, "logps/chosen": -4.984487533569336, "logps/rejected": -4.87336540222168, "loss": 3.6927, "rewards/accuracies": 0.5, "rewards/chosen": -49.844871520996094, "rewards/margins": -1.1112213134765625, "rewards/rejected": -48.73365020751953, "step": 5978 }, { "epoch": 0.8141339869281046, "grad_norm": 38.32417610835026, "learning_rate": 8.128919498526188e-08, "logits/chosen": 14.125056266784668, "logits/rejected": 14.34646224975586, "logps/chosen": -4.499873161315918, "logps/rejected": -4.497167587280273, "loss": 3.5424, "rewards/accuracies": 0.25, "rewards/chosen": -44.99873352050781, "rewards/margins": -0.02705669403076172, "rewards/rejected": -44.971675872802734, "step": 5979 }, { "epoch": 0.8142701525054467, "grad_norm": 40.399769517164685, "learning_rate": 8.117433428035373e-08, "logits/chosen": 14.028594017028809, "logits/rejected": 14.191659927368164, "logps/chosen": -4.899530410766602, "logps/rejected": -4.916959285736084, "loss": 4.0203, "rewards/accuracies": 0.5, "rewards/chosen": -48.99530792236328, "rewards/margins": 0.17428970336914062, "rewards/rejected": -49.169593811035156, "step": 5980 }, { "epoch": 0.8144063180827886, "grad_norm": 54.82578540843106, "learning_rate": 8.105954561674022e-08, "logits/chosen": 13.883306503295898, "logits/rejected": 14.67312240600586, "logps/chosen": -4.235384941101074, "logps/rejected": -4.8904008865356445, "loss": 3.6063, "rewards/accuracies": 1.0, "rewards/chosen": -42.353851318359375, "rewards/margins": 6.550158500671387, "rewards/rejected": -48.90401077270508, "step": 5981 }, { "epoch": 0.8145424836601307, "grad_norm": 40.558799963427646, "learning_rate": 8.094482902035884e-08, "logits/chosen": 13.959650993347168, "logits/rejected": 14.841827392578125, "logps/chosen": -4.572166442871094, "logps/rejected": -4.550827980041504, "loss": 3.7782, "rewards/accuracies": 0.5, "rewards/chosen": -45.7216682434082, "rewards/margins": -0.21339130401611328, "rewards/rejected": -45.508277893066406, "step": 5982 }, { "epoch": 0.8146786492374728, "grad_norm": 48.548898937786326, "learning_rate": 8.083018451713064e-08, "logits/chosen": 14.561759948730469, "logits/rejected": 14.353836059570312, "logps/chosen": -4.572809219360352, "logps/rejected": -4.601370811462402, "loss": 4.6302, "rewards/accuracies": 0.5, "rewards/chosen": -45.728092193603516, "rewards/margins": 0.2856149673461914, "rewards/rejected": -46.01370620727539, "step": 5983 }, { "epoch": 0.8148148148148148, "grad_norm": 41.285242842900885, "learning_rate": 8.071561213296046e-08, "logits/chosen": 14.251708984375, "logits/rejected": 13.844715118408203, "logps/chosen": -4.583173751831055, "logps/rejected": -4.369844436645508, "loss": 4.1223, "rewards/accuracies": 0.25, "rewards/chosen": -45.83173370361328, "rewards/margins": -2.133294105529785, "rewards/rejected": -43.69844055175781, "step": 5984 }, { "epoch": 0.8149509803921569, "grad_norm": 42.9395357830762, "learning_rate": 8.0601111893737e-08, "logits/chosen": 13.995842933654785, "logits/rejected": 15.044763565063477, "logps/chosen": -4.556888580322266, "logps/rejected": -4.894972801208496, "loss": 4.1911, "rewards/accuracies": 0.75, "rewards/chosen": -45.56888961791992, "rewards/margins": 3.3808374404907227, "rewards/rejected": -48.94972610473633, "step": 5985 }, { "epoch": 0.8150871459694989, "grad_norm": 38.28149171638672, "learning_rate": 8.048668382533255e-08, "logits/chosen": 14.942787170410156, "logits/rejected": 15.221500396728516, "logps/chosen": -4.999497413635254, "logps/rejected": -5.4166717529296875, "loss": 3.6808, "rewards/accuracies": 1.0, "rewards/chosen": -49.99497604370117, "rewards/margins": 4.171746253967285, "rewards/rejected": -54.16672134399414, "step": 5986 }, { "epoch": 0.8152233115468409, "grad_norm": 42.24738466777406, "learning_rate": 8.037232795360296e-08, "logits/chosen": 14.30633544921875, "logits/rejected": 14.932873725891113, "logps/chosen": -4.576933860778809, "logps/rejected": -5.048507213592529, "loss": 3.81, "rewards/accuracies": 0.75, "rewards/chosen": -45.76934051513672, "rewards/margins": 4.715726852416992, "rewards/rejected": -50.485069274902344, "step": 5987 }, { "epoch": 0.815359477124183, "grad_norm": 46.848125787755876, "learning_rate": 8.025804430438791e-08, "logits/chosen": 14.281978607177734, "logits/rejected": 13.967626571655273, "logps/chosen": -4.624711036682129, "logps/rejected": -4.617627143859863, "loss": 4.4535, "rewards/accuracies": 0.5, "rewards/chosen": -46.247108459472656, "rewards/margins": -0.07083606719970703, "rewards/rejected": -46.17626953125, "step": 5988 }, { "epoch": 0.8154956427015251, "grad_norm": 44.10237875481287, "learning_rate": 8.014383290351086e-08, "logits/chosen": 13.72697639465332, "logits/rejected": 13.852288246154785, "logps/chosen": -4.42156457901001, "logps/rejected": -4.422795295715332, "loss": 3.6826, "rewards/accuracies": 0.5, "rewards/chosen": -44.21564483642578, "rewards/margins": 0.012304306030273438, "rewards/rejected": -44.22795104980469, "step": 5989 }, { "epoch": 0.815631808278867, "grad_norm": 40.04298263990506, "learning_rate": 8.002969377677864e-08, "logits/chosen": 15.069334030151367, "logits/rejected": 14.985847473144531, "logps/chosen": -4.819337844848633, "logps/rejected": -4.898970603942871, "loss": 4.2484, "rewards/accuracies": 0.5, "rewards/chosen": -48.19337844848633, "rewards/margins": 0.7963275909423828, "rewards/rejected": -48.98970413208008, "step": 5990 }, { "epoch": 0.8157679738562091, "grad_norm": 45.789486201253645, "learning_rate": 7.991562694998197e-08, "logits/chosen": 13.170194625854492, "logits/rejected": 13.432563781738281, "logps/chosen": -4.081509113311768, "logps/rejected": -4.291446685791016, "loss": 3.6711, "rewards/accuracies": 0.75, "rewards/chosen": -40.81509017944336, "rewards/margins": 2.0993757247924805, "rewards/rejected": -42.914466857910156, "step": 5991 }, { "epoch": 0.8159041394335512, "grad_norm": 40.773679205327085, "learning_rate": 7.980163244889527e-08, "logits/chosen": 14.339488983154297, "logits/rejected": 15.266176223754883, "logps/chosen": -4.377316474914551, "logps/rejected": -4.753547668457031, "loss": 4.104, "rewards/accuracies": 0.75, "rewards/chosen": -43.773170471191406, "rewards/margins": 3.762308120727539, "rewards/rejected": -47.53547668457031, "step": 5992 }, { "epoch": 0.8160403050108932, "grad_norm": 42.3995247475114, "learning_rate": 7.968771029927662e-08, "logits/chosen": 13.708053588867188, "logits/rejected": 13.943464279174805, "logps/chosen": -4.131507396697998, "logps/rejected": -4.442996978759766, "loss": 3.912, "rewards/accuracies": 0.75, "rewards/chosen": -41.3150749206543, "rewards/margins": 3.114893913269043, "rewards/rejected": -44.42996597290039, "step": 5993 }, { "epoch": 0.8161764705882353, "grad_norm": 66.64701137188845, "learning_rate": 7.957386052686743e-08, "logits/chosen": 13.018975257873535, "logits/rejected": 13.913990020751953, "logps/chosen": -4.026711463928223, "logps/rejected": -4.380043029785156, "loss": 3.6279, "rewards/accuracies": 0.5, "rewards/chosen": -40.26710891723633, "rewards/margins": 3.533320426940918, "rewards/rejected": -43.80043029785156, "step": 5994 }, { "epoch": 0.8163126361655774, "grad_norm": 50.0061813904923, "learning_rate": 7.946008315739314e-08, "logits/chosen": 13.782017707824707, "logits/rejected": 14.402687072753906, "logps/chosen": -4.219215393066406, "logps/rejected": -4.602672100067139, "loss": 4.8461, "rewards/accuracies": 1.0, "rewards/chosen": -42.19215393066406, "rewards/margins": 3.8345651626586914, "rewards/rejected": -46.02671813964844, "step": 5995 }, { "epoch": 0.8164488017429193, "grad_norm": 43.226292087777765, "learning_rate": 7.934637821656274e-08, "logits/chosen": 13.948070526123047, "logits/rejected": 14.377225875854492, "logps/chosen": -4.499520301818848, "logps/rejected": -4.493375778198242, "loss": 3.9078, "rewards/accuracies": 0.25, "rewards/chosen": -44.995208740234375, "rewards/margins": -0.06145000457763672, "rewards/rejected": -44.933753967285156, "step": 5996 }, { "epoch": 0.8165849673202614, "grad_norm": 40.711065484802425, "learning_rate": 7.923274573006864e-08, "logits/chosen": 14.290773391723633, "logits/rejected": 14.238618850708008, "logps/chosen": -4.471385955810547, "logps/rejected": -4.789969444274902, "loss": 3.4221, "rewards/accuracies": 1.0, "rewards/chosen": -44.71385955810547, "rewards/margins": 3.1858348846435547, "rewards/rejected": -47.89969253540039, "step": 5997 }, { "epoch": 0.8167211328976035, "grad_norm": 40.71887842150009, "learning_rate": 7.911918572358715e-08, "logits/chosen": 15.395668029785156, "logits/rejected": 15.093093872070312, "logps/chosen": -4.556219100952148, "logps/rejected": -4.661343574523926, "loss": 3.6558, "rewards/accuracies": 0.75, "rewards/chosen": -45.56218719482422, "rewards/margins": 1.0512456893920898, "rewards/rejected": -46.613433837890625, "step": 5998 }, { "epoch": 0.8168572984749455, "grad_norm": 45.373985876160766, "learning_rate": 7.900569822277807e-08, "logits/chosen": 13.808727264404297, "logits/rejected": 14.85534954071045, "logps/chosen": -4.21833610534668, "logps/rejected": -4.537600994110107, "loss": 4.1436, "rewards/accuracies": 0.5, "rewards/chosen": -42.18335723876953, "rewards/margins": 3.1926498413085938, "rewards/rejected": -45.376007080078125, "step": 5999 }, { "epoch": 0.8169934640522876, "grad_norm": 41.395692855567845, "learning_rate": 7.889228325328496e-08, "logits/chosen": 13.174192428588867, "logits/rejected": 14.175813674926758, "logps/chosen": -4.171985149383545, "logps/rejected": -4.67363166809082, "loss": 3.9243, "rewards/accuracies": 0.75, "rewards/chosen": -41.719852447509766, "rewards/margins": 5.016463279724121, "rewards/rejected": -46.73631286621094, "step": 6000 }, { "epoch": 0.8171296296296297, "grad_norm": 52.560918481547745, "learning_rate": 7.877894084073462e-08, "logits/chosen": 14.149629592895508, "logits/rejected": 15.067869186401367, "logps/chosen": -4.618329048156738, "logps/rejected": -4.714743614196777, "loss": 4.6868, "rewards/accuracies": 0.5, "rewards/chosen": -46.18328857421875, "rewards/margins": 0.9641485214233398, "rewards/rejected": -47.147438049316406, "step": 6001 }, { "epoch": 0.8172657952069716, "grad_norm": 41.68736090778014, "learning_rate": 7.866567101073785e-08, "logits/chosen": 14.619134902954102, "logits/rejected": 14.030467987060547, "logps/chosen": -4.849210262298584, "logps/rejected": -4.794585704803467, "loss": 3.982, "rewards/accuracies": 0.5, "rewards/chosen": -48.492103576660156, "rewards/margins": -0.5462446212768555, "rewards/rejected": -47.94585418701172, "step": 6002 }, { "epoch": 0.8174019607843137, "grad_norm": 44.52133794255285, "learning_rate": 7.8552473788889e-08, "logits/chosen": 14.390113830566406, "logits/rejected": 14.841157913208008, "logps/chosen": -4.738980770111084, "logps/rejected": -4.909950256347656, "loss": 3.98, "rewards/accuracies": 0.75, "rewards/chosen": -47.389808654785156, "rewards/margins": 1.7096967697143555, "rewards/rejected": -49.09950256347656, "step": 6003 }, { "epoch": 0.8175381263616558, "grad_norm": 41.71450492059725, "learning_rate": 7.84393492007657e-08, "logits/chosen": 15.11579704284668, "logits/rejected": 15.262287139892578, "logps/chosen": -4.425032615661621, "logps/rejected": -4.714179039001465, "loss": 3.8736, "rewards/accuracies": 1.0, "rewards/chosen": -44.25032043457031, "rewards/margins": 2.8914670944213867, "rewards/rejected": -47.141788482666016, "step": 6004 }, { "epoch": 0.8176742919389978, "grad_norm": 41.717983581000006, "learning_rate": 7.83262972719295e-08, "logits/chosen": 13.844987869262695, "logits/rejected": 13.373100280761719, "logps/chosen": -4.323237419128418, "logps/rejected": -4.514610767364502, "loss": 4.5774, "rewards/accuracies": 0.5, "rewards/chosen": -43.23237609863281, "rewards/margins": 1.9137296676635742, "rewards/rejected": -45.1461067199707, "step": 6005 }, { "epoch": 0.8178104575163399, "grad_norm": 37.96002197944764, "learning_rate": 7.82133180279255e-08, "logits/chosen": 13.80588436126709, "logits/rejected": 14.56706428527832, "logps/chosen": -4.232714653015137, "logps/rejected": -4.481884956359863, "loss": 3.4497, "rewards/accuracies": 0.75, "rewards/chosen": -42.32714080810547, "rewards/margins": 2.4917097091674805, "rewards/rejected": -44.81885528564453, "step": 6006 }, { "epoch": 0.8179466230936819, "grad_norm": 45.175672424305425, "learning_rate": 7.810041149428213e-08, "logits/chosen": 14.69601058959961, "logits/rejected": 14.528158187866211, "logps/chosen": -4.858743667602539, "logps/rejected": -4.815365791320801, "loss": 4.1426, "rewards/accuracies": 0.25, "rewards/chosen": -48.58743667602539, "rewards/margins": -0.43378162384033203, "rewards/rejected": -48.153656005859375, "step": 6007 }, { "epoch": 0.818082788671024, "grad_norm": 44.52346138788648, "learning_rate": 7.798757769651159e-08, "logits/chosen": 14.017135620117188, "logits/rejected": 14.656637191772461, "logps/chosen": -4.269618034362793, "logps/rejected": -4.730622291564941, "loss": 4.1954, "rewards/accuracies": 0.75, "rewards/chosen": -42.69618225097656, "rewards/margins": 4.610044479370117, "rewards/rejected": -47.30622863769531, "step": 6008 }, { "epoch": 0.818218954248366, "grad_norm": 42.295952888728124, "learning_rate": 7.78748166601098e-08, "logits/chosen": 13.683147430419922, "logits/rejected": 14.114256858825684, "logps/chosen": -4.111961841583252, "logps/rejected": -4.494727611541748, "loss": 3.7769, "rewards/accuracies": 0.75, "rewards/chosen": -41.11961364746094, "rewards/margins": 3.82766056060791, "rewards/rejected": -44.9472770690918, "step": 6009 }, { "epoch": 0.8183551198257081, "grad_norm": 45.80446166340541, "learning_rate": 7.776212841055576e-08, "logits/chosen": 14.476516723632812, "logits/rejected": 14.86734676361084, "logps/chosen": -4.459993362426758, "logps/rejected": -4.743666172027588, "loss": 3.985, "rewards/accuracies": 0.75, "rewards/chosen": -44.59992980957031, "rewards/margins": 2.8367271423339844, "rewards/rejected": -47.4366569519043, "step": 6010 }, { "epoch": 0.8184912854030502, "grad_norm": 41.620442898638835, "learning_rate": 7.764951297331248e-08, "logits/chosen": 13.836191177368164, "logits/rejected": 14.474963188171387, "logps/chosen": -4.326783180236816, "logps/rejected": -4.820408344268799, "loss": 3.7316, "rewards/accuracies": 0.75, "rewards/chosen": -43.26782989501953, "rewards/margins": 4.936253547668457, "rewards/rejected": -48.20408248901367, "step": 6011 }, { "epoch": 0.8186274509803921, "grad_norm": 42.70476060381954, "learning_rate": 7.753697037382641e-08, "logits/chosen": 14.37193489074707, "logits/rejected": 14.73953914642334, "logps/chosen": -4.558926582336426, "logps/rejected": -4.874338626861572, "loss": 3.478, "rewards/accuracies": 0.75, "rewards/chosen": -45.58926773071289, "rewards/margins": 3.1541194915771484, "rewards/rejected": -48.743385314941406, "step": 6012 }, { "epoch": 0.8187636165577342, "grad_norm": 40.508285797756194, "learning_rate": 7.742450063752728e-08, "logits/chosen": 13.541839599609375, "logits/rejected": 14.457833290100098, "logps/chosen": -4.248723030090332, "logps/rejected": -4.56154727935791, "loss": 3.841, "rewards/accuracies": 0.5, "rewards/chosen": -42.48723220825195, "rewards/margins": 3.1282424926757812, "rewards/rejected": -45.61547088623047, "step": 6013 }, { "epoch": 0.8188997821350763, "grad_norm": 40.35371790891467, "learning_rate": 7.731210378982868e-08, "logits/chosen": 15.145612716674805, "logits/rejected": 15.298017501831055, "logps/chosen": -4.545915603637695, "logps/rejected": -4.695441246032715, "loss": 3.5223, "rewards/accuracies": 0.75, "rewards/chosen": -45.45915603637695, "rewards/margins": 1.4952564239501953, "rewards/rejected": -46.954410552978516, "step": 6014 }, { "epoch": 0.8190359477124183, "grad_norm": 48.3823707398096, "learning_rate": 7.71997798561277e-08, "logits/chosen": 14.343029022216797, "logits/rejected": 14.557390213012695, "logps/chosen": -4.488602638244629, "logps/rejected": -5.038233757019043, "loss": 4.1338, "rewards/accuracies": 1.0, "rewards/chosen": -44.88602066040039, "rewards/margins": 5.496316909790039, "rewards/rejected": -50.38233947753906, "step": 6015 }, { "epoch": 0.8191721132897604, "grad_norm": 39.81007741175804, "learning_rate": 7.708752886180465e-08, "logits/chosen": 13.277530670166016, "logits/rejected": 13.212728500366211, "logps/chosen": -4.083587646484375, "logps/rejected": -4.275666236877441, "loss": 3.3892, "rewards/accuracies": 0.5, "rewards/chosen": -40.835880279541016, "rewards/margins": 1.9207849502563477, "rewards/rejected": -42.75666046142578, "step": 6016 }, { "epoch": 0.8193082788671024, "grad_norm": 46.94050557645067, "learning_rate": 7.697535083222363e-08, "logits/chosen": 14.522079467773438, "logits/rejected": 13.661188125610352, "logps/chosen": -4.520146369934082, "logps/rejected": -4.376888275146484, "loss": 4.2473, "rewards/accuracies": 0.25, "rewards/chosen": -45.20146179199219, "rewards/margins": -1.4325790405273438, "rewards/rejected": -43.768882751464844, "step": 6017 }, { "epoch": 0.8194444444444444, "grad_norm": 41.33373774150663, "learning_rate": 7.686324579273242e-08, "logits/chosen": 13.687393188476562, "logits/rejected": 14.607616424560547, "logps/chosen": -4.0246052742004395, "logps/rejected": -4.404346466064453, "loss": 3.8597, "rewards/accuracies": 1.0, "rewards/chosen": -40.246055603027344, "rewards/margins": 3.7974119186401367, "rewards/rejected": -44.04346466064453, "step": 6018 }, { "epoch": 0.8195806100217865, "grad_norm": 46.29398145446134, "learning_rate": 7.675121376866176e-08, "logits/chosen": 13.909353256225586, "logits/rejected": 14.743000030517578, "logps/chosen": -4.234498977661133, "logps/rejected": -4.8199920654296875, "loss": 4.2961, "rewards/accuracies": 1.0, "rewards/chosen": -42.344993591308594, "rewards/margins": 5.854931831359863, "rewards/rejected": -48.19992446899414, "step": 6019 }, { "epoch": 0.8197167755991286, "grad_norm": 45.30658229849743, "learning_rate": 7.663925478532633e-08, "logits/chosen": 13.139104843139648, "logits/rejected": 14.43150520324707, "logps/chosen": -4.158872604370117, "logps/rejected": -4.73300838470459, "loss": 4.1785, "rewards/accuracies": 1.0, "rewards/chosen": -41.58872604370117, "rewards/margins": 5.74135684967041, "rewards/rejected": -47.33008575439453, "step": 6020 }, { "epoch": 0.8198529411764706, "grad_norm": 64.94179534870095, "learning_rate": 7.652736886802431e-08, "logits/chosen": 14.810487747192383, "logits/rejected": 14.804001808166504, "logps/chosen": -4.804013252258301, "logps/rejected": -4.776101112365723, "loss": 3.8156, "rewards/accuracies": 0.5, "rewards/chosen": -48.040130615234375, "rewards/margins": -0.27912235260009766, "rewards/rejected": -47.761009216308594, "step": 6021 }, { "epoch": 0.8199891067538126, "grad_norm": 36.76124050098644, "learning_rate": 7.641555604203707e-08, "logits/chosen": 13.646617889404297, "logits/rejected": 15.06253433227539, "logps/chosen": -4.481683731079102, "logps/rejected": -4.68145751953125, "loss": 3.7383, "rewards/accuracies": 0.5, "rewards/chosen": -44.81683349609375, "rewards/margins": 1.9977388381958008, "rewards/rejected": -46.8145751953125, "step": 6022 }, { "epoch": 0.8201252723311547, "grad_norm": 47.71563582148405, "learning_rate": 7.630381633262972e-08, "logits/chosen": 13.924763679504395, "logits/rejected": 14.887331008911133, "logps/chosen": -4.263131141662598, "logps/rejected": -4.780566215515137, "loss": 4.1106, "rewards/accuracies": 0.75, "rewards/chosen": -42.631317138671875, "rewards/margins": 5.174346923828125, "rewards/rejected": -47.8056640625, "step": 6023 }, { "epoch": 0.8202614379084967, "grad_norm": 37.76884784163231, "learning_rate": 7.619214976505089e-08, "logits/chosen": 14.070272445678711, "logits/rejected": 14.59512710571289, "logps/chosen": -4.5568037033081055, "logps/rejected": -4.905632019042969, "loss": 3.6981, "rewards/accuracies": 0.5, "rewards/chosen": -45.56803512573242, "rewards/margins": 3.4882850646972656, "rewards/rejected": -49.05632019042969, "step": 6024 }, { "epoch": 0.8203976034858388, "grad_norm": 41.02925983398805, "learning_rate": 7.608055636453228e-08, "logits/chosen": 13.705631256103516, "logits/rejected": 13.923505783081055, "logps/chosen": -4.357579231262207, "logps/rejected": -4.740560531616211, "loss": 4.0341, "rewards/accuracies": 1.0, "rewards/chosen": -43.57579040527344, "rewards/margins": 3.829813003540039, "rewards/rejected": -47.405601501464844, "step": 6025 }, { "epoch": 0.8205337690631809, "grad_norm": 42.25169712297809, "learning_rate": 7.596903615628955e-08, "logits/chosen": 13.977289199829102, "logits/rejected": 15.193534851074219, "logps/chosen": -4.420406818389893, "logps/rejected": -4.933014869689941, "loss": 3.9749, "rewards/accuracies": 0.75, "rewards/chosen": -44.20406723022461, "rewards/margins": 5.126080513000488, "rewards/rejected": -49.33015060424805, "step": 6026 }, { "epoch": 0.8206699346405228, "grad_norm": 39.71756153468852, "learning_rate": 7.585758916552167e-08, "logits/chosen": 14.305583000183105, "logits/rejected": 14.18179702758789, "logps/chosen": -4.320496559143066, "logps/rejected": -4.4118852615356445, "loss": 3.9903, "rewards/accuracies": 0.5, "rewards/chosen": -43.20496368408203, "rewards/margins": 0.9138917922973633, "rewards/rejected": -44.11885452270508, "step": 6027 }, { "epoch": 0.8208061002178649, "grad_norm": 48.47243095860361, "learning_rate": 7.57462154174108e-08, "logits/chosen": 12.86971664428711, "logits/rejected": 13.524547576904297, "logps/chosen": -4.151023864746094, "logps/rejected": -4.387146472930908, "loss": 4.1195, "rewards/accuracies": 0.75, "rewards/chosen": -41.51023864746094, "rewards/margins": 2.361227035522461, "rewards/rejected": -43.87146759033203, "step": 6028 }, { "epoch": 0.820942265795207, "grad_norm": 43.61405729092713, "learning_rate": 7.563491493712284e-08, "logits/chosen": 14.43122673034668, "logits/rejected": 14.6258544921875, "logps/chosen": -4.670180320739746, "logps/rejected": -4.600507736206055, "loss": 4.3286, "rewards/accuracies": 0.5, "rewards/chosen": -46.70180130004883, "rewards/margins": -0.6967229843139648, "rewards/rejected": -46.00507354736328, "step": 6029 }, { "epoch": 0.821078431372549, "grad_norm": 42.54825301467318, "learning_rate": 7.55236877498072e-08, "logits/chosen": 14.126518249511719, "logits/rejected": 14.731502532958984, "logps/chosen": -4.362906455993652, "logps/rejected": -4.586627006530762, "loss": 4.1154, "rewards/accuracies": 0.5, "rewards/chosen": -43.629066467285156, "rewards/margins": 2.237203598022461, "rewards/rejected": -45.86627197265625, "step": 6030 }, { "epoch": 0.8212145969498911, "grad_norm": 46.0008587346091, "learning_rate": 7.541253388059634e-08, "logits/chosen": 14.792364120483398, "logits/rejected": 14.395086288452148, "logps/chosen": -4.771799087524414, "logps/rejected": -4.600842475891113, "loss": 3.7071, "rewards/accuracies": 0.25, "rewards/chosen": -47.717994689941406, "rewards/margins": -1.709568977355957, "rewards/rejected": -46.0084228515625, "step": 6031 }, { "epoch": 0.8213507625272332, "grad_norm": 43.19450881897328, "learning_rate": 7.530145335460654e-08, "logits/chosen": 14.43800163269043, "logits/rejected": 15.811010360717773, "logps/chosen": -4.391613006591797, "logps/rejected": -5.06016731262207, "loss": 3.8025, "rewards/accuracies": 1.0, "rewards/chosen": -43.91613006591797, "rewards/margins": 6.685535430908203, "rewards/rejected": -50.60166931152344, "step": 6032 }, { "epoch": 0.8214869281045751, "grad_norm": 44.60403672029883, "learning_rate": 7.519044619693744e-08, "logits/chosen": 14.472103118896484, "logits/rejected": 14.085406303405762, "logps/chosen": -4.647116184234619, "logps/rejected": -4.52299690246582, "loss": 4.4515, "rewards/accuracies": 0.25, "rewards/chosen": -46.47116470336914, "rewards/margins": -1.241196632385254, "rewards/rejected": -45.22996520996094, "step": 6033 }, { "epoch": 0.8216230936819172, "grad_norm": 41.87358613833732, "learning_rate": 7.507951243267183e-08, "logits/chosen": 13.995009422302246, "logits/rejected": 13.810298919677734, "logps/chosen": -4.4439167976379395, "logps/rejected": -4.552236557006836, "loss": 4.1387, "rewards/accuracies": 0.75, "rewards/chosen": -44.439170837402344, "rewards/margins": 1.0831966400146484, "rewards/rejected": -45.52236557006836, "step": 6034 }, { "epoch": 0.8217592592592593, "grad_norm": 46.39558894267889, "learning_rate": 7.49686520868762e-08, "logits/chosen": 14.61752700805664, "logits/rejected": 14.897412300109863, "logps/chosen": -4.6092119216918945, "logps/rejected": -4.650601387023926, "loss": 4.2813, "rewards/accuracies": 0.5, "rewards/chosen": -46.09211730957031, "rewards/margins": 0.4138965606689453, "rewards/rejected": -46.506011962890625, "step": 6035 }, { "epoch": 0.8218954248366013, "grad_norm": 40.88380297689759, "learning_rate": 7.485786518460045e-08, "logits/chosen": 14.463798522949219, "logits/rejected": 14.840081214904785, "logps/chosen": -4.738067626953125, "logps/rejected": -5.0094733238220215, "loss": 3.5567, "rewards/accuracies": 0.75, "rewards/chosen": -47.38067626953125, "rewards/margins": 2.714056968688965, "rewards/rejected": -50.09473419189453, "step": 6036 }, { "epoch": 0.8220315904139434, "grad_norm": 38.4131514902211, "learning_rate": 7.474715175087763e-08, "logits/chosen": 13.705766677856445, "logits/rejected": 15.062323570251465, "logps/chosen": -4.399770736694336, "logps/rejected": -4.782490253448486, "loss": 3.7585, "rewards/accuracies": 0.75, "rewards/chosen": -43.997711181640625, "rewards/margins": 3.8271923065185547, "rewards/rejected": -47.82490158081055, "step": 6037 }, { "epoch": 0.8221677559912854, "grad_norm": 42.80629999610187, "learning_rate": 7.463651181072444e-08, "logits/chosen": 13.84878158569336, "logits/rejected": 14.277839660644531, "logps/chosen": -4.4122138023376465, "logps/rejected": -4.769591331481934, "loss": 3.8749, "rewards/accuracies": 0.75, "rewards/chosen": -44.122135162353516, "rewards/margins": 3.5737762451171875, "rewards/rejected": -47.6959114074707, "step": 6038 }, { "epoch": 0.8223039215686274, "grad_norm": 41.40164948942345, "learning_rate": 7.4525945389141e-08, "logits/chosen": 13.506304740905762, "logits/rejected": 13.898422241210938, "logps/chosen": -4.612300395965576, "logps/rejected": -4.6077656745910645, "loss": 3.7561, "rewards/accuracies": 0.5, "rewards/chosen": -46.12300491333008, "rewards/margins": -0.04534626007080078, "rewards/rejected": -46.077659606933594, "step": 6039 }, { "epoch": 0.8224400871459695, "grad_norm": 42.91798675699595, "learning_rate": 7.441545251111047e-08, "logits/chosen": 14.026148796081543, "logits/rejected": 14.907052040100098, "logps/chosen": -4.68317985534668, "logps/rejected": -4.705562591552734, "loss": 4.1726, "rewards/accuracies": 0.75, "rewards/chosen": -46.8317985534668, "rewards/margins": 0.22382545471191406, "rewards/rejected": -47.055625915527344, "step": 6040 }, { "epoch": 0.8225762527233116, "grad_norm": 42.661377900566485, "learning_rate": 7.430503320159975e-08, "logits/chosen": 14.474313735961914, "logits/rejected": 14.959654808044434, "logps/chosen": -4.6750807762146, "logps/rejected": -4.967467308044434, "loss": 4.0109, "rewards/accuracies": 0.5, "rewards/chosen": -46.75080871582031, "rewards/margins": 2.9238710403442383, "rewards/rejected": -49.67467498779297, "step": 6041 }, { "epoch": 0.8227124183006536, "grad_norm": 42.70744807041955, "learning_rate": 7.419468748555915e-08, "logits/chosen": 14.778543472290039, "logits/rejected": 15.544974327087402, "logps/chosen": -4.706404685974121, "logps/rejected": -5.171978950500488, "loss": 3.8347, "rewards/accuracies": 0.75, "rewards/chosen": -47.064048767089844, "rewards/margins": 4.655745506286621, "rewards/rejected": -51.71979522705078, "step": 6042 }, { "epoch": 0.8228485838779956, "grad_norm": 43.37021251664664, "learning_rate": 7.408441538792187e-08, "logits/chosen": 13.737449645996094, "logits/rejected": 13.688862800598145, "logps/chosen": -4.333797931671143, "logps/rejected": -4.594644069671631, "loss": 3.7045, "rewards/accuracies": 1.0, "rewards/chosen": -43.337982177734375, "rewards/margins": 2.608461380004883, "rewards/rejected": -45.946441650390625, "step": 6043 }, { "epoch": 0.8229847494553377, "grad_norm": 41.510737955014505, "learning_rate": 7.397421693360506e-08, "logits/chosen": 14.085724830627441, "logits/rejected": 14.74032211303711, "logps/chosen": -4.27551794052124, "logps/rejected": -4.683884620666504, "loss": 3.4037, "rewards/accuracies": 1.0, "rewards/chosen": -42.75518035888672, "rewards/margins": 4.083666801452637, "rewards/rejected": -46.838844299316406, "step": 6044 }, { "epoch": 0.8231209150326797, "grad_norm": 43.73370153343419, "learning_rate": 7.386409214750893e-08, "logits/chosen": 14.352783203125, "logits/rejected": 15.019609451293945, "logps/chosen": -4.337759971618652, "logps/rejected": -4.970550537109375, "loss": 3.9255, "rewards/accuracies": 1.0, "rewards/chosen": -43.37759780883789, "rewards/margins": 6.327907562255859, "rewards/rejected": -49.70550537109375, "step": 6045 }, { "epoch": 0.8232570806100218, "grad_norm": 44.477535045930274, "learning_rate": 7.375404105451699e-08, "logits/chosen": 14.23565673828125, "logits/rejected": 14.224596977233887, "logps/chosen": -4.614267349243164, "logps/rejected": -4.735851287841797, "loss": 4.421, "rewards/accuracies": 0.75, "rewards/chosen": -46.142669677734375, "rewards/margins": 1.2158432006835938, "rewards/rejected": -47.35851287841797, "step": 6046 }, { "epoch": 0.8233932461873639, "grad_norm": 40.10498041134358, "learning_rate": 7.364406367949621e-08, "logits/chosen": 14.633441925048828, "logits/rejected": 14.050064086914062, "logps/chosen": -4.772497177124023, "logps/rejected": -4.824873447418213, "loss": 3.589, "rewards/accuracies": 0.75, "rewards/chosen": -47.7249755859375, "rewards/margins": 0.5237627029418945, "rewards/rejected": -48.24873352050781, "step": 6047 }, { "epoch": 0.8235294117647058, "grad_norm": 44.80500598457292, "learning_rate": 7.353416004729705e-08, "logits/chosen": 13.779814720153809, "logits/rejected": 13.69771957397461, "logps/chosen": -4.376428604125977, "logps/rejected": -4.439965724945068, "loss": 3.7283, "rewards/accuracies": 0.25, "rewards/chosen": -43.7642822265625, "rewards/margins": 0.6353740692138672, "rewards/rejected": -44.399658203125, "step": 6048 }, { "epoch": 0.8236655773420479, "grad_norm": 48.75296672972339, "learning_rate": 7.342433018275289e-08, "logits/chosen": 14.541112899780273, "logits/rejected": 14.638211250305176, "logps/chosen": -4.5426225662231445, "logps/rejected": -4.8063154220581055, "loss": 3.8156, "rewards/accuracies": 0.75, "rewards/chosen": -45.42622756958008, "rewards/margins": 2.6369314193725586, "rewards/rejected": -48.06315612792969, "step": 6049 }, { "epoch": 0.82380174291939, "grad_norm": 40.52716419266731, "learning_rate": 7.331457411068088e-08, "logits/chosen": 13.863149642944336, "logits/rejected": 14.214122772216797, "logps/chosen": -4.426251411437988, "logps/rejected": -4.648735046386719, "loss": 4.1616, "rewards/accuracies": 0.5, "rewards/chosen": -44.26251220703125, "rewards/margins": 2.224837303161621, "rewards/rejected": -46.48735046386719, "step": 6050 }, { "epoch": 0.823937908496732, "grad_norm": 41.90357123786133, "learning_rate": 7.320489185588132e-08, "logits/chosen": 14.095892906188965, "logits/rejected": 13.511802673339844, "logps/chosen": -4.556178569793701, "logps/rejected": -4.539336204528809, "loss": 3.7328, "rewards/accuracies": 0.5, "rewards/chosen": -45.56178665161133, "rewards/margins": -0.168426513671875, "rewards/rejected": -45.39336395263672, "step": 6051 }, { "epoch": 0.8240740740740741, "grad_norm": 40.49105407763186, "learning_rate": 7.309528344313766e-08, "logits/chosen": 13.828752517700195, "logits/rejected": 14.605684280395508, "logps/chosen": -4.568808078765869, "logps/rejected": -4.782588005065918, "loss": 4.2278, "rewards/accuracies": 0.75, "rewards/chosen": -45.688079833984375, "rewards/margins": 2.1377992630004883, "rewards/rejected": -47.82588195800781, "step": 6052 }, { "epoch": 0.8242102396514162, "grad_norm": 41.06172502249719, "learning_rate": 7.298574889721694e-08, "logits/chosen": 14.272636413574219, "logits/rejected": 14.966309547424316, "logps/chosen": -4.384452819824219, "logps/rejected": -4.985260963439941, "loss": 3.5987, "rewards/accuracies": 1.0, "rewards/chosen": -43.84452819824219, "rewards/margins": 6.0080766677856445, "rewards/rejected": -49.85260772705078, "step": 6053 }, { "epoch": 0.8243464052287581, "grad_norm": 40.72042898465925, "learning_rate": 7.287628824286951e-08, "logits/chosen": 13.810052871704102, "logits/rejected": 14.29554557800293, "logps/chosen": -4.406274318695068, "logps/rejected": -4.4993062019348145, "loss": 4.0931, "rewards/accuracies": 0.75, "rewards/chosen": -44.062744140625, "rewards/margins": 0.9303178787231445, "rewards/rejected": -44.99306106567383, "step": 6054 }, { "epoch": 0.8244825708061002, "grad_norm": 41.45007353564835, "learning_rate": 7.276690150482862e-08, "logits/chosen": 14.013740539550781, "logits/rejected": 14.195833206176758, "logps/chosen": -4.54670524597168, "logps/rejected": -4.615396499633789, "loss": 4.1103, "rewards/accuracies": 0.25, "rewards/chosen": -45.46705627441406, "rewards/margins": 0.6869134902954102, "rewards/rejected": -46.153968811035156, "step": 6055 }, { "epoch": 0.8246187363834423, "grad_norm": 46.184391974148944, "learning_rate": 7.265758870781132e-08, "logits/chosen": 14.793947219848633, "logits/rejected": 14.932818412780762, "logps/chosen": -4.814327239990234, "logps/rejected": -4.6902594566345215, "loss": 3.9007, "rewards/accuracies": 0.25, "rewards/chosen": -48.14327621459961, "rewards/margins": -1.2406787872314453, "rewards/rejected": -46.90259552001953, "step": 6056 }, { "epoch": 0.8247549019607843, "grad_norm": 43.90153860432903, "learning_rate": 7.254834987651781e-08, "logits/chosen": 15.259958267211914, "logits/rejected": 14.595428466796875, "logps/chosen": -5.048829078674316, "logps/rejected": -4.680179119110107, "loss": 3.9415, "rewards/accuracies": 0.25, "rewards/chosen": -50.48828887939453, "rewards/margins": -3.6865005493164062, "rewards/rejected": -46.801788330078125, "step": 6057 }, { "epoch": 0.8248910675381264, "grad_norm": 44.04248125831955, "learning_rate": 7.243918503563122e-08, "logits/chosen": 14.443925857543945, "logits/rejected": 14.04746150970459, "logps/chosen": -4.983578205108643, "logps/rejected": -4.769674301147461, "loss": 4.2569, "rewards/accuracies": 0.5, "rewards/chosen": -49.835784912109375, "rewards/margins": -2.1390380859375, "rewards/rejected": -47.69674301147461, "step": 6058 }, { "epoch": 0.8250272331154684, "grad_norm": 42.99161822291927, "learning_rate": 7.233009420981849e-08, "logits/chosen": 14.25444221496582, "logits/rejected": 14.925470352172852, "logps/chosen": -4.619034767150879, "logps/rejected": -4.813818454742432, "loss": 3.8423, "rewards/accuracies": 0.75, "rewards/chosen": -46.190345764160156, "rewards/margins": 1.947835922241211, "rewards/rejected": -48.13818359375, "step": 6059 }, { "epoch": 0.8251633986928104, "grad_norm": 43.98487650258626, "learning_rate": 7.222107742372957e-08, "logits/chosen": 13.697402954101562, "logits/rejected": 14.223770141601562, "logps/chosen": -4.443617820739746, "logps/rejected": -4.565984725952148, "loss": 4.2075, "rewards/accuracies": 0.75, "rewards/chosen": -44.43617248535156, "rewards/margins": 1.223672866821289, "rewards/rejected": -45.65985107421875, "step": 6060 }, { "epoch": 0.8252995642701525, "grad_norm": 39.482699196126966, "learning_rate": 7.211213470199755e-08, "logits/chosen": 14.013307571411133, "logits/rejected": 14.24814224243164, "logps/chosen": -4.6708269119262695, "logps/rejected": -4.589913368225098, "loss": 3.6514, "rewards/accuracies": 0.5, "rewards/chosen": -46.70826721191406, "rewards/margins": -0.8091335296630859, "rewards/rejected": -45.899131774902344, "step": 6061 }, { "epoch": 0.8254357298474946, "grad_norm": 46.95719772133083, "learning_rate": 7.200326606923908e-08, "logits/chosen": 13.653738975524902, "logits/rejected": 14.844185829162598, "logps/chosen": -4.329349517822266, "logps/rejected": -4.930691242218018, "loss": 3.707, "rewards/accuracies": 1.0, "rewards/chosen": -43.29349136352539, "rewards/margins": 6.013421058654785, "rewards/rejected": -49.306915283203125, "step": 6062 }, { "epoch": 0.8255718954248366, "grad_norm": 39.880439095648356, "learning_rate": 7.189447155005397e-08, "logits/chosen": 14.303956985473633, "logits/rejected": 14.390336990356445, "logps/chosen": -4.3622236251831055, "logps/rejected": -4.606077194213867, "loss": 3.7368, "rewards/accuracies": 0.5, "rewards/chosen": -43.62223434448242, "rewards/margins": 2.438535690307617, "rewards/rejected": -46.060768127441406, "step": 6063 }, { "epoch": 0.8257080610021786, "grad_norm": 42.12269787381658, "learning_rate": 7.178575116902506e-08, "logits/chosen": 14.854129791259766, "logits/rejected": 14.448607444763184, "logps/chosen": -4.570369720458984, "logps/rejected": -4.644229888916016, "loss": 3.4084, "rewards/accuracies": 0.75, "rewards/chosen": -45.703697204589844, "rewards/margins": 0.7385978698730469, "rewards/rejected": -46.442298889160156, "step": 6064 }, { "epoch": 0.8258442265795207, "grad_norm": 38.37495913221766, "learning_rate": 7.167710495071872e-08, "logits/chosen": 13.854917526245117, "logits/rejected": 14.278461456298828, "logps/chosen": -4.487420082092285, "logps/rejected": -4.506735324859619, "loss": 3.533, "rewards/accuracies": 0.25, "rewards/chosen": -44.874202728271484, "rewards/margins": 0.1931476593017578, "rewards/rejected": -45.06734848022461, "step": 6065 }, { "epoch": 0.8259803921568627, "grad_norm": 41.32835523329439, "learning_rate": 7.156853291968458e-08, "logits/chosen": 13.961372375488281, "logits/rejected": 14.031390190124512, "logps/chosen": -4.43761682510376, "logps/rejected": -4.386665344238281, "loss": 3.8233, "rewards/accuracies": 0.75, "rewards/chosen": -44.37616729736328, "rewards/margins": -0.5095129013061523, "rewards/rejected": -43.86665344238281, "step": 6066 }, { "epoch": 0.8261165577342048, "grad_norm": 44.86120112240614, "learning_rate": 7.146003510045516e-08, "logits/chosen": 14.613605499267578, "logits/rejected": 14.734315872192383, "logps/chosen": -4.477174758911133, "logps/rejected": -4.562650203704834, "loss": 4.0831, "rewards/accuracies": 0.75, "rewards/chosen": -44.77174377441406, "rewards/margins": 0.8547563552856445, "rewards/rejected": -45.626502990722656, "step": 6067 }, { "epoch": 0.8262527233115469, "grad_norm": 52.256021046124616, "learning_rate": 7.135161151754654e-08, "logits/chosen": 14.224308013916016, "logits/rejected": 14.315027236938477, "logps/chosen": -4.5298967361450195, "logps/rejected": -4.577735900878906, "loss": 4.2148, "rewards/accuracies": 0.75, "rewards/chosen": -45.29896545410156, "rewards/margins": 0.4783926010131836, "rewards/rejected": -45.77735900878906, "step": 6068 }, { "epoch": 0.8263888888888888, "grad_norm": 48.79951966799887, "learning_rate": 7.124326219545804e-08, "logits/chosen": 14.549543380737305, "logits/rejected": 14.825225830078125, "logps/chosen": -4.44003963470459, "logps/rejected": -4.540858268737793, "loss": 4.0424, "rewards/accuracies": 0.5, "rewards/chosen": -44.40039825439453, "rewards/margins": 1.0081796646118164, "rewards/rejected": -45.40857696533203, "step": 6069 }, { "epoch": 0.8265250544662309, "grad_norm": 42.29593887715269, "learning_rate": 7.113498715867185e-08, "logits/chosen": 13.911077499389648, "logits/rejected": 14.265509605407715, "logps/chosen": -4.547335147857666, "logps/rejected": -4.798932075500488, "loss": 3.8431, "rewards/accuracies": 0.5, "rewards/chosen": -45.473350524902344, "rewards/margins": 2.5159683227539062, "rewards/rejected": -47.98931884765625, "step": 6070 }, { "epoch": 0.826661220043573, "grad_norm": 42.516188867083024, "learning_rate": 7.102678643165378e-08, "logits/chosen": 14.862205505371094, "logits/rejected": 15.131847381591797, "logps/chosen": -4.801314353942871, "logps/rejected": -5.049325942993164, "loss": 3.6029, "rewards/accuracies": 1.0, "rewards/chosen": -48.01314163208008, "rewards/margins": 2.4801177978515625, "rewards/rejected": -50.49325942993164, "step": 6071 }, { "epoch": 0.826797385620915, "grad_norm": 39.89062094569761, "learning_rate": 7.091866003885271e-08, "logits/chosen": 15.26628303527832, "logits/rejected": 14.712328910827637, "logps/chosen": -4.857829570770264, "logps/rejected": -4.582584381103516, "loss": 4.2939, "rewards/accuracies": 0.25, "rewards/chosen": -48.57829284667969, "rewards/margins": -2.752452850341797, "rewards/rejected": -45.825843811035156, "step": 6072 }, { "epoch": 0.8269335511982571, "grad_norm": 44.30089840842812, "learning_rate": 7.08106080047005e-08, "logits/chosen": 14.05163288116455, "logits/rejected": 14.831768035888672, "logps/chosen": -4.420868873596191, "logps/rejected": -4.862396240234375, "loss": 3.5182, "rewards/accuracies": 0.75, "rewards/chosen": -44.20868682861328, "rewards/margins": 4.415277481079102, "rewards/rejected": -48.62396240234375, "step": 6073 }, { "epoch": 0.8270697167755992, "grad_norm": 40.06966180878832, "learning_rate": 7.070263035361254e-08, "logits/chosen": 13.873525619506836, "logits/rejected": 14.460525512695312, "logps/chosen": -4.603402137756348, "logps/rejected": -4.819301605224609, "loss": 4.1051, "rewards/accuracies": 0.75, "rewards/chosen": -46.034019470214844, "rewards/margins": 2.158998489379883, "rewards/rejected": -48.193016052246094, "step": 6074 }, { "epoch": 0.8272058823529411, "grad_norm": 40.35784157810186, "learning_rate": 7.059472710998737e-08, "logits/chosen": 13.99992561340332, "logits/rejected": 14.034769058227539, "logps/chosen": -4.498339653015137, "logps/rejected": -4.568355560302734, "loss": 4.1148, "rewards/accuracies": 0.25, "rewards/chosen": -44.983394622802734, "rewards/margins": 0.7001638412475586, "rewards/rejected": -45.68355941772461, "step": 6075 }, { "epoch": 0.8273420479302832, "grad_norm": 45.89084551777359, "learning_rate": 7.04868982982064e-08, "logits/chosen": 14.417797088623047, "logits/rejected": 14.930318832397461, "logps/chosen": -4.718477249145508, "logps/rejected": -4.863948822021484, "loss": 4.3476, "rewards/accuracies": 0.75, "rewards/chosen": -47.18477249145508, "rewards/margins": 1.4547147750854492, "rewards/rejected": -48.639488220214844, "step": 6076 }, { "epoch": 0.8274782135076253, "grad_norm": 40.572313093356314, "learning_rate": 7.037914394263449e-08, "logits/chosen": 15.099807739257812, "logits/rejected": 15.233970642089844, "logps/chosen": -4.7059326171875, "logps/rejected": -4.700485706329346, "loss": 4.1506, "rewards/accuracies": 0.5, "rewards/chosen": -47.059329986572266, "rewards/margins": -0.05447196960449219, "rewards/rejected": -47.00485610961914, "step": 6077 }, { "epoch": 0.8276143790849673, "grad_norm": 45.346993892872085, "learning_rate": 7.027146406761981e-08, "logits/chosen": 14.112348556518555, "logits/rejected": 14.823265075683594, "logps/chosen": -4.420815944671631, "logps/rejected": -4.859475135803223, "loss": 3.8052, "rewards/accuracies": 0.75, "rewards/chosen": -44.20815658569336, "rewards/margins": 4.386592864990234, "rewards/rejected": -48.594749450683594, "step": 6078 }, { "epoch": 0.8277505446623094, "grad_norm": 39.721279492488065, "learning_rate": 7.016385869749331e-08, "logits/chosen": 14.21021842956543, "logits/rejected": 14.89492416381836, "logps/chosen": -4.636488914489746, "logps/rejected": -5.059628486633301, "loss": 3.5393, "rewards/accuracies": 1.0, "rewards/chosen": -46.364891052246094, "rewards/margins": 4.231396675109863, "rewards/rejected": -50.59628677368164, "step": 6079 }, { "epoch": 0.8278867102396514, "grad_norm": 40.885137774661914, "learning_rate": 7.005632785656938e-08, "logits/chosen": 14.651555061340332, "logits/rejected": 14.886837005615234, "logps/chosen": -4.971344947814941, "logps/rejected": -5.098850250244141, "loss": 4.0222, "rewards/accuracies": 0.75, "rewards/chosen": -49.71344757080078, "rewards/margins": 1.275054931640625, "rewards/rejected": -50.988502502441406, "step": 6080 }, { "epoch": 0.8280228758169934, "grad_norm": 43.062082568165096, "learning_rate": 6.99488715691456e-08, "logits/chosen": 14.543900489807129, "logits/rejected": 15.179203033447266, "logps/chosen": -4.846244812011719, "logps/rejected": -5.169633865356445, "loss": 3.8351, "rewards/accuracies": 0.75, "rewards/chosen": -48.46244430541992, "rewards/margins": 3.233896255493164, "rewards/rejected": -51.69634246826172, "step": 6081 }, { "epoch": 0.8281590413943355, "grad_norm": 42.76943183214596, "learning_rate": 6.984148985950242e-08, "logits/chosen": 14.30626106262207, "logits/rejected": 15.014322280883789, "logps/chosen": -4.696127891540527, "logps/rejected": -4.754158973693848, "loss": 3.5275, "rewards/accuracies": 0.5, "rewards/chosen": -46.96127700805664, "rewards/margins": 0.5803070068359375, "rewards/rejected": -47.54158401489258, "step": 6082 }, { "epoch": 0.8282952069716776, "grad_norm": 44.89128366069269, "learning_rate": 6.973418275190374e-08, "logits/chosen": 15.528919219970703, "logits/rejected": 15.33469009399414, "logps/chosen": -4.889500617980957, "logps/rejected": -5.128881454467773, "loss": 3.9264, "rewards/accuracies": 0.75, "rewards/chosen": -48.89500427246094, "rewards/margins": 2.3938121795654297, "rewards/rejected": -51.288814544677734, "step": 6083 }, { "epoch": 0.8284313725490197, "grad_norm": 43.3031070445453, "learning_rate": 6.962695027059649e-08, "logits/chosen": 14.214765548706055, "logits/rejected": 13.2991304397583, "logps/chosen": -4.66153621673584, "logps/rejected": -4.400608062744141, "loss": 3.9071, "rewards/accuracies": 0.25, "rewards/chosen": -46.61536407470703, "rewards/margins": -2.609281539916992, "rewards/rejected": -44.00608444213867, "step": 6084 }, { "epoch": 0.8285675381263616, "grad_norm": 45.70740347114995, "learning_rate": 6.951979243981077e-08, "logits/chosen": 14.116584777832031, "logits/rejected": 14.857118606567383, "logps/chosen": -4.259733200073242, "logps/rejected": -4.844461917877197, "loss": 4.1151, "rewards/accuracies": 1.0, "rewards/chosen": -42.597328186035156, "rewards/margins": 5.847288131713867, "rewards/rejected": -48.444618225097656, "step": 6085 }, { "epoch": 0.8287037037037037, "grad_norm": 37.03333453568293, "learning_rate": 6.941270928375967e-08, "logits/chosen": 14.260360717773438, "logits/rejected": 15.192967414855957, "logps/chosen": -4.315303802490234, "logps/rejected": -4.666909694671631, "loss": 3.7204, "rewards/accuracies": 0.75, "rewards/chosen": -43.153038024902344, "rewards/margins": 3.516057014465332, "rewards/rejected": -46.66909408569336, "step": 6086 }, { "epoch": 0.8288398692810458, "grad_norm": 40.81401861588423, "learning_rate": 6.930570082663951e-08, "logits/chosen": 14.214668273925781, "logits/rejected": 14.12022590637207, "logps/chosen": -4.493093013763428, "logps/rejected": -4.466318607330322, "loss": 3.834, "rewards/accuracies": 0.5, "rewards/chosen": -44.930931091308594, "rewards/margins": -0.2677459716796875, "rewards/rejected": -44.663185119628906, "step": 6087 }, { "epoch": 0.8289760348583878, "grad_norm": 52.433079233690435, "learning_rate": 6.919876709262995e-08, "logits/chosen": 14.081655502319336, "logits/rejected": 13.902118682861328, "logps/chosen": -4.447174072265625, "logps/rejected": -4.773488998413086, "loss": 3.4569, "rewards/accuracies": 1.0, "rewards/chosen": -44.47174072265625, "rewards/margins": 3.2631492614746094, "rewards/rejected": -47.734893798828125, "step": 6088 }, { "epoch": 0.8291122004357299, "grad_norm": 40.898825666189325, "learning_rate": 6.909190810589324e-08, "logits/chosen": 14.837434768676758, "logits/rejected": 14.756318092346191, "logps/chosen": -4.712863922119141, "logps/rejected": -4.889360427856445, "loss": 3.6973, "rewards/accuracies": 0.75, "rewards/chosen": -47.128639221191406, "rewards/margins": 1.764969825744629, "rewards/rejected": -48.89360809326172, "step": 6089 }, { "epoch": 0.829248366013072, "grad_norm": 45.84828552857745, "learning_rate": 6.898512389057529e-08, "logits/chosen": 14.345945358276367, "logits/rejected": 14.266180038452148, "logps/chosen": -4.582828998565674, "logps/rejected": -4.562997817993164, "loss": 4.2934, "rewards/accuracies": 0.25, "rewards/chosen": -45.82829284667969, "rewards/margins": -0.1983165740966797, "rewards/rejected": -45.629974365234375, "step": 6090 }, { "epoch": 0.8293845315904139, "grad_norm": 40.66012788888462, "learning_rate": 6.887841447080473e-08, "logits/chosen": 13.74519157409668, "logits/rejected": 14.897139549255371, "logps/chosen": -4.2462568283081055, "logps/rejected": -4.603601455688477, "loss": 3.5789, "rewards/accuracies": 1.0, "rewards/chosen": -42.462562561035156, "rewards/margins": 3.573451042175293, "rewards/rejected": -46.03601837158203, "step": 6091 }, { "epoch": 0.829520697167756, "grad_norm": 43.53952320199978, "learning_rate": 6.877177987069363e-08, "logits/chosen": 14.121017456054688, "logits/rejected": 15.05500316619873, "logps/chosen": -4.617104530334473, "logps/rejected": -5.343650817871094, "loss": 4.0352, "rewards/accuracies": 0.75, "rewards/chosen": -46.17104721069336, "rewards/margins": 7.265462875366211, "rewards/rejected": -53.4365119934082, "step": 6092 }, { "epoch": 0.8296568627450981, "grad_norm": 39.914869187724065, "learning_rate": 6.866522011433668e-08, "logits/chosen": 13.417402267456055, "logits/rejected": 14.076986312866211, "logps/chosen": -4.569502830505371, "logps/rejected": -4.874088287353516, "loss": 3.4851, "rewards/accuracies": 1.0, "rewards/chosen": -45.695030212402344, "rewards/margins": 3.0458545684814453, "rewards/rejected": -48.740882873535156, "step": 6093 }, { "epoch": 0.8297930283224401, "grad_norm": 42.10558223197076, "learning_rate": 6.855873522581213e-08, "logits/chosen": 14.123369216918945, "logits/rejected": 14.528175354003906, "logps/chosen": -4.6730451583862305, "logps/rejected": -4.97477912902832, "loss": 4.1525, "rewards/accuracies": 1.0, "rewards/chosen": -46.73045349121094, "rewards/margins": 3.0173397064208984, "rewards/rejected": -49.74779510498047, "step": 6094 }, { "epoch": 0.8299291938997821, "grad_norm": 45.80028915380794, "learning_rate": 6.845232522918119e-08, "logits/chosen": 14.984077453613281, "logits/rejected": 14.859975814819336, "logps/chosen": -4.595654010772705, "logps/rejected": -4.686800956726074, "loss": 3.9888, "rewards/accuracies": 0.75, "rewards/chosen": -45.95654296875, "rewards/margins": 0.9114704132080078, "rewards/rejected": -46.868011474609375, "step": 6095 }, { "epoch": 0.8300653594771242, "grad_norm": 40.04862555542022, "learning_rate": 6.834599014848783e-08, "logits/chosen": 13.986462593078613, "logits/rejected": 13.84444522857666, "logps/chosen": -4.463362693786621, "logps/rejected": -4.448735237121582, "loss": 3.7484, "rewards/accuracies": 0.5, "rewards/chosen": -44.63362503051758, "rewards/margins": -0.14627361297607422, "rewards/rejected": -44.48735046386719, "step": 6096 }, { "epoch": 0.8302015250544662, "grad_norm": 48.47388128421832, "learning_rate": 6.82397300077595e-08, "logits/chosen": 13.683891296386719, "logits/rejected": 13.55765438079834, "logps/chosen": -4.347008228302002, "logps/rejected": -4.463242530822754, "loss": 4.1831, "rewards/accuracies": 0.75, "rewards/chosen": -43.4700813293457, "rewards/margins": 1.1623401641845703, "rewards/rejected": -44.63241958618164, "step": 6097 }, { "epoch": 0.8303376906318083, "grad_norm": 83.53695567011663, "learning_rate": 6.813354483100653e-08, "logits/chosen": 13.643176078796387, "logits/rejected": 14.310261726379395, "logps/chosen": -4.43190860748291, "logps/rejected": -4.609288215637207, "loss": 3.7023, "rewards/accuracies": 0.5, "rewards/chosen": -44.319087982177734, "rewards/margins": 1.7737932205200195, "rewards/rejected": -46.09288024902344, "step": 6098 }, { "epoch": 0.8304738562091504, "grad_norm": 41.424103390453155, "learning_rate": 6.802743464222241e-08, "logits/chosen": 14.351569175720215, "logits/rejected": 14.917470932006836, "logps/chosen": -4.437436103820801, "logps/rejected": -4.783222198486328, "loss": 3.5782, "rewards/accuracies": 0.75, "rewards/chosen": -44.374359130859375, "rewards/margins": 3.457859992980957, "rewards/rejected": -47.83222198486328, "step": 6099 }, { "epoch": 0.8306100217864923, "grad_norm": 43.549695496199156, "learning_rate": 6.792139946538347e-08, "logits/chosen": 14.371826171875, "logits/rejected": 14.472829818725586, "logps/chosen": -4.5725603103637695, "logps/rejected": -4.796960830688477, "loss": 4.2102, "rewards/accuracies": 0.5, "rewards/chosen": -45.72560119628906, "rewards/margins": 2.244004249572754, "rewards/rejected": -47.9696044921875, "step": 6100 }, { "epoch": 0.8307461873638344, "grad_norm": 39.85152026207773, "learning_rate": 6.78154393244493e-08, "logits/chosen": 14.54790210723877, "logits/rejected": 14.66722297668457, "logps/chosen": -4.7647552490234375, "logps/rejected": -5.307422161102295, "loss": 3.5769, "rewards/accuracies": 0.75, "rewards/chosen": -47.64755630493164, "rewards/margins": 5.426663398742676, "rewards/rejected": -53.07421875, "step": 6101 }, { "epoch": 0.8308823529411765, "grad_norm": 40.360852049917256, "learning_rate": 6.77095542433626e-08, "logits/chosen": 14.18756103515625, "logits/rejected": 13.841379165649414, "logps/chosen": -4.614589214324951, "logps/rejected": -4.4305925369262695, "loss": 3.5829, "rewards/accuracies": 0.25, "rewards/chosen": -46.14589309692383, "rewards/margins": -1.8399715423583984, "rewards/rejected": -44.3059196472168, "step": 6102 }, { "epoch": 0.8310185185185185, "grad_norm": 45.380395677172366, "learning_rate": 6.760374424604878e-08, "logits/chosen": 14.334905624389648, "logits/rejected": 14.502923965454102, "logps/chosen": -4.512916564941406, "logps/rejected": -4.619512557983398, "loss": 3.9802, "rewards/accuracies": 0.5, "rewards/chosen": -45.1291618347168, "rewards/margins": 1.0659599304199219, "rewards/rejected": -46.19512176513672, "step": 6103 }, { "epoch": 0.8311546840958606, "grad_norm": 42.399785496083105, "learning_rate": 6.74980093564165e-08, "logits/chosen": 13.29300594329834, "logits/rejected": 14.491476058959961, "logps/chosen": -4.2838873863220215, "logps/rejected": -4.5612945556640625, "loss": 3.5533, "rewards/accuracies": 1.0, "rewards/chosen": -42.83887481689453, "rewards/margins": 2.774066925048828, "rewards/rejected": -45.612945556640625, "step": 6104 }, { "epoch": 0.8312908496732027, "grad_norm": 41.49799901825871, "learning_rate": 6.739234959835762e-08, "logits/chosen": 14.038209915161133, "logits/rejected": 14.951814651489258, "logps/chosen": -4.4712910652160645, "logps/rejected": -5.016790866851807, "loss": 3.9746, "rewards/accuracies": 1.0, "rewards/chosen": -44.71290969848633, "rewards/margins": 5.4549970626831055, "rewards/rejected": -50.16790771484375, "step": 6105 }, { "epoch": 0.8314270152505446, "grad_norm": 39.45588218261924, "learning_rate": 6.728676499574666e-08, "logits/chosen": 14.448272705078125, "logits/rejected": 14.5, "logps/chosen": -4.636481761932373, "logps/rejected": -4.752635955810547, "loss": 3.6685, "rewards/accuracies": 0.5, "rewards/chosen": -46.36481857299805, "rewards/margins": 1.1615428924560547, "rewards/rejected": -47.526363372802734, "step": 6106 }, { "epoch": 0.8315631808278867, "grad_norm": 44.36645880974374, "learning_rate": 6.718125557244133e-08, "logits/chosen": 14.521526336669922, "logits/rejected": 14.653047561645508, "logps/chosen": -4.842743873596191, "logps/rejected": -4.784938812255859, "loss": 4.3807, "rewards/accuracies": 0.25, "rewards/chosen": -48.42744064331055, "rewards/margins": -0.5780525207519531, "rewards/rejected": -47.849388122558594, "step": 6107 }, { "epoch": 0.8316993464052288, "grad_norm": 42.525823169635984, "learning_rate": 6.707582135228254e-08, "logits/chosen": 14.774666786193848, "logits/rejected": 14.01301097869873, "logps/chosen": -4.859303951263428, "logps/rejected": -4.4493279457092285, "loss": 4.1438, "rewards/accuracies": 0.0, "rewards/chosen": -48.593040466308594, "rewards/margins": -4.099761962890625, "rewards/rejected": -44.49327850341797, "step": 6108 }, { "epoch": 0.8318355119825708, "grad_norm": 40.423068296429165, "learning_rate": 6.697046235909379e-08, "logits/chosen": 14.427631378173828, "logits/rejected": 15.094568252563477, "logps/chosen": -4.494986534118652, "logps/rejected": -4.958319664001465, "loss": 3.2076, "rewards/accuracies": 1.0, "rewards/chosen": -44.94986343383789, "rewards/margins": 4.633336067199707, "rewards/rejected": -49.58319854736328, "step": 6109 }, { "epoch": 0.8319716775599129, "grad_norm": 43.08978444066192, "learning_rate": 6.686517861668188e-08, "logits/chosen": 14.62700080871582, "logits/rejected": 14.445833206176758, "logps/chosen": -4.384319305419922, "logps/rejected": -4.40177059173584, "loss": 3.7678, "rewards/accuracies": 0.25, "rewards/chosen": -43.84319305419922, "rewards/margins": 0.17451190948486328, "rewards/rejected": -44.01770782470703, "step": 6110 }, { "epoch": 0.8321078431372549, "grad_norm": 43.643920639918015, "learning_rate": 6.675997014883669e-08, "logits/chosen": 14.358978271484375, "logits/rejected": 14.464056015014648, "logps/chosen": -4.45405912399292, "logps/rejected": -4.515989780426025, "loss": 3.8893, "rewards/accuracies": 0.5, "rewards/chosen": -44.540592193603516, "rewards/margins": 0.6193056106567383, "rewards/rejected": -45.15989685058594, "step": 6111 }, { "epoch": 0.8322440087145969, "grad_norm": 47.285707703069335, "learning_rate": 6.665483697933077e-08, "logits/chosen": 14.55178451538086, "logits/rejected": 14.400928497314453, "logps/chosen": -4.2088422775268555, "logps/rejected": -4.791296005249023, "loss": 4.0632, "rewards/accuracies": 1.0, "rewards/chosen": -42.08842086791992, "rewards/margins": 5.824536323547363, "rewards/rejected": -47.91295623779297, "step": 6112 }, { "epoch": 0.832380174291939, "grad_norm": 43.76146275973204, "learning_rate": 6.654977913191988e-08, "logits/chosen": 13.915486335754395, "logits/rejected": 13.935562133789062, "logps/chosen": -4.331501007080078, "logps/rejected": -4.46814489364624, "loss": 3.5715, "rewards/accuracies": 0.25, "rewards/chosen": -43.31501388549805, "rewards/margins": 1.3664350509643555, "rewards/rejected": -44.68144989013672, "step": 6113 }, { "epoch": 0.8325163398692811, "grad_norm": 43.85794894068353, "learning_rate": 6.644479663034283e-08, "logits/chosen": 15.098387718200684, "logits/rejected": 15.075302124023438, "logps/chosen": -4.784763336181641, "logps/rejected": -4.711294651031494, "loss": 3.1656, "rewards/accuracies": 0.5, "rewards/chosen": -47.84762954711914, "rewards/margins": -0.734684944152832, "rewards/rejected": -47.112945556640625, "step": 6114 }, { "epoch": 0.8326525054466231, "grad_norm": 40.97604302394682, "learning_rate": 6.633988949832105e-08, "logits/chosen": 15.014986991882324, "logits/rejected": 15.747922897338867, "logps/chosen": -4.656776428222656, "logps/rejected": -5.234889984130859, "loss": 3.5897, "rewards/accuracies": 1.0, "rewards/chosen": -46.56776809692383, "rewards/margins": 5.781133651733398, "rewards/rejected": -52.34890365600586, "step": 6115 }, { "epoch": 0.8327886710239651, "grad_norm": 49.11452275990805, "learning_rate": 6.623505775955936e-08, "logits/chosen": 14.052789688110352, "logits/rejected": 14.440380096435547, "logps/chosen": -4.468685626983643, "logps/rejected": -4.568350791931152, "loss": 3.8312, "rewards/accuracies": 0.75, "rewards/chosen": -44.686859130859375, "rewards/margins": 0.996647834777832, "rewards/rejected": -45.683502197265625, "step": 6116 }, { "epoch": 0.8329248366013072, "grad_norm": 47.31847828096604, "learning_rate": 6.613030143774536e-08, "logits/chosen": 14.001862525939941, "logits/rejected": 15.082275390625, "logps/chosen": -4.459283828735352, "logps/rejected": -5.0361127853393555, "loss": 4.0333, "rewards/accuracies": 1.0, "rewards/chosen": -44.592838287353516, "rewards/margins": 5.768289566040039, "rewards/rejected": -50.36112976074219, "step": 6117 }, { "epoch": 0.8330610021786492, "grad_norm": 42.268103243096945, "learning_rate": 6.602562055654943e-08, "logits/chosen": 14.536033630371094, "logits/rejected": 15.248395919799805, "logps/chosen": -4.458534240722656, "logps/rejected": -4.625010013580322, "loss": 4.3024, "rewards/accuracies": 0.75, "rewards/chosen": -44.58534240722656, "rewards/margins": 1.664754867553711, "rewards/rejected": -46.250099182128906, "step": 6118 }, { "epoch": 0.8331971677559913, "grad_norm": 44.32203346366435, "learning_rate": 6.592101513962523e-08, "logits/chosen": 14.903671264648438, "logits/rejected": 14.789854049682617, "logps/chosen": -4.6624884605407715, "logps/rejected": -4.414203643798828, "loss": 4.097, "rewards/accuracies": 0.25, "rewards/chosen": -46.62488555908203, "rewards/margins": -2.4828481674194336, "rewards/rejected": -44.14203643798828, "step": 6119 }, { "epoch": 0.8333333333333334, "grad_norm": 42.985194422812, "learning_rate": 6.581648521060925e-08, "logits/chosen": 13.804893493652344, "logits/rejected": 14.453415870666504, "logps/chosen": -4.407199859619141, "logps/rejected": -4.584736347198486, "loss": 4.4391, "rewards/accuracies": 0.5, "rewards/chosen": -44.071998596191406, "rewards/margins": 1.775364875793457, "rewards/rejected": -45.84736251831055, "step": 6120 }, { "epoch": 0.8334694989106753, "grad_norm": 48.73869990202893, "learning_rate": 6.57120307931207e-08, "logits/chosen": 14.089008331298828, "logits/rejected": 15.127090454101562, "logps/chosen": -4.477149486541748, "logps/rejected": -4.933053493499756, "loss": 3.6529, "rewards/accuracies": 0.75, "rewards/chosen": -44.7714958190918, "rewards/margins": 4.5590410232543945, "rewards/rejected": -49.330535888671875, "step": 6121 }, { "epoch": 0.8336056644880174, "grad_norm": 45.32886848432048, "learning_rate": 6.56076519107621e-08, "logits/chosen": 14.027435302734375, "logits/rejected": 14.689157485961914, "logps/chosen": -4.580099105834961, "logps/rejected": -4.726335048675537, "loss": 4.2798, "rewards/accuracies": 0.75, "rewards/chosen": -45.80099105834961, "rewards/margins": 1.462357521057129, "rewards/rejected": -47.26335144042969, "step": 6122 }, { "epoch": 0.8337418300653595, "grad_norm": 42.35981595122469, "learning_rate": 6.550334858711876e-08, "logits/chosen": 13.707992553710938, "logits/rejected": 14.600269317626953, "logps/chosen": -4.488898277282715, "logps/rejected": -4.859711647033691, "loss": 4.1803, "rewards/accuracies": 1.0, "rewards/chosen": -44.88898468017578, "rewards/margins": 3.708131790161133, "rewards/rejected": -48.59711837768555, "step": 6123 }, { "epoch": 0.8338779956427015, "grad_norm": 42.2339288388915, "learning_rate": 6.539912084575867e-08, "logits/chosen": 13.512540817260742, "logits/rejected": 14.427936553955078, "logps/chosen": -4.153434753417969, "logps/rejected": -4.655799865722656, "loss": 4.2369, "rewards/accuracies": 1.0, "rewards/chosen": -41.53435134887695, "rewards/margins": 5.023648262023926, "rewards/rejected": -46.55799865722656, "step": 6124 }, { "epoch": 0.8340141612200436, "grad_norm": 41.31921111842466, "learning_rate": 6.529496871023306e-08, "logits/chosen": 14.331377029418945, "logits/rejected": 14.535574913024902, "logps/chosen": -4.4718546867370605, "logps/rejected": -4.345283031463623, "loss": 3.7614, "rewards/accuracies": 0.5, "rewards/chosen": -44.71854782104492, "rewards/margins": -1.265716552734375, "rewards/rejected": -43.45283508300781, "step": 6125 }, { "epoch": 0.8341503267973857, "grad_norm": 42.88832408092106, "learning_rate": 6.519089220407608e-08, "logits/chosen": 14.583187103271484, "logits/rejected": 15.199056625366211, "logps/chosen": -4.538436412811279, "logps/rejected": -4.76739501953125, "loss": 4.1675, "rewards/accuracies": 0.75, "rewards/chosen": -45.38436508178711, "rewards/margins": 2.289584159851074, "rewards/rejected": -47.6739501953125, "step": 6126 }, { "epoch": 0.8342864923747276, "grad_norm": 43.98912520228231, "learning_rate": 6.508689135080447e-08, "logits/chosen": 14.774314880371094, "logits/rejected": 14.931795120239258, "logps/chosen": -4.391279220581055, "logps/rejected": -4.677399635314941, "loss": 4.4413, "rewards/accuracies": 0.75, "rewards/chosen": -43.91279602050781, "rewards/margins": 2.8612003326416016, "rewards/rejected": -46.77399444580078, "step": 6127 }, { "epoch": 0.8344226579520697, "grad_norm": 43.5971232018078, "learning_rate": 6.498296617391817e-08, "logits/chosen": 14.332961082458496, "logits/rejected": 14.418508529663086, "logps/chosen": -4.781241416931152, "logps/rejected": -4.8458662033081055, "loss": 3.5749, "rewards/accuracies": 0.75, "rewards/chosen": -47.812416076660156, "rewards/margins": 0.6462430953979492, "rewards/rejected": -48.45866012573242, "step": 6128 }, { "epoch": 0.8345588235294118, "grad_norm": 39.79786809703895, "learning_rate": 6.487911669690006e-08, "logits/chosen": 14.969010353088379, "logits/rejected": 14.94645881652832, "logps/chosen": -4.631986618041992, "logps/rejected": -4.999041557312012, "loss": 3.6496, "rewards/accuracies": 0.75, "rewards/chosen": -46.31986999511719, "rewards/margins": 3.670546531677246, "rewards/rejected": -49.99041748046875, "step": 6129 }, { "epoch": 0.8346949891067538, "grad_norm": 44.77782817901429, "learning_rate": 6.477534294321554e-08, "logits/chosen": 14.914674758911133, "logits/rejected": 15.111852645874023, "logps/chosen": -4.817696571350098, "logps/rejected": -4.933455944061279, "loss": 3.8723, "rewards/accuracies": 0.75, "rewards/chosen": -48.176963806152344, "rewards/margins": 1.1575956344604492, "rewards/rejected": -49.33456039428711, "step": 6130 }, { "epoch": 0.8348311546840959, "grad_norm": 40.821660231505945, "learning_rate": 6.46716449363133e-08, "logits/chosen": 14.695040702819824, "logits/rejected": 14.928022384643555, "logps/chosen": -4.720222473144531, "logps/rejected": -4.976797103881836, "loss": 3.9323, "rewards/accuracies": 0.75, "rewards/chosen": -47.20222473144531, "rewards/margins": 2.5657472610473633, "rewards/rejected": -49.767974853515625, "step": 6131 }, { "epoch": 0.8349673202614379, "grad_norm": 47.89037538667811, "learning_rate": 6.45680226996248e-08, "logits/chosen": 13.87470817565918, "logits/rejected": 14.629241943359375, "logps/chosen": -4.468764781951904, "logps/rejected": -4.601442337036133, "loss": 3.7595, "rewards/accuracies": 0.75, "rewards/chosen": -44.687644958496094, "rewards/margins": 1.3267765045166016, "rewards/rejected": -46.01442337036133, "step": 6132 }, { "epoch": 0.8351034858387799, "grad_norm": 41.010171048833556, "learning_rate": 6.446447625656421e-08, "logits/chosen": 15.136863708496094, "logits/rejected": 15.213127136230469, "logps/chosen": -4.788043022155762, "logps/rejected": -5.076482772827148, "loss": 3.6575, "rewards/accuracies": 0.5, "rewards/chosen": -47.88043212890625, "rewards/margins": 2.8843955993652344, "rewards/rejected": -50.76482391357422, "step": 6133 }, { "epoch": 0.835239651416122, "grad_norm": 42.26973091527386, "learning_rate": 6.436100563052882e-08, "logits/chosen": 13.620711326599121, "logits/rejected": 14.76095962524414, "logps/chosen": -4.187464237213135, "logps/rejected": -4.715559959411621, "loss": 4.1682, "rewards/accuracies": 0.5, "rewards/chosen": -41.87464141845703, "rewards/margins": 5.280961990356445, "rewards/rejected": -47.15560531616211, "step": 6134 }, { "epoch": 0.8353758169934641, "grad_norm": 43.04569988240668, "learning_rate": 6.425761084489867e-08, "logits/chosen": 15.00963020324707, "logits/rejected": 14.733663558959961, "logps/chosen": -4.758084774017334, "logps/rejected": -4.814685821533203, "loss": 3.4167, "rewards/accuracies": 0.5, "rewards/chosen": -47.580848693847656, "rewards/margins": 0.566009521484375, "rewards/rejected": -48.14685821533203, "step": 6135 }, { "epoch": 0.835511982570806, "grad_norm": 44.57033151906582, "learning_rate": 6.415429192303654e-08, "logits/chosen": 14.911996841430664, "logits/rejected": 14.90323257446289, "logps/chosen": -4.594383239746094, "logps/rejected": -4.450860500335693, "loss": 4.5411, "rewards/accuracies": 0.5, "rewards/chosen": -45.94383239746094, "rewards/margins": -1.4352264404296875, "rewards/rejected": -44.50860595703125, "step": 6136 }, { "epoch": 0.8356481481481481, "grad_norm": 45.63959426743002, "learning_rate": 6.405104888828825e-08, "logits/chosen": 14.57387924194336, "logits/rejected": 14.93979263305664, "logps/chosen": -4.5041632652282715, "logps/rejected": -4.796629905700684, "loss": 3.7339, "rewards/accuracies": 0.75, "rewards/chosen": -45.04163360595703, "rewards/margins": 2.9246692657470703, "rewards/rejected": -47.96630096435547, "step": 6137 }, { "epoch": 0.8357843137254902, "grad_norm": 43.9619504629322, "learning_rate": 6.39478817639826e-08, "logits/chosen": 14.124460220336914, "logits/rejected": 14.288473129272461, "logps/chosen": -4.182819843292236, "logps/rejected": -4.301627159118652, "loss": 3.8098, "rewards/accuracies": 0.75, "rewards/chosen": -41.82819366455078, "rewards/margins": 1.1880760192871094, "rewards/rejected": -43.016273498535156, "step": 6138 }, { "epoch": 0.8359204793028322, "grad_norm": 40.6932674388067, "learning_rate": 6.384479057343078e-08, "logits/chosen": 13.976842880249023, "logits/rejected": 14.318231582641602, "logps/chosen": -4.4180521965026855, "logps/rejected": -4.8339128494262695, "loss": 4.1229, "rewards/accuracies": 1.0, "rewards/chosen": -44.18052291870117, "rewards/margins": 4.158608436584473, "rewards/rejected": -48.33913040161133, "step": 6139 }, { "epoch": 0.8360566448801743, "grad_norm": 46.100805807508635, "learning_rate": 6.374177533992719e-08, "logits/chosen": 14.639457702636719, "logits/rejected": 14.641408920288086, "logps/chosen": -4.890402317047119, "logps/rejected": -4.774507522583008, "loss": 4.1515, "rewards/accuracies": 0.25, "rewards/chosen": -48.904022216796875, "rewards/margins": -1.1589479446411133, "rewards/rejected": -47.745079040527344, "step": 6140 }, { "epoch": 0.8361928104575164, "grad_norm": 40.55036572492544, "learning_rate": 6.363883608674911e-08, "logits/chosen": 13.980506896972656, "logits/rejected": 15.150175094604492, "logps/chosen": -4.183509826660156, "logps/rejected": -4.376813888549805, "loss": 4.0075, "rewards/accuracies": 0.5, "rewards/chosen": -41.835105895996094, "rewards/margins": 1.933039665222168, "rewards/rejected": -43.76814270019531, "step": 6141 }, { "epoch": 0.8363289760348583, "grad_norm": 43.23125815908061, "learning_rate": 6.353597283715633e-08, "logits/chosen": 13.66677474975586, "logits/rejected": 14.071264266967773, "logps/chosen": -4.068900108337402, "logps/rejected": -4.373232364654541, "loss": 4.5273, "rewards/accuracies": 1.0, "rewards/chosen": -40.68899917602539, "rewards/margins": 3.0433244705200195, "rewards/rejected": -43.732322692871094, "step": 6142 }, { "epoch": 0.8364651416122004, "grad_norm": 41.41548170460914, "learning_rate": 6.34331856143917e-08, "logits/chosen": 13.724721908569336, "logits/rejected": 14.809320449829102, "logps/chosen": -4.581521034240723, "logps/rejected": -5.036867618560791, "loss": 4.0166, "rewards/accuracies": 1.0, "rewards/chosen": -45.815208435058594, "rewards/margins": 4.553465843200684, "rewards/rejected": -50.368675231933594, "step": 6143 }, { "epoch": 0.8366013071895425, "grad_norm": 41.6823608027207, "learning_rate": 6.333047444168099e-08, "logits/chosen": 14.387989044189453, "logits/rejected": 14.896089553833008, "logps/chosen": -4.822771072387695, "logps/rejected": -4.781948566436768, "loss": 3.6968, "rewards/accuracies": 0.25, "rewards/chosen": -48.22770690917969, "rewards/margins": -0.4082193374633789, "rewards/rejected": -47.819488525390625, "step": 6144 }, { "epoch": 0.8367374727668845, "grad_norm": 46.92857758799555, "learning_rate": 6.322783934223239e-08, "logits/chosen": 13.694687843322754, "logits/rejected": 14.8466215133667, "logps/chosen": -4.079318046569824, "logps/rejected": -4.512469291687012, "loss": 3.889, "rewards/accuracies": 0.75, "rewards/chosen": -40.793182373046875, "rewards/margins": 4.331513404846191, "rewards/rejected": -45.12469482421875, "step": 6145 }, { "epoch": 0.8368736383442266, "grad_norm": 46.359559108345614, "learning_rate": 6.312528033923734e-08, "logits/chosen": 14.686901092529297, "logits/rejected": 14.62751293182373, "logps/chosen": -4.4833831787109375, "logps/rejected": -4.526606559753418, "loss": 4.3786, "rewards/accuracies": 0.5, "rewards/chosen": -44.833831787109375, "rewards/margins": 0.4322338104248047, "rewards/rejected": -45.26606750488281, "step": 6146 }, { "epoch": 0.8370098039215687, "grad_norm": 41.48175499647031, "learning_rate": 6.302279745586988e-08, "logits/chosen": 14.760845184326172, "logits/rejected": 14.634923934936523, "logps/chosen": -4.803130149841309, "logps/rejected": -4.9867353439331055, "loss": 4.1703, "rewards/accuracies": 0.75, "rewards/chosen": -48.03130340576172, "rewards/margins": 1.8360509872436523, "rewards/rejected": -49.86735534667969, "step": 6147 }, { "epoch": 0.8371459694989106, "grad_norm": 47.20630134770123, "learning_rate": 6.292039071528675e-08, "logits/chosen": 14.80915355682373, "logits/rejected": 15.135675430297852, "logps/chosen": -4.734670162200928, "logps/rejected": -5.153237819671631, "loss": 3.3361, "rewards/accuracies": 1.0, "rewards/chosen": -47.346702575683594, "rewards/margins": 4.185676574707031, "rewards/rejected": -51.532379150390625, "step": 6148 }, { "epoch": 0.8372821350762527, "grad_norm": 43.16170662622969, "learning_rate": 6.281806014062763e-08, "logits/chosen": 14.731307983398438, "logits/rejected": 15.501100540161133, "logps/chosen": -4.655179023742676, "logps/rejected": -5.148763179779053, "loss": 3.9153, "rewards/accuracies": 1.0, "rewards/chosen": -46.55179214477539, "rewards/margins": 4.9358415603637695, "rewards/rejected": -51.487632751464844, "step": 6149 }, { "epoch": 0.8374183006535948, "grad_norm": 41.97275351315022, "learning_rate": 6.27158057550151e-08, "logits/chosen": 14.367761611938477, "logits/rejected": 15.01080322265625, "logps/chosen": -4.822267055511475, "logps/rejected": -5.034568786621094, "loss": 3.4488, "rewards/accuracies": 0.25, "rewards/chosen": -48.22267150878906, "rewards/margins": 2.1230154037475586, "rewards/rejected": -50.34568786621094, "step": 6150 }, { "epoch": 0.8375544662309368, "grad_norm": 40.81398957924375, "learning_rate": 6.261362758155418e-08, "logits/chosen": 13.713077545166016, "logits/rejected": 13.481290817260742, "logps/chosen": -4.353902339935303, "logps/rejected": -4.571360111236572, "loss": 3.9497, "rewards/accuracies": 0.75, "rewards/chosen": -43.539024353027344, "rewards/margins": 2.174576759338379, "rewards/rejected": -45.713600158691406, "step": 6151 }, { "epoch": 0.8376906318082789, "grad_norm": 46.036599742273935, "learning_rate": 6.251152564333298e-08, "logits/chosen": 14.949037551879883, "logits/rejected": 14.779916763305664, "logps/chosen": -4.687830448150635, "logps/rejected": -4.8106689453125, "loss": 3.9054, "rewards/accuracies": 0.75, "rewards/chosen": -46.87830352783203, "rewards/margins": 1.2283849716186523, "rewards/rejected": -48.106689453125, "step": 6152 }, { "epoch": 0.8378267973856209, "grad_norm": 43.49602663382905, "learning_rate": 6.240949996342238e-08, "logits/chosen": 15.39076042175293, "logits/rejected": 14.698583602905273, "logps/chosen": -4.809099197387695, "logps/rejected": -4.541504383087158, "loss": 4.2805, "rewards/accuracies": 0.25, "rewards/chosen": -48.09099197387695, "rewards/margins": -2.6759490966796875, "rewards/rejected": -45.415042877197266, "step": 6153 }, { "epoch": 0.8379629629629629, "grad_norm": 43.00441622467753, "learning_rate": 6.230755056487571e-08, "logits/chosen": 13.955394744873047, "logits/rejected": 14.64914321899414, "logps/chosen": -4.348712921142578, "logps/rejected": -4.6768293380737305, "loss": 3.663, "rewards/accuracies": 1.0, "rewards/chosen": -43.487125396728516, "rewards/margins": 3.2811660766601562, "rewards/rejected": -46.76829147338867, "step": 6154 }, { "epoch": 0.838099128540305, "grad_norm": 45.81551058067107, "learning_rate": 6.220567747072935e-08, "logits/chosen": 14.526515007019043, "logits/rejected": 14.381134033203125, "logps/chosen": -4.616194725036621, "logps/rejected": -4.682825565338135, "loss": 4.2947, "rewards/accuracies": 0.75, "rewards/chosen": -46.16194152832031, "rewards/margins": 0.6663112640380859, "rewards/rejected": -46.82825469970703, "step": 6155 }, { "epoch": 0.8382352941176471, "grad_norm": 42.61773921635792, "learning_rate": 6.210388070400254e-08, "logits/chosen": 14.305734634399414, "logits/rejected": 14.412460327148438, "logps/chosen": -4.345357894897461, "logps/rejected": -4.578492164611816, "loss": 3.6404, "rewards/accuracies": 0.75, "rewards/chosen": -43.453582763671875, "rewards/margins": 2.3313417434692383, "rewards/rejected": -45.78492736816406, "step": 6156 }, { "epoch": 0.838371459694989, "grad_norm": 42.75991101777564, "learning_rate": 6.200216028769687e-08, "logits/chosen": 13.465669631958008, "logits/rejected": 14.039091110229492, "logps/chosen": -4.219989776611328, "logps/rejected": -4.599555492401123, "loss": 4.4708, "rewards/accuracies": 1.0, "rewards/chosen": -42.19989776611328, "rewards/margins": 3.7956533432006836, "rewards/rejected": -45.99555206298828, "step": 6157 }, { "epoch": 0.8385076252723311, "grad_norm": 40.575830340742534, "learning_rate": 6.190051624479698e-08, "logits/chosen": 14.62089729309082, "logits/rejected": 14.632793426513672, "logps/chosen": -4.749768257141113, "logps/rejected": -4.713484764099121, "loss": 3.9904, "rewards/accuracies": 0.5, "rewards/chosen": -47.4976806640625, "rewards/margins": -0.3628358840942383, "rewards/rejected": -47.13484573364258, "step": 6158 }, { "epoch": 0.8386437908496732, "grad_norm": 51.29506727976669, "learning_rate": 6.179894859827031e-08, "logits/chosen": 15.057344436645508, "logits/rejected": 14.587727546691895, "logps/chosen": -4.8302154541015625, "logps/rejected": -4.669647216796875, "loss": 4.7241, "rewards/accuracies": 0.25, "rewards/chosen": -48.30215072631836, "rewards/margins": -1.6056804656982422, "rewards/rejected": -46.69647216796875, "step": 6159 }, { "epoch": 0.8387799564270153, "grad_norm": 41.069978596249044, "learning_rate": 6.169745737106669e-08, "logits/chosen": 14.41003131866455, "logits/rejected": 15.779656410217285, "logps/chosen": -4.593689918518066, "logps/rejected": -5.114567279815674, "loss": 4.1199, "rewards/accuracies": 0.75, "rewards/chosen": -45.93689727783203, "rewards/margins": 5.208774566650391, "rewards/rejected": -51.14567184448242, "step": 6160 }, { "epoch": 0.8389161220043573, "grad_norm": 42.86075334018611, "learning_rate": 6.159604258611902e-08, "logits/chosen": 13.928592681884766, "logits/rejected": 15.053949356079102, "logps/chosen": -4.384983062744141, "logps/rejected": -5.018642425537109, "loss": 4.0777, "rewards/accuracies": 1.0, "rewards/chosen": -43.84983444213867, "rewards/margins": 6.336591720581055, "rewards/rejected": -50.186424255371094, "step": 6161 }, { "epoch": 0.8390522875816994, "grad_norm": 42.421970733068704, "learning_rate": 6.149470426634291e-08, "logits/chosen": 14.004692077636719, "logits/rejected": 13.892975807189941, "logps/chosen": -4.165945053100586, "logps/rejected": -4.551339149475098, "loss": 4.0486, "rewards/accuracies": 1.0, "rewards/chosen": -41.65945053100586, "rewards/margins": 3.8539352416992188, "rewards/rejected": -45.51338577270508, "step": 6162 }, { "epoch": 0.8391884531590414, "grad_norm": 40.5382226086264, "learning_rate": 6.139344243463638e-08, "logits/chosen": 15.302030563354492, "logits/rejected": 15.649471282958984, "logps/chosen": -5.09224796295166, "logps/rejected": -4.971681594848633, "loss": 3.895, "rewards/accuracies": 0.5, "rewards/chosen": -50.92247772216797, "rewards/margins": -1.2056646347045898, "rewards/rejected": -49.71681213378906, "step": 6163 }, { "epoch": 0.8393246187363834, "grad_norm": 42.86342928229963, "learning_rate": 6.129225711388048e-08, "logits/chosen": 15.275354385375977, "logits/rejected": 15.164896011352539, "logps/chosen": -4.755391597747803, "logps/rejected": -4.878018379211426, "loss": 3.3966, "rewards/accuracies": 0.75, "rewards/chosen": -47.553916931152344, "rewards/margins": 1.2262706756591797, "rewards/rejected": -48.780189514160156, "step": 6164 }, { "epoch": 0.8394607843137255, "grad_norm": 44.22604437781367, "learning_rate": 6.119114832693898e-08, "logits/chosen": 14.040874481201172, "logits/rejected": 14.328506469726562, "logps/chosen": -4.452728271484375, "logps/rejected": -4.680817604064941, "loss": 4.0437, "rewards/accuracies": 0.75, "rewards/chosen": -44.527286529541016, "rewards/margins": 2.2808876037597656, "rewards/rejected": -46.80817413330078, "step": 6165 }, { "epoch": 0.8395969498910676, "grad_norm": 43.75019510745941, "learning_rate": 6.109011609665802e-08, "logits/chosen": 14.387958526611328, "logits/rejected": 14.106555938720703, "logps/chosen": -4.630713939666748, "logps/rejected": -4.886229038238525, "loss": 4.4016, "rewards/accuracies": 0.75, "rewards/chosen": -46.3071403503418, "rewards/margins": 2.5551490783691406, "rewards/rejected": -48.86228942871094, "step": 6166 }, { "epoch": 0.8397331154684096, "grad_norm": 43.44987159378303, "learning_rate": 6.098916044586682e-08, "logits/chosen": 15.291091918945312, "logits/rejected": 15.009586334228516, "logps/chosen": -4.839518070220947, "logps/rejected": -4.845471382141113, "loss": 4.0098, "rewards/accuracies": 0.5, "rewards/chosen": -48.395179748535156, "rewards/margins": 0.05953693389892578, "rewards/rejected": -48.45471954345703, "step": 6167 }, { "epoch": 0.8398692810457516, "grad_norm": 41.722772842096646, "learning_rate": 6.088828139737718e-08, "logits/chosen": 13.753020286560059, "logits/rejected": 14.362237930297852, "logps/chosen": -4.35163688659668, "logps/rejected": -4.471140384674072, "loss": 4.0216, "rewards/accuracies": 0.75, "rewards/chosen": -43.51637268066406, "rewards/margins": 1.1950340270996094, "rewards/rejected": -44.711402893066406, "step": 6168 }, { "epoch": 0.8400054466230937, "grad_norm": 44.11903921629628, "learning_rate": 6.078747897398341e-08, "logits/chosen": 14.802290916442871, "logits/rejected": 15.696369171142578, "logps/chosen": -4.582756996154785, "logps/rejected": -4.854500770568848, "loss": 3.8141, "rewards/accuracies": 0.75, "rewards/chosen": -45.82756805419922, "rewards/margins": 2.717440605163574, "rewards/rejected": -48.545005798339844, "step": 6169 }, { "epoch": 0.8401416122004357, "grad_norm": 46.67580634922217, "learning_rate": 6.068675319846268e-08, "logits/chosen": 14.090728759765625, "logits/rejected": 15.409890174865723, "logps/chosen": -4.540559768676758, "logps/rejected": -4.992587089538574, "loss": 3.6444, "rewards/accuracies": 0.75, "rewards/chosen": -45.40559768676758, "rewards/margins": 4.5202789306640625, "rewards/rejected": -49.925872802734375, "step": 6170 }, { "epoch": 0.8402777777777778, "grad_norm": 42.1163481212159, "learning_rate": 6.058610409357499e-08, "logits/chosen": 14.519412994384766, "logits/rejected": 15.336568832397461, "logps/chosen": -4.694339752197266, "logps/rejected": -4.854836463928223, "loss": 3.2948, "rewards/accuracies": 0.5, "rewards/chosen": -46.943397521972656, "rewards/margins": 1.604966163635254, "rewards/rejected": -48.548362731933594, "step": 6171 }, { "epoch": 0.8404139433551199, "grad_norm": 42.78579760205416, "learning_rate": 6.048553168206258e-08, "logits/chosen": 14.331136703491211, "logits/rejected": 13.822213172912598, "logps/chosen": -4.506701946258545, "logps/rejected": -4.408230781555176, "loss": 4.0296, "rewards/accuracies": 0.25, "rewards/chosen": -45.0670166015625, "rewards/margins": -0.9847126007080078, "rewards/rejected": -44.082305908203125, "step": 6172 }, { "epoch": 0.8405501089324618, "grad_norm": 44.300682299326105, "learning_rate": 6.038503598665077e-08, "logits/chosen": 14.490324020385742, "logits/rejected": 14.285131454467773, "logps/chosen": -4.545265197753906, "logps/rejected": -4.708888053894043, "loss": 4.2487, "rewards/accuracies": 0.5, "rewards/chosen": -45.4526481628418, "rewards/margins": 1.6362295150756836, "rewards/rejected": -47.0888786315918, "step": 6173 }, { "epoch": 0.8406862745098039, "grad_norm": 39.423337813801574, "learning_rate": 6.028461703004746e-08, "logits/chosen": 14.644183158874512, "logits/rejected": 15.183704376220703, "logps/chosen": -4.608112335205078, "logps/rejected": -5.179940223693848, "loss": 3.6123, "rewards/accuracies": 1.0, "rewards/chosen": -46.081119537353516, "rewards/margins": 5.718282699584961, "rewards/rejected": -51.799400329589844, "step": 6174 }, { "epoch": 0.840822440087146, "grad_norm": 40.24838816242825, "learning_rate": 6.018427483494295e-08, "logits/chosen": 14.186944961547852, "logits/rejected": 14.54780387878418, "logps/chosen": -4.777701377868652, "logps/rejected": -4.75429630279541, "loss": 3.8867, "rewards/accuracies": 0.25, "rewards/chosen": -47.777015686035156, "rewards/margins": -0.2340526580810547, "rewards/rejected": -47.54296112060547, "step": 6175 }, { "epoch": 0.840958605664488, "grad_norm": 47.08355899552074, "learning_rate": 6.008400942401048e-08, "logits/chosen": 14.152093887329102, "logits/rejected": 14.05659008026123, "logps/chosen": -4.026114463806152, "logps/rejected": -4.087420463562012, "loss": 3.9096, "rewards/accuracies": 0.25, "rewards/chosen": -40.261146545410156, "rewards/margins": 0.6130609512329102, "rewards/rejected": -40.87420654296875, "step": 6176 }, { "epoch": 0.8410947712418301, "grad_norm": 48.8999881876189, "learning_rate": 5.998382081990593e-08, "logits/chosen": 14.542013168334961, "logits/rejected": 14.270641326904297, "logps/chosen": -4.486303806304932, "logps/rejected": -4.665948867797852, "loss": 4.1686, "rewards/accuracies": 0.75, "rewards/chosen": -44.863037109375, "rewards/margins": 1.7964534759521484, "rewards/rejected": -46.65949249267578, "step": 6177 }, { "epoch": 0.8412309368191722, "grad_norm": 47.57219940456956, "learning_rate": 5.988370904526761e-08, "logits/chosen": 15.194438934326172, "logits/rejected": 15.5159912109375, "logps/chosen": -4.9089460372924805, "logps/rejected": -4.976038455963135, "loss": 3.8625, "rewards/accuracies": 0.75, "rewards/chosen": -49.08946228027344, "rewards/margins": 0.6709222793579102, "rewards/rejected": -49.76038360595703, "step": 6178 }, { "epoch": 0.8413671023965141, "grad_norm": 45.651446066437586, "learning_rate": 5.978367412271663e-08, "logits/chosen": 14.54029369354248, "logits/rejected": 14.997416496276855, "logps/chosen": -4.546347618103027, "logps/rejected": -4.90607213973999, "loss": 3.9171, "rewards/accuracies": 1.0, "rewards/chosen": -45.46347427368164, "rewards/margins": 3.597248077392578, "rewards/rejected": -49.06072235107422, "step": 6179 }, { "epoch": 0.8415032679738562, "grad_norm": 45.95485776414801, "learning_rate": 5.968371607485685e-08, "logits/chosen": 14.03404712677002, "logits/rejected": 13.764532089233398, "logps/chosen": -4.605472564697266, "logps/rejected": -4.4339399337768555, "loss": 4.503, "rewards/accuracies": 0.5, "rewards/chosen": -46.054725646972656, "rewards/margins": -1.715327262878418, "rewards/rejected": -44.33940124511719, "step": 6180 }, { "epoch": 0.8416394335511983, "grad_norm": 39.35415870436947, "learning_rate": 5.958383492427441e-08, "logits/chosen": 14.954928398132324, "logits/rejected": 15.302433013916016, "logps/chosen": -4.714592933654785, "logps/rejected": -5.110393524169922, "loss": 3.7243, "rewards/accuracies": 1.0, "rewards/chosen": -47.14592742919922, "rewards/margins": 3.9580068588256836, "rewards/rejected": -51.10393524169922, "step": 6181 }, { "epoch": 0.8417755991285403, "grad_norm": 47.11660174757733, "learning_rate": 5.9484030693538376e-08, "logits/chosen": 14.635830879211426, "logits/rejected": 14.685310363769531, "logps/chosen": -4.5962090492248535, "logps/rejected": -4.774572372436523, "loss": 4.4052, "rewards/accuracies": 0.75, "rewards/chosen": -45.962093353271484, "rewards/margins": 1.7836322784423828, "rewards/rejected": -47.745723724365234, "step": 6182 }, { "epoch": 0.8419117647058824, "grad_norm": 48.33083501618332, "learning_rate": 5.938430340520035e-08, "logits/chosen": 14.67400074005127, "logits/rejected": 15.344593048095703, "logps/chosen": -4.674765110015869, "logps/rejected": -4.924532890319824, "loss": 3.7397, "rewards/accuracies": 0.75, "rewards/chosen": -46.747650146484375, "rewards/margins": 2.497677803039551, "rewards/rejected": -49.245330810546875, "step": 6183 }, { "epoch": 0.8420479302832244, "grad_norm": 44.48077631572509, "learning_rate": 5.928465308179462e-08, "logits/chosen": 14.971923828125, "logits/rejected": 14.958494186401367, "logps/chosen": -4.750709056854248, "logps/rejected": -4.751724720001221, "loss": 4.1794, "rewards/accuracies": 0.75, "rewards/chosen": -47.5070915222168, "rewards/margins": 0.010160446166992188, "rewards/rejected": -47.517250061035156, "step": 6184 }, { "epoch": 0.8421840958605664, "grad_norm": 38.50708159430155, "learning_rate": 5.9185079745837794e-08, "logits/chosen": 14.403253555297852, "logits/rejected": 14.219794273376465, "logps/chosen": -4.406630516052246, "logps/rejected": -4.503719329833984, "loss": 3.723, "rewards/accuracies": 0.5, "rewards/chosen": -44.066307067871094, "rewards/margins": 0.9708843231201172, "rewards/rejected": -45.037193298339844, "step": 6185 }, { "epoch": 0.8423202614379085, "grad_norm": 39.545767900055175, "learning_rate": 5.908558341982943e-08, "logits/chosen": 13.606145858764648, "logits/rejected": 13.763849258422852, "logps/chosen": -4.365384101867676, "logps/rejected": -4.490391731262207, "loss": 3.9397, "rewards/accuracies": 0.5, "rewards/chosen": -43.65384292602539, "rewards/margins": 1.2500724792480469, "rewards/rejected": -44.90391540527344, "step": 6186 }, { "epoch": 0.8424564270152506, "grad_norm": 41.20950380059814, "learning_rate": 5.898616412625159e-08, "logits/chosen": 14.295352935791016, "logits/rejected": 14.968732833862305, "logps/chosen": -4.648285388946533, "logps/rejected": -4.857160568237305, "loss": 4.2579, "rewards/accuracies": 0.75, "rewards/chosen": -46.482852935791016, "rewards/margins": 2.088749885559082, "rewards/rejected": -48.57160186767578, "step": 6187 }, { "epoch": 0.8425925925925926, "grad_norm": 49.56542358106453, "learning_rate": 5.888682188756875e-08, "logits/chosen": 14.442590713500977, "logits/rejected": 14.69841480255127, "logps/chosen": -4.557439804077148, "logps/rejected": -4.801897048950195, "loss": 3.1682, "rewards/accuracies": 1.0, "rewards/chosen": -45.57439422607422, "rewards/margins": 2.444578170776367, "rewards/rejected": -48.01897430419922, "step": 6188 }, { "epoch": 0.8427287581699346, "grad_norm": 37.86537211864574, "learning_rate": 5.878755672622815e-08, "logits/chosen": 14.499958038330078, "logits/rejected": 14.305248260498047, "logps/chosen": -4.134953498840332, "logps/rejected": -4.630538463592529, "loss": 3.8299, "rewards/accuracies": 0.75, "rewards/chosen": -41.34953308105469, "rewards/margins": 4.95584774017334, "rewards/rejected": -46.305381774902344, "step": 6189 }, { "epoch": 0.8428649237472767, "grad_norm": 48.098288431001336, "learning_rate": 5.868836866465958e-08, "logits/chosen": 14.502309799194336, "logits/rejected": 13.973564147949219, "logps/chosen": -4.815094947814941, "logps/rejected": -4.832985877990723, "loss": 4.4377, "rewards/accuracies": 0.75, "rewards/chosen": -48.15095138549805, "rewards/margins": 0.17890644073486328, "rewards/rejected": -48.32986068725586, "step": 6190 }, { "epoch": 0.8430010893246187, "grad_norm": 43.71040216788372, "learning_rate": 5.858925772527556e-08, "logits/chosen": 14.727188110351562, "logits/rejected": 15.312585830688477, "logps/chosen": -4.692289352416992, "logps/rejected": -5.165884017944336, "loss": 4.1324, "rewards/accuracies": 0.75, "rewards/chosen": -46.92289352416992, "rewards/margins": 4.735949516296387, "rewards/rejected": -51.658843994140625, "step": 6191 }, { "epoch": 0.8431372549019608, "grad_norm": 46.71704670939221, "learning_rate": 5.849022393047076e-08, "logits/chosen": 14.283187866210938, "logits/rejected": 14.61185359954834, "logps/chosen": -4.45003604888916, "logps/rejected": -4.799247741699219, "loss": 3.8545, "rewards/accuracies": 0.75, "rewards/chosen": -44.50035858154297, "rewards/margins": 3.492115020751953, "rewards/rejected": -47.99247741699219, "step": 6192 }, { "epoch": 0.8432734204793029, "grad_norm": 47.09501485350906, "learning_rate": 5.839126730262283e-08, "logits/chosen": 14.585857391357422, "logits/rejected": 15.199870109558105, "logps/chosen": -4.695686340332031, "logps/rejected": -4.947122573852539, "loss": 3.517, "rewards/accuracies": 0.75, "rewards/chosen": -46.95686340332031, "rewards/margins": 2.5143604278564453, "rewards/rejected": -49.471221923828125, "step": 6193 }, { "epoch": 0.8434095860566448, "grad_norm": 39.3599009244653, "learning_rate": 5.829238786409188e-08, "logits/chosen": 14.373895645141602, "logits/rejected": 13.79660415649414, "logps/chosen": -4.465553283691406, "logps/rejected": -4.5787763595581055, "loss": 4.3325, "rewards/accuracies": 0.5, "rewards/chosen": -44.65553283691406, "rewards/margins": 1.1322307586669922, "rewards/rejected": -45.78776550292969, "step": 6194 }, { "epoch": 0.8435457516339869, "grad_norm": 36.753449321843405, "learning_rate": 5.819358563722043e-08, "logits/chosen": 14.867935180664062, "logits/rejected": 15.598739624023438, "logps/chosen": -4.719025611877441, "logps/rejected": -5.264179229736328, "loss": 3.4112, "rewards/accuracies": 1.0, "rewards/chosen": -47.19025421142578, "rewards/margins": 5.451536178588867, "rewards/rejected": -52.641788482666016, "step": 6195 }, { "epoch": 0.843681917211329, "grad_norm": 35.74217405994492, "learning_rate": 5.809486064433367e-08, "logits/chosen": 14.672769546508789, "logits/rejected": 15.250325202941895, "logps/chosen": -4.4757466316223145, "logps/rejected": -4.777403831481934, "loss": 3.6728, "rewards/accuracies": 0.75, "rewards/chosen": -44.757469177246094, "rewards/margins": 3.0165700912475586, "rewards/rejected": -47.77404022216797, "step": 6196 }, { "epoch": 0.843818082788671, "grad_norm": 42.4792117030147, "learning_rate": 5.7996212907739375e-08, "logits/chosen": 14.277985572814941, "logits/rejected": 13.6654691696167, "logps/chosen": -4.7256975173950195, "logps/rejected": -4.662711143493652, "loss": 4.1314, "rewards/accuracies": 0.5, "rewards/chosen": -47.25697326660156, "rewards/margins": -0.6298627853393555, "rewards/rejected": -46.62710952758789, "step": 6197 }, { "epoch": 0.8439542483660131, "grad_norm": 37.135573731371686, "learning_rate": 5.78976424497279e-08, "logits/chosen": 12.88153076171875, "logits/rejected": 14.474174499511719, "logps/chosen": -4.089600563049316, "logps/rejected": -4.517848014831543, "loss": 3.7335, "rewards/accuracies": 1.0, "rewards/chosen": -40.89600372314453, "rewards/margins": 4.282474517822266, "rewards/rejected": -45.17848205566406, "step": 6198 }, { "epoch": 0.8440904139433552, "grad_norm": 47.52594742765935, "learning_rate": 5.779914929257188e-08, "logits/chosen": 14.386947631835938, "logits/rejected": 15.327559471130371, "logps/chosen": -4.932586669921875, "logps/rejected": -5.211922645568848, "loss": 4.1235, "rewards/accuracies": 1.0, "rewards/chosen": -49.325870513916016, "rewards/margins": 2.7933530807495117, "rewards/rejected": -52.119224548339844, "step": 6199 }, { "epoch": 0.8442265795206971, "grad_norm": 41.32719125470246, "learning_rate": 5.770073345852671e-08, "logits/chosen": 13.329619407653809, "logits/rejected": 14.386636734008789, "logps/chosen": -4.218997478485107, "logps/rejected": -4.5940728187561035, "loss": 3.8554, "rewards/accuracies": 0.75, "rewards/chosen": -42.189971923828125, "rewards/margins": 3.750753402709961, "rewards/rejected": -45.94072723388672, "step": 6200 }, { "epoch": 0.8443627450980392, "grad_norm": 39.422510238342696, "learning_rate": 5.760239496983041e-08, "logits/chosen": 14.695243835449219, "logits/rejected": 15.3374662399292, "logps/chosen": -4.761492729187012, "logps/rejected": -4.95539665222168, "loss": 4.2793, "rewards/accuracies": 0.75, "rewards/chosen": -47.614925384521484, "rewards/margins": 1.9390430450439453, "rewards/rejected": -49.55397033691406, "step": 6201 }, { "epoch": 0.8444989106753813, "grad_norm": 44.924834749101, "learning_rate": 5.750413384870314e-08, "logits/chosen": 14.804901123046875, "logits/rejected": 14.554557800292969, "logps/chosen": -4.512783527374268, "logps/rejected": -4.704436302185059, "loss": 3.464, "rewards/accuracies": 0.5, "rewards/chosen": -45.127838134765625, "rewards/margins": 1.9165239334106445, "rewards/rejected": -47.04435729980469, "step": 6202 }, { "epoch": 0.8446350762527233, "grad_norm": 40.686211585490824, "learning_rate": 5.7405950117347966e-08, "logits/chosen": 14.377166748046875, "logits/rejected": 13.991857528686523, "logps/chosen": -4.717646598815918, "logps/rejected": -4.825718879699707, "loss": 3.8459, "rewards/accuracies": 0.5, "rewards/chosen": -47.17646789550781, "rewards/margins": 1.080718994140625, "rewards/rejected": -48.25718688964844, "step": 6203 }, { "epoch": 0.8447712418300654, "grad_norm": 40.18814985196234, "learning_rate": 5.7307843797950305e-08, "logits/chosen": 14.715333938598633, "logits/rejected": 14.630342483520508, "logps/chosen": -4.60203742980957, "logps/rejected": -4.564793586730957, "loss": 3.7379, "rewards/accuracies": 0.75, "rewards/chosen": -46.02037048339844, "rewards/margins": -0.3724374771118164, "rewards/rejected": -45.6479377746582, "step": 6204 }, { "epoch": 0.8449074074074074, "grad_norm": 44.346727288699206, "learning_rate": 5.720981491267802e-08, "logits/chosen": 14.113011360168457, "logits/rejected": 14.528656005859375, "logps/chosen": -4.701967239379883, "logps/rejected": -4.911996841430664, "loss": 3.7351, "rewards/accuracies": 0.75, "rewards/chosen": -47.019676208496094, "rewards/margins": 2.100295066833496, "rewards/rejected": -49.119972229003906, "step": 6205 }, { "epoch": 0.8450435729847494, "grad_norm": 40.1929447114285, "learning_rate": 5.7111863483681576e-08, "logits/chosen": 14.68751335144043, "logits/rejected": 15.204004287719727, "logps/chosen": -4.434236526489258, "logps/rejected": -4.832610607147217, "loss": 4.0509, "rewards/accuracies": 0.75, "rewards/chosen": -44.342369079589844, "rewards/margins": 3.983736991882324, "rewards/rejected": -48.32610321044922, "step": 6206 }, { "epoch": 0.8451797385620915, "grad_norm": 42.44158702476277, "learning_rate": 5.701398953309397e-08, "logits/chosen": 14.47527027130127, "logits/rejected": 15.242792129516602, "logps/chosen": -4.50540018081665, "logps/rejected": -4.985686302185059, "loss": 4.0062, "rewards/accuracies": 1.0, "rewards/chosen": -45.05400085449219, "rewards/margins": 4.802855491638184, "rewards/rejected": -49.85685729980469, "step": 6207 }, { "epoch": 0.8453159041394336, "grad_norm": 42.50897686473782, "learning_rate": 5.691619308303055e-08, "logits/chosen": 14.40042495727539, "logits/rejected": 14.9237060546875, "logps/chosen": -4.53262996673584, "logps/rejected": -4.914402008056641, "loss": 4.4044, "rewards/accuracies": 1.0, "rewards/chosen": -45.326297760009766, "rewards/margins": 3.817721366882324, "rewards/rejected": -49.144020080566406, "step": 6208 }, { "epoch": 0.8454520697167756, "grad_norm": 42.19092829868331, "learning_rate": 5.6818474155589224e-08, "logits/chosen": 14.49832534790039, "logits/rejected": 14.571982383728027, "logps/chosen": -4.506627082824707, "logps/rejected": -4.452179908752441, "loss": 3.956, "rewards/accuracies": 0.75, "rewards/chosen": -45.06626892089844, "rewards/margins": -0.544468879699707, "rewards/rejected": -44.52180099487305, "step": 6209 }, { "epoch": 0.8455882352941176, "grad_norm": 43.473302271799014, "learning_rate": 5.672083277285051e-08, "logits/chosen": 13.755977630615234, "logits/rejected": 13.71381950378418, "logps/chosen": -4.483125686645508, "logps/rejected": -4.502946376800537, "loss": 4.1454, "rewards/accuracies": 0.25, "rewards/chosen": -44.83125305175781, "rewards/margins": 0.1982107162475586, "rewards/rejected": -45.02946472167969, "step": 6210 }, { "epoch": 0.8457244008714597, "grad_norm": 46.30404626825449, "learning_rate": 5.662326895687717e-08, "logits/chosen": 14.040122032165527, "logits/rejected": 14.449213981628418, "logps/chosen": -4.892231464385986, "logps/rejected": -4.835275173187256, "loss": 3.7475, "rewards/accuracies": 0.75, "rewards/chosen": -48.92231369018555, "rewards/margins": -0.5695610046386719, "rewards/rejected": -48.352752685546875, "step": 6211 }, { "epoch": 0.8458605664488017, "grad_norm": 48.2540479334628, "learning_rate": 5.652578272971453e-08, "logits/chosen": 15.053499221801758, "logits/rejected": 15.77943229675293, "logps/chosen": -5.075888633728027, "logps/rejected": -5.304513454437256, "loss": 3.7569, "rewards/accuracies": 0.75, "rewards/chosen": -50.758888244628906, "rewards/margins": 2.286245346069336, "rewards/rejected": -53.045135498046875, "step": 6212 }, { "epoch": 0.8459967320261438, "grad_norm": 52.720389035242725, "learning_rate": 5.642837411339059e-08, "logits/chosen": 14.286209106445312, "logits/rejected": 14.218881607055664, "logps/chosen": -4.428638458251953, "logps/rejected": -4.920507907867432, "loss": 3.8659, "rewards/accuracies": 1.0, "rewards/chosen": -44.28638458251953, "rewards/margins": 4.918695449829102, "rewards/rejected": -49.205078125, "step": 6213 }, { "epoch": 0.8461328976034859, "grad_norm": 43.885281772784666, "learning_rate": 5.633104312991541e-08, "logits/chosen": 14.340051651000977, "logits/rejected": 14.738523483276367, "logps/chosen": -4.706290245056152, "logps/rejected": -4.960955619812012, "loss": 3.6147, "rewards/accuracies": 0.75, "rewards/chosen": -47.06290054321289, "rewards/margins": 2.5466537475585938, "rewards/rejected": -49.609554290771484, "step": 6214 }, { "epoch": 0.8462690631808278, "grad_norm": 45.78073209766236, "learning_rate": 5.623378980128186e-08, "logits/chosen": 15.788978576660156, "logits/rejected": 15.85013198852539, "logps/chosen": -4.870012283325195, "logps/rejected": -4.828569412231445, "loss": 3.0045, "rewards/accuracies": 0.25, "rewards/chosen": -48.70011901855469, "rewards/margins": -0.41442394256591797, "rewards/rejected": -48.28569793701172, "step": 6215 }, { "epoch": 0.8464052287581699, "grad_norm": 42.8208002314913, "learning_rate": 5.6136614149465155e-08, "logits/chosen": 14.316198348999023, "logits/rejected": 14.7881441116333, "logps/chosen": -4.340117931365967, "logps/rejected": -4.5707550048828125, "loss": 4.0904, "rewards/accuracies": 1.0, "rewards/chosen": -43.401180267333984, "rewards/margins": 2.306370735168457, "rewards/rejected": -45.707550048828125, "step": 6216 }, { "epoch": 0.846541394335512, "grad_norm": 39.644120676772516, "learning_rate": 5.603951619642284e-08, "logits/chosen": 14.468978881835938, "logits/rejected": 14.662394523620605, "logps/chosen": -4.447390556335449, "logps/rejected": -4.781644344329834, "loss": 3.8343, "rewards/accuracies": 0.75, "rewards/chosen": -44.47390365600586, "rewards/margins": 3.3425397872924805, "rewards/rejected": -47.816444396972656, "step": 6217 }, { "epoch": 0.846677559912854, "grad_norm": 47.440597522464465, "learning_rate": 5.594249596409501e-08, "logits/chosen": 14.380655288696289, "logits/rejected": 14.608487129211426, "logps/chosen": -4.612651824951172, "logps/rejected": -4.510746002197266, "loss": 3.8215, "rewards/accuracies": 0.75, "rewards/chosen": -46.12651824951172, "rewards/margins": -1.019059181213379, "rewards/rejected": -45.107460021972656, "step": 6218 }, { "epoch": 0.8468137254901961, "grad_norm": 44.717974711124945, "learning_rate": 5.5845553474404316e-08, "logits/chosen": 14.393001556396484, "logits/rejected": 15.193058967590332, "logps/chosen": -4.861289024353027, "logps/rejected": -5.047465801239014, "loss": 3.8623, "rewards/accuracies": 0.75, "rewards/chosen": -48.612892150878906, "rewards/margins": 1.8617677688598633, "rewards/rejected": -50.47465896606445, "step": 6219 }, { "epoch": 0.8469498910675382, "grad_norm": 40.292480126347826, "learning_rate": 5.574868874925553e-08, "logits/chosen": 14.203252792358398, "logits/rejected": 14.859657287597656, "logps/chosen": -4.825170993804932, "logps/rejected": -4.928400993347168, "loss": 4.0543, "rewards/accuracies": 0.75, "rewards/chosen": -48.251708984375, "rewards/margins": 1.0322999954223633, "rewards/rejected": -49.28401184082031, "step": 6220 }, { "epoch": 0.8470860566448801, "grad_norm": 40.56958977814295, "learning_rate": 5.565190181053618e-08, "logits/chosen": 13.809621810913086, "logits/rejected": 15.832788467407227, "logps/chosen": -4.397698402404785, "logps/rejected": -4.934686660766602, "loss": 3.6609, "rewards/accuracies": 1.0, "rewards/chosen": -43.97698211669922, "rewards/margins": 5.369884490966797, "rewards/rejected": -49.34686279296875, "step": 6221 }, { "epoch": 0.8472222222222222, "grad_norm": 39.882529033196924, "learning_rate": 5.555519268011606e-08, "logits/chosen": 14.248750686645508, "logits/rejected": 15.273904800415039, "logps/chosen": -4.515607833862305, "logps/rejected": -4.699661731719971, "loss": 3.9965, "rewards/accuracies": 0.75, "rewards/chosen": -45.15608215332031, "rewards/margins": 1.8405380249023438, "rewards/rejected": -46.996620178222656, "step": 6222 }, { "epoch": 0.8473583877995643, "grad_norm": 43.72397191759196, "learning_rate": 5.545856137984728e-08, "logits/chosen": 15.415872573852539, "logits/rejected": 14.989825248718262, "logps/chosen": -4.96284818649292, "logps/rejected": -5.001464366912842, "loss": 3.9071, "rewards/accuracies": 0.5, "rewards/chosen": -49.62848663330078, "rewards/margins": 0.3861570358276367, "rewards/rejected": -50.01464080810547, "step": 6223 }, { "epoch": 0.8474945533769063, "grad_norm": 42.14855924074083, "learning_rate": 5.53620079315646e-08, "logits/chosen": 14.848775863647461, "logits/rejected": 15.206839561462402, "logps/chosen": -4.6534104347229, "logps/rejected": -4.731157302856445, "loss": 4.1062, "rewards/accuracies": 0.5, "rewards/chosen": -46.53410339355469, "rewards/margins": 0.7774744033813477, "rewards/rejected": -47.31157684326172, "step": 6224 }, { "epoch": 0.8476307189542484, "grad_norm": 45.216855962146134, "learning_rate": 5.526553235708511e-08, "logits/chosen": 14.843944549560547, "logits/rejected": 15.028690338134766, "logps/chosen": -4.8807501792907715, "logps/rejected": -4.654793739318848, "loss": 3.9817, "rewards/accuracies": 0.25, "rewards/chosen": -48.80750274658203, "rewards/margins": -2.259566307067871, "rewards/rejected": -46.547935485839844, "step": 6225 }, { "epoch": 0.8477668845315904, "grad_norm": 40.89446946436304, "learning_rate": 5.5169134678208076e-08, "logits/chosen": 13.90495491027832, "logits/rejected": 14.263880729675293, "logps/chosen": -4.1724748611450195, "logps/rejected": -4.590516090393066, "loss": 4.1442, "rewards/accuracies": 0.75, "rewards/chosen": -41.72474670410156, "rewards/margins": 4.180416107177734, "rewards/rejected": -45.9051628112793, "step": 6226 }, { "epoch": 0.8479030501089324, "grad_norm": 40.35853869096806, "learning_rate": 5.5072814916715496e-08, "logits/chosen": 14.632600784301758, "logits/rejected": 14.644729614257812, "logps/chosen": -4.655913829803467, "logps/rejected": -4.804492950439453, "loss": 3.4858, "rewards/accuracies": 0.5, "rewards/chosen": -46.55913543701172, "rewards/margins": 1.4857912063598633, "rewards/rejected": -48.04492950439453, "step": 6227 }, { "epoch": 0.8480392156862745, "grad_norm": 41.90175262120909, "learning_rate": 5.497657309437165e-08, "logits/chosen": 14.310859680175781, "logits/rejected": 14.214942932128906, "logps/chosen": -4.735624313354492, "logps/rejected": -4.804656505584717, "loss": 4.3574, "rewards/accuracies": 0.5, "rewards/chosen": -47.35624313354492, "rewards/margins": 0.6903238296508789, "rewards/rejected": -48.046566009521484, "step": 6228 }, { "epoch": 0.8481753812636166, "grad_norm": 39.99078912687258, "learning_rate": 5.488040923292301e-08, "logits/chosen": 14.917655944824219, "logits/rejected": 15.194503784179688, "logps/chosen": -4.112436771392822, "logps/rejected": -4.812071800231934, "loss": 3.8968, "rewards/accuracies": 0.75, "rewards/chosen": -41.124366760253906, "rewards/margins": 6.996352195739746, "rewards/rejected": -48.12071990966797, "step": 6229 }, { "epoch": 0.8483115468409586, "grad_norm": 47.99366032428379, "learning_rate": 5.4784323354098725e-08, "logits/chosen": 13.73386001586914, "logits/rejected": 14.445808410644531, "logps/chosen": -4.317683219909668, "logps/rejected": -4.560059547424316, "loss": 4.5857, "rewards/accuracies": 0.75, "rewards/chosen": -43.17683029174805, "rewards/margins": 2.4237661361694336, "rewards/rejected": -45.6005973815918, "step": 6230 }, { "epoch": 0.8484477124183006, "grad_norm": 41.48076775942071, "learning_rate": 5.468831547961019e-08, "logits/chosen": 14.681549072265625, "logits/rejected": 14.604517936706543, "logps/chosen": -4.6313934326171875, "logps/rejected": -4.661497116088867, "loss": 3.8261, "rewards/accuracies": 0.75, "rewards/chosen": -46.313934326171875, "rewards/margins": 0.30103588104248047, "rewards/rejected": -46.61497116088867, "step": 6231 }, { "epoch": 0.8485838779956427, "grad_norm": 38.92267118467332, "learning_rate": 5.459238563115112e-08, "logits/chosen": 13.98868179321289, "logits/rejected": 14.972845077514648, "logps/chosen": -4.27315616607666, "logps/rejected": -4.527257919311523, "loss": 3.4898, "rewards/accuracies": 0.75, "rewards/chosen": -42.73155975341797, "rewards/margins": 2.5410213470458984, "rewards/rejected": -45.2725830078125, "step": 6232 }, { "epoch": 0.8487200435729847, "grad_norm": 44.73340850028767, "learning_rate": 5.449653383039767e-08, "logits/chosen": 14.469905853271484, "logits/rejected": 14.831454277038574, "logps/chosen": -4.403191089630127, "logps/rejected": -4.639496803283691, "loss": 4.292, "rewards/accuracies": 0.75, "rewards/chosen": -44.03191375732422, "rewards/margins": 2.3630599975585938, "rewards/rejected": -46.39497375488281, "step": 6233 }, { "epoch": 0.8488562091503268, "grad_norm": 39.879409805792314, "learning_rate": 5.4400760099008406e-08, "logits/chosen": 14.278127670288086, "logits/rejected": 14.802234649658203, "logps/chosen": -4.792750358581543, "logps/rejected": -4.8422346115112305, "loss": 4.0297, "rewards/accuracies": 0.5, "rewards/chosen": -47.92749786376953, "rewards/margins": 0.4948453903198242, "rewards/rejected": -48.42234420776367, "step": 6234 }, { "epoch": 0.8489923747276689, "grad_norm": 40.87303480035831, "learning_rate": 5.4305064458624126e-08, "logits/chosen": 14.369239807128906, "logits/rejected": 14.20871353149414, "logps/chosen": -4.622058868408203, "logps/rejected": -4.72779655456543, "loss": 3.709, "rewards/accuracies": 0.5, "rewards/chosen": -46.22058868408203, "rewards/margins": 1.057377815246582, "rewards/rejected": -47.27796173095703, "step": 6235 }, { "epoch": 0.849128540305011, "grad_norm": 46.106844437088846, "learning_rate": 5.420944693086804e-08, "logits/chosen": 14.5187349319458, "logits/rejected": 14.478044509887695, "logps/chosen": -4.854469299316406, "logps/rejected": -4.681634902954102, "loss": 4.223, "rewards/accuracies": 0.25, "rewards/chosen": -48.54469299316406, "rewards/margins": -1.7283401489257812, "rewards/rejected": -46.816349029541016, "step": 6236 }, { "epoch": 0.8492647058823529, "grad_norm": 43.59615126206031, "learning_rate": 5.411390753734584e-08, "logits/chosen": 14.890884399414062, "logits/rejected": 14.908217430114746, "logps/chosen": -4.406335830688477, "logps/rejected": -4.585890293121338, "loss": 4.2735, "rewards/accuracies": 0.75, "rewards/chosen": -44.06336212158203, "rewards/margins": 1.7955408096313477, "rewards/rejected": -45.85890197753906, "step": 6237 }, { "epoch": 0.849400871459695, "grad_norm": 47.42469834304133, "learning_rate": 5.401844629964527e-08, "logits/chosen": 15.001938819885254, "logits/rejected": 15.430183410644531, "logps/chosen": -4.85138463973999, "logps/rejected": -4.999936580657959, "loss": 4.0011, "rewards/accuracies": 0.5, "rewards/chosen": -48.51384735107422, "rewards/margins": 1.485520362854004, "rewards/rejected": -49.999366760253906, "step": 6238 }, { "epoch": 0.8495370370370371, "grad_norm": 40.33617868699232, "learning_rate": 5.3923063239336686e-08, "logits/chosen": 13.615911483764648, "logits/rejected": 14.588699340820312, "logps/chosen": -3.9998762607574463, "logps/rejected": -4.602031707763672, "loss": 3.7458, "rewards/accuracies": 1.0, "rewards/chosen": -39.99876022338867, "rewards/margins": 6.021554946899414, "rewards/rejected": -46.02031326293945, "step": 6239 }, { "epoch": 0.8496732026143791, "grad_norm": 48.84470611147045, "learning_rate": 5.382775837797271e-08, "logits/chosen": 14.089151382446289, "logits/rejected": 14.552359580993652, "logps/chosen": -4.297633647918701, "logps/rejected": -4.734750270843506, "loss": 4.4566, "rewards/accuracies": 0.75, "rewards/chosen": -42.97633743286133, "rewards/margins": 4.3711652755737305, "rewards/rejected": -47.347503662109375, "step": 6240 }, { "epoch": 0.8498093681917211, "grad_norm": 39.41266312724466, "learning_rate": 5.373253173708816e-08, "logits/chosen": 14.33203125, "logits/rejected": 14.619664192199707, "logps/chosen": -4.7069292068481445, "logps/rejected": -4.55074405670166, "loss": 3.8818, "rewards/accuracies": 0.5, "rewards/chosen": -47.06929016113281, "rewards/margins": -1.5618486404418945, "rewards/rejected": -45.507442474365234, "step": 6241 }, { "epoch": 0.8499455337690632, "grad_norm": 39.849276663701566, "learning_rate": 5.363738333820036e-08, "logits/chosen": 14.178266525268555, "logits/rejected": 14.477968215942383, "logps/chosen": -4.263298034667969, "logps/rejected": -4.5754313468933105, "loss": 3.8401, "rewards/accuracies": 1.0, "rewards/chosen": -42.63298034667969, "rewards/margins": 3.1213369369506836, "rewards/rejected": -45.75431442260742, "step": 6242 }, { "epoch": 0.8500816993464052, "grad_norm": 40.77599057267739, "learning_rate": 5.354231320280882e-08, "logits/chosen": 14.467096328735352, "logits/rejected": 14.507750511169434, "logps/chosen": -4.2791361808776855, "logps/rejected": -4.812912940979004, "loss": 3.7362, "rewards/accuracies": 0.75, "rewards/chosen": -42.79136276245117, "rewards/margins": 5.337766647338867, "rewards/rejected": -48.129127502441406, "step": 6243 }, { "epoch": 0.8502178649237473, "grad_norm": 47.21137677049083, "learning_rate": 5.3447321352395605e-08, "logits/chosen": 14.782535552978516, "logits/rejected": 15.167940139770508, "logps/chosen": -4.234158515930176, "logps/rejected": -4.7674760818481445, "loss": 3.9304, "rewards/accuracies": 0.75, "rewards/chosen": -42.341583251953125, "rewards/margins": 5.333174705505371, "rewards/rejected": -47.67475891113281, "step": 6244 }, { "epoch": 0.8503540305010894, "grad_norm": 42.455746175982775, "learning_rate": 5.3352407808424604e-08, "logits/chosen": 14.89515495300293, "logits/rejected": 14.93038558959961, "logps/chosen": -4.587011337280273, "logps/rejected": -4.549373626708984, "loss": 4.0351, "rewards/accuracies": 0.75, "rewards/chosen": -45.87010955810547, "rewards/margins": -0.3763761520385742, "rewards/rejected": -45.493736267089844, "step": 6245 }, { "epoch": 0.8504901960784313, "grad_norm": 62.60073723519045, "learning_rate": 5.3257572592342537e-08, "logits/chosen": 14.06978988647461, "logits/rejected": 14.683040618896484, "logps/chosen": -4.504032135009766, "logps/rejected": -4.409523963928223, "loss": 4.1447, "rewards/accuracies": 0.25, "rewards/chosen": -45.040321350097656, "rewards/margins": -0.9450798034667969, "rewards/rejected": -44.09524154663086, "step": 6246 }, { "epoch": 0.8506263616557734, "grad_norm": 43.26404965184597, "learning_rate": 5.316281572557817e-08, "logits/chosen": 13.838626861572266, "logits/rejected": 14.70407485961914, "logps/chosen": -4.537975311279297, "logps/rejected": -4.673964500427246, "loss": 3.9077, "rewards/accuracies": 0.5, "rewards/chosen": -45.379756927490234, "rewards/margins": 1.3598871231079102, "rewards/rejected": -46.739646911621094, "step": 6247 }, { "epoch": 0.8507625272331155, "grad_norm": 41.63452234875831, "learning_rate": 5.306813722954255e-08, "logits/chosen": 14.914800643920898, "logits/rejected": 15.156681060791016, "logps/chosen": -4.739239692687988, "logps/rejected": -5.087740421295166, "loss": 3.6184, "rewards/accuracies": 1.0, "rewards/chosen": -47.392398834228516, "rewards/margins": 3.4850082397460938, "rewards/rejected": -50.877403259277344, "step": 6248 }, { "epoch": 0.8508986928104575, "grad_norm": 47.6823675498259, "learning_rate": 5.2973537125629064e-08, "logits/chosen": 14.663373947143555, "logits/rejected": 14.317045211791992, "logps/chosen": -4.558366775512695, "logps/rejected": -4.775005340576172, "loss": 3.9317, "rewards/accuracies": 0.75, "rewards/chosen": -45.58366394042969, "rewards/margins": 2.166388511657715, "rewards/rejected": -47.75005340576172, "step": 6249 }, { "epoch": 0.8510348583877996, "grad_norm": 38.31506051662492, "learning_rate": 5.287901543521349e-08, "logits/chosen": 14.937076568603516, "logits/rejected": 15.264872550964355, "logps/chosen": -4.574488162994385, "logps/rejected": -4.739348411560059, "loss": 3.2735, "rewards/accuracies": 0.5, "rewards/chosen": -45.74488067626953, "rewards/margins": 1.6485977172851562, "rewards/rejected": -47.39347839355469, "step": 6250 }, { "epoch": 0.8511710239651417, "grad_norm": 40.668951818793104, "learning_rate": 5.278457217965364e-08, "logits/chosen": 13.997838020324707, "logits/rejected": 14.298007011413574, "logps/chosen": -4.1905622482299805, "logps/rejected": -4.468923091888428, "loss": 3.8093, "rewards/accuracies": 0.75, "rewards/chosen": -41.90562438964844, "rewards/margins": 2.783609390258789, "rewards/rejected": -44.689231872558594, "step": 6251 }, { "epoch": 0.8513071895424836, "grad_norm": 40.4646110910092, "learning_rate": 5.269020738028982e-08, "logits/chosen": 13.841745376586914, "logits/rejected": 14.881567001342773, "logps/chosen": -4.051892280578613, "logps/rejected": -4.624453067779541, "loss": 4.1042, "rewards/accuracies": 1.0, "rewards/chosen": -40.5189208984375, "rewards/margins": 5.725606918334961, "rewards/rejected": -46.244529724121094, "step": 6252 }, { "epoch": 0.8514433551198257, "grad_norm": 45.18559888397863, "learning_rate": 5.259592105844461e-08, "logits/chosen": 14.662347793579102, "logits/rejected": 14.709846496582031, "logps/chosen": -4.424884796142578, "logps/rejected": -4.625085830688477, "loss": 4.2062, "rewards/accuracies": 0.75, "rewards/chosen": -44.24884796142578, "rewards/margins": 2.002012252807617, "rewards/rejected": -46.250858306884766, "step": 6253 }, { "epoch": 0.8515795206971678, "grad_norm": 43.93064388515479, "learning_rate": 5.250171323542263e-08, "logits/chosen": 14.495479583740234, "logits/rejected": 14.573704719543457, "logps/chosen": -4.613450050354004, "logps/rejected": -4.747138977050781, "loss": 3.3878, "rewards/accuracies": 0.5, "rewards/chosen": -46.13450622558594, "rewards/margins": 1.3368873596191406, "rewards/rejected": -47.47138977050781, "step": 6254 }, { "epoch": 0.8517156862745098, "grad_norm": 48.671660367076, "learning_rate": 5.240758393251097e-08, "logits/chosen": 14.94912338256836, "logits/rejected": 15.375212669372559, "logps/chosen": -4.80950927734375, "logps/rejected": -5.040409088134766, "loss": 4.2627, "rewards/accuracies": 1.0, "rewards/chosen": -48.0950927734375, "rewards/margins": 2.3089962005615234, "rewards/rejected": -50.404090881347656, "step": 6255 }, { "epoch": 0.8518518518518519, "grad_norm": 42.60627937991098, "learning_rate": 5.231353317097906e-08, "logits/chosen": 14.362241744995117, "logits/rejected": 13.859838485717773, "logps/chosen": -4.635224342346191, "logps/rejected": -4.575243949890137, "loss": 4.2351, "rewards/accuracies": 0.75, "rewards/chosen": -46.35224533081055, "rewards/margins": -0.5998010635375977, "rewards/rejected": -45.75244140625, "step": 6256 }, { "epoch": 0.851988017429194, "grad_norm": 39.74079621874251, "learning_rate": 5.2219560972078223e-08, "logits/chosen": 14.388138771057129, "logits/rejected": 14.504240989685059, "logps/chosen": -4.561245441436768, "logps/rejected": -4.597083568572998, "loss": 3.6402, "rewards/accuracies": 0.5, "rewards/chosen": -45.61245346069336, "rewards/margins": 0.3583850860595703, "rewards/rejected": -45.9708366394043, "step": 6257 }, { "epoch": 0.8521241830065359, "grad_norm": 35.33034802553327, "learning_rate": 5.21256673570424e-08, "logits/chosen": 14.654308319091797, "logits/rejected": 14.966150283813477, "logps/chosen": -4.667231559753418, "logps/rejected": -5.002504348754883, "loss": 3.5675, "rewards/accuracies": 0.75, "rewards/chosen": -46.67231750488281, "rewards/margins": 3.3527259826660156, "rewards/rejected": -50.02503967285156, "step": 6258 }, { "epoch": 0.852260348583878, "grad_norm": 43.50749507909341, "learning_rate": 5.2031852347087643e-08, "logits/chosen": 14.646556854248047, "logits/rejected": 14.517036437988281, "logps/chosen": -4.693794250488281, "logps/rejected": -4.671374320983887, "loss": 4.3995, "rewards/accuracies": 0.75, "rewards/chosen": -46.93794250488281, "rewards/margins": -0.2241983413696289, "rewards/rejected": -46.7137451171875, "step": 6259 }, { "epoch": 0.8523965141612201, "grad_norm": 42.460971900255785, "learning_rate": 5.1938115963412156e-08, "logits/chosen": 13.591922760009766, "logits/rejected": 14.160041809082031, "logps/chosen": -4.224164962768555, "logps/rejected": -4.474119186401367, "loss": 3.5846, "rewards/accuracies": 0.75, "rewards/chosen": -42.24165344238281, "rewards/margins": 2.499537467956543, "rewards/rejected": -44.741188049316406, "step": 6260 }, { "epoch": 0.8525326797385621, "grad_norm": 43.84001187876652, "learning_rate": 5.184445822719641e-08, "logits/chosen": 14.463750839233398, "logits/rejected": 14.996917724609375, "logps/chosen": -5.15059757232666, "logps/rejected": -5.170513153076172, "loss": 4.276, "rewards/accuracies": 0.5, "rewards/chosen": -51.50597381591797, "rewards/margins": 0.19915390014648438, "rewards/rejected": -51.70513153076172, "step": 6261 }, { "epoch": 0.8526688453159041, "grad_norm": 43.43050829211674, "learning_rate": 5.1750879159603344e-08, "logits/chosen": 14.900352478027344, "logits/rejected": 14.989660263061523, "logps/chosen": -5.191588401794434, "logps/rejected": -5.044247627258301, "loss": 4.0844, "rewards/accuracies": 0.75, "rewards/chosen": -51.9158821105957, "rewards/margins": -1.4734067916870117, "rewards/rejected": -50.442474365234375, "step": 6262 }, { "epoch": 0.8528050108932462, "grad_norm": 38.404051360701544, "learning_rate": 5.165737878177769e-08, "logits/chosen": 14.809487342834473, "logits/rejected": 14.729862213134766, "logps/chosen": -4.876069068908691, "logps/rejected": -4.765894889831543, "loss": 4.0126, "rewards/accuracies": 0.5, "rewards/chosen": -48.76069259643555, "rewards/margins": -1.10174560546875, "rewards/rejected": -47.6589469909668, "step": 6263 }, { "epoch": 0.8529411764705882, "grad_norm": 42.682778899721214, "learning_rate": 5.1563957114846736e-08, "logits/chosen": 14.176752090454102, "logits/rejected": 14.825799942016602, "logps/chosen": -4.470516204833984, "logps/rejected": -4.590390205383301, "loss": 3.8804, "rewards/accuracies": 0.75, "rewards/chosen": -44.705162048339844, "rewards/margins": 1.198740005493164, "rewards/rejected": -45.903900146484375, "step": 6264 }, { "epoch": 0.8530773420479303, "grad_norm": 42.64659070914706, "learning_rate": 5.147061417991994e-08, "logits/chosen": 14.69723892211914, "logits/rejected": 15.005762100219727, "logps/chosen": -4.859247207641602, "logps/rejected": -4.970071792602539, "loss": 3.835, "rewards/accuracies": 0.5, "rewards/chosen": -48.592472076416016, "rewards/margins": 1.1082487106323242, "rewards/rejected": -49.700721740722656, "step": 6265 }, { "epoch": 0.8532135076252724, "grad_norm": 42.22232424316995, "learning_rate": 5.137734999808878e-08, "logits/chosen": 15.031424522399902, "logits/rejected": 15.019704818725586, "logps/chosen": -4.6542510986328125, "logps/rejected": -4.366767883300781, "loss": 4.3007, "rewards/accuracies": 0.25, "rewards/chosen": -46.542510986328125, "rewards/margins": -2.8748340606689453, "rewards/rejected": -43.66767883300781, "step": 6266 }, { "epoch": 0.8533496732026143, "grad_norm": 49.94857754311843, "learning_rate": 5.128416459042708e-08, "logits/chosen": 14.791919708251953, "logits/rejected": 15.242818832397461, "logps/chosen": -4.661333084106445, "logps/rejected": -4.720803260803223, "loss": 4.4955, "rewards/accuracies": 0.5, "rewards/chosen": -46.61333465576172, "rewards/margins": 0.5946969985961914, "rewards/rejected": -47.208030700683594, "step": 6267 }, { "epoch": 0.8534858387799564, "grad_norm": 40.12635722326916, "learning_rate": 5.119105797799106e-08, "logits/chosen": 14.903605461120605, "logits/rejected": 14.208602905273438, "logps/chosen": -4.874676704406738, "logps/rejected": -4.804818630218506, "loss": 3.633, "rewards/accuracies": 0.5, "rewards/chosen": -48.74676513671875, "rewards/margins": -0.6985807418823242, "rewards/rejected": -48.048187255859375, "step": 6268 }, { "epoch": 0.8536220043572985, "grad_norm": 45.074949829057005, "learning_rate": 5.109803018181864e-08, "logits/chosen": 14.216468811035156, "logits/rejected": 14.232769966125488, "logps/chosen": -4.597902774810791, "logps/rejected": -4.612606525421143, "loss": 3.9402, "rewards/accuracies": 0.5, "rewards/chosen": -45.979026794433594, "rewards/margins": 0.1470355987548828, "rewards/rejected": -46.126060485839844, "step": 6269 }, { "epoch": 0.8537581699346405, "grad_norm": 42.405247735850956, "learning_rate": 5.100508122293039e-08, "logits/chosen": 14.87908935546875, "logits/rejected": 14.69072151184082, "logps/chosen": -4.414097785949707, "logps/rejected": -4.6070556640625, "loss": 4.0583, "rewards/accuracies": 0.5, "rewards/chosen": -44.1409797668457, "rewards/margins": 1.9295759201049805, "rewards/rejected": -46.070556640625, "step": 6270 }, { "epoch": 0.8538943355119826, "grad_norm": 45.2949980134567, "learning_rate": 5.091221112232893e-08, "logits/chosen": 13.857084274291992, "logits/rejected": 13.886733055114746, "logps/chosen": -4.587113380432129, "logps/rejected": -4.623639106750488, "loss": 4.1273, "rewards/accuracies": 0.5, "rewards/chosen": -45.871131896972656, "rewards/margins": 0.36525917053222656, "rewards/rejected": -46.23638916015625, "step": 6271 }, { "epoch": 0.8540305010893247, "grad_norm": 42.887266752061024, "learning_rate": 5.081941990099885e-08, "logits/chosen": 14.073591232299805, "logits/rejected": 14.847406387329102, "logps/chosen": -4.277204990386963, "logps/rejected": -4.546545505523682, "loss": 4.0998, "rewards/accuracies": 0.75, "rewards/chosen": -42.77204895019531, "rewards/margins": 2.6934022903442383, "rewards/rejected": -45.4654541015625, "step": 6272 }, { "epoch": 0.8541666666666666, "grad_norm": 40.59379499303332, "learning_rate": 5.07267075799072e-08, "logits/chosen": 14.084118843078613, "logits/rejected": 13.908992767333984, "logps/chosen": -4.665022850036621, "logps/rejected": -4.759922027587891, "loss": 3.9626, "rewards/accuracies": 0.5, "rewards/chosen": -46.65022659301758, "rewards/margins": 0.9489898681640625, "rewards/rejected": -47.59921646118164, "step": 6273 }, { "epoch": 0.8543028322440087, "grad_norm": 41.10426758289774, "learning_rate": 5.063407418000323e-08, "logits/chosen": 13.409760475158691, "logits/rejected": 14.203452110290527, "logps/chosen": -4.489898681640625, "logps/rejected": -4.4064106941223145, "loss": 4.1726, "rewards/accuracies": 0.5, "rewards/chosen": -44.89898681640625, "rewards/margins": -0.8348817825317383, "rewards/rejected": -44.064109802246094, "step": 6274 }, { "epoch": 0.8544389978213508, "grad_norm": 41.61759308127034, "learning_rate": 5.054151972221796e-08, "logits/chosen": 14.50729751586914, "logits/rejected": 14.112119674682617, "logps/chosen": -4.558389663696289, "logps/rejected": -4.771623611450195, "loss": 4.5493, "rewards/accuracies": 0.75, "rewards/chosen": -45.583900451660156, "rewards/margins": 2.1323375701904297, "rewards/rejected": -47.71623611450195, "step": 6275 }, { "epoch": 0.8545751633986928, "grad_norm": 45.54240529999606, "learning_rate": 5.0449044227464946e-08, "logits/chosen": 14.538894653320312, "logits/rejected": 14.299633026123047, "logps/chosen": -4.574131011962891, "logps/rejected": -4.602267265319824, "loss": 4.6376, "rewards/accuracies": 0.5, "rewards/chosen": -45.741310119628906, "rewards/margins": 0.28136634826660156, "rewards/rejected": -46.022674560546875, "step": 6276 }, { "epoch": 0.8547113289760349, "grad_norm": 38.24810944365648, "learning_rate": 5.035664771663994e-08, "logits/chosen": 13.197016716003418, "logits/rejected": 14.577510833740234, "logps/chosen": -4.063294887542725, "logps/rejected": -4.853150367736816, "loss": 3.5887, "rewards/accuracies": 1.0, "rewards/chosen": -40.63294982910156, "rewards/margins": 7.898555755615234, "rewards/rejected": -48.5315055847168, "step": 6277 }, { "epoch": 0.8548474945533769, "grad_norm": 38.32080542291267, "learning_rate": 5.0264330210620445e-08, "logits/chosen": 14.486572265625, "logits/rejected": 14.784700393676758, "logps/chosen": -4.607344627380371, "logps/rejected": -4.88092041015625, "loss": 3.3553, "rewards/accuracies": 0.75, "rewards/chosen": -46.073448181152344, "rewards/margins": 2.7357568740844727, "rewards/rejected": -48.8092041015625, "step": 6278 }, { "epoch": 0.8549836601307189, "grad_norm": 40.35351649715194, "learning_rate": 5.0172091730266464e-08, "logits/chosen": 14.50525188446045, "logits/rejected": 15.470947265625, "logps/chosen": -4.841235637664795, "logps/rejected": -5.072400093078613, "loss": 3.9594, "rewards/accuracies": 0.5, "rewards/chosen": -48.412353515625, "rewards/margins": 2.311648368835449, "rewards/rejected": -50.72400665283203, "step": 6279 }, { "epoch": 0.855119825708061, "grad_norm": 42.69720331116207, "learning_rate": 5.007993229642018e-08, "logits/chosen": 15.197021484375, "logits/rejected": 15.080558776855469, "logps/chosen": -5.070446968078613, "logps/rejected": -4.88830041885376, "loss": 3.8534, "rewards/accuracies": 0.5, "rewards/chosen": -50.7044677734375, "rewards/margins": -1.8214597702026367, "rewards/rejected": -48.88300704956055, "step": 6280 }, { "epoch": 0.8552559912854031, "grad_norm": 48.7236418681286, "learning_rate": 4.99878519299056e-08, "logits/chosen": 14.718158721923828, "logits/rejected": 14.154272079467773, "logps/chosen": -4.603699684143066, "logps/rejected": -4.554569244384766, "loss": 4.5731, "rewards/accuracies": 0.25, "rewards/chosen": -46.03700256347656, "rewards/margins": -0.49131107330322266, "rewards/rejected": -45.54568862915039, "step": 6281 }, { "epoch": 0.8553921568627451, "grad_norm": 38.779039911094976, "learning_rate": 4.989585065152906e-08, "logits/chosen": 14.331378936767578, "logits/rejected": 14.763679504394531, "logps/chosen": -4.539652347564697, "logps/rejected": -4.922015190124512, "loss": 3.5974, "rewards/accuracies": 0.75, "rewards/chosen": -45.396522521972656, "rewards/margins": 3.8236284255981445, "rewards/rejected": -49.22015380859375, "step": 6282 }, { "epoch": 0.8555283224400871, "grad_norm": 42.32735398365069, "learning_rate": 4.980392848207917e-08, "logits/chosen": 14.68092155456543, "logits/rejected": 15.04909610748291, "logps/chosen": -4.578912734985352, "logps/rejected": -4.896073341369629, "loss": 4.211, "rewards/accuracies": 0.5, "rewards/chosen": -45.789127349853516, "rewards/margins": 3.1716079711914062, "rewards/rejected": -48.96073532104492, "step": 6283 }, { "epoch": 0.8556644880174292, "grad_norm": 40.544709812570645, "learning_rate": 4.97120854423263e-08, "logits/chosen": 14.534722328186035, "logits/rejected": 15.190847396850586, "logps/chosen": -4.680952072143555, "logps/rejected": -5.2987799644470215, "loss": 3.2291, "rewards/accuracies": 1.0, "rewards/chosen": -46.80952072143555, "rewards/margins": 6.178277015686035, "rewards/rejected": -52.987796783447266, "step": 6284 }, { "epoch": 0.8558006535947712, "grad_norm": 42.17876239877836, "learning_rate": 4.962032155302323e-08, "logits/chosen": 14.706541061401367, "logits/rejected": 14.439836502075195, "logps/chosen": -4.670192718505859, "logps/rejected": -4.707869529724121, "loss": 4.3428, "rewards/accuracies": 0.75, "rewards/chosen": -46.701927185058594, "rewards/margins": 0.3767690658569336, "rewards/rejected": -47.078697204589844, "step": 6285 }, { "epoch": 0.8559368191721133, "grad_norm": 41.8644361510905, "learning_rate": 4.952863683490482e-08, "logits/chosen": 14.859986305236816, "logits/rejected": 14.048257827758789, "logps/chosen": -4.939136505126953, "logps/rejected": -4.57236385345459, "loss": 4.1846, "rewards/accuracies": 0.0, "rewards/chosen": -49.39136505126953, "rewards/margins": -3.667727470397949, "rewards/rejected": -45.72364044189453, "step": 6286 }, { "epoch": 0.8560729847494554, "grad_norm": 39.72658566045162, "learning_rate": 4.9437031308687904e-08, "logits/chosen": 15.02701187133789, "logits/rejected": 15.570419311523438, "logps/chosen": -4.6897993087768555, "logps/rejected": -4.85080623626709, "loss": 3.5422, "rewards/accuracies": 0.75, "rewards/chosen": -46.89799118041992, "rewards/margins": 1.6100683212280273, "rewards/rejected": -48.508060455322266, "step": 6287 }, { "epoch": 0.8562091503267973, "grad_norm": 41.93972994401729, "learning_rate": 4.9345504995071554e-08, "logits/chosen": 14.814729690551758, "logits/rejected": 15.073471069335938, "logps/chosen": -4.6089372634887695, "logps/rejected": -4.831567764282227, "loss": 3.8627, "rewards/accuracies": 0.75, "rewards/chosen": -46.08937072753906, "rewards/margins": 2.2263050079345703, "rewards/rejected": -48.315677642822266, "step": 6288 }, { "epoch": 0.8563453159041394, "grad_norm": 59.607073384400294, "learning_rate": 4.925405791473687e-08, "logits/chosen": 14.178564071655273, "logits/rejected": 14.35872745513916, "logps/chosen": -4.52488374710083, "logps/rejected": -4.637661933898926, "loss": 4.2292, "rewards/accuracies": 1.0, "rewards/chosen": -45.248836517333984, "rewards/margins": 1.1277847290039062, "rewards/rejected": -46.37662124633789, "step": 6289 }, { "epoch": 0.8564814814814815, "grad_norm": 46.600380102609314, "learning_rate": 4.916269008834719e-08, "logits/chosen": 14.460933685302734, "logits/rejected": 14.759056091308594, "logps/chosen": -4.282037734985352, "logps/rejected": -4.359968185424805, "loss": 3.4927, "rewards/accuracies": 0.75, "rewards/chosen": -42.82037353515625, "rewards/margins": 0.7793035507202148, "rewards/rejected": -43.59967803955078, "step": 6290 }, { "epoch": 0.8566176470588235, "grad_norm": 40.7050360204799, "learning_rate": 4.907140153654765e-08, "logits/chosen": 13.717334747314453, "logits/rejected": 13.970897674560547, "logps/chosen": -4.3009443283081055, "logps/rejected": -4.191186904907227, "loss": 3.7405, "rewards/accuracies": 0.5, "rewards/chosen": -43.00944519042969, "rewards/margins": -1.0975770950317383, "rewards/rejected": -41.911869049072266, "step": 6291 }, { "epoch": 0.8567538126361656, "grad_norm": 41.56770690745644, "learning_rate": 4.8980192279965705e-08, "logits/chosen": 14.381246566772461, "logits/rejected": 14.9207181930542, "logps/chosen": -4.647169589996338, "logps/rejected": -4.81531286239624, "loss": 4.0085, "rewards/accuracies": 0.75, "rewards/chosen": -46.47169494628906, "rewards/margins": 1.6814327239990234, "rewards/rejected": -48.15312957763672, "step": 6292 }, { "epoch": 0.8568899782135077, "grad_norm": 53.96081267068826, "learning_rate": 4.8889062339210995e-08, "logits/chosen": 14.8453369140625, "logits/rejected": 15.567510604858398, "logps/chosen": -4.788166046142578, "logps/rejected": -4.931455612182617, "loss": 4.1797, "rewards/accuracies": 0.75, "rewards/chosen": -47.881656646728516, "rewards/margins": 1.4328975677490234, "rewards/rejected": -49.31455612182617, "step": 6293 }, { "epoch": 0.8570261437908496, "grad_norm": 50.44889144051217, "learning_rate": 4.879801173487488e-08, "logits/chosen": 14.85787582397461, "logits/rejected": 15.963068008422852, "logps/chosen": -4.668790340423584, "logps/rejected": -4.84942626953125, "loss": 4.2626, "rewards/accuracies": 0.75, "rewards/chosen": -46.687904357910156, "rewards/margins": 1.8063631057739258, "rewards/rejected": -48.494266510009766, "step": 6294 }, { "epoch": 0.8571623093681917, "grad_norm": 48.235790843708436, "learning_rate": 4.870704048753107e-08, "logits/chosen": 15.08074951171875, "logits/rejected": 15.031383514404297, "logps/chosen": -5.010343551635742, "logps/rejected": -5.186995029449463, "loss": 3.7207, "rewards/accuracies": 0.5, "rewards/chosen": -50.10343551635742, "rewards/margins": 1.7665157318115234, "rewards/rejected": -51.86994934082031, "step": 6295 }, { "epoch": 0.8572984749455338, "grad_norm": 42.54240824464034, "learning_rate": 4.861614861773526e-08, "logits/chosen": 13.87396240234375, "logits/rejected": 14.656944274902344, "logps/chosen": -4.548776626586914, "logps/rejected": -4.916203498840332, "loss": 4.225, "rewards/accuracies": 0.75, "rewards/chosen": -45.487770080566406, "rewards/margins": 3.674266815185547, "rewards/rejected": -49.16203308105469, "step": 6296 }, { "epoch": 0.8574346405228758, "grad_norm": 44.338600651257025, "learning_rate": 4.8525336146025344e-08, "logits/chosen": 14.842086791992188, "logits/rejected": 14.668060302734375, "logps/chosen": -4.793311595916748, "logps/rejected": -4.741711616516113, "loss": 3.5606, "rewards/accuracies": 0.5, "rewards/chosen": -47.9331169128418, "rewards/margins": -0.5159969329833984, "rewards/rejected": -47.417118072509766, "step": 6297 }, { "epoch": 0.8575708061002179, "grad_norm": 38.475969481438966, "learning_rate": 4.8434603092920936e-08, "logits/chosen": 14.23227310180664, "logits/rejected": 14.071279525756836, "logps/chosen": -4.529090404510498, "logps/rejected": -4.64829158782959, "loss": 3.712, "rewards/accuracies": 0.5, "rewards/chosen": -45.29090118408203, "rewards/margins": 1.1920127868652344, "rewards/rejected": -46.48291778564453, "step": 6298 }, { "epoch": 0.8577069716775599, "grad_norm": 41.47745642774331, "learning_rate": 4.834394947892404e-08, "logits/chosen": 14.410770416259766, "logits/rejected": 14.60273551940918, "logps/chosen": -4.746739864349365, "logps/rejected": -4.812582015991211, "loss": 4.1023, "rewards/accuracies": 0.5, "rewards/chosen": -47.46739959716797, "rewards/margins": 0.6584253311157227, "rewards/rejected": -48.125823974609375, "step": 6299 }, { "epoch": 0.8578431372549019, "grad_norm": 43.157498944763894, "learning_rate": 4.8253375324518676e-08, "logits/chosen": 14.441130638122559, "logits/rejected": 15.929176330566406, "logps/chosen": -4.5465850830078125, "logps/rejected": -5.071347713470459, "loss": 4.2528, "rewards/accuracies": 1.0, "rewards/chosen": -45.46584701538086, "rewards/margins": 5.247628211975098, "rewards/rejected": -50.71347427368164, "step": 6300 }, { "epoch": 0.857979302832244, "grad_norm": 41.26443529233903, "learning_rate": 4.8162880650170646e-08, "logits/chosen": 15.445053100585938, "logits/rejected": 15.650251388549805, "logps/chosen": -4.957644462585449, "logps/rejected": -5.040010929107666, "loss": 3.7055, "rewards/accuracies": 0.75, "rewards/chosen": -49.576446533203125, "rewards/margins": 0.8236637115478516, "rewards/rejected": -50.400108337402344, "step": 6301 }, { "epoch": 0.8581154684095861, "grad_norm": 43.234385236447, "learning_rate": 4.807246547632804e-08, "logits/chosen": 13.696060180664062, "logits/rejected": 13.563565254211426, "logps/chosen": -4.359265327453613, "logps/rejected": -4.308632850646973, "loss": 3.6375, "rewards/accuracies": 0.5, "rewards/chosen": -43.592655181884766, "rewards/margins": -0.5063304901123047, "rewards/rejected": -43.086326599121094, "step": 6302 }, { "epoch": 0.858251633986928, "grad_norm": 39.62011769105936, "learning_rate": 4.7982129823420915e-08, "logits/chosen": 14.943552017211914, "logits/rejected": 15.079517364501953, "logps/chosen": -4.837647914886475, "logps/rejected": -5.076125144958496, "loss": 3.6655, "rewards/accuracies": 0.5, "rewards/chosen": -48.37648010253906, "rewards/margins": 2.3847713470458984, "rewards/rejected": -50.76124954223633, "step": 6303 }, { "epoch": 0.8583877995642701, "grad_norm": 40.37194015109196, "learning_rate": 4.789187371186143e-08, "logits/chosen": 13.200212478637695, "logits/rejected": 14.621530532836914, "logps/chosen": -4.166866302490234, "logps/rejected": -4.674034595489502, "loss": 4.09, "rewards/accuracies": 0.5, "rewards/chosen": -41.668663024902344, "rewards/margins": 5.071681976318359, "rewards/rejected": -46.7403450012207, "step": 6304 }, { "epoch": 0.8585239651416122, "grad_norm": 41.48672130584644, "learning_rate": 4.780169716204358e-08, "logits/chosen": 15.118908882141113, "logits/rejected": 15.30639362335205, "logps/chosen": -4.72259521484375, "logps/rejected": -4.8330230712890625, "loss": 3.5422, "rewards/accuracies": 0.75, "rewards/chosen": -47.2259521484375, "rewards/margins": 1.1042804718017578, "rewards/rejected": -48.330230712890625, "step": 6305 }, { "epoch": 0.8586601307189542, "grad_norm": 40.093605194250536, "learning_rate": 4.7711600194343526e-08, "logits/chosen": 14.98690414428711, "logits/rejected": 15.116438865661621, "logps/chosen": -5.0176472663879395, "logps/rejected": -5.222922325134277, "loss": 3.9199, "rewards/accuracies": 0.5, "rewards/chosen": -50.176475524902344, "rewards/margins": 2.0527515411376953, "rewards/rejected": -52.229225158691406, "step": 6306 }, { "epoch": 0.8587962962962963, "grad_norm": 42.043347310522634, "learning_rate": 4.762158282911959e-08, "logits/chosen": 13.33022689819336, "logits/rejected": 14.15957260131836, "logps/chosen": -4.094928741455078, "logps/rejected": -4.395134925842285, "loss": 3.9306, "rewards/accuracies": 0.75, "rewards/chosen": -40.94928741455078, "rewards/margins": 3.0020647048950195, "rewards/rejected": -43.95134735107422, "step": 6307 }, { "epoch": 0.8589324618736384, "grad_norm": 43.20411863952872, "learning_rate": 4.753164508671168e-08, "logits/chosen": 15.639893531799316, "logits/rejected": 14.366548538208008, "logps/chosen": -4.794039726257324, "logps/rejected": -4.818411827087402, "loss": 3.9548, "rewards/accuracies": 0.5, "rewards/chosen": -47.940399169921875, "rewards/margins": 0.24372005462646484, "rewards/rejected": -48.184120178222656, "step": 6308 }, { "epoch": 0.8590686274509803, "grad_norm": 43.03147496122341, "learning_rate": 4.7441786987442125e-08, "logits/chosen": 13.896462440490723, "logits/rejected": 14.733318328857422, "logps/chosen": -4.295198917388916, "logps/rejected": -4.730501651763916, "loss": 3.6926, "rewards/accuracies": 1.0, "rewards/chosen": -42.95199203491211, "rewards/margins": 4.353025436401367, "rewards/rejected": -47.305015563964844, "step": 6309 }, { "epoch": 0.8592047930283224, "grad_norm": 39.4762608695056, "learning_rate": 4.735200855161512e-08, "logits/chosen": 14.175823211669922, "logits/rejected": 14.620439529418945, "logps/chosen": -4.807154655456543, "logps/rejected": -4.759788990020752, "loss": 3.9819, "rewards/accuracies": 0.5, "rewards/chosen": -48.07155227661133, "rewards/margins": -0.4736595153808594, "rewards/rejected": -47.59789276123047, "step": 6310 }, { "epoch": 0.8593409586056645, "grad_norm": 40.97574845235922, "learning_rate": 4.7262309799516754e-08, "logits/chosen": 14.949630737304688, "logits/rejected": 14.98392105102539, "logps/chosen": -4.861994743347168, "logps/rejected": -4.697976112365723, "loss": 4.3336, "rewards/accuracies": 0.5, "rewards/chosen": -48.61994934082031, "rewards/margins": -1.6401872634887695, "rewards/rejected": -46.979759216308594, "step": 6311 }, { "epoch": 0.8594771241830066, "grad_norm": 42.94487071368635, "learning_rate": 4.717269075141521e-08, "logits/chosen": 13.565885543823242, "logits/rejected": 14.627691268920898, "logps/chosen": -4.258866310119629, "logps/rejected": -4.669299125671387, "loss": 3.5679, "rewards/accuracies": 1.0, "rewards/chosen": -42.588661193847656, "rewards/margins": 4.104330062866211, "rewards/rejected": -46.6929931640625, "step": 6312 }, { "epoch": 0.8596132897603486, "grad_norm": 41.0428773914691, "learning_rate": 4.708315142756079e-08, "logits/chosen": 14.89725112915039, "logits/rejected": 14.762121200561523, "logps/chosen": -4.798399925231934, "logps/rejected": -4.861655235290527, "loss": 3.7082, "rewards/accuracies": 0.75, "rewards/chosen": -47.9839973449707, "rewards/margins": 0.6325588226318359, "rewards/rejected": -48.616554260253906, "step": 6313 }, { "epoch": 0.8597494553376906, "grad_norm": 41.41694434175986, "learning_rate": 4.699369184818547e-08, "logits/chosen": 14.659509658813477, "logits/rejected": 14.664175987243652, "logps/chosen": -4.6260881423950195, "logps/rejected": -4.9479804039001465, "loss": 4.0385, "rewards/accuracies": 0.75, "rewards/chosen": -46.260887145996094, "rewards/margins": 3.218918800354004, "rewards/rejected": -49.47980499267578, "step": 6314 }, { "epoch": 0.8598856209150327, "grad_norm": 39.29642991125794, "learning_rate": 4.690431203350344e-08, "logits/chosen": 14.606109619140625, "logits/rejected": 14.948873519897461, "logps/chosen": -4.352011680603027, "logps/rejected": -4.775235176086426, "loss": 3.4735, "rewards/accuracies": 1.0, "rewards/chosen": -43.520118713378906, "rewards/margins": 4.232234001159668, "rewards/rejected": -47.752349853515625, "step": 6315 }, { "epoch": 0.8600217864923747, "grad_norm": 48.760845502104736, "learning_rate": 4.681501200371096e-08, "logits/chosen": 14.042289733886719, "logits/rejected": 14.759323120117188, "logps/chosen": -4.573490142822266, "logps/rejected": -4.859196662902832, "loss": 3.9157, "rewards/accuracies": 0.75, "rewards/chosen": -45.73489761352539, "rewards/margins": 2.8570661544799805, "rewards/rejected": -48.59196472167969, "step": 6316 }, { "epoch": 0.8601579520697168, "grad_norm": 41.183985407748054, "learning_rate": 4.6725791778985835e-08, "logits/chosen": 15.210393905639648, "logits/rejected": 14.87628173828125, "logps/chosen": -4.7060441970825195, "logps/rejected": -4.775981903076172, "loss": 4.2777, "rewards/accuracies": 0.25, "rewards/chosen": -47.06044006347656, "rewards/margins": 0.6993751525878906, "rewards/rejected": -47.75981903076172, "step": 6317 }, { "epoch": 0.8602941176470589, "grad_norm": 40.545265398704515, "learning_rate": 4.663665137948829e-08, "logits/chosen": 13.771717071533203, "logits/rejected": 15.098974227905273, "logps/chosen": -4.5988264083862305, "logps/rejected": -5.034485816955566, "loss": 3.5475, "rewards/accuracies": 1.0, "rewards/chosen": -45.98826599121094, "rewards/margins": 4.356590270996094, "rewards/rejected": -50.34485626220703, "step": 6318 }, { "epoch": 0.8604302832244008, "grad_norm": 45.21311356154638, "learning_rate": 4.654759082536035e-08, "logits/chosen": 14.540565490722656, "logits/rejected": 14.531932830810547, "logps/chosen": -4.89148473739624, "logps/rejected": -4.831225395202637, "loss": 3.8738, "rewards/accuracies": 0.5, "rewards/chosen": -48.91484451293945, "rewards/margins": -0.6025867462158203, "rewards/rejected": -48.312259674072266, "step": 6319 }, { "epoch": 0.8605664488017429, "grad_norm": 47.17968909423781, "learning_rate": 4.645861013672583e-08, "logits/chosen": 14.7527494430542, "logits/rejected": 14.323036193847656, "logps/chosen": -4.531525135040283, "logps/rejected": -4.327242851257324, "loss": 4.529, "rewards/accuracies": 0.25, "rewards/chosen": -45.31525421142578, "rewards/margins": -2.0428199768066406, "rewards/rejected": -43.272430419921875, "step": 6320 }, { "epoch": 0.860702614379085, "grad_norm": 41.15192549813199, "learning_rate": 4.636970933369082e-08, "logits/chosen": 14.818778991699219, "logits/rejected": 14.971782684326172, "logps/chosen": -5.204492568969727, "logps/rejected": -5.410921096801758, "loss": 3.6666, "rewards/accuracies": 0.75, "rewards/chosen": -52.044925689697266, "rewards/margins": 2.0642833709716797, "rewards/rejected": -54.10920715332031, "step": 6321 }, { "epoch": 0.860838779956427, "grad_norm": 40.117922424227075, "learning_rate": 4.6280888436343166e-08, "logits/chosen": 14.06245231628418, "logits/rejected": 14.592514038085938, "logps/chosen": -4.539057731628418, "logps/rejected": -4.7479047775268555, "loss": 4.2401, "rewards/accuracies": 0.75, "rewards/chosen": -45.39057922363281, "rewards/margins": 2.088465690612793, "rewards/rejected": -47.479042053222656, "step": 6322 }, { "epoch": 0.8609749455337691, "grad_norm": 43.76056264499648, "learning_rate": 4.619214746475255e-08, "logits/chosen": 14.702350616455078, "logits/rejected": 14.121801376342773, "logps/chosen": -4.628621578216553, "logps/rejected": -4.874842166900635, "loss": 4.2245, "rewards/accuracies": 0.5, "rewards/chosen": -46.286216735839844, "rewards/margins": 2.462204933166504, "rewards/rejected": -48.74842071533203, "step": 6323 }, { "epoch": 0.8611111111111112, "grad_norm": 38.83967818604107, "learning_rate": 4.610348643897084e-08, "logits/chosen": 14.286881446838379, "logits/rejected": 15.122183799743652, "logps/chosen": -4.6562180519104, "logps/rejected": -5.015712738037109, "loss": 3.7653, "rewards/accuracies": 1.0, "rewards/chosen": -46.56217956542969, "rewards/margins": 3.59494686126709, "rewards/rejected": -50.15712356567383, "step": 6324 }, { "epoch": 0.8612472766884531, "grad_norm": 48.486586310919286, "learning_rate": 4.6014905379031744e-08, "logits/chosen": 13.991632461547852, "logits/rejected": 14.560279846191406, "logps/chosen": -4.4987335205078125, "logps/rejected": -4.7353057861328125, "loss": 4.1857, "rewards/accuracies": 0.75, "rewards/chosen": -44.987335205078125, "rewards/margins": 2.36572265625, "rewards/rejected": -47.353057861328125, "step": 6325 }, { "epoch": 0.8613834422657952, "grad_norm": 51.073724636746036, "learning_rate": 4.592640430495081e-08, "logits/chosen": 14.60379409790039, "logits/rejected": 14.543889999389648, "logps/chosen": -4.645276069641113, "logps/rejected": -4.8601179122924805, "loss": 4.0087, "rewards/accuracies": 0.75, "rewards/chosen": -46.4527587890625, "rewards/margins": 2.1484193801879883, "rewards/rejected": -48.60117721557617, "step": 6326 }, { "epoch": 0.8615196078431373, "grad_norm": 44.37552435141933, "learning_rate": 4.583798323672563e-08, "logits/chosen": 14.778709411621094, "logits/rejected": 14.252876281738281, "logps/chosen": -4.691988945007324, "logps/rejected": -4.736557960510254, "loss": 4.1005, "rewards/accuracies": 0.5, "rewards/chosen": -46.919891357421875, "rewards/margins": 0.4456901550292969, "rewards/rejected": -47.36558151245117, "step": 6327 }, { "epoch": 0.8616557734204793, "grad_norm": 41.58296445525313, "learning_rate": 4.574964219433575e-08, "logits/chosen": 14.737336158752441, "logits/rejected": 14.721292495727539, "logps/chosen": -4.731456279754639, "logps/rejected": -4.671577453613281, "loss": 4.075, "rewards/accuracies": 0.5, "rewards/chosen": -47.3145637512207, "rewards/margins": -0.5987892150878906, "rewards/rejected": -46.71577453613281, "step": 6328 }, { "epoch": 0.8617919389978214, "grad_norm": 43.356911183046925, "learning_rate": 4.566138119774239e-08, "logits/chosen": 14.6694974899292, "logits/rejected": 14.662923812866211, "logps/chosen": -4.817527770996094, "logps/rejected": -4.636959552764893, "loss": 4.0968, "rewards/accuracies": 0.25, "rewards/chosen": -48.17527770996094, "rewards/margins": -1.8056859970092773, "rewards/rejected": -46.36959457397461, "step": 6329 }, { "epoch": 0.8619281045751634, "grad_norm": 39.69011438009944, "learning_rate": 4.5573200266888936e-08, "logits/chosen": 14.619917869567871, "logits/rejected": 15.10900592803955, "logps/chosen": -4.605718612670898, "logps/rejected": -4.9174723625183105, "loss": 4.242, "rewards/accuracies": 1.0, "rewards/chosen": -46.05718231201172, "rewards/margins": 3.1175403594970703, "rewards/rejected": -49.174720764160156, "step": 6330 }, { "epoch": 0.8620642701525054, "grad_norm": 43.63898992903208, "learning_rate": 4.548509942170065e-08, "logits/chosen": 14.773000717163086, "logits/rejected": 14.56869888305664, "logps/chosen": -4.884891033172607, "logps/rejected": -4.654040336608887, "loss": 3.596, "rewards/accuracies": 0.5, "rewards/chosen": -48.848907470703125, "rewards/margins": -2.308506965637207, "rewards/rejected": -46.5404052734375, "step": 6331 }, { "epoch": 0.8622004357298475, "grad_norm": 39.96752811130165, "learning_rate": 4.5397078682084575e-08, "logits/chosen": 13.9705810546875, "logits/rejected": 14.163375854492188, "logps/chosen": -4.684244632720947, "logps/rejected": -4.637913703918457, "loss": 3.7616, "rewards/accuracies": 0.5, "rewards/chosen": -46.842445373535156, "rewards/margins": -0.46330833435058594, "rewards/rejected": -46.37913513183594, "step": 6332 }, { "epoch": 0.8623366013071896, "grad_norm": 43.78922700856473, "learning_rate": 4.530913806792971e-08, "logits/chosen": 14.890533447265625, "logits/rejected": 14.773855209350586, "logps/chosen": -4.672646522521973, "logps/rejected": -4.688262939453125, "loss": 4.5098, "rewards/accuracies": 0.75, "rewards/chosen": -46.726463317871094, "rewards/margins": 0.15616703033447266, "rewards/rejected": -46.882633209228516, "step": 6333 }, { "epoch": 0.8624727668845316, "grad_norm": 43.7724096704515, "learning_rate": 4.522127759910712e-08, "logits/chosen": 14.701539993286133, "logits/rejected": 14.691061019897461, "logps/chosen": -4.905083656311035, "logps/rejected": -4.563934326171875, "loss": 4.2737, "rewards/accuracies": 0.0, "rewards/chosen": -49.05083465576172, "rewards/margins": -3.4114913940429688, "rewards/rejected": -45.639339447021484, "step": 6334 }, { "epoch": 0.8626089324618736, "grad_norm": 45.27387442885299, "learning_rate": 4.513349729546938e-08, "logits/chosen": 14.757425308227539, "logits/rejected": 14.292789459228516, "logps/chosen": -4.699358940124512, "logps/rejected": -4.555130958557129, "loss": 4.3433, "rewards/accuracies": 0.25, "rewards/chosen": -46.99359130859375, "rewards/margins": -1.4422788619995117, "rewards/rejected": -45.55131149291992, "step": 6335 }, { "epoch": 0.8627450980392157, "grad_norm": 40.03218996400962, "learning_rate": 4.5045797176851284e-08, "logits/chosen": 14.142051696777344, "logits/rejected": 15.006393432617188, "logps/chosen": -4.265716552734375, "logps/rejected": -4.607076644897461, "loss": 3.8123, "rewards/accuracies": 0.5, "rewards/chosen": -42.65716552734375, "rewards/margins": 3.4136009216308594, "rewards/rejected": -46.070770263671875, "step": 6336 }, { "epoch": 0.8628812636165577, "grad_norm": 41.851327853236995, "learning_rate": 4.495817726306952e-08, "logits/chosen": 14.603702545166016, "logits/rejected": 15.154951095581055, "logps/chosen": -4.379068851470947, "logps/rejected": -4.790209770202637, "loss": 4.0162, "rewards/accuracies": 1.0, "rewards/chosen": -43.79069137573242, "rewards/margins": 4.1114044189453125, "rewards/rejected": -47.90209197998047, "step": 6337 }, { "epoch": 0.8630174291938998, "grad_norm": 37.81342693039244, "learning_rate": 4.4870637573922286e-08, "logits/chosen": 14.009843826293945, "logits/rejected": 14.177896499633789, "logps/chosen": -4.296201229095459, "logps/rejected": -4.625694274902344, "loss": 3.6759, "rewards/accuracies": 0.5, "rewards/chosen": -42.962013244628906, "rewards/margins": 3.2949323654174805, "rewards/rejected": -46.25694274902344, "step": 6338 }, { "epoch": 0.8631535947712419, "grad_norm": 39.73631982376044, "learning_rate": 4.4783178129190036e-08, "logits/chosen": 14.758922576904297, "logits/rejected": 14.828794479370117, "logps/chosen": -4.676549434661865, "logps/rejected": -4.767045021057129, "loss": 3.8921, "rewards/accuracies": 0.75, "rewards/chosen": -46.76549530029297, "rewards/margins": 0.9049520492553711, "rewards/rejected": -47.67044448852539, "step": 6339 }, { "epoch": 0.8632897603485838, "grad_norm": 37.70030322134508, "learning_rate": 4.4695798948635e-08, "logits/chosen": 13.584487915039062, "logits/rejected": 14.635496139526367, "logps/chosen": -4.230460166931152, "logps/rejected": -4.693565845489502, "loss": 3.7315, "rewards/accuracies": 1.0, "rewards/chosen": -42.30459976196289, "rewards/margins": 4.631060600280762, "rewards/rejected": -46.93566131591797, "step": 6340 }, { "epoch": 0.8634259259259259, "grad_norm": 41.29640056631451, "learning_rate": 4.460850005200107e-08, "logits/chosen": 13.822229385375977, "logits/rejected": 14.965007781982422, "logps/chosen": -4.558654308319092, "logps/rejected": -4.903943061828613, "loss": 4.0553, "rewards/accuracies": 0.75, "rewards/chosen": -45.58654022216797, "rewards/margins": 3.4528884887695312, "rewards/rejected": -49.0394287109375, "step": 6341 }, { "epoch": 0.863562091503268, "grad_norm": 46.327502062753375, "learning_rate": 4.4521281459014307e-08, "logits/chosen": 14.086519241333008, "logits/rejected": 14.530445098876953, "logps/chosen": -4.634749889373779, "logps/rejected": -4.709981918334961, "loss": 4.4955, "rewards/accuracies": 0.5, "rewards/chosen": -46.34749984741211, "rewards/margins": 0.7523174285888672, "rewards/rejected": -47.099815368652344, "step": 6342 }, { "epoch": 0.86369825708061, "grad_norm": 44.00716127104467, "learning_rate": 4.443414318938248e-08, "logits/chosen": 14.031911849975586, "logits/rejected": 15.128154754638672, "logps/chosen": -4.561458587646484, "logps/rejected": -4.800580024719238, "loss": 4.0651, "rewards/accuracies": 0.75, "rewards/chosen": -45.614585876464844, "rewards/margins": 2.3912181854248047, "rewards/rejected": -48.005802154541016, "step": 6343 }, { "epoch": 0.8638344226579521, "grad_norm": 40.84001647721901, "learning_rate": 4.4347085262795e-08, "logits/chosen": 14.604297637939453, "logits/rejected": 15.107023239135742, "logps/chosen": -4.743901252746582, "logps/rejected": -4.755979061126709, "loss": 3.9704, "rewards/accuracies": 0.25, "rewards/chosen": -47.43901443481445, "rewards/margins": 0.12077617645263672, "rewards/rejected": -47.559791564941406, "step": 6344 }, { "epoch": 0.8639705882352942, "grad_norm": 42.58691249513218, "learning_rate": 4.4260107698923524e-08, "logits/chosen": 14.114423751831055, "logits/rejected": 14.993121147155762, "logps/chosen": -4.243510723114014, "logps/rejected": -4.359095096588135, "loss": 4.0655, "rewards/accuracies": 0.5, "rewards/chosen": -42.43510818481445, "rewards/margins": 1.1558408737182617, "rewards/rejected": -43.59095001220703, "step": 6345 }, { "epoch": 0.8641067538126361, "grad_norm": 41.369749839099335, "learning_rate": 4.4173210517421334e-08, "logits/chosen": 14.282176971435547, "logits/rejected": 14.344535827636719, "logps/chosen": -4.499890327453613, "logps/rejected": -4.672081470489502, "loss": 4.1784, "rewards/accuracies": 0.75, "rewards/chosen": -44.998905181884766, "rewards/margins": 1.7219104766845703, "rewards/rejected": -46.7208137512207, "step": 6346 }, { "epoch": 0.8642429193899782, "grad_norm": 39.05261827221241, "learning_rate": 4.408639373792349e-08, "logits/chosen": 13.832597732543945, "logits/rejected": 14.54166030883789, "logps/chosen": -4.245262145996094, "logps/rejected": -4.494441032409668, "loss": 4.0135, "rewards/accuracies": 0.5, "rewards/chosen": -42.45262145996094, "rewards/margins": 2.491787910461426, "rewards/rejected": -44.94441223144531, "step": 6347 }, { "epoch": 0.8643790849673203, "grad_norm": 42.147502265776474, "learning_rate": 4.3999657380046965e-08, "logits/chosen": 14.420530319213867, "logits/rejected": 13.947183609008789, "logps/chosen": -4.411360263824463, "logps/rejected": -4.746288299560547, "loss": 4.0856, "rewards/accuracies": 0.75, "rewards/chosen": -44.11360168457031, "rewards/margins": 3.3492774963378906, "rewards/rejected": -47.46288299560547, "step": 6348 }, { "epoch": 0.8645152505446623, "grad_norm": 39.98790306311034, "learning_rate": 4.391300146339065e-08, "logits/chosen": 14.580240249633789, "logits/rejected": 14.955343246459961, "logps/chosen": -4.588663578033447, "logps/rejected": -4.512876510620117, "loss": 3.8271, "rewards/accuracies": 0.25, "rewards/chosen": -45.886634826660156, "rewards/margins": -0.7578697204589844, "rewards/rejected": -45.12876892089844, "step": 6349 }, { "epoch": 0.8646514161220044, "grad_norm": 40.10148726615173, "learning_rate": 4.3826426007535035e-08, "logits/chosen": 13.968156814575195, "logits/rejected": 14.652351379394531, "logps/chosen": -4.159331798553467, "logps/rejected": -4.637884140014648, "loss": 3.7396, "rewards/accuracies": 0.75, "rewards/chosen": -41.59331512451172, "rewards/margins": 4.785528182983398, "rewards/rejected": -46.37884521484375, "step": 6350 }, { "epoch": 0.8647875816993464, "grad_norm": 43.47842396015869, "learning_rate": 4.373993103204259e-08, "logits/chosen": 14.387123107910156, "logits/rejected": 15.542119979858398, "logps/chosen": -4.653557777404785, "logps/rejected": -4.9191484451293945, "loss": 4.1849, "rewards/accuracies": 1.0, "rewards/chosen": -46.53557586669922, "rewards/margins": 2.6559104919433594, "rewards/rejected": -49.19148635864258, "step": 6351 }, { "epoch": 0.8649237472766884, "grad_norm": 44.066483609170824, "learning_rate": 4.3653516556457725e-08, "logits/chosen": 13.80273151397705, "logits/rejected": 14.840538024902344, "logps/chosen": -4.493968486785889, "logps/rejected": -4.990978240966797, "loss": 4.1117, "rewards/accuracies": 1.0, "rewards/chosen": -44.93968963623047, "rewards/margins": 4.970095634460449, "rewards/rejected": -49.90978240966797, "step": 6352 }, { "epoch": 0.8650599128540305, "grad_norm": 39.91830563816491, "learning_rate": 4.356718260030629e-08, "logits/chosen": 14.177532196044922, "logits/rejected": 15.099817276000977, "logps/chosen": -4.401972770690918, "logps/rejected": -4.678278923034668, "loss": 3.9141, "rewards/accuracies": 0.75, "rewards/chosen": -44.01972198486328, "rewards/margins": 2.763068199157715, "rewards/rejected": -46.78279495239258, "step": 6353 }, { "epoch": 0.8651960784313726, "grad_norm": 41.879633903386434, "learning_rate": 4.348092918309625e-08, "logits/chosen": 15.43372917175293, "logits/rejected": 15.257126808166504, "logps/chosen": -4.8400092124938965, "logps/rejected": -4.957301616668701, "loss": 4.1642, "rewards/accuracies": 0.75, "rewards/chosen": -48.400089263916016, "rewards/margins": 1.1729259490966797, "rewards/rejected": -49.57301712036133, "step": 6354 }, { "epoch": 0.8653322440087146, "grad_norm": 40.515106404425126, "learning_rate": 4.339475632431737e-08, "logits/chosen": 14.544576644897461, "logits/rejected": 14.431136131286621, "logps/chosen": -4.802483558654785, "logps/rejected": -4.880718231201172, "loss": 3.9551, "rewards/accuracies": 0.5, "rewards/chosen": -48.02483367919922, "rewards/margins": 0.7823467254638672, "rewards/rejected": -48.80718231201172, "step": 6355 }, { "epoch": 0.8654684095860566, "grad_norm": 41.584753404066944, "learning_rate": 4.330866404344093e-08, "logits/chosen": 13.927349090576172, "logits/rejected": 14.932347297668457, "logps/chosen": -4.457209587097168, "logps/rejected": -4.587226867675781, "loss": 3.2975, "rewards/accuracies": 0.5, "rewards/chosen": -44.57209014892578, "rewards/margins": 1.300175666809082, "rewards/rejected": -45.87226867675781, "step": 6356 }, { "epoch": 0.8656045751633987, "grad_norm": 42.95211774044062, "learning_rate": 4.3222652359920265e-08, "logits/chosen": 13.583545684814453, "logits/rejected": 14.297296524047852, "logps/chosen": -4.289696216583252, "logps/rejected": -4.737851142883301, "loss": 3.3181, "rewards/accuracies": 0.75, "rewards/chosen": -42.89696502685547, "rewards/margins": 4.4815473556518555, "rewards/rejected": -47.378509521484375, "step": 6357 }, { "epoch": 0.8657407407407407, "grad_norm": 43.188163648307956, "learning_rate": 4.313672129319057e-08, "logits/chosen": 13.75119686126709, "logits/rejected": 14.029589653015137, "logps/chosen": -4.101480484008789, "logps/rejected": -4.628345012664795, "loss": 3.8864, "rewards/accuracies": 0.75, "rewards/chosen": -41.014801025390625, "rewards/margins": 5.268646240234375, "rewards/rejected": -46.283447265625, "step": 6358 }, { "epoch": 0.8658769063180828, "grad_norm": 41.100817336531264, "learning_rate": 4.30508708626685e-08, "logits/chosen": 13.851776123046875, "logits/rejected": 14.399234771728516, "logps/chosen": -4.300530910491943, "logps/rejected": -4.6372270584106445, "loss": 3.8591, "rewards/accuracies": 0.75, "rewards/chosen": -43.00531005859375, "rewards/margins": 3.366962432861328, "rewards/rejected": -46.37227249145508, "step": 6359 }, { "epoch": 0.8660130718954249, "grad_norm": 47.37092600598072, "learning_rate": 4.2965101087752663e-08, "logits/chosen": 14.494430541992188, "logits/rejected": 15.579567909240723, "logps/chosen": -4.612027168273926, "logps/rejected": -4.90233039855957, "loss": 3.5853, "rewards/accuracies": 0.5, "rewards/chosen": -46.120269775390625, "rewards/margins": 2.9030332565307617, "rewards/rejected": -49.02330017089844, "step": 6360 }, { "epoch": 0.8661492374727668, "grad_norm": 37.419734312338775, "learning_rate": 4.287941198782365e-08, "logits/chosen": 13.678643226623535, "logits/rejected": 15.037521362304688, "logps/chosen": -4.363683700561523, "logps/rejected": -4.918086051940918, "loss": 3.3145, "rewards/accuracies": 1.0, "rewards/chosen": -43.63683319091797, "rewards/margins": 5.544022560119629, "rewards/rejected": -49.18085479736328, "step": 6361 }, { "epoch": 0.8662854030501089, "grad_norm": 42.25807077268014, "learning_rate": 4.2793803582243406e-08, "logits/chosen": 14.589322090148926, "logits/rejected": 15.37535285949707, "logps/chosen": -4.533709526062012, "logps/rejected": -4.740451812744141, "loss": 3.6361, "rewards/accuracies": 0.75, "rewards/chosen": -45.33709716796875, "rewards/margins": 2.0674209594726562, "rewards/rejected": -47.404518127441406, "step": 6362 }, { "epoch": 0.866421568627451, "grad_norm": 44.33557900116541, "learning_rate": 4.27082758903559e-08, "logits/chosen": 13.633357048034668, "logits/rejected": 14.09014892578125, "logps/chosen": -4.568943977355957, "logps/rejected": -4.649266719818115, "loss": 4.1253, "rewards/accuracies": 0.75, "rewards/chosen": -45.6894416809082, "rewards/margins": 0.8032302856445312, "rewards/rejected": -46.49266815185547, "step": 6363 }, { "epoch": 0.866557734204793, "grad_norm": 41.97357519364502, "learning_rate": 4.2622828931486985e-08, "logits/chosen": 14.630714416503906, "logits/rejected": 15.059240341186523, "logps/chosen": -4.917322158813477, "logps/rejected": -5.0772705078125, "loss": 3.9163, "rewards/accuracies": 0.5, "rewards/chosen": -49.173221588134766, "rewards/margins": 1.5994834899902344, "rewards/rejected": -50.772705078125, "step": 6364 }, { "epoch": 0.8666938997821351, "grad_norm": 42.58372736990901, "learning_rate": 4.2537462724943875e-08, "logits/chosen": 13.23505973815918, "logits/rejected": 15.19888687133789, "logps/chosen": -4.269374847412109, "logps/rejected": -4.821521759033203, "loss": 4.0464, "rewards/accuracies": 0.75, "rewards/chosen": -42.693748474121094, "rewards/margins": 5.521470069885254, "rewards/rejected": -48.21521759033203, "step": 6365 }, { "epoch": 0.8668300653594772, "grad_norm": 42.85447875169397, "learning_rate": 4.245217729001589e-08, "logits/chosen": 14.003107070922852, "logits/rejected": 14.66788101196289, "logps/chosen": -4.6236772537231445, "logps/rejected": -4.867849826812744, "loss": 4.5579, "rewards/accuracies": 0.75, "rewards/chosen": -46.236778259277344, "rewards/margins": 2.441722869873047, "rewards/rejected": -48.678497314453125, "step": 6366 }, { "epoch": 0.8669662309368191, "grad_norm": 42.7074060643165, "learning_rate": 4.236697264597402e-08, "logits/chosen": 13.881614685058594, "logits/rejected": 14.115291595458984, "logps/chosen": -4.622912406921387, "logps/rejected": -4.9642462730407715, "loss": 3.947, "rewards/accuracies": 1.0, "rewards/chosen": -46.229122161865234, "rewards/margins": 3.413339614868164, "rewards/rejected": -49.64246368408203, "step": 6367 }, { "epoch": 0.8671023965141612, "grad_norm": 42.607566059235374, "learning_rate": 4.228184881207087e-08, "logits/chosen": 14.168756484985352, "logits/rejected": 13.863138198852539, "logps/chosen": -4.8452534675598145, "logps/rejected": -4.785819053649902, "loss": 3.9181, "rewards/accuracies": 0.5, "rewards/chosen": -48.452537536621094, "rewards/margins": -0.5943460464477539, "rewards/rejected": -47.85818862915039, "step": 6368 }, { "epoch": 0.8672385620915033, "grad_norm": 44.894269433448166, "learning_rate": 4.21968058075409e-08, "logits/chosen": 14.457267761230469, "logits/rejected": 14.066794395446777, "logps/chosen": -4.689170837402344, "logps/rejected": -4.649724960327148, "loss": 3.7869, "rewards/accuracies": 0.5, "rewards/chosen": -46.89170837402344, "rewards/margins": -0.39446163177490234, "rewards/rejected": -46.497249603271484, "step": 6369 }, { "epoch": 0.8673747276688453, "grad_norm": 41.504486574027865, "learning_rate": 4.211184365160032e-08, "logits/chosen": 13.989864349365234, "logits/rejected": 14.17346477508545, "logps/chosen": -4.368061065673828, "logps/rejected": -4.510915756225586, "loss": 4.5236, "rewards/accuracies": 0.5, "rewards/chosen": -43.68061065673828, "rewards/margins": 1.4285478591918945, "rewards/rejected": -45.109161376953125, "step": 6370 }, { "epoch": 0.8675108932461874, "grad_norm": 43.18023249926604, "learning_rate": 4.202696236344696e-08, "logits/chosen": 15.620107650756836, "logits/rejected": 15.415658950805664, "logps/chosen": -5.0275044441223145, "logps/rejected": -4.914933204650879, "loss": 4.2991, "rewards/accuracies": 0.25, "rewards/chosen": -50.27504348754883, "rewards/margins": -1.1257143020629883, "rewards/rejected": -49.149330139160156, "step": 6371 }, { "epoch": 0.8676470588235294, "grad_norm": 42.590074505294254, "learning_rate": 4.1942161962260505e-08, "logits/chosen": 13.943916320800781, "logits/rejected": 14.221452713012695, "logps/chosen": -4.584142684936523, "logps/rejected": -4.680835723876953, "loss": 3.8497, "rewards/accuracies": 1.0, "rewards/chosen": -45.8414306640625, "rewards/margins": 0.9669322967529297, "rewards/rejected": -46.8083610534668, "step": 6372 }, { "epoch": 0.8677832244008714, "grad_norm": 38.1657424452638, "learning_rate": 4.185744246720233e-08, "logits/chosen": 14.501251220703125, "logits/rejected": 14.368917465209961, "logps/chosen": -4.416582107543945, "logps/rejected": -4.615049839019775, "loss": 3.7788, "rewards/accuracies": 1.0, "rewards/chosen": -44.16582107543945, "rewards/margins": 1.9846782684326172, "rewards/rejected": -46.15049743652344, "step": 6373 }, { "epoch": 0.8679193899782135, "grad_norm": 48.361213638429966, "learning_rate": 4.17728038974154e-08, "logits/chosen": 15.12282657623291, "logits/rejected": 14.960447311401367, "logps/chosen": -4.524136066436768, "logps/rejected": -4.648266792297363, "loss": 4.0033, "rewards/accuracies": 0.75, "rewards/chosen": -45.241355895996094, "rewards/margins": 1.2413082122802734, "rewards/rejected": -46.482666015625, "step": 6374 }, { "epoch": 0.8680555555555556, "grad_norm": 42.22957898408609, "learning_rate": 4.1688246272024586e-08, "logits/chosen": 13.331878662109375, "logits/rejected": 13.525925636291504, "logps/chosen": -4.191385269165039, "logps/rejected": -4.481723308563232, "loss": 3.8714, "rewards/accuracies": 1.0, "rewards/chosen": -41.913856506347656, "rewards/margins": 2.903379440307617, "rewards/rejected": -44.81723403930664, "step": 6375 }, { "epoch": 0.8681917211328976, "grad_norm": 40.61850048248018, "learning_rate": 4.160376961013643e-08, "logits/chosen": 14.772937774658203, "logits/rejected": 14.585604667663574, "logps/chosen": -4.566389560699463, "logps/rejected": -4.7483038902282715, "loss": 3.9079, "rewards/accuracies": 0.75, "rewards/chosen": -45.66389465332031, "rewards/margins": 1.819143295288086, "rewards/rejected": -47.48303985595703, "step": 6376 }, { "epoch": 0.8683278867102396, "grad_norm": 52.967538738212504, "learning_rate": 4.1519373930838995e-08, "logits/chosen": 15.032428741455078, "logits/rejected": 14.702106475830078, "logps/chosen": -4.63688850402832, "logps/rejected": -4.85512638092041, "loss": 3.9364, "rewards/accuracies": 0.5, "rewards/chosen": -46.3688850402832, "rewards/margins": 2.182379722595215, "rewards/rejected": -48.55126190185547, "step": 6377 }, { "epoch": 0.8684640522875817, "grad_norm": 50.10324504222451, "learning_rate": 4.143505925320223e-08, "logits/chosen": 14.469335556030273, "logits/rejected": 14.659378051757812, "logps/chosen": -4.458823204040527, "logps/rejected": -4.6633710861206055, "loss": 3.7424, "rewards/accuracies": 0.75, "rewards/chosen": -44.588233947753906, "rewards/margins": 2.045480728149414, "rewards/rejected": -46.63371276855469, "step": 6378 }, { "epoch": 0.8686002178649237, "grad_norm": 42.699406454524976, "learning_rate": 4.135082559627783e-08, "logits/chosen": 14.636507034301758, "logits/rejected": 14.758623123168945, "logps/chosen": -4.535435676574707, "logps/rejected": -4.830925941467285, "loss": 3.8233, "rewards/accuracies": 0.75, "rewards/chosen": -45.35436248779297, "rewards/margins": 2.9549007415771484, "rewards/rejected": -48.309261322021484, "step": 6379 }, { "epoch": 0.8687363834422658, "grad_norm": 41.04267472015733, "learning_rate": 4.126667297909896e-08, "logits/chosen": 14.970678329467773, "logits/rejected": 14.933744430541992, "logps/chosen": -4.892548561096191, "logps/rejected": -4.808859825134277, "loss": 4.063, "rewards/accuracies": 0.5, "rewards/chosen": -48.92548370361328, "rewards/margins": -0.8368902206420898, "rewards/rejected": -48.088592529296875, "step": 6380 }, { "epoch": 0.8688725490196079, "grad_norm": 47.11172522409361, "learning_rate": 4.118260142068064e-08, "logits/chosen": 14.136611938476562, "logits/rejected": 14.641871452331543, "logps/chosen": -4.5040483474731445, "logps/rejected": -4.794240474700928, "loss": 4.4111, "rewards/accuracies": 0.75, "rewards/chosen": -45.04048156738281, "rewards/margins": 2.901921272277832, "rewards/rejected": -47.942405700683594, "step": 6381 }, { "epoch": 0.8690087145969498, "grad_norm": 42.6506034148568, "learning_rate": 4.1098610940019587e-08, "logits/chosen": 13.88927936553955, "logits/rejected": 14.781251907348633, "logps/chosen": -4.368145942687988, "logps/rejected": -4.623218059539795, "loss": 4.0193, "rewards/accuracies": 0.75, "rewards/chosen": -43.681461334228516, "rewards/margins": 2.5507192611694336, "rewards/rejected": -46.232181549072266, "step": 6382 }, { "epoch": 0.8691448801742919, "grad_norm": 38.65572978137001, "learning_rate": 4.101470155609408e-08, "logits/chosen": 13.966506958007812, "logits/rejected": 14.680952072143555, "logps/chosen": -4.284915924072266, "logps/rejected": -4.849477767944336, "loss": 3.8148, "rewards/accuracies": 0.75, "rewards/chosen": -42.849159240722656, "rewards/margins": 5.645618438720703, "rewards/rejected": -48.494773864746094, "step": 6383 }, { "epoch": 0.869281045751634, "grad_norm": 49.8128066713936, "learning_rate": 4.093087328786411e-08, "logits/chosen": 14.888933181762695, "logits/rejected": 15.754182815551758, "logps/chosen": -4.836578369140625, "logps/rejected": -5.028322219848633, "loss": 4.0239, "rewards/accuracies": 0.5, "rewards/chosen": -48.36578369140625, "rewards/margins": 1.9174365997314453, "rewards/rejected": -50.28322219848633, "step": 6384 }, { "epoch": 0.869417211328976, "grad_norm": 41.04897425329368, "learning_rate": 4.08471261542715e-08, "logits/chosen": 13.882354736328125, "logits/rejected": 15.210415840148926, "logps/chosen": -4.44007682800293, "logps/rejected": -5.099309921264648, "loss": 3.5628, "rewards/accuracies": 0.75, "rewards/chosen": -44.40076446533203, "rewards/margins": 6.5923309326171875, "rewards/rejected": -50.99309539794922, "step": 6385 }, { "epoch": 0.8695533769063181, "grad_norm": 41.965613972308475, "learning_rate": 4.076346017423948e-08, "logits/chosen": 14.438671112060547, "logits/rejected": 15.128961563110352, "logps/chosen": -4.619084358215332, "logps/rejected": -5.1563029289245605, "loss": 3.8744, "rewards/accuracies": 0.75, "rewards/chosen": -46.19084548950195, "rewards/margins": 5.372183799743652, "rewards/rejected": -51.563026428222656, "step": 6386 }, { "epoch": 0.8696895424836601, "grad_norm": 40.107833784741516, "learning_rate": 4.06798753666731e-08, "logits/chosen": 14.125547409057617, "logits/rejected": 14.825126647949219, "logps/chosen": -4.395268440246582, "logps/rejected": -4.736660957336426, "loss": 3.5999, "rewards/accuracies": 0.75, "rewards/chosen": -43.95268249511719, "rewards/margins": 3.4139270782470703, "rewards/rejected": -47.366607666015625, "step": 6387 }, { "epoch": 0.8698257080610022, "grad_norm": 41.638831589452025, "learning_rate": 4.0596371750459026e-08, "logits/chosen": 14.296073913574219, "logits/rejected": 15.060995101928711, "logps/chosen": -4.497125625610352, "logps/rejected": -5.108025074005127, "loss": 3.6505, "rewards/accuracies": 0.75, "rewards/chosen": -44.97125244140625, "rewards/margins": 6.10899543762207, "rewards/rejected": -51.08024597167969, "step": 6388 }, { "epoch": 0.8699618736383442, "grad_norm": 41.66470086759793, "learning_rate": 4.051294934446572e-08, "logits/chosen": 14.025723457336426, "logits/rejected": 14.472965240478516, "logps/chosen": -4.572669506072998, "logps/rejected": -4.476799488067627, "loss": 3.8484, "rewards/accuracies": 0.5, "rewards/chosen": -45.72669219970703, "rewards/margins": -0.9587001800537109, "rewards/rejected": -44.76799774169922, "step": 6389 }, { "epoch": 0.8700980392156863, "grad_norm": 37.800196719052224, "learning_rate": 4.0429608167543e-08, "logits/chosen": 14.606400489807129, "logits/rejected": 14.768648147583008, "logps/chosen": -4.598665714263916, "logps/rejected": -4.607712745666504, "loss": 4.0462, "rewards/accuracies": 0.5, "rewards/chosen": -45.986656188964844, "rewards/margins": 0.0904693603515625, "rewards/rejected": -46.077125549316406, "step": 6390 }, { "epoch": 0.8702342047930284, "grad_norm": 40.88663114898547, "learning_rate": 4.03463482385225e-08, "logits/chosen": 14.120285987854004, "logits/rejected": 14.586524963378906, "logps/chosen": -4.403279781341553, "logps/rejected": -4.524953365325928, "loss": 4.3596, "rewards/accuracies": 0.75, "rewards/chosen": -44.032798767089844, "rewards/margins": 1.2167377471923828, "rewards/rejected": -45.249534606933594, "step": 6391 }, { "epoch": 0.8703703703703703, "grad_norm": 45.499716704648186, "learning_rate": 4.026316957621767e-08, "logits/chosen": 14.115317344665527, "logits/rejected": 14.148710250854492, "logps/chosen": -4.3001604080200195, "logps/rejected": -4.551989555358887, "loss": 4.0253, "rewards/accuracies": 1.0, "rewards/chosen": -43.00160598754883, "rewards/margins": 2.5182924270629883, "rewards/rejected": -45.5198974609375, "step": 6392 }, { "epoch": 0.8705065359477124, "grad_norm": 41.99989065389096, "learning_rate": 4.0180072199423164e-08, "logits/chosen": 15.08333969116211, "logits/rejected": 14.954740524291992, "logps/chosen": -4.689822196960449, "logps/rejected": -4.610916614532471, "loss": 4.0639, "rewards/accuracies": 0.25, "rewards/chosen": -46.898223876953125, "rewards/margins": -0.7890596389770508, "rewards/rejected": -46.109161376953125, "step": 6393 }, { "epoch": 0.8706427015250545, "grad_norm": 38.001731786289845, "learning_rate": 4.0097056126915694e-08, "logits/chosen": 14.832304954528809, "logits/rejected": 14.61151123046875, "logps/chosen": -4.597582817077637, "logps/rejected": -4.961291313171387, "loss": 3.7667, "rewards/accuracies": 0.75, "rewards/chosen": -45.975833892822266, "rewards/margins": 3.637082099914551, "rewards/rejected": -49.6129150390625, "step": 6394 }, { "epoch": 0.8707788671023965, "grad_norm": 42.10905286440806, "learning_rate": 4.0014121377453325e-08, "logits/chosen": 15.23127269744873, "logits/rejected": 14.410822868347168, "logps/chosen": -4.793584823608398, "logps/rejected": -4.828470706939697, "loss": 3.8948, "rewards/accuracies": 0.5, "rewards/chosen": -47.93585205078125, "rewards/margins": 0.34885501861572266, "rewards/rejected": -48.284706115722656, "step": 6395 }, { "epoch": 0.8709150326797386, "grad_norm": 40.68439028455641, "learning_rate": 3.993126796977604e-08, "logits/chosen": 14.346855163574219, "logits/rejected": 14.48862075805664, "logps/chosen": -4.455068588256836, "logps/rejected": -4.495464324951172, "loss": 4.1014, "rewards/accuracies": 0.5, "rewards/chosen": -44.55068588256836, "rewards/margins": 0.4039583206176758, "rewards/rejected": -44.95464324951172, "step": 6396 }, { "epoch": 0.8710511982570807, "grad_norm": 40.65724446745605, "learning_rate": 3.9848495922605e-08, "logits/chosen": 14.703393936157227, "logits/rejected": 14.600422859191895, "logps/chosen": -4.688957214355469, "logps/rejected": -4.627396106719971, "loss": 3.3974, "rewards/accuracies": 0.5, "rewards/chosen": -46.88957214355469, "rewards/margins": -0.6156091690063477, "rewards/rejected": -46.273963928222656, "step": 6397 }, { "epoch": 0.8711873638344226, "grad_norm": 41.08398093633835, "learning_rate": 3.976580525464337e-08, "logits/chosen": 14.482548713684082, "logits/rejected": 14.078485488891602, "logps/chosen": -4.59613037109375, "logps/rejected": -4.185117244720459, "loss": 4.1754, "rewards/accuracies": 0.25, "rewards/chosen": -45.9613037109375, "rewards/margins": -4.110129356384277, "rewards/rejected": -41.851173400878906, "step": 6398 }, { "epoch": 0.8713235294117647, "grad_norm": 40.09545864786599, "learning_rate": 3.968319598457581e-08, "logits/chosen": 15.322626113891602, "logits/rejected": 14.41812515258789, "logps/chosen": -4.610895156860352, "logps/rejected": -4.4572954177856445, "loss": 3.7712, "rewards/accuracies": 0.25, "rewards/chosen": -46.10895538330078, "rewards/margins": -1.5360040664672852, "rewards/rejected": -44.57295227050781, "step": 6399 }, { "epoch": 0.8714596949891068, "grad_norm": 42.024904357989094, "learning_rate": 3.960066813106851e-08, "logits/chosen": 15.424028396606445, "logits/rejected": 15.276802062988281, "logps/chosen": -4.700650215148926, "logps/rejected": -4.719232082366943, "loss": 3.9752, "rewards/accuracies": 0.5, "rewards/chosen": -47.006500244140625, "rewards/margins": 0.1858224868774414, "rewards/rejected": -47.19232177734375, "step": 6400 }, { "epoch": 0.8715958605664488, "grad_norm": 46.81865016045427, "learning_rate": 3.951822171276928e-08, "logits/chosen": 14.86922836303711, "logits/rejected": 15.03872013092041, "logps/chosen": -4.595913887023926, "logps/rejected": -4.496757507324219, "loss": 4.1968, "rewards/accuracies": 0.5, "rewards/chosen": -45.959136962890625, "rewards/margins": -0.9915599822998047, "rewards/rejected": -44.96757507324219, "step": 6401 }, { "epoch": 0.8717320261437909, "grad_norm": 44.97613589190836, "learning_rate": 3.943585674830765e-08, "logits/chosen": 14.34731674194336, "logits/rejected": 14.185139656066895, "logps/chosen": -4.551504135131836, "logps/rejected": -4.46076774597168, "loss": 4.408, "rewards/accuracies": 0.25, "rewards/chosen": -45.515045166015625, "rewards/margins": -0.9073667526245117, "rewards/rejected": -44.60767364501953, "step": 6402 }, { "epoch": 0.871868191721133, "grad_norm": 41.01291973079213, "learning_rate": 3.9353573256294715e-08, "logits/chosen": 13.342748641967773, "logits/rejected": 13.742281913757324, "logps/chosen": -4.444042205810547, "logps/rejected": -4.535294532775879, "loss": 4.1521, "rewards/accuracies": 0.5, "rewards/chosen": -44.44042205810547, "rewards/margins": 0.9125204086303711, "rewards/rejected": -45.35293960571289, "step": 6403 }, { "epoch": 0.8720043572984749, "grad_norm": 37.391370052719495, "learning_rate": 3.9271371255322985e-08, "logits/chosen": 14.557657241821289, "logits/rejected": 14.235107421875, "logps/chosen": -4.365511894226074, "logps/rejected": -4.417013645172119, "loss": 3.7393, "rewards/accuracies": 0.5, "rewards/chosen": -43.655120849609375, "rewards/margins": 0.5150156021118164, "rewards/rejected": -44.17013931274414, "step": 6404 }, { "epoch": 0.872140522875817, "grad_norm": 44.99744134982474, "learning_rate": 3.918925076396671e-08, "logits/chosen": 15.346841812133789, "logits/rejected": 14.845622062683105, "logps/chosen": -4.851337432861328, "logps/rejected": -4.708362579345703, "loss": 4.4081, "rewards/accuracies": 0.25, "rewards/chosen": -48.51337432861328, "rewards/margins": -1.4297466278076172, "rewards/rejected": -47.08362579345703, "step": 6405 }, { "epoch": 0.8722766884531591, "grad_norm": 42.2267386174794, "learning_rate": 3.9107211800781804e-08, "logits/chosen": 14.51369857788086, "logits/rejected": 15.114175796508789, "logps/chosen": -4.709648609161377, "logps/rejected": -4.777191162109375, "loss": 3.8613, "rewards/accuracies": 0.5, "rewards/chosen": -47.09648132324219, "rewards/margins": 0.6754245758056641, "rewards/rejected": -47.77191162109375, "step": 6406 }, { "epoch": 0.8724128540305011, "grad_norm": 39.63074485655019, "learning_rate": 3.902525438430544e-08, "logits/chosen": 14.66457462310791, "logits/rejected": 14.853537559509277, "logps/chosen": -4.549746513366699, "logps/rejected": -4.815005779266357, "loss": 3.7519, "rewards/accuracies": 0.75, "rewards/chosen": -45.497467041015625, "rewards/margins": 2.652592658996582, "rewards/rejected": -48.150054931640625, "step": 6407 }, { "epoch": 0.8725490196078431, "grad_norm": 38.09640254005919, "learning_rate": 3.894337853305676e-08, "logits/chosen": 14.10064697265625, "logits/rejected": 14.961237907409668, "logps/chosen": -4.331182479858398, "logps/rejected": -4.689443111419678, "loss": 3.8694, "rewards/accuracies": 0.75, "rewards/chosen": -43.311824798583984, "rewards/margins": 3.5826101303100586, "rewards/rejected": -46.89443588256836, "step": 6408 }, { "epoch": 0.8726851851851852, "grad_norm": 47.00478984356269, "learning_rate": 3.8861584265536253e-08, "logits/chosen": 14.11602783203125, "logits/rejected": 14.436549186706543, "logps/chosen": -4.454368591308594, "logps/rejected": -4.320531845092773, "loss": 3.8884, "rewards/accuracies": 0.25, "rewards/chosen": -44.54368591308594, "rewards/margins": -1.3383665084838867, "rewards/rejected": -43.205318450927734, "step": 6409 }, { "epoch": 0.8728213507625272, "grad_norm": 42.30256631124283, "learning_rate": 3.877987160022593e-08, "logits/chosen": 13.7400541305542, "logits/rejected": 14.169689178466797, "logps/chosen": -4.45499324798584, "logps/rejected": -4.745632171630859, "loss": 3.5656, "rewards/accuracies": 0.75, "rewards/chosen": -44.549930572509766, "rewards/margins": 2.9063940048217773, "rewards/rejected": -47.45632553100586, "step": 6410 }, { "epoch": 0.8729575163398693, "grad_norm": 42.567890087213975, "learning_rate": 3.869824055558948e-08, "logits/chosen": 13.654338836669922, "logits/rejected": 13.965325355529785, "logps/chosen": -4.368600845336914, "logps/rejected": -4.452110290527344, "loss": 3.7366, "rewards/accuracies": 0.5, "rewards/chosen": -43.686004638671875, "rewards/margins": 0.8350963592529297, "rewards/rejected": -44.52110290527344, "step": 6411 }, { "epoch": 0.8730936819172114, "grad_norm": 43.1068654992256, "learning_rate": 3.861669115007222e-08, "logits/chosen": 13.987333297729492, "logits/rejected": 14.724018096923828, "logps/chosen": -4.638034820556641, "logps/rejected": -4.829804420471191, "loss": 3.6892, "rewards/accuracies": 0.5, "rewards/chosen": -46.380348205566406, "rewards/margins": 1.9176959991455078, "rewards/rejected": -48.29804229736328, "step": 6412 }, { "epoch": 0.8732298474945533, "grad_norm": 38.98585650957292, "learning_rate": 3.8535223402100757e-08, "logits/chosen": 15.43985366821289, "logits/rejected": 15.482429504394531, "logps/chosen": -4.903306007385254, "logps/rejected": -4.8922119140625, "loss": 4.0114, "rewards/accuracies": 0.25, "rewards/chosen": -49.03306198120117, "rewards/margins": -0.11093997955322266, "rewards/rejected": -48.922119140625, "step": 6413 }, { "epoch": 0.8733660130718954, "grad_norm": 43.07800299683582, "learning_rate": 3.8453837330083425e-08, "logits/chosen": 13.833650588989258, "logits/rejected": 14.298687934875488, "logps/chosen": -4.511828422546387, "logps/rejected": -4.465058326721191, "loss": 4.3997, "rewards/accuracies": 0.5, "rewards/chosen": -45.1182861328125, "rewards/margins": -0.4677009582519531, "rewards/rejected": -44.65058517456055, "step": 6414 }, { "epoch": 0.8735021786492375, "grad_norm": 45.3806600373023, "learning_rate": 3.837253295241023e-08, "logits/chosen": 14.91526985168457, "logits/rejected": 15.45034122467041, "logps/chosen": -5.058285236358643, "logps/rejected": -5.090456962585449, "loss": 4.4944, "rewards/accuracies": 0.5, "rewards/chosen": -50.582847595214844, "rewards/margins": 0.32172107696533203, "rewards/rejected": -50.904571533203125, "step": 6415 }, { "epoch": 0.8736383442265795, "grad_norm": 38.73870349553519, "learning_rate": 3.829131028745234e-08, "logits/chosen": 14.120180130004883, "logits/rejected": 13.841076850891113, "logps/chosen": -4.384881973266602, "logps/rejected": -4.417059898376465, "loss": 3.7885, "rewards/accuracies": 0.5, "rewards/chosen": -43.84881591796875, "rewards/margins": 0.32178306579589844, "rewards/rejected": -44.17060089111328, "step": 6416 }, { "epoch": 0.8737745098039216, "grad_norm": 45.27389003367396, "learning_rate": 3.821016935356285e-08, "logits/chosen": 14.575355529785156, "logits/rejected": 15.297415733337402, "logps/chosen": -4.441620826721191, "logps/rejected": -5.143866539001465, "loss": 3.4982, "rewards/accuracies": 0.75, "rewards/chosen": -44.41620635986328, "rewards/margins": 7.022459983825684, "rewards/rejected": -51.43866729736328, "step": 6417 }, { "epoch": 0.8739106753812637, "grad_norm": 41.1275815810631, "learning_rate": 3.8129110169076206e-08, "logits/chosen": 14.011616706848145, "logits/rejected": 13.975326538085938, "logps/chosen": -4.350865364074707, "logps/rejected": -4.537623405456543, "loss": 3.9613, "rewards/accuracies": 0.75, "rewards/chosen": -43.5086555480957, "rewards/margins": 1.8675813674926758, "rewards/rejected": -45.37623596191406, "step": 6418 }, { "epoch": 0.8740468409586056, "grad_norm": 42.19849937827322, "learning_rate": 3.804813275230834e-08, "logits/chosen": 14.608383178710938, "logits/rejected": 15.0851411819458, "logps/chosen": -4.492208003997803, "logps/rejected": -4.851807594299316, "loss": 3.675, "rewards/accuracies": 1.0, "rewards/chosen": -44.92207717895508, "rewards/margins": 3.5959959030151367, "rewards/rejected": -48.51807403564453, "step": 6419 }, { "epoch": 0.8741830065359477, "grad_norm": 42.815569912464696, "learning_rate": 3.796723712155678e-08, "logits/chosen": 14.37472152709961, "logits/rejected": 14.204998016357422, "logps/chosen": -4.395698070526123, "logps/rejected": -4.734023094177246, "loss": 4.1844, "rewards/accuracies": 0.75, "rewards/chosen": -43.95698165893555, "rewards/margins": 3.3832502365112305, "rewards/rejected": -47.340232849121094, "step": 6420 }, { "epoch": 0.8743191721132898, "grad_norm": 40.112734298058776, "learning_rate": 3.78864232951007e-08, "logits/chosen": 13.652288436889648, "logits/rejected": 13.939918518066406, "logps/chosen": -4.105578422546387, "logps/rejected": -4.436390399932861, "loss": 4.0389, "rewards/accuracies": 1.0, "rewards/chosen": -41.0557861328125, "rewards/margins": 3.3081178665161133, "rewards/rejected": -44.3639030456543, "step": 6421 }, { "epoch": 0.8744553376906318, "grad_norm": 43.48851320564497, "learning_rate": 3.7805691291200417e-08, "logits/chosen": 14.79528522491455, "logits/rejected": 14.498201370239258, "logps/chosen": -4.450712203979492, "logps/rejected": -4.829965114593506, "loss": 4.075, "rewards/accuracies": 1.0, "rewards/chosen": -44.50712585449219, "rewards/margins": 3.792527198791504, "rewards/rejected": -48.29964828491211, "step": 6422 }, { "epoch": 0.8745915032679739, "grad_norm": 44.15160066759477, "learning_rate": 3.7725041128098134e-08, "logits/chosen": 14.580506324768066, "logits/rejected": 14.578330039978027, "logps/chosen": -4.698866844177246, "logps/rejected": -4.371504306793213, "loss": 3.7561, "rewards/accuracies": 0.25, "rewards/chosen": -46.988670349121094, "rewards/margins": -3.273622512817383, "rewards/rejected": -43.71504592895508, "step": 6423 }, { "epoch": 0.8747276688453159, "grad_norm": 41.33582586782996, "learning_rate": 3.764447282401746e-08, "logits/chosen": 14.80050277709961, "logits/rejected": 15.038887977600098, "logps/chosen": -4.305050849914551, "logps/rejected": -4.9819865226745605, "loss": 3.4782, "rewards/accuracies": 1.0, "rewards/chosen": -43.050506591796875, "rewards/margins": 6.769359588623047, "rewards/rejected": -49.81986618041992, "step": 6424 }, { "epoch": 0.8748638344226579, "grad_norm": 41.10436256223727, "learning_rate": 3.7563986397163386e-08, "logits/chosen": 14.610893249511719, "logits/rejected": 14.552228927612305, "logps/chosen": -4.557262420654297, "logps/rejected": -4.5472564697265625, "loss": 3.568, "rewards/accuracies": 0.25, "rewards/chosen": -45.57262420654297, "rewards/margins": -0.10005760192871094, "rewards/rejected": -45.47256851196289, "step": 6425 }, { "epoch": 0.875, "grad_norm": 44.3531452074862, "learning_rate": 3.7483581865722467e-08, "logits/chosen": 14.37594223022461, "logits/rejected": 14.56154727935791, "logps/chosen": -4.574322700500488, "logps/rejected": -4.832045078277588, "loss": 4.0097, "rewards/accuracies": 0.75, "rewards/chosen": -45.743228912353516, "rewards/margins": 2.5772199630737305, "rewards/rejected": -48.32044982910156, "step": 6426 }, { "epoch": 0.8751361655773421, "grad_norm": 40.05475603657183, "learning_rate": 3.74032592478629e-08, "logits/chosen": 14.51449203491211, "logits/rejected": 13.926562309265137, "logps/chosen": -4.841498374938965, "logps/rejected": -4.389255046844482, "loss": 3.5149, "rewards/accuracies": 0.0, "rewards/chosen": -48.41498565673828, "rewards/margins": -4.52243709564209, "rewards/rejected": -43.89255142211914, "step": 6427 }, { "epoch": 0.8752723311546841, "grad_norm": 53.669215150434084, "learning_rate": 3.73230185617341e-08, "logits/chosen": 13.879491806030273, "logits/rejected": 14.552377700805664, "logps/chosen": -4.559189319610596, "logps/rejected": -4.744523048400879, "loss": 3.9691, "rewards/accuracies": 0.75, "rewards/chosen": -45.591896057128906, "rewards/margins": 1.8533363342285156, "rewards/rejected": -47.44523239135742, "step": 6428 }, { "epoch": 0.8754084967320261, "grad_norm": 43.49486151522678, "learning_rate": 3.7242859825467174e-08, "logits/chosen": 13.910137176513672, "logits/rejected": 14.117281913757324, "logps/chosen": -4.221071243286133, "logps/rejected": -4.426587104797363, "loss": 4.6138, "rewards/accuracies": 0.75, "rewards/chosen": -42.21071243286133, "rewards/margins": 2.055161476135254, "rewards/rejected": -44.26587677001953, "step": 6429 }, { "epoch": 0.8755446623093682, "grad_norm": 42.91899324594755, "learning_rate": 3.7162783057174704e-08, "logits/chosen": 14.514890670776367, "logits/rejected": 15.382699012756348, "logps/chosen": -4.439688682556152, "logps/rejected": -4.952658653259277, "loss": 3.7825, "rewards/accuracies": 0.75, "rewards/chosen": -44.396888732910156, "rewards/margins": 5.129700660705566, "rewards/rejected": -49.526588439941406, "step": 6430 }, { "epoch": 0.8756808278867102, "grad_norm": 39.094443638107506, "learning_rate": 3.7082788274950574e-08, "logits/chosen": 13.901899337768555, "logits/rejected": 14.236474990844727, "logps/chosen": -4.643949508666992, "logps/rejected": -4.769536972045898, "loss": 3.6448, "rewards/accuracies": 0.5, "rewards/chosen": -46.43949508666992, "rewards/margins": 1.2558765411376953, "rewards/rejected": -47.69537353515625, "step": 6431 }, { "epoch": 0.8758169934640523, "grad_norm": 42.0031687053988, "learning_rate": 3.7002875496870354e-08, "logits/chosen": 13.744634628295898, "logits/rejected": 13.959112167358398, "logps/chosen": -4.6006975173950195, "logps/rejected": -4.387789726257324, "loss": 3.8143, "rewards/accuracies": 0.25, "rewards/chosen": -46.00697326660156, "rewards/margins": -2.1290769577026367, "rewards/rejected": -43.87789535522461, "step": 6432 }, { "epoch": 0.8759531590413944, "grad_norm": 41.25802254889097, "learning_rate": 3.692304474099104e-08, "logits/chosen": 14.545984268188477, "logits/rejected": 14.969621658325195, "logps/chosen": -4.65802001953125, "logps/rejected": -4.757960319519043, "loss": 3.9156, "rewards/accuracies": 0.5, "rewards/chosen": -46.5802001953125, "rewards/margins": 0.9994001388549805, "rewards/rejected": -47.5796012878418, "step": 6433 }, { "epoch": 0.8760893246187363, "grad_norm": 41.30447949751879, "learning_rate": 3.6843296025350945e-08, "logits/chosen": 14.476856231689453, "logits/rejected": 14.868753433227539, "logps/chosen": -4.675858974456787, "logps/rejected": -4.997003555297852, "loss": 3.8067, "rewards/accuracies": 1.0, "rewards/chosen": -46.75858688354492, "rewards/margins": 3.2114439010620117, "rewards/rejected": -49.97003173828125, "step": 6434 }, { "epoch": 0.8762254901960784, "grad_norm": 41.201310603394866, "learning_rate": 3.6763629367969974e-08, "logits/chosen": 14.547531127929688, "logits/rejected": 14.511619567871094, "logps/chosen": -4.748650074005127, "logps/rejected": -4.8319292068481445, "loss": 3.3944, "rewards/accuracies": 0.5, "rewards/chosen": -47.48649978637695, "rewards/margins": 0.832794189453125, "rewards/rejected": -48.31929397583008, "step": 6435 }, { "epoch": 0.8763616557734205, "grad_norm": 41.7204004225423, "learning_rate": 3.668404478684954e-08, "logits/chosen": 14.045064926147461, "logits/rejected": 14.036134719848633, "logps/chosen": -4.058584213256836, "logps/rejected": -4.4949798583984375, "loss": 3.8233, "rewards/accuracies": 0.75, "rewards/chosen": -40.58584213256836, "rewards/margins": 4.363955497741699, "rewards/rejected": -44.949798583984375, "step": 6436 }, { "epoch": 0.8764978213507625, "grad_norm": 44.09106373728671, "learning_rate": 3.660454229997234e-08, "logits/chosen": 13.999302864074707, "logits/rejected": 14.180981636047363, "logps/chosen": -4.466971397399902, "logps/rejected": -4.584668159484863, "loss": 4.099, "rewards/accuracies": 0.75, "rewards/chosen": -44.669715881347656, "rewards/margins": 1.1769657135009766, "rewards/rejected": -45.8466796875, "step": 6437 }, { "epoch": 0.8766339869281046, "grad_norm": 41.884657416751296, "learning_rate": 3.65251219253027e-08, "logits/chosen": 14.88894271850586, "logits/rejected": 15.491671562194824, "logps/chosen": -4.801467418670654, "logps/rejected": -4.914383411407471, "loss": 4.2988, "rewards/accuracies": 0.5, "rewards/chosen": -48.01467514038086, "rewards/margins": 1.129159927368164, "rewards/rejected": -49.143836975097656, "step": 6438 }, { "epoch": 0.8767701525054467, "grad_norm": 39.85832182875122, "learning_rate": 3.644578368078628e-08, "logits/chosen": 14.089617729187012, "logits/rejected": 14.402820587158203, "logps/chosen": -4.611952304840088, "logps/rejected": -4.730417251586914, "loss": 3.5864, "rewards/accuracies": 0.5, "rewards/chosen": -46.11952209472656, "rewards/margins": 1.1846542358398438, "rewards/rejected": -47.304176330566406, "step": 6439 }, { "epoch": 0.8769063180827886, "grad_norm": 46.23777175593684, "learning_rate": 3.636652758435019e-08, "logits/chosen": 14.08960247039795, "logits/rejected": 14.578428268432617, "logps/chosen": -4.167566299438477, "logps/rejected": -4.259790897369385, "loss": 4.2031, "rewards/accuracies": 0.75, "rewards/chosen": -41.67566680908203, "rewards/margins": 0.9222421646118164, "rewards/rejected": -42.59790802001953, "step": 6440 }, { "epoch": 0.8770424836601307, "grad_norm": 42.875076239628825, "learning_rate": 3.6287353653903006e-08, "logits/chosen": 14.230186462402344, "logits/rejected": 14.958675384521484, "logps/chosen": -4.338247299194336, "logps/rejected": -4.519060134887695, "loss": 3.9981, "rewards/accuracies": 0.75, "rewards/chosen": -43.38247299194336, "rewards/margins": 1.8081283569335938, "rewards/rejected": -45.19060134887695, "step": 6441 }, { "epoch": 0.8771786492374728, "grad_norm": 38.28537282114948, "learning_rate": 3.620826190733477e-08, "logits/chosen": 14.281573295593262, "logits/rejected": 14.693061828613281, "logps/chosen": -4.715940475463867, "logps/rejected": -4.887265205383301, "loss": 3.5451, "rewards/accuracies": 0.75, "rewards/chosen": -47.15940856933594, "rewards/margins": 1.7132434844970703, "rewards/rejected": -48.87265396118164, "step": 6442 }, { "epoch": 0.8773148148148148, "grad_norm": 42.460345229520556, "learning_rate": 3.612925236251687e-08, "logits/chosen": 14.587732315063477, "logits/rejected": 14.823476791381836, "logps/chosen": -4.223249912261963, "logps/rejected": -4.58455228805542, "loss": 3.8818, "rewards/accuracies": 1.0, "rewards/chosen": -42.23249816894531, "rewards/margins": 3.6130218505859375, "rewards/rejected": -45.84552001953125, "step": 6443 }, { "epoch": 0.8774509803921569, "grad_norm": 42.56250755042208, "learning_rate": 3.605032503730214e-08, "logits/chosen": 14.350861549377441, "logits/rejected": 14.890335083007812, "logps/chosen": -4.62319278717041, "logps/rejected": -4.779537200927734, "loss": 3.8212, "rewards/accuracies": 0.75, "rewards/chosen": -46.23193359375, "rewards/margins": 1.5634422302246094, "rewards/rejected": -47.79537582397461, "step": 6444 }, { "epoch": 0.8775871459694989, "grad_norm": 39.67691294164033, "learning_rate": 3.597147994952503e-08, "logits/chosen": 13.97555160522461, "logits/rejected": 14.787656784057617, "logps/chosen": -4.336845397949219, "logps/rejected": -4.944334983825684, "loss": 4.141, "rewards/accuracies": 1.0, "rewards/chosen": -43.36845397949219, "rewards/margins": 6.074892997741699, "rewards/rejected": -49.4433479309082, "step": 6445 }, { "epoch": 0.8777233115468409, "grad_norm": 45.27103398198845, "learning_rate": 3.5892717117001013e-08, "logits/chosen": 15.14605712890625, "logits/rejected": 15.060980796813965, "logps/chosen": -4.8140716552734375, "logps/rejected": -5.014303684234619, "loss": 4.07, "rewards/accuracies": 0.5, "rewards/chosen": -48.14071273803711, "rewards/margins": 2.0023231506347656, "rewards/rejected": -50.143035888671875, "step": 6446 }, { "epoch": 0.877859477124183, "grad_norm": 44.83513390133848, "learning_rate": 3.581403655752733e-08, "logits/chosen": 15.038406372070312, "logits/rejected": 14.961989402770996, "logps/chosen": -4.793720245361328, "logps/rejected": -4.9053802490234375, "loss": 4.28, "rewards/accuracies": 0.75, "rewards/chosen": -47.93720245361328, "rewards/margins": 1.1166000366210938, "rewards/rejected": -49.05380630493164, "step": 6447 }, { "epoch": 0.8779956427015251, "grad_norm": 49.97181794655337, "learning_rate": 3.57354382888825e-08, "logits/chosen": 14.503435134887695, "logits/rejected": 15.232942581176758, "logps/chosen": -4.675107002258301, "logps/rejected": -5.079447269439697, "loss": 4.1323, "rewards/accuracies": 0.75, "rewards/chosen": -46.751068115234375, "rewards/margins": 4.043401718139648, "rewards/rejected": -50.794471740722656, "step": 6448 }, { "epoch": 0.878131808278867, "grad_norm": 90.16296477838569, "learning_rate": 3.565692232882638e-08, "logits/chosen": 14.394107818603516, "logits/rejected": 14.5171480178833, "logps/chosen": -4.37000846862793, "logps/rejected": -4.834317207336426, "loss": 4.4075, "rewards/accuracies": 1.0, "rewards/chosen": -43.70008087158203, "rewards/margins": 4.643093109130859, "rewards/rejected": -48.343177795410156, "step": 6449 }, { "epoch": 0.8782679738562091, "grad_norm": 40.196178484432934, "learning_rate": 3.557848869510036e-08, "logits/chosen": 14.120899200439453, "logits/rejected": 13.896772384643555, "logps/chosen": -4.325850009918213, "logps/rejected": -4.312047958374023, "loss": 3.9243, "rewards/accuracies": 0.5, "rewards/chosen": -43.25849914550781, "rewards/margins": -0.1380176544189453, "rewards/rejected": -43.1204833984375, "step": 6450 }, { "epoch": 0.8784041394335512, "grad_norm": 40.21684483339266, "learning_rate": 3.550013740542725e-08, "logits/chosen": 14.210731506347656, "logits/rejected": 14.95730972290039, "logps/chosen": -4.572120666503906, "logps/rejected": -4.972184658050537, "loss": 4.1443, "rewards/accuracies": 0.5, "rewards/chosen": -45.72120666503906, "rewards/margins": 4.000638008117676, "rewards/rejected": -49.72184371948242, "step": 6451 }, { "epoch": 0.8785403050108932, "grad_norm": 40.12332672495221, "learning_rate": 3.542186847751099e-08, "logits/chosen": 14.657721519470215, "logits/rejected": 15.334850311279297, "logps/chosen": -4.420055389404297, "logps/rejected": -4.863298416137695, "loss": 4.053, "rewards/accuracies": 1.0, "rewards/chosen": -44.20055389404297, "rewards/margins": 4.432431221008301, "rewards/rejected": -48.63298797607422, "step": 6452 }, { "epoch": 0.8786764705882353, "grad_norm": 47.4786892561668, "learning_rate": 3.534368192903714e-08, "logits/chosen": 14.798040390014648, "logits/rejected": 15.024089813232422, "logps/chosen": -4.823304176330566, "logps/rejected": -4.9386138916015625, "loss": 4.3971, "rewards/accuracies": 0.5, "rewards/chosen": -48.23303985595703, "rewards/margins": 1.1530981063842773, "rewards/rejected": -49.386138916015625, "step": 6453 }, { "epoch": 0.8788126361655774, "grad_norm": 40.9527994292995, "learning_rate": 3.526557777767278e-08, "logits/chosen": 14.401956558227539, "logits/rejected": 14.873482704162598, "logps/chosen": -4.6540632247924805, "logps/rejected": -4.805634498596191, "loss": 3.6314, "rewards/accuracies": 0.5, "rewards/chosen": -46.54063034057617, "rewards/margins": 1.5157175064086914, "rewards/rejected": -48.05635070800781, "step": 6454 }, { "epoch": 0.8789488017429193, "grad_norm": 42.01241032231907, "learning_rate": 3.518755604106594e-08, "logits/chosen": 14.981515884399414, "logits/rejected": 15.332988739013672, "logps/chosen": -4.766072750091553, "logps/rejected": -4.808893203735352, "loss": 3.7527, "rewards/accuracies": 0.25, "rewards/chosen": -47.660728454589844, "rewards/margins": 0.4282064437866211, "rewards/rejected": -48.088932037353516, "step": 6455 }, { "epoch": 0.8790849673202614, "grad_norm": 43.40094425962744, "learning_rate": 3.510961673684636e-08, "logits/chosen": 13.962148666381836, "logits/rejected": 14.917896270751953, "logps/chosen": -4.499581336975098, "logps/rejected": -4.814534664154053, "loss": 4.0637, "rewards/accuracies": 0.5, "rewards/chosen": -44.995811462402344, "rewards/margins": 3.149538040161133, "rewards/rejected": -48.145347595214844, "step": 6456 }, { "epoch": 0.8792211328976035, "grad_norm": 43.642805527546514, "learning_rate": 3.503175988262521e-08, "logits/chosen": 13.948875427246094, "logits/rejected": 14.639915466308594, "logps/chosen": -4.318565845489502, "logps/rejected": -4.522521018981934, "loss": 4.2908, "rewards/accuracies": 0.75, "rewards/chosen": -43.18566131591797, "rewards/margins": 2.039553642272949, "rewards/rejected": -45.22521209716797, "step": 6457 }, { "epoch": 0.8793572984749455, "grad_norm": 43.84424567212211, "learning_rate": 3.495398549599469e-08, "logits/chosen": 14.021892547607422, "logits/rejected": 15.083389282226562, "logps/chosen": -4.280743598937988, "logps/rejected": -4.719585418701172, "loss": 3.2622, "rewards/accuracies": 0.75, "rewards/chosen": -42.807437896728516, "rewards/margins": 4.3884172439575195, "rewards/rejected": -47.19585418701172, "step": 6458 }, { "epoch": 0.8794934640522876, "grad_norm": 41.983786731150325, "learning_rate": 3.487629359452859e-08, "logits/chosen": 14.815229415893555, "logits/rejected": 14.742867469787598, "logps/chosen": -4.747209548950195, "logps/rejected": -4.683256149291992, "loss": 4.1898, "rewards/accuracies": 0.5, "rewards/chosen": -47.47209548950195, "rewards/margins": -0.6395330429077148, "rewards/rejected": -46.83256530761719, "step": 6459 }, { "epoch": 0.8796296296296297, "grad_norm": 47.812255300378986, "learning_rate": 3.479868419578223e-08, "logits/chosen": 14.796087265014648, "logits/rejected": 14.487672805786133, "logps/chosen": -4.746779441833496, "logps/rejected": -4.612703323364258, "loss": 4.5405, "rewards/accuracies": 0.5, "rewards/chosen": -47.46779251098633, "rewards/margins": -1.3407611846923828, "rewards/rejected": -46.12702941894531, "step": 6460 }, { "epoch": 0.8797657952069716, "grad_norm": 41.60235371376423, "learning_rate": 3.472115731729186e-08, "logits/chosen": 15.170961380004883, "logits/rejected": 15.767915725708008, "logps/chosen": -4.820622444152832, "logps/rejected": -5.000680923461914, "loss": 3.6625, "rewards/accuracies": 0.75, "rewards/chosen": -48.20622253417969, "rewards/margins": 1.800583839416504, "rewards/rejected": -50.00680923461914, "step": 6461 }, { "epoch": 0.8799019607843137, "grad_norm": 48.15717304541938, "learning_rate": 3.464371297657544e-08, "logits/chosen": 13.785826683044434, "logits/rejected": 14.001524925231934, "logps/chosen": -4.77761173248291, "logps/rejected": -4.7195329666137695, "loss": 4.1122, "rewards/accuracies": 0.5, "rewards/chosen": -47.77611541748047, "rewards/margins": -0.5807857513427734, "rewards/rejected": -47.19533157348633, "step": 6462 }, { "epoch": 0.8800381263616558, "grad_norm": 42.040793475884, "learning_rate": 3.4566351191132226e-08, "logits/chosen": 15.715401649475098, "logits/rejected": 15.194194793701172, "logps/chosen": -5.034067153930664, "logps/rejected": -4.852687835693359, "loss": 3.7393, "rewards/accuracies": 0.25, "rewards/chosen": -50.340675354003906, "rewards/margins": -1.8137950897216797, "rewards/rejected": -48.526878356933594, "step": 6463 }, { "epoch": 0.8801742919389978, "grad_norm": 41.38811026041794, "learning_rate": 3.4489071978442577e-08, "logits/chosen": 15.014703750610352, "logits/rejected": 14.007401466369629, "logps/chosen": -4.856417655944824, "logps/rejected": -4.382270812988281, "loss": 3.9207, "rewards/accuracies": 0.0, "rewards/chosen": -48.564178466796875, "rewards/margins": -4.7414703369140625, "rewards/rejected": -43.82270812988281, "step": 6464 }, { "epoch": 0.8803104575163399, "grad_norm": 43.4582489725445, "learning_rate": 3.4411875355968436e-08, "logits/chosen": 13.725510597229004, "logits/rejected": 14.214553833007812, "logps/chosen": -4.58829927444458, "logps/rejected": -4.592363357543945, "loss": 3.8775, "rewards/accuracies": 0.5, "rewards/chosen": -45.88299560546875, "rewards/margins": 0.04064178466796875, "rewards/rejected": -45.92363357543945, "step": 6465 }, { "epoch": 0.8804466230936819, "grad_norm": 42.07977712283212, "learning_rate": 3.433476134115314e-08, "logits/chosen": 15.17442512512207, "logits/rejected": 15.388168334960938, "logps/chosen": -4.713858604431152, "logps/rejected": -5.002692699432373, "loss": 3.8284, "rewards/accuracies": 0.75, "rewards/chosen": -47.138580322265625, "rewards/margins": 2.8883447647094727, "rewards/rejected": -50.02692794799805, "step": 6466 }, { "epoch": 0.880582788671024, "grad_norm": 44.68903430557207, "learning_rate": 3.425772995142107e-08, "logits/chosen": 14.409282684326172, "logits/rejected": 14.693170547485352, "logps/chosen": -4.409704208374023, "logps/rejected": -4.505117893218994, "loss": 3.5887, "rewards/accuracies": 0.5, "rewards/chosen": -44.09703826904297, "rewards/margins": 0.9541397094726562, "rewards/rejected": -45.051177978515625, "step": 6467 }, { "epoch": 0.880718954248366, "grad_norm": 43.69758952723479, "learning_rate": 3.418078120417815e-08, "logits/chosen": 13.77707290649414, "logits/rejected": 14.35982894897461, "logps/chosen": -4.4069671630859375, "logps/rejected": -4.445528984069824, "loss": 4.0838, "rewards/accuracies": 0.5, "rewards/chosen": -44.069671630859375, "rewards/margins": 0.3856191635131836, "rewards/rejected": -44.455291748046875, "step": 6468 }, { "epoch": 0.8808551198257081, "grad_norm": 39.94059517503577, "learning_rate": 3.41039151168117e-08, "logits/chosen": 13.490219116210938, "logits/rejected": 14.290493965148926, "logps/chosen": -4.339943885803223, "logps/rejected": -4.686841011047363, "loss": 4.0266, "rewards/accuracies": 0.75, "rewards/chosen": -43.399436950683594, "rewards/margins": 3.4689693450927734, "rewards/rejected": -46.868408203125, "step": 6469 }, { "epoch": 0.8809912854030502, "grad_norm": 40.54902067898724, "learning_rate": 3.402713170669007e-08, "logits/chosen": 13.624223709106445, "logits/rejected": 14.443082809448242, "logps/chosen": -4.576337814331055, "logps/rejected": -4.761371612548828, "loss": 3.8283, "rewards/accuracies": 0.75, "rewards/chosen": -45.76337432861328, "rewards/margins": 1.850337028503418, "rewards/rejected": -47.613712310791016, "step": 6470 }, { "epoch": 0.8811274509803921, "grad_norm": 42.525384698884444, "learning_rate": 3.395043099116317e-08, "logits/chosen": 14.559825897216797, "logits/rejected": 14.418835639953613, "logps/chosen": -4.8278350830078125, "logps/rejected": -5.027181625366211, "loss": 4.0919, "rewards/accuracies": 0.5, "rewards/chosen": -48.278350830078125, "rewards/margins": 1.9934673309326172, "rewards/rejected": -50.271820068359375, "step": 6471 }, { "epoch": 0.8812636165577342, "grad_norm": 41.29081664110678, "learning_rate": 3.387381298756229e-08, "logits/chosen": 14.85017204284668, "logits/rejected": 14.391462326049805, "logps/chosen": -4.649078369140625, "logps/rejected": -4.861116409301758, "loss": 4.3781, "rewards/accuracies": 0.75, "rewards/chosen": -46.49078369140625, "rewards/margins": 2.1203813552856445, "rewards/rejected": -48.61116409301758, "step": 6472 }, { "epoch": 0.8813997821350763, "grad_norm": 40.196309302331805, "learning_rate": 3.379727771319971e-08, "logits/chosen": 13.866641998291016, "logits/rejected": 14.261594772338867, "logps/chosen": -4.545894622802734, "logps/rejected": -4.510229587554932, "loss": 3.3588, "rewards/accuracies": 0.5, "rewards/chosen": -45.45895004272461, "rewards/margins": -0.35665416717529297, "rewards/rejected": -45.102294921875, "step": 6473 }, { "epoch": 0.8815359477124183, "grad_norm": 41.22019565887942, "learning_rate": 3.372082518536934e-08, "logits/chosen": 15.261153221130371, "logits/rejected": 15.191059112548828, "logps/chosen": -4.6285319328308105, "logps/rejected": -4.875308990478516, "loss": 4.0054, "rewards/accuracies": 0.75, "rewards/chosen": -46.28532028198242, "rewards/margins": 2.4677677154541016, "rewards/rejected": -48.753089904785156, "step": 6474 }, { "epoch": 0.8816721132897604, "grad_norm": 43.51945805679201, "learning_rate": 3.364445542134624e-08, "logits/chosen": 14.067322731018066, "logits/rejected": 14.736043930053711, "logps/chosen": -4.797096252441406, "logps/rejected": -4.742808818817139, "loss": 4.0557, "rewards/accuracies": 0.25, "rewards/chosen": -47.97096633911133, "rewards/margins": -0.5428762435913086, "rewards/rejected": -47.4280891418457, "step": 6475 }, { "epoch": 0.8818082788671024, "grad_norm": 48.08311060233914, "learning_rate": 3.35681684383867e-08, "logits/chosen": 14.059168815612793, "logits/rejected": 15.293012619018555, "logps/chosen": -4.602128505706787, "logps/rejected": -4.773210525512695, "loss": 4.127, "rewards/accuracies": 0.75, "rewards/chosen": -46.02128601074219, "rewards/margins": 1.7108221054077148, "rewards/rejected": -47.73210525512695, "step": 6476 }, { "epoch": 0.8819444444444444, "grad_norm": 43.449383247887184, "learning_rate": 3.349196425372844e-08, "logits/chosen": 14.616607666015625, "logits/rejected": 14.637468338012695, "logps/chosen": -4.41302490234375, "logps/rejected": -4.435385704040527, "loss": 4.0944, "rewards/accuracies": 0.5, "rewards/chosen": -44.1302490234375, "rewards/margins": 0.22361373901367188, "rewards/rejected": -44.35386276245117, "step": 6477 }, { "epoch": 0.8820806100217865, "grad_norm": 43.0499166037025, "learning_rate": 3.341584288459054e-08, "logits/chosen": 14.473791122436523, "logits/rejected": 15.158480644226074, "logps/chosen": -4.403746128082275, "logps/rejected": -4.7284369468688965, "loss": 3.5909, "rewards/accuracies": 1.0, "rewards/chosen": -44.03746032714844, "rewards/margins": 3.246908187866211, "rewards/rejected": -47.28437042236328, "step": 6478 }, { "epoch": 0.8822167755991286, "grad_norm": 43.59259271167369, "learning_rate": 3.333980434817305e-08, "logits/chosen": 15.794915199279785, "logits/rejected": 15.710230827331543, "logps/chosen": -4.88861083984375, "logps/rejected": -5.102190971374512, "loss": 3.8392, "rewards/accuracies": 0.75, "rewards/chosen": -48.8861083984375, "rewards/margins": 2.135798454284668, "rewards/rejected": -51.021907806396484, "step": 6479 }, { "epoch": 0.8823529411764706, "grad_norm": 40.507806627681916, "learning_rate": 3.326384866165765e-08, "logits/chosen": 14.814793586730957, "logits/rejected": 14.958614349365234, "logps/chosen": -4.59736442565918, "logps/rejected": -5.075818061828613, "loss": 3.9119, "rewards/accuracies": 0.75, "rewards/chosen": -45.97364807128906, "rewards/margins": 4.784533500671387, "rewards/rejected": -50.7581787109375, "step": 6480 }, { "epoch": 0.8824891067538126, "grad_norm": 44.47308086362996, "learning_rate": 3.3187975842207163e-08, "logits/chosen": 14.664337158203125, "logits/rejected": 15.290634155273438, "logps/chosen": -4.413264274597168, "logps/rejected": -4.772562026977539, "loss": 4.1218, "rewards/accuracies": 1.0, "rewards/chosen": -44.13264465332031, "rewards/margins": 3.5929765701293945, "rewards/rejected": -47.72562026977539, "step": 6481 }, { "epoch": 0.8826252723311547, "grad_norm": 40.32526518961961, "learning_rate": 3.3112185906965586e-08, "logits/chosen": 14.413631439208984, "logits/rejected": 14.768783569335938, "logps/chosen": -4.535499572753906, "logps/rejected": -4.639052867889404, "loss": 3.7133, "rewards/accuracies": 0.5, "rewards/chosen": -45.35499572753906, "rewards/margins": 1.0355358123779297, "rewards/rejected": -46.39052963256836, "step": 6482 }, { "epoch": 0.8827614379084967, "grad_norm": 43.00070711719718, "learning_rate": 3.303647887305834e-08, "logits/chosen": 14.891782760620117, "logits/rejected": 14.910598754882812, "logps/chosen": -4.735958099365234, "logps/rejected": -4.672118186950684, "loss": 4.1653, "rewards/accuracies": 0.5, "rewards/chosen": -47.359580993652344, "rewards/margins": -0.638401985168457, "rewards/rejected": -46.7211799621582, "step": 6483 }, { "epoch": 0.8828976034858388, "grad_norm": 49.82602201037091, "learning_rate": 3.296085475759205e-08, "logits/chosen": 14.23892879486084, "logits/rejected": 13.921567916870117, "logps/chosen": -4.470156669616699, "logps/rejected": -4.497491836547852, "loss": 4.4629, "rewards/accuracies": 0.75, "rewards/chosen": -44.70156478881836, "rewards/margins": 0.27335453033447266, "rewards/rejected": -44.97492218017578, "step": 6484 }, { "epoch": 0.8830337690631809, "grad_norm": 41.48435324254079, "learning_rate": 3.28853135776546e-08, "logits/chosen": 14.252158164978027, "logits/rejected": 14.450550079345703, "logps/chosen": -4.50166654586792, "logps/rejected": -4.620513916015625, "loss": 3.8509, "rewards/accuracies": 0.5, "rewards/chosen": -45.01666259765625, "rewards/margins": 1.1884746551513672, "rewards/rejected": -46.20513916015625, "step": 6485 }, { "epoch": 0.8831699346405228, "grad_norm": 38.823950928091215, "learning_rate": 3.280985535031511e-08, "logits/chosen": 13.678445816040039, "logits/rejected": 13.771688461303711, "logps/chosen": -4.475760459899902, "logps/rejected": -4.815646171569824, "loss": 3.4277, "rewards/accuracies": 1.0, "rewards/chosen": -44.757606506347656, "rewards/margins": 3.3988542556762695, "rewards/rejected": -48.15645980834961, "step": 6486 }, { "epoch": 0.8833061002178649, "grad_norm": 42.33459315777052, "learning_rate": 3.2734480092624053e-08, "logits/chosen": 14.17140007019043, "logits/rejected": 14.566744804382324, "logps/chosen": -4.524493217468262, "logps/rejected": -4.43181037902832, "loss": 3.3656, "rewards/accuracies": 0.0, "rewards/chosen": -45.24493408203125, "rewards/margins": -0.9268274307250977, "rewards/rejected": -44.3181037902832, "step": 6487 }, { "epoch": 0.883442265795207, "grad_norm": 39.64190165111549, "learning_rate": 3.2659187821613096e-08, "logits/chosen": 14.393903732299805, "logits/rejected": 14.82382583618164, "logps/chosen": -4.581073760986328, "logps/rejected": -4.693782806396484, "loss": 4.0049, "rewards/accuracies": 0.5, "rewards/chosen": -45.81074142456055, "rewards/margins": 1.1270856857299805, "rewards/rejected": -46.937828063964844, "step": 6488 }, { "epoch": 0.883578431372549, "grad_norm": 40.63200943250871, "learning_rate": 3.258397855429509e-08, "logits/chosen": 14.144599914550781, "logits/rejected": 14.081396102905273, "logps/chosen": -4.737330436706543, "logps/rejected": -4.8037567138671875, "loss": 3.8958, "rewards/accuracies": 0.5, "rewards/chosen": -47.37329864501953, "rewards/margins": 0.6642646789550781, "rewards/rejected": -48.037567138671875, "step": 6489 }, { "epoch": 0.8837145969498911, "grad_norm": 40.619107155719746, "learning_rate": 3.25088523076642e-08, "logits/chosen": 14.76836109161377, "logits/rejected": 15.345159530639648, "logps/chosen": -5.0202131271362305, "logps/rejected": -5.1906938552856445, "loss": 3.9671, "rewards/accuracies": 0.75, "rewards/chosen": -50.20212936401367, "rewards/margins": 1.7048091888427734, "rewards/rejected": -51.90693664550781, "step": 6490 }, { "epoch": 0.8838507625272332, "grad_norm": 41.20531026911161, "learning_rate": 3.243380909869593e-08, "logits/chosen": 14.28277587890625, "logits/rejected": 15.139841079711914, "logps/chosen": -4.512971878051758, "logps/rejected": -5.025304794311523, "loss": 3.9358, "rewards/accuracies": 0.75, "rewards/chosen": -45.12971878051758, "rewards/margins": 5.12332820892334, "rewards/rejected": -50.25304412841797, "step": 6491 }, { "epoch": 0.8839869281045751, "grad_norm": 39.31301167058253, "learning_rate": 3.235884894434675e-08, "logits/chosen": 13.824957847595215, "logits/rejected": 13.710512161254883, "logps/chosen": -4.404124736785889, "logps/rejected": -4.498624324798584, "loss": 3.8165, "rewards/accuracies": 0.75, "rewards/chosen": -44.04124450683594, "rewards/margins": 0.9449958801269531, "rewards/rejected": -44.986244201660156, "step": 6492 }, { "epoch": 0.8841230936819172, "grad_norm": 47.39402746082289, "learning_rate": 3.2283971861554626e-08, "logits/chosen": 14.477638244628906, "logits/rejected": 14.931137084960938, "logps/chosen": -4.904433250427246, "logps/rejected": -5.075382709503174, "loss": 3.9436, "rewards/accuracies": 0.75, "rewards/chosen": -49.044334411621094, "rewards/margins": 1.7094907760620117, "rewards/rejected": -50.75382614135742, "step": 6493 }, { "epoch": 0.8842592592592593, "grad_norm": 42.06046863248118, "learning_rate": 3.22091778672386e-08, "logits/chosen": 15.408424377441406, "logits/rejected": 15.382572174072266, "logps/chosen": -4.557273864746094, "logps/rejected": -4.98514461517334, "loss": 3.7553, "rewards/accuracies": 1.0, "rewards/chosen": -45.57273864746094, "rewards/margins": 4.278707504272461, "rewards/rejected": -49.851444244384766, "step": 6494 }, { "epoch": 0.8843954248366013, "grad_norm": 39.87933949169843, "learning_rate": 3.213446697829911e-08, "logits/chosen": 13.817535400390625, "logits/rejected": 15.336187362670898, "logps/chosen": -4.475554466247559, "logps/rejected": -4.8715901374816895, "loss": 3.8883, "rewards/accuracies": 1.0, "rewards/chosen": -44.75554656982422, "rewards/margins": 3.9603567123413086, "rewards/rejected": -48.715904235839844, "step": 6495 }, { "epoch": 0.8845315904139434, "grad_norm": 38.814079331654575, "learning_rate": 3.2059839211617545e-08, "logits/chosen": 13.840075492858887, "logits/rejected": 14.263221740722656, "logps/chosen": -4.259716033935547, "logps/rejected": -4.625599384307861, "loss": 3.5367, "rewards/accuracies": 1.0, "rewards/chosen": -42.59716033935547, "rewards/margins": 3.658834457397461, "rewards/rejected": -46.2559928894043, "step": 6496 }, { "epoch": 0.8846677559912854, "grad_norm": 38.77241931079456, "learning_rate": 3.198529458405672e-08, "logits/chosen": 14.28968334197998, "logits/rejected": 13.826550483703613, "logps/chosen": -4.491115570068359, "logps/rejected": -4.563628673553467, "loss": 3.0748, "rewards/accuracies": 0.5, "rewards/chosen": -44.91115951538086, "rewards/margins": 0.7251291275024414, "rewards/rejected": -45.63629150390625, "step": 6497 }, { "epoch": 0.8848039215686274, "grad_norm": 40.43710978299733, "learning_rate": 3.191083311246072e-08, "logits/chosen": 14.92817497253418, "logits/rejected": 14.667062759399414, "logps/chosen": -4.753589630126953, "logps/rejected": -4.898029804229736, "loss": 3.5388, "rewards/accuracies": 0.5, "rewards/chosen": -47.53589630126953, "rewards/margins": 1.4443988800048828, "rewards/rejected": -48.98029708862305, "step": 6498 }, { "epoch": 0.8849400871459695, "grad_norm": 45.49326291307882, "learning_rate": 3.1836454813654536e-08, "logits/chosen": 14.658668518066406, "logits/rejected": 14.414983749389648, "logps/chosen": -4.751599311828613, "logps/rejected": -4.716909408569336, "loss": 3.8441, "rewards/accuracies": 0.5, "rewards/chosen": -47.5159912109375, "rewards/margins": -0.34689807891845703, "rewards/rejected": -47.169097900390625, "step": 6499 }, { "epoch": 0.8850762527233116, "grad_norm": 40.69150073855459, "learning_rate": 3.176215970444467e-08, "logits/chosen": 13.537582397460938, "logits/rejected": 13.73614501953125, "logps/chosen": -4.281247138977051, "logps/rejected": -4.585131645202637, "loss": 3.7029, "rewards/accuracies": 0.75, "rewards/chosen": -42.812469482421875, "rewards/margins": 3.0388479232788086, "rewards/rejected": -45.851318359375, "step": 6500 }, { "epoch": 0.8852124183006536, "grad_norm": 44.048787938588305, "learning_rate": 3.1687947801618765e-08, "logits/chosen": 14.181817054748535, "logits/rejected": 14.246225357055664, "logps/chosen": -4.469305515289307, "logps/rejected": -4.976237773895264, "loss": 4.0403, "rewards/accuracies": 0.75, "rewards/chosen": -44.69305419921875, "rewards/margins": 5.0693206787109375, "rewards/rejected": -49.76237487792969, "step": 6501 }, { "epoch": 0.8853485838779956, "grad_norm": 49.06849658032216, "learning_rate": 3.1613819121945493e-08, "logits/chosen": 13.844470977783203, "logits/rejected": 14.460190773010254, "logps/chosen": -4.389010429382324, "logps/rejected": -4.840890884399414, "loss": 4.8009, "rewards/accuracies": 0.75, "rewards/chosen": -43.890106201171875, "rewards/margins": 4.518801689147949, "rewards/rejected": -48.408905029296875, "step": 6502 }, { "epoch": 0.8854847494553377, "grad_norm": 39.77984072420202, "learning_rate": 3.1539773682174885e-08, "logits/chosen": 14.506372451782227, "logits/rejected": 14.529525756835938, "logps/chosen": -4.574348449707031, "logps/rejected": -4.730242729187012, "loss": 4.1882, "rewards/accuracies": 0.75, "rewards/chosen": -45.74348449707031, "rewards/margins": 1.5589380264282227, "rewards/rejected": -47.30242156982422, "step": 6503 }, { "epoch": 0.8856209150326797, "grad_norm": 48.03786389905275, "learning_rate": 3.146581149903818e-08, "logits/chosen": 14.730087280273438, "logits/rejected": 14.480015754699707, "logps/chosen": -4.745375156402588, "logps/rejected": -4.427130222320557, "loss": 4.1635, "rewards/accuracies": 0.25, "rewards/chosen": -47.45375061035156, "rewards/margins": -3.1824493408203125, "rewards/rejected": -44.27130126953125, "step": 6504 }, { "epoch": 0.8857570806100218, "grad_norm": 42.76798440244562, "learning_rate": 3.1391932589247727e-08, "logits/chosen": 15.125001907348633, "logits/rejected": 14.81982421875, "logps/chosen": -4.886661052703857, "logps/rejected": -4.559961318969727, "loss": 4.1075, "rewards/accuracies": 0.25, "rewards/chosen": -48.86661148071289, "rewards/margins": -3.267001152038574, "rewards/rejected": -45.599609375, "step": 6505 }, { "epoch": 0.8858932461873639, "grad_norm": 45.2404393247132, "learning_rate": 3.131813696949699e-08, "logits/chosen": 14.852771759033203, "logits/rejected": 15.146718978881836, "logps/chosen": -4.825152397155762, "logps/rejected": -4.826972961425781, "loss": 3.1866, "rewards/accuracies": 0.5, "rewards/chosen": -48.25152587890625, "rewards/margins": 0.018204689025878906, "rewards/rejected": -48.26972961425781, "step": 6506 }, { "epoch": 0.8860294117647058, "grad_norm": 43.72266151004736, "learning_rate": 3.124442465646075e-08, "logits/chosen": 14.397920608520508, "logits/rejected": 15.154706001281738, "logps/chosen": -4.710071563720703, "logps/rejected": -5.054646968841553, "loss": 4.2482, "rewards/accuracies": 1.0, "rewards/chosen": -47.10071563720703, "rewards/margins": 3.445754051208496, "rewards/rejected": -50.546470642089844, "step": 6507 }, { "epoch": 0.8861655773420479, "grad_norm": 47.59874443937234, "learning_rate": 3.1170795666795036e-08, "logits/chosen": 15.240253448486328, "logits/rejected": 16.135223388671875, "logps/chosen": -4.769532680511475, "logps/rejected": -5.384277820587158, "loss": 3.9184, "rewards/accuracies": 1.0, "rewards/chosen": -47.69532775878906, "rewards/margins": 6.147454261779785, "rewards/rejected": -53.84278106689453, "step": 6508 }, { "epoch": 0.88630174291939, "grad_norm": 40.40838048430186, "learning_rate": 3.10972500171367e-08, "logits/chosen": 13.84785270690918, "logits/rejected": 14.389228820800781, "logps/chosen": -4.5732035636901855, "logps/rejected": -4.781741142272949, "loss": 3.6628, "rewards/accuracies": 0.75, "rewards/chosen": -45.732032775878906, "rewards/margins": 2.0853805541992188, "rewards/rejected": -47.817413330078125, "step": 6509 }, { "epoch": 0.886437908496732, "grad_norm": 43.14680474018495, "learning_rate": 3.10237877241041e-08, "logits/chosen": 14.904081344604492, "logits/rejected": 15.48947811126709, "logps/chosen": -4.665904998779297, "logps/rejected": -4.912626266479492, "loss": 4.055, "rewards/accuracies": 0.75, "rewards/chosen": -46.659053802490234, "rewards/margins": 2.4672060012817383, "rewards/rejected": -49.126258850097656, "step": 6510 }, { "epoch": 0.8865740740740741, "grad_norm": 39.7052067229587, "learning_rate": 3.0950408804296754e-08, "logits/chosen": 13.062255859375, "logits/rejected": 14.497112274169922, "logps/chosen": -4.357197284698486, "logps/rejected": -4.857753753662109, "loss": 3.472, "rewards/accuracies": 0.75, "rewards/chosen": -43.57197189331055, "rewards/margins": 5.005566596984863, "rewards/rejected": -48.577537536621094, "step": 6511 }, { "epoch": 0.8867102396514162, "grad_norm": 48.60740180277183, "learning_rate": 3.0877113274295055e-08, "logits/chosen": 14.156336784362793, "logits/rejected": 14.673973083496094, "logps/chosen": -4.444496154785156, "logps/rejected": -4.852148056030273, "loss": 4.3114, "rewards/accuracies": 1.0, "rewards/chosen": -44.44496154785156, "rewards/margins": 4.076516151428223, "rewards/rejected": -48.52147674560547, "step": 6512 }, { "epoch": 0.8868464052287581, "grad_norm": 42.8863460136087, "learning_rate": 3.0803901150660805e-08, "logits/chosen": 14.229606628417969, "logits/rejected": 13.836257934570312, "logps/chosen": -4.574427127838135, "logps/rejected": -4.839954853057861, "loss": 4.1378, "rewards/accuracies": 0.75, "rewards/chosen": -45.74427032470703, "rewards/margins": 2.6552743911743164, "rewards/rejected": -48.3995475769043, "step": 6513 }, { "epoch": 0.8869825708061002, "grad_norm": 40.67415302135693, "learning_rate": 3.073077244993696e-08, "logits/chosen": 14.235902786254883, "logits/rejected": 15.267007827758789, "logps/chosen": -4.594359397888184, "logps/rejected": -4.860743045806885, "loss": 3.5368, "rewards/accuracies": 0.75, "rewards/chosen": -45.94359588623047, "rewards/margins": 2.6638336181640625, "rewards/rejected": -48.60742950439453, "step": 6514 }, { "epoch": 0.8871187363834423, "grad_norm": 51.972519006233426, "learning_rate": 3.065772718864745e-08, "logits/chosen": 13.801729202270508, "logits/rejected": 14.112140655517578, "logps/chosen": -4.366376876831055, "logps/rejected": -4.63001823425293, "loss": 3.6426, "rewards/accuracies": 0.75, "rewards/chosen": -43.66376876831055, "rewards/margins": 2.63641357421875, "rewards/rejected": -46.30018615722656, "step": 6515 }, { "epoch": 0.8872549019607843, "grad_norm": 42.20691199157864, "learning_rate": 3.058476538329748e-08, "logits/chosen": 14.484088897705078, "logits/rejected": 14.3545503616333, "logps/chosen": -4.71586275100708, "logps/rejected": -4.693017959594727, "loss": 3.7706, "rewards/accuracies": 0.5, "rewards/chosen": -47.15863037109375, "rewards/margins": -0.22844886779785156, "rewards/rejected": -46.93018341064453, "step": 6516 }, { "epoch": 0.8873910675381264, "grad_norm": 39.03919829215692, "learning_rate": 3.051188705037346e-08, "logits/chosen": 14.427352905273438, "logits/rejected": 15.383798599243164, "logps/chosen": -4.557003974914551, "logps/rejected": -5.164378643035889, "loss": 3.2336, "rewards/accuracies": 1.0, "rewards/chosen": -45.57004165649414, "rewards/margins": 6.073744773864746, "rewards/rejected": -51.64378356933594, "step": 6517 }, { "epoch": 0.8875272331154684, "grad_norm": 53.319655020771904, "learning_rate": 3.043909220634271e-08, "logits/chosen": 13.155858993530273, "logits/rejected": 14.093515396118164, "logps/chosen": -4.322413444519043, "logps/rejected": -4.512382507324219, "loss": 3.9761, "rewards/accuracies": 0.75, "rewards/chosen": -43.2241325378418, "rewards/margins": 1.8996963500976562, "rewards/rejected": -45.12382888793945, "step": 6518 }, { "epoch": 0.8876633986928104, "grad_norm": 44.161380902202005, "learning_rate": 3.036638086765388e-08, "logits/chosen": 15.157207489013672, "logits/rejected": 14.81922721862793, "logps/chosen": -4.474312782287598, "logps/rejected": -5.119687557220459, "loss": 3.9007, "rewards/accuracies": 1.0, "rewards/chosen": -44.743125915527344, "rewards/margins": 6.453751564025879, "rewards/rejected": -51.196876525878906, "step": 6519 }, { "epoch": 0.8877995642701525, "grad_norm": 43.964455494646934, "learning_rate": 3.029375305073678e-08, "logits/chosen": 14.784942626953125, "logits/rejected": 13.82758903503418, "logps/chosen": -4.429802894592285, "logps/rejected": -4.129275798797607, "loss": 4.1908, "rewards/accuracies": 0.25, "rewards/chosen": -44.29802703857422, "rewards/margins": -3.005270004272461, "rewards/rejected": -41.29275894165039, "step": 6520 }, { "epoch": 0.8879357298474946, "grad_norm": 45.389255081775744, "learning_rate": 3.022120877200218e-08, "logits/chosen": 14.86091136932373, "logits/rejected": 14.837562561035156, "logps/chosen": -4.756810188293457, "logps/rejected": -4.770349502563477, "loss": 3.5191, "rewards/accuracies": 0.5, "rewards/chosen": -47.56809997558594, "rewards/margins": 0.1353912353515625, "rewards/rejected": -47.7034912109375, "step": 6521 }, { "epoch": 0.8880718954248366, "grad_norm": 41.43170852418786, "learning_rate": 3.014874804784204e-08, "logits/chosen": 14.222824096679688, "logits/rejected": 14.664876937866211, "logps/chosen": -4.483436584472656, "logps/rejected": -5.02998161315918, "loss": 4.0867, "rewards/accuracies": 1.0, "rewards/chosen": -44.8343620300293, "rewards/margins": 5.465452194213867, "rewards/rejected": -50.29981231689453, "step": 6522 }, { "epoch": 0.8882080610021786, "grad_norm": 45.26410852877724, "learning_rate": 3.007637089462958e-08, "logits/chosen": 14.024113655090332, "logits/rejected": 14.583223342895508, "logps/chosen": -4.672956466674805, "logps/rejected": -4.805482387542725, "loss": 4.1215, "rewards/accuracies": 0.75, "rewards/chosen": -46.72956085205078, "rewards/margins": 1.3252601623535156, "rewards/rejected": -48.05482482910156, "step": 6523 }, { "epoch": 0.8883442265795207, "grad_norm": 39.286780851268105, "learning_rate": 3.000407732871886e-08, "logits/chosen": 14.769009590148926, "logits/rejected": 15.247848510742188, "logps/chosen": -4.702369689941406, "logps/rejected": -4.881096839904785, "loss": 3.9964, "rewards/accuracies": 0.5, "rewards/chosen": -47.02369689941406, "rewards/margins": 1.7872724533081055, "rewards/rejected": -48.81096649169922, "step": 6524 }, { "epoch": 0.8884803921568627, "grad_norm": 43.15528369767623, "learning_rate": 2.9931867366445306e-08, "logits/chosen": 14.79001235961914, "logits/rejected": 14.255219459533691, "logps/chosen": -4.723740100860596, "logps/rejected": -4.49666690826416, "loss": 4.0558, "rewards/accuracies": 0.0, "rewards/chosen": -47.23740005493164, "rewards/margins": -2.270730972290039, "rewards/rejected": -44.966670989990234, "step": 6525 }, { "epoch": 0.8886165577342048, "grad_norm": 46.57121953943031, "learning_rate": 2.985974102412538e-08, "logits/chosen": 14.560258865356445, "logits/rejected": 14.716794967651367, "logps/chosen": -4.472012519836426, "logps/rejected": -4.844395637512207, "loss": 4.5409, "rewards/accuracies": 0.5, "rewards/chosen": -44.720123291015625, "rewards/margins": 3.7238330841064453, "rewards/rejected": -48.44395446777344, "step": 6526 }, { "epoch": 0.8887527233115469, "grad_norm": 41.77981008250386, "learning_rate": 2.97876983180565e-08, "logits/chosen": 14.083147048950195, "logits/rejected": 15.670121192932129, "logps/chosen": -4.558938026428223, "logps/rejected": -5.054705619812012, "loss": 3.6424, "rewards/accuracies": 1.0, "rewards/chosen": -45.589378356933594, "rewards/margins": 4.95767879486084, "rewards/rejected": -50.54705810546875, "step": 6527 }, { "epoch": 0.8888888888888888, "grad_norm": 38.058633432946756, "learning_rate": 2.9715739264517447e-08, "logits/chosen": 14.330047607421875, "logits/rejected": 15.063133239746094, "logps/chosen": -4.491429805755615, "logps/rejected": -4.757390975952148, "loss": 3.6147, "rewards/accuracies": 1.0, "rewards/chosen": -44.91429901123047, "rewards/margins": 2.65960693359375, "rewards/rejected": -47.57390594482422, "step": 6528 }, { "epoch": 0.8890250544662309, "grad_norm": 43.08704482303595, "learning_rate": 2.964386387976794e-08, "logits/chosen": 14.417969703674316, "logits/rejected": 14.603568077087402, "logps/chosen": -4.448860168457031, "logps/rejected": -4.704768180847168, "loss": 4.0917, "rewards/accuracies": 1.0, "rewards/chosen": -44.48860549926758, "rewards/margins": 2.5590744018554688, "rewards/rejected": -47.04768371582031, "step": 6529 }, { "epoch": 0.889161220043573, "grad_norm": 45.89484231499072, "learning_rate": 2.9572072180048713e-08, "logits/chosen": 14.22073745727539, "logits/rejected": 14.440317153930664, "logps/chosen": -4.536288261413574, "logps/rejected": -4.631163120269775, "loss": 3.8522, "rewards/accuracies": 0.5, "rewards/chosen": -45.362884521484375, "rewards/margins": 0.9487466812133789, "rewards/rejected": -46.31163024902344, "step": 6530 }, { "epoch": 0.889297385620915, "grad_norm": 47.32861130909843, "learning_rate": 2.950036418158177e-08, "logits/chosen": 14.118019104003906, "logits/rejected": 14.405445098876953, "logps/chosen": -4.245192527770996, "logps/rejected": -4.449519157409668, "loss": 4.0525, "rewards/accuracies": 0.75, "rewards/chosen": -42.45192337036133, "rewards/margins": 2.0432662963867188, "rewards/rejected": -44.49518966674805, "step": 6531 }, { "epoch": 0.8894335511982571, "grad_norm": 41.224674208461416, "learning_rate": 2.94287399005702e-08, "logits/chosen": 14.011262893676758, "logits/rejected": 14.911115646362305, "logps/chosen": -4.433196067810059, "logps/rejected": -5.217355728149414, "loss": 4.0036, "rewards/accuracies": 1.0, "rewards/chosen": -44.33195877075195, "rewards/margins": 7.841597557067871, "rewards/rejected": -52.17355728149414, "step": 6532 }, { "epoch": 0.8895697167755992, "grad_norm": 42.17217524578748, "learning_rate": 2.9357199353197936e-08, "logits/chosen": 14.846839904785156, "logits/rejected": 15.181829452514648, "logps/chosen": -4.732494354248047, "logps/rejected": -5.005756378173828, "loss": 3.7662, "rewards/accuracies": 1.0, "rewards/chosen": -47.32494354248047, "rewards/margins": 2.7326202392578125, "rewards/rejected": -50.05756378173828, "step": 6533 }, { "epoch": 0.8897058823529411, "grad_norm": 42.93077190950949, "learning_rate": 2.9285742555630233e-08, "logits/chosen": 14.920097351074219, "logits/rejected": 14.076130867004395, "logps/chosen": -4.804957866668701, "logps/rejected": -4.442132949829102, "loss": 3.9026, "rewards/accuracies": 0.0, "rewards/chosen": -48.04957580566406, "rewards/margins": -3.628244400024414, "rewards/rejected": -44.42133331298828, "step": 6534 }, { "epoch": 0.8898420479302832, "grad_norm": 46.09786756129358, "learning_rate": 2.921436952401346e-08, "logits/chosen": 14.976652145385742, "logits/rejected": 14.797658920288086, "logps/chosen": -4.980266571044922, "logps/rejected": -5.207162857055664, "loss": 4.2902, "rewards/accuracies": 0.75, "rewards/chosen": -49.80266571044922, "rewards/margins": 2.2689638137817383, "rewards/rejected": -52.071632385253906, "step": 6535 }, { "epoch": 0.8899782135076253, "grad_norm": 42.642180679947, "learning_rate": 2.9143080274474717e-08, "logits/chosen": 14.47365951538086, "logits/rejected": 14.120927810668945, "logps/chosen": -4.578913688659668, "logps/rejected": -4.521067142486572, "loss": 3.8674, "rewards/accuracies": 0.5, "rewards/chosen": -45.78913879394531, "rewards/margins": -0.5784683227539062, "rewards/rejected": -45.210670471191406, "step": 6536 }, { "epoch": 0.8901143790849673, "grad_norm": 44.704698276727605, "learning_rate": 2.9071874823122587e-08, "logits/chosen": 13.68294906616211, "logits/rejected": 14.942068099975586, "logps/chosen": -4.5224289894104, "logps/rejected": -4.993646621704102, "loss": 4.3661, "rewards/accuracies": 1.0, "rewards/chosen": -45.22428894042969, "rewards/margins": 4.712174415588379, "rewards/rejected": -49.936466217041016, "step": 6537 }, { "epoch": 0.8902505446623094, "grad_norm": 47.312062963563875, "learning_rate": 2.9000753186046466e-08, "logits/chosen": 14.23097038269043, "logits/rejected": 14.832452774047852, "logps/chosen": -4.608640670776367, "logps/rejected": -4.703634262084961, "loss": 4.2101, "rewards/accuracies": 0.5, "rewards/chosen": -46.086402893066406, "rewards/margins": 0.9499406814575195, "rewards/rejected": -47.036346435546875, "step": 6538 }, { "epoch": 0.8903867102396514, "grad_norm": 46.025519597803616, "learning_rate": 2.8929715379316832e-08, "logits/chosen": 14.672126770019531, "logits/rejected": 15.761420249938965, "logps/chosen": -4.445803642272949, "logps/rejected": -4.873211860656738, "loss": 4.3996, "rewards/accuracies": 0.75, "rewards/chosen": -44.458038330078125, "rewards/margins": 4.274079322814941, "rewards/rejected": -48.73211669921875, "step": 6539 }, { "epoch": 0.8905228758169934, "grad_norm": 42.90468452063866, "learning_rate": 2.8858761418985334e-08, "logits/chosen": 15.479925155639648, "logits/rejected": 15.065272331237793, "logps/chosen": -4.8613433837890625, "logps/rejected": -4.599819183349609, "loss": 4.3062, "rewards/accuracies": 0.5, "rewards/chosen": -48.613433837890625, "rewards/margins": -2.6152420043945312, "rewards/rejected": -45.998191833496094, "step": 6540 }, { "epoch": 0.8906590413943355, "grad_norm": 44.09799705199716, "learning_rate": 2.8787891321084612e-08, "logits/chosen": 14.82752799987793, "logits/rejected": 14.77133560180664, "logps/chosen": -4.69389533996582, "logps/rejected": -4.68951416015625, "loss": 4.0867, "rewards/accuracies": 0.25, "rewards/chosen": -46.93894958496094, "rewards/margins": -0.04380989074707031, "rewards/rejected": -46.8951416015625, "step": 6541 }, { "epoch": 0.8907952069716776, "grad_norm": 50.82573568275497, "learning_rate": 2.871710510162826e-08, "logits/chosen": 14.262765884399414, "logits/rejected": 14.79769229888916, "logps/chosen": -4.414350509643555, "logps/rejected": -4.8659515380859375, "loss": 4.0334, "rewards/accuracies": 1.0, "rewards/chosen": -44.14350509643555, "rewards/margins": 4.5160112380981445, "rewards/rejected": -48.659515380859375, "step": 6542 }, { "epoch": 0.8909313725490197, "grad_norm": 41.22216315860043, "learning_rate": 2.8646402776611078e-08, "logits/chosen": 14.835981369018555, "logits/rejected": 14.858800888061523, "logps/chosen": -4.977211952209473, "logps/rejected": -5.037518501281738, "loss": 3.8926, "rewards/accuracies": 0.75, "rewards/chosen": -49.772117614746094, "rewards/margins": 0.6030664443969727, "rewards/rejected": -50.37518310546875, "step": 6543 }, { "epoch": 0.8910675381263616, "grad_norm": 39.92050371088288, "learning_rate": 2.857578436200887e-08, "logits/chosen": 14.883134841918945, "logits/rejected": 15.081969261169434, "logps/chosen": -4.73288631439209, "logps/rejected": -4.998301029205322, "loss": 4.3177, "rewards/accuracies": 0.75, "rewards/chosen": -47.32886505126953, "rewards/margins": 2.654147148132324, "rewards/rejected": -49.98301696777344, "step": 6544 }, { "epoch": 0.8912037037037037, "grad_norm": 42.54328111613652, "learning_rate": 2.850524987377838e-08, "logits/chosen": 14.908931732177734, "logits/rejected": 15.57131576538086, "logps/chosen": -4.773127555847168, "logps/rejected": -5.20202112197876, "loss": 3.5253, "rewards/accuracies": 0.75, "rewards/chosen": -47.73127746582031, "rewards/margins": 4.288934707641602, "rewards/rejected": -52.02021026611328, "step": 6545 }, { "epoch": 0.8913398692810458, "grad_norm": 48.74715623292887, "learning_rate": 2.8434799327857438e-08, "logits/chosen": 14.188970565795898, "logits/rejected": 14.324066162109375, "logps/chosen": -4.532864570617676, "logps/rejected": -4.721199989318848, "loss": 3.8773, "rewards/accuracies": 0.5, "rewards/chosen": -45.32864761352539, "rewards/margins": 1.8833503723144531, "rewards/rejected": -47.211997985839844, "step": 6546 }, { "epoch": 0.8914760348583878, "grad_norm": 46.82909151855678, "learning_rate": 2.836443274016509e-08, "logits/chosen": 13.885762214660645, "logits/rejected": 14.597662925720215, "logps/chosen": -4.565786361694336, "logps/rejected": -5.007596969604492, "loss": 4.3763, "rewards/accuracies": 1.0, "rewards/chosen": -45.657867431640625, "rewards/margins": 4.418102264404297, "rewards/rejected": -50.075965881347656, "step": 6547 }, { "epoch": 0.8916122004357299, "grad_norm": 44.72024276174794, "learning_rate": 2.8294150126601058e-08, "logits/chosen": 15.759336471557617, "logits/rejected": 14.897823333740234, "logps/chosen": -4.799797058105469, "logps/rejected": -4.834068298339844, "loss": 4.2191, "rewards/accuracies": 0.5, "rewards/chosen": -47.99797058105469, "rewards/margins": 0.3427143096923828, "rewards/rejected": -48.34068298339844, "step": 6548 }, { "epoch": 0.891748366013072, "grad_norm": 39.25551139017163, "learning_rate": 2.822395150304633e-08, "logits/chosen": 13.700119018554688, "logits/rejected": 13.98355770111084, "logps/chosen": -4.429030418395996, "logps/rejected": -4.902409076690674, "loss": 3.5582, "rewards/accuracies": 1.0, "rewards/chosen": -44.290306091308594, "rewards/margins": 4.733785629272461, "rewards/rejected": -49.02408981323242, "step": 6549 }, { "epoch": 0.8918845315904139, "grad_norm": 47.85168566361548, "learning_rate": 2.815383688536297e-08, "logits/chosen": 14.479780197143555, "logits/rejected": 14.720808029174805, "logps/chosen": -4.639355182647705, "logps/rejected": -4.9894022941589355, "loss": 4.0009, "rewards/accuracies": 0.75, "rewards/chosen": -46.3935546875, "rewards/margins": 3.500472068786621, "rewards/rejected": -49.89402389526367, "step": 6550 }, { "epoch": 0.892020697167756, "grad_norm": 40.20390570117491, "learning_rate": 2.808380628939382e-08, "logits/chosen": 14.808664321899414, "logits/rejected": 14.440749168395996, "logps/chosen": -4.578726768493652, "logps/rejected": -4.3862152099609375, "loss": 4.402, "rewards/accuracies": 0.5, "rewards/chosen": -45.78727340698242, "rewards/margins": -1.925119400024414, "rewards/rejected": -43.862152099609375, "step": 6551 }, { "epoch": 0.8921568627450981, "grad_norm": 42.998804847870474, "learning_rate": 2.801385973096293e-08, "logits/chosen": 14.100672721862793, "logits/rejected": 14.718347549438477, "logps/chosen": -4.579357147216797, "logps/rejected": -4.964768409729004, "loss": 3.4966, "rewards/accuracies": 0.75, "rewards/chosen": -45.79357147216797, "rewards/margins": 3.8541154861450195, "rewards/rejected": -49.64768981933594, "step": 6552 }, { "epoch": 0.8922930283224401, "grad_norm": 43.506581947498454, "learning_rate": 2.794399722587535e-08, "logits/chosen": 14.667903900146484, "logits/rejected": 15.222501754760742, "logps/chosen": -4.655949592590332, "logps/rejected": -4.836057662963867, "loss": 3.9581, "rewards/accuracies": 0.5, "rewards/chosen": -46.55949401855469, "rewards/margins": 1.8010826110839844, "rewards/rejected": -48.36057662963867, "step": 6553 }, { "epoch": 0.8924291938997821, "grad_norm": 44.42761762795968, "learning_rate": 2.787421878991698e-08, "logits/chosen": 14.253257751464844, "logits/rejected": 15.052058219909668, "logps/chosen": -4.762690544128418, "logps/rejected": -4.8941192626953125, "loss": 4.7473, "rewards/accuracies": 0.5, "rewards/chosen": -47.62690734863281, "rewards/margins": 1.3142871856689453, "rewards/rejected": -48.941192626953125, "step": 6554 }, { "epoch": 0.8925653594771242, "grad_norm": 41.23133630394535, "learning_rate": 2.7804524438854947e-08, "logits/chosen": 14.003382682800293, "logits/rejected": 14.316011428833008, "logps/chosen": -4.496988296508789, "logps/rejected": -4.834866046905518, "loss": 3.8218, "rewards/accuracies": 1.0, "rewards/chosen": -44.96988296508789, "rewards/margins": 3.3787784576416016, "rewards/rejected": -48.348663330078125, "step": 6555 }, { "epoch": 0.8927015250544662, "grad_norm": 40.45916489780261, "learning_rate": 2.773491418843723e-08, "logits/chosen": 14.4097900390625, "logits/rejected": 14.824409484863281, "logps/chosen": -4.843506336212158, "logps/rejected": -4.874680519104004, "loss": 3.9722, "rewards/accuracies": 0.75, "rewards/chosen": -48.435062408447266, "rewards/margins": 0.31174278259277344, "rewards/rejected": -48.746803283691406, "step": 6556 }, { "epoch": 0.8928376906318083, "grad_norm": 39.314248714794154, "learning_rate": 2.7665388054392758e-08, "logits/chosen": 14.096378326416016, "logits/rejected": 14.537426948547363, "logps/chosen": -4.283380508422852, "logps/rejected": -4.69374418258667, "loss": 3.9677, "rewards/accuracies": 1.0, "rewards/chosen": -42.83380889892578, "rewards/margins": 4.103633880615234, "rewards/rejected": -46.93743896484375, "step": 6557 }, { "epoch": 0.8929738562091504, "grad_norm": 51.162183805577094, "learning_rate": 2.7595946052431628e-08, "logits/chosen": 14.251462936401367, "logits/rejected": 14.558479309082031, "logps/chosen": -4.682110786437988, "logps/rejected": -4.6943583488464355, "loss": 4.2108, "rewards/accuracies": 0.5, "rewards/chosen": -46.821109771728516, "rewards/margins": 0.12247180938720703, "rewards/rejected": -46.943580627441406, "step": 6558 }, { "epoch": 0.8931100217864923, "grad_norm": 42.95439350465689, "learning_rate": 2.752658819824485e-08, "logits/chosen": 14.176885604858398, "logits/rejected": 14.914576530456543, "logps/chosen": -4.716659069061279, "logps/rejected": -4.992732048034668, "loss": 3.9426, "rewards/accuracies": 1.0, "rewards/chosen": -47.16659164428711, "rewards/margins": 2.7607297897338867, "rewards/rejected": -49.92732238769531, "step": 6559 }, { "epoch": 0.8932461873638344, "grad_norm": 34.7759441099903, "learning_rate": 2.7457314507504326e-08, "logits/chosen": 15.620574951171875, "logits/rejected": 15.395759582519531, "logps/chosen": -5.016655921936035, "logps/rejected": -5.214810371398926, "loss": 3.5523, "rewards/accuracies": 0.75, "rewards/chosen": -50.16655731201172, "rewards/margins": 1.9815483093261719, "rewards/rejected": -52.14810562133789, "step": 6560 }, { "epoch": 0.8933823529411765, "grad_norm": 49.182168281459305, "learning_rate": 2.738812499586305e-08, "logits/chosen": 13.939793586730957, "logits/rejected": 14.975593566894531, "logps/chosen": -4.191817283630371, "logps/rejected": -4.846192359924316, "loss": 3.9068, "rewards/accuracies": 1.0, "rewards/chosen": -41.91817092895508, "rewards/margins": 6.543750762939453, "rewards/rejected": -48.46192169189453, "step": 6561 }, { "epoch": 0.8935185185185185, "grad_norm": 47.55665505106208, "learning_rate": 2.7319019678955046e-08, "logits/chosen": 15.279884338378906, "logits/rejected": 14.807687759399414, "logps/chosen": -4.694601058959961, "logps/rejected": -4.768943786621094, "loss": 3.9539, "rewards/accuracies": 0.25, "rewards/chosen": -46.946006774902344, "rewards/margins": 0.7434253692626953, "rewards/rejected": -47.68943405151367, "step": 6562 }, { "epoch": 0.8936546840958606, "grad_norm": 40.73159614771325, "learning_rate": 2.7249998572395073e-08, "logits/chosen": 14.776139259338379, "logits/rejected": 14.684370040893555, "logps/chosen": -5.149169445037842, "logps/rejected": -5.146333694458008, "loss": 3.9672, "rewards/accuracies": 0.5, "rewards/chosen": -51.49169158935547, "rewards/margins": -0.028359413146972656, "rewards/rejected": -51.46333312988281, "step": 6563 }, { "epoch": 0.8937908496732027, "grad_norm": 41.65162026980432, "learning_rate": 2.718106169177914e-08, "logits/chosen": 14.835015296936035, "logits/rejected": 14.087282180786133, "logps/chosen": -4.507628917694092, "logps/rejected": -4.593357086181641, "loss": 3.9412, "rewards/accuracies": 0.75, "rewards/chosen": -45.07628631591797, "rewards/margins": 0.8572826385498047, "rewards/rejected": -45.933570861816406, "step": 6564 }, { "epoch": 0.8939270152505446, "grad_norm": 42.81258367112067, "learning_rate": 2.711220905268412e-08, "logits/chosen": 14.099512100219727, "logits/rejected": 14.292187690734863, "logps/chosen": -4.746670722961426, "logps/rejected": -4.953699111938477, "loss": 3.8207, "rewards/accuracies": 0.75, "rewards/chosen": -47.46670913696289, "rewards/margins": 2.070280075073242, "rewards/rejected": -49.5369873046875, "step": 6565 }, { "epoch": 0.8940631808278867, "grad_norm": 48.697826842729675, "learning_rate": 2.7043440670667705e-08, "logits/chosen": 15.71395492553711, "logits/rejected": 15.196203231811523, "logps/chosen": -4.832036972045898, "logps/rejected": -4.596547603607178, "loss": 3.952, "rewards/accuracies": 0.5, "rewards/chosen": -48.32036590576172, "rewards/margins": -2.3548898696899414, "rewards/rejected": -45.965476989746094, "step": 6566 }, { "epoch": 0.8941993464052288, "grad_norm": 42.973739096224314, "learning_rate": 2.6974756561268754e-08, "logits/chosen": 13.778997421264648, "logits/rejected": 14.17897891998291, "logps/chosen": -4.2922163009643555, "logps/rejected": -4.569117546081543, "loss": 3.7323, "rewards/accuracies": 0.75, "rewards/chosen": -42.922157287597656, "rewards/margins": 2.769017219543457, "rewards/rejected": -45.69117736816406, "step": 6567 }, { "epoch": 0.8943355119825708, "grad_norm": 41.923027581235544, "learning_rate": 2.6906156740007115e-08, "logits/chosen": 14.751784324645996, "logits/rejected": 14.621479034423828, "logps/chosen": -4.5842390060424805, "logps/rejected": -4.820810794830322, "loss": 3.6256, "rewards/accuracies": 1.0, "rewards/chosen": -45.84239196777344, "rewards/margins": 2.365715980529785, "rewards/rejected": -48.208106994628906, "step": 6568 }, { "epoch": 0.8944716775599129, "grad_norm": 50.114846238408795, "learning_rate": 2.683764122238328e-08, "logits/chosen": 13.641611099243164, "logits/rejected": 14.408815383911133, "logps/chosen": -4.439493179321289, "logps/rejected": -4.6988019943237305, "loss": 4.2009, "rewards/accuracies": 1.0, "rewards/chosen": -44.39493179321289, "rewards/margins": 2.593088150024414, "rewards/rejected": -46.98802185058594, "step": 6569 }, { "epoch": 0.8946078431372549, "grad_norm": 45.771004003694514, "learning_rate": 2.676921002387904e-08, "logits/chosen": 13.903179168701172, "logits/rejected": 14.023199081420898, "logps/chosen": -4.02486515045166, "logps/rejected": -4.3189287185668945, "loss": 3.9428, "rewards/accuracies": 0.75, "rewards/chosen": -40.24864959716797, "rewards/margins": 2.9406356811523438, "rewards/rejected": -43.18928527832031, "step": 6570 }, { "epoch": 0.8947440087145969, "grad_norm": 41.01463556056628, "learning_rate": 2.670086315995701e-08, "logits/chosen": 14.109639167785645, "logits/rejected": 14.511480331420898, "logps/chosen": -4.2906951904296875, "logps/rejected": -4.364314079284668, "loss": 3.8846, "rewards/accuracies": 0.5, "rewards/chosen": -42.906951904296875, "rewards/margins": 0.7361917495727539, "rewards/rejected": -43.64314270019531, "step": 6571 }, { "epoch": 0.894880174291939, "grad_norm": 45.93016989293542, "learning_rate": 2.6632600646060566e-08, "logits/chosen": 15.04213809967041, "logits/rejected": 15.229743003845215, "logps/chosen": -4.790735244750977, "logps/rejected": -5.1327033042907715, "loss": 4.3496, "rewards/accuracies": 0.75, "rewards/chosen": -47.907352447509766, "rewards/margins": 3.419677734375, "rewards/rejected": -51.327030181884766, "step": 6572 }, { "epoch": 0.8950163398692811, "grad_norm": 44.5650584043284, "learning_rate": 2.6564422497614348e-08, "logits/chosen": 13.940345764160156, "logits/rejected": 14.556112289428711, "logps/chosen": -4.626977920532227, "logps/rejected": -4.750796318054199, "loss": 4.4955, "rewards/accuracies": 0.75, "rewards/chosen": -46.26978302001953, "rewards/margins": 1.2381839752197266, "rewards/rejected": -47.507965087890625, "step": 6573 }, { "epoch": 0.8951525054466231, "grad_norm": 42.72609642226264, "learning_rate": 2.6496328730023766e-08, "logits/chosen": 14.568811416625977, "logits/rejected": 14.75271987915039, "logps/chosen": -4.299591064453125, "logps/rejected": -4.361888885498047, "loss": 3.9936, "rewards/accuracies": 0.5, "rewards/chosen": -42.99591064453125, "rewards/margins": 0.6229791641235352, "rewards/rejected": -43.61888885498047, "step": 6574 }, { "epoch": 0.8952886710239651, "grad_norm": 41.33392840214374, "learning_rate": 2.6428319358675045e-08, "logits/chosen": 14.480035781860352, "logits/rejected": 14.47778606414795, "logps/chosen": -4.358654975891113, "logps/rejected": -4.541121482849121, "loss": 3.6509, "rewards/accuracies": 0.5, "rewards/chosen": -43.586551666259766, "rewards/margins": 1.8246631622314453, "rewards/rejected": -45.411216735839844, "step": 6575 }, { "epoch": 0.8954248366013072, "grad_norm": 42.60821209538101, "learning_rate": 2.6360394398935537e-08, "logits/chosen": 14.140027046203613, "logits/rejected": 14.99643611907959, "logps/chosen": -4.265511512756348, "logps/rejected": -4.930877208709717, "loss": 4.057, "rewards/accuracies": 1.0, "rewards/chosen": -42.655113220214844, "rewards/margins": 6.653656005859375, "rewards/rejected": -49.30876922607422, "step": 6576 }, { "epoch": 0.8955610021786492, "grad_norm": 40.35851819846573, "learning_rate": 2.62925538661535e-08, "logits/chosen": 14.358802795410156, "logits/rejected": 14.836260795593262, "logps/chosen": -4.885253429412842, "logps/rejected": -4.9848103523254395, "loss": 4.0946, "rewards/accuracies": 0.75, "rewards/chosen": -48.85253143310547, "rewards/margins": 0.9955682754516602, "rewards/rejected": -49.84810256958008, "step": 6577 }, { "epoch": 0.8956971677559913, "grad_norm": 43.116589175317976, "learning_rate": 2.6224797775657957e-08, "logits/chosen": 14.84131908416748, "logits/rejected": 14.825276374816895, "logps/chosen": -4.639319896697998, "logps/rejected": -4.747673034667969, "loss": 3.705, "rewards/accuracies": 0.5, "rewards/chosen": -46.39319610595703, "rewards/margins": 1.0835323333740234, "rewards/rejected": -47.47673034667969, "step": 6578 }, { "epoch": 0.8958333333333334, "grad_norm": 57.67020207965758, "learning_rate": 2.6157126142759023e-08, "logits/chosen": 14.821390151977539, "logits/rejected": 14.810379028320312, "logps/chosen": -4.950883865356445, "logps/rejected": -4.853640079498291, "loss": 3.5278, "rewards/accuracies": 0.5, "rewards/chosen": -49.50883865356445, "rewards/margins": -0.9724416732788086, "rewards/rejected": -48.536399841308594, "step": 6579 }, { "epoch": 0.8959694989106753, "grad_norm": 40.45175641576069, "learning_rate": 2.6089538982747748e-08, "logits/chosen": 14.293622970581055, "logits/rejected": 15.33925724029541, "logps/chosen": -4.943133354187012, "logps/rejected": -4.998432159423828, "loss": 3.9664, "rewards/accuracies": 0.5, "rewards/chosen": -49.431331634521484, "rewards/margins": 0.5529928207397461, "rewards/rejected": -49.98432540893555, "step": 6580 }, { "epoch": 0.8961056644880174, "grad_norm": 46.577512393282376, "learning_rate": 2.6022036310895834e-08, "logits/chosen": 14.64176082611084, "logits/rejected": 14.861875534057617, "logps/chosen": -4.763521194458008, "logps/rejected": -4.681957721710205, "loss": 4.0169, "rewards/accuracies": 0.25, "rewards/chosen": -47.63520812988281, "rewards/margins": -0.8156347274780273, "rewards/rejected": -46.819576263427734, "step": 6581 }, { "epoch": 0.8962418300653595, "grad_norm": 41.237034871954904, "learning_rate": 2.5954618142456142e-08, "logits/chosen": 14.830507278442383, "logits/rejected": 14.590995788574219, "logps/chosen": -4.634166240692139, "logps/rejected": -4.761716842651367, "loss": 3.6891, "rewards/accuracies": 0.5, "rewards/chosen": -46.34165954589844, "rewards/margins": 1.2755041122436523, "rewards/rejected": -47.617164611816406, "step": 6582 }, { "epoch": 0.8963779956427015, "grad_norm": 41.01976548158545, "learning_rate": 2.5887284492662397e-08, "logits/chosen": 14.523824691772461, "logits/rejected": 14.849679946899414, "logps/chosen": -4.649500846862793, "logps/rejected": -4.933089733123779, "loss": 4.2251, "rewards/accuracies": 0.75, "rewards/chosen": -46.4950065612793, "rewards/margins": 2.8358917236328125, "rewards/rejected": -49.330894470214844, "step": 6583 }, { "epoch": 0.8965141612200436, "grad_norm": 38.84654649609442, "learning_rate": 2.5820035376729143e-08, "logits/chosen": 14.474481582641602, "logits/rejected": 13.915340423583984, "logps/chosen": -4.390017509460449, "logps/rejected": -4.549873352050781, "loss": 3.9745, "rewards/accuracies": 0.75, "rewards/chosen": -43.900177001953125, "rewards/margins": 1.598555564880371, "rewards/rejected": -45.49873352050781, "step": 6584 }, { "epoch": 0.8966503267973857, "grad_norm": 40.21926256996712, "learning_rate": 2.575287080985191e-08, "logits/chosen": 14.43305778503418, "logits/rejected": 14.62663459777832, "logps/chosen": -4.652863025665283, "logps/rejected": -4.860627174377441, "loss": 3.9013, "rewards/accuracies": 0.75, "rewards/chosen": -46.528629302978516, "rewards/margins": 2.0776405334472656, "rewards/rejected": -48.60626983642578, "step": 6585 }, { "epoch": 0.8967864923747276, "grad_norm": 41.35123839163675, "learning_rate": 2.5685790807207098e-08, "logits/chosen": 14.903141975402832, "logits/rejected": 14.546773910522461, "logps/chosen": -4.858029842376709, "logps/rejected": -4.730367660522461, "loss": 3.9564, "rewards/accuracies": 0.25, "rewards/chosen": -48.58029556274414, "rewards/margins": -1.2766246795654297, "rewards/rejected": -47.303672790527344, "step": 6586 }, { "epoch": 0.8969226579520697, "grad_norm": 42.107875174412115, "learning_rate": 2.5618795383951952e-08, "logits/chosen": 12.740978240966797, "logits/rejected": 13.441350936889648, "logps/chosen": -4.153252601623535, "logps/rejected": -4.6139702796936035, "loss": 3.876, "rewards/accuracies": 1.0, "rewards/chosen": -41.53252410888672, "rewards/margins": 4.607181549072266, "rewards/rejected": -46.139705657958984, "step": 6587 }, { "epoch": 0.8970588235294118, "grad_norm": 44.54949962067429, "learning_rate": 2.5551884555224633e-08, "logits/chosen": 14.368650436401367, "logits/rejected": 15.141088485717773, "logps/chosen": -4.578349590301514, "logps/rejected": -4.912569046020508, "loss": 4.2907, "rewards/accuracies": 0.75, "rewards/chosen": -45.78349685668945, "rewards/margins": 3.342193603515625, "rewards/rejected": -49.12569046020508, "step": 6588 }, { "epoch": 0.8971949891067538, "grad_norm": 39.99248760983401, "learning_rate": 2.5485058336144206e-08, "logits/chosen": 14.605381965637207, "logits/rejected": 14.413202285766602, "logps/chosen": -4.548278331756592, "logps/rejected": -4.676705360412598, "loss": 3.9872, "rewards/accuracies": 0.5, "rewards/chosen": -45.48278045654297, "rewards/margins": 1.2842721939086914, "rewards/rejected": -46.76705551147461, "step": 6589 }, { "epoch": 0.8973311546840959, "grad_norm": 48.80793041722023, "learning_rate": 2.5418316741810674e-08, "logits/chosen": 14.441923141479492, "logits/rejected": 15.357817649841309, "logps/chosen": -4.49782657623291, "logps/rejected": -5.117785930633545, "loss": 4.3683, "rewards/accuracies": 0.75, "rewards/chosen": -44.97826385498047, "rewards/margins": 6.199594497680664, "rewards/rejected": -51.1778564453125, "step": 6590 }, { "epoch": 0.8974673202614379, "grad_norm": 46.26950774359632, "learning_rate": 2.535165978730478e-08, "logits/chosen": 14.733354568481445, "logits/rejected": 14.86907958984375, "logps/chosen": -4.644752025604248, "logps/rejected": -4.596690654754639, "loss": 4.0924, "rewards/accuracies": 0.5, "rewards/chosen": -46.44751739501953, "rewards/margins": -0.4806098937988281, "rewards/rejected": -45.96691131591797, "step": 6591 }, { "epoch": 0.8976034858387799, "grad_norm": 48.40059568693438, "learning_rate": 2.5285087487688205e-08, "logits/chosen": 14.634366035461426, "logits/rejected": 14.838611602783203, "logps/chosen": -4.9117326736450195, "logps/rejected": -5.076918125152588, "loss": 4.361, "rewards/accuracies": 0.5, "rewards/chosen": -49.117332458496094, "rewards/margins": 1.651850700378418, "rewards/rejected": -50.76918029785156, "step": 6592 }, { "epoch": 0.897739651416122, "grad_norm": 50.36276606868991, "learning_rate": 2.5218599858003586e-08, "logits/chosen": 14.564204216003418, "logits/rejected": 13.618989944458008, "logps/chosen": -4.584262847900391, "logps/rejected": -4.623375415802002, "loss": 3.8786, "rewards/accuracies": 0.5, "rewards/chosen": -45.842628479003906, "rewards/margins": 0.3911266326904297, "rewards/rejected": -46.2337532043457, "step": 6593 }, { "epoch": 0.8978758169934641, "grad_norm": 42.01773081082473, "learning_rate": 2.515219691327428e-08, "logits/chosen": 13.578137397766113, "logits/rejected": 14.060562133789062, "logps/chosen": -4.490099906921387, "logps/rejected": -4.599183082580566, "loss": 4.1239, "rewards/accuracies": 0.5, "rewards/chosen": -44.9010009765625, "rewards/margins": 1.0908279418945312, "rewards/rejected": -45.99182891845703, "step": 6594 }, { "epoch": 0.898011982570806, "grad_norm": 41.3070302902014, "learning_rate": 2.5085878668504555e-08, "logits/chosen": 14.15907096862793, "logits/rejected": 14.686300277709961, "logps/chosen": -4.8761796951293945, "logps/rejected": -4.627857685089111, "loss": 4.0069, "rewards/accuracies": 0.25, "rewards/chosen": -48.76179504394531, "rewards/margins": -2.483217239379883, "rewards/rejected": -46.2785758972168, "step": 6595 }, { "epoch": 0.8981481481481481, "grad_norm": 46.638656265211026, "learning_rate": 2.501964513867967e-08, "logits/chosen": 13.58960247039795, "logits/rejected": 14.336051940917969, "logps/chosen": -4.135716438293457, "logps/rejected": -4.42486572265625, "loss": 4.2508, "rewards/accuracies": 0.5, "rewards/chosen": -41.3571662902832, "rewards/margins": 2.8914928436279297, "rewards/rejected": -44.248661041259766, "step": 6596 }, { "epoch": 0.8982843137254902, "grad_norm": 40.783491991199256, "learning_rate": 2.49534963387656e-08, "logits/chosen": 14.531208992004395, "logits/rejected": 14.0476713180542, "logps/chosen": -4.846938133239746, "logps/rejected": -4.700399875640869, "loss": 3.841, "rewards/accuracies": 0.5, "rewards/chosen": -48.46937942504883, "rewards/margins": -1.465378761291504, "rewards/rejected": -47.003997802734375, "step": 6597 }, { "epoch": 0.8984204793028322, "grad_norm": 43.00325795424459, "learning_rate": 2.4887432283709155e-08, "logits/chosen": 14.850250244140625, "logits/rejected": 14.943075180053711, "logps/chosen": -4.789754867553711, "logps/rejected": -5.254841327667236, "loss": 4.3297, "rewards/accuracies": 1.0, "rewards/chosen": -47.897552490234375, "rewards/margins": 4.650861740112305, "rewards/rejected": -52.54841613769531, "step": 6598 }, { "epoch": 0.8985566448801743, "grad_norm": 47.05453223344335, "learning_rate": 2.482145298843812e-08, "logits/chosen": 14.361631393432617, "logits/rejected": 14.631954193115234, "logps/chosen": -4.4328694343566895, "logps/rejected": -4.769039154052734, "loss": 4.1409, "rewards/accuracies": 0.75, "rewards/chosen": -44.32869338989258, "rewards/margins": 3.361701011657715, "rewards/rejected": -47.690391540527344, "step": 6599 }, { "epoch": 0.8986928104575164, "grad_norm": 39.119222047074324, "learning_rate": 2.475555846786106e-08, "logits/chosen": 14.571577072143555, "logits/rejected": 15.000720977783203, "logps/chosen": -4.7144269943237305, "logps/rejected": -4.845949172973633, "loss": 3.8066, "rewards/accuracies": 0.5, "rewards/chosen": -47.14427185058594, "rewards/margins": 1.3152246475219727, "rewards/rejected": -48.459495544433594, "step": 6600 }, { "epoch": 0.8988289760348583, "grad_norm": 38.120281666459384, "learning_rate": 2.468974873686731e-08, "logits/chosen": 14.30759048461914, "logits/rejected": 14.468982696533203, "logps/chosen": -4.320596694946289, "logps/rejected": -4.6870927810668945, "loss": 3.5268, "rewards/accuracies": 0.75, "rewards/chosen": -43.20596694946289, "rewards/margins": 3.6649646759033203, "rewards/rejected": -46.87092971801758, "step": 6601 }, { "epoch": 0.8989651416122004, "grad_norm": 41.82697925349995, "learning_rate": 2.4624023810327198e-08, "logits/chosen": 14.49470329284668, "logits/rejected": 13.780467987060547, "logps/chosen": -4.554579734802246, "logps/rejected": -4.443431377410889, "loss": 4.3571, "rewards/accuracies": 0.25, "rewards/chosen": -45.54579544067383, "rewards/margins": -1.1114845275878906, "rewards/rejected": -44.43431091308594, "step": 6602 }, { "epoch": 0.8991013071895425, "grad_norm": 39.129362950366826, "learning_rate": 2.455838370309182e-08, "logits/chosen": 14.475899696350098, "logits/rejected": 13.902650833129883, "logps/chosen": -4.790998935699463, "logps/rejected": -4.642266273498535, "loss": 3.9855, "rewards/accuracies": 0.25, "rewards/chosen": -47.90999221801758, "rewards/margins": -1.4873228073120117, "rewards/rejected": -46.42266845703125, "step": 6603 }, { "epoch": 0.8992374727668845, "grad_norm": 38.46196156996043, "learning_rate": 2.4492828429993094e-08, "logits/chosen": 14.230965614318848, "logits/rejected": 14.254557609558105, "logps/chosen": -4.795049667358398, "logps/rejected": -4.956223487854004, "loss": 3.9031, "rewards/accuracies": 0.5, "rewards/chosen": -47.950496673583984, "rewards/margins": 1.611739158630371, "rewards/rejected": -49.56223678588867, "step": 6604 }, { "epoch": 0.8993736383442266, "grad_norm": 43.088910928938475, "learning_rate": 2.4427358005843703e-08, "logits/chosen": 14.863174438476562, "logits/rejected": 14.41417121887207, "logps/chosen": -4.547157287597656, "logps/rejected": -4.478557109832764, "loss": 3.3155, "rewards/accuracies": 0.5, "rewards/chosen": -45.47157287597656, "rewards/margins": -0.6860036849975586, "rewards/rejected": -44.78556823730469, "step": 6605 }, { "epoch": 0.8995098039215687, "grad_norm": 41.48437023253003, "learning_rate": 2.4361972445437317e-08, "logits/chosen": 14.219696044921875, "logits/rejected": 14.359471321105957, "logps/chosen": -4.579258918762207, "logps/rejected": -4.692013740539551, "loss": 3.6157, "rewards/accuracies": 1.0, "rewards/chosen": -45.7925910949707, "rewards/margins": 1.1275482177734375, "rewards/rejected": -46.92013931274414, "step": 6606 }, { "epoch": 0.8996459694989106, "grad_norm": 45.75308976836651, "learning_rate": 2.4296671763548348e-08, "logits/chosen": 12.989416122436523, "logits/rejected": 13.9384765625, "logps/chosen": -4.255228042602539, "logps/rejected": -4.6452860832214355, "loss": 4.308, "rewards/accuracies": 1.0, "rewards/chosen": -42.552284240722656, "rewards/margins": 3.9005746841430664, "rewards/rejected": -46.452857971191406, "step": 6607 }, { "epoch": 0.8997821350762527, "grad_norm": 46.36330404985194, "learning_rate": 2.4231455974931924e-08, "logits/chosen": 14.453847885131836, "logits/rejected": 14.192617416381836, "logps/chosen": -4.50557279586792, "logps/rejected": -4.588769435882568, "loss": 3.6235, "rewards/accuracies": 0.5, "rewards/chosen": -45.055728912353516, "rewards/margins": 0.831965446472168, "rewards/rejected": -45.8876953125, "step": 6608 }, { "epoch": 0.8999183006535948, "grad_norm": 39.12130013877436, "learning_rate": 2.416632509432417e-08, "logits/chosen": 13.661439895629883, "logits/rejected": 14.35805606842041, "logps/chosen": -4.449451923370361, "logps/rejected": -4.535680770874023, "loss": 3.5801, "rewards/accuracies": 0.5, "rewards/chosen": -44.49452209472656, "rewards/margins": 0.8622856140136719, "rewards/rejected": -45.35680389404297, "step": 6609 }, { "epoch": 0.9000544662309368, "grad_norm": 46.457703223707185, "learning_rate": 2.4101279136441978e-08, "logits/chosen": 13.845407485961914, "logits/rejected": 14.723451614379883, "logps/chosen": -4.351393699645996, "logps/rejected": -4.631321907043457, "loss": 3.8638, "rewards/accuracies": 0.75, "rewards/chosen": -43.51393508911133, "rewards/margins": 2.7992820739746094, "rewards/rejected": -46.31321716308594, "step": 6610 }, { "epoch": 0.9001906318082789, "grad_norm": 38.24255484181969, "learning_rate": 2.4036318115982924e-08, "logits/chosen": 13.69852352142334, "logits/rejected": 15.192856788635254, "logps/chosen": -4.323352336883545, "logps/rejected": -4.777180194854736, "loss": 4.0998, "rewards/accuracies": 1.0, "rewards/chosen": -43.233524322509766, "rewards/margins": 4.538276672363281, "rewards/rejected": -47.77180480957031, "step": 6611 }, { "epoch": 0.9003267973856209, "grad_norm": 42.91940176340694, "learning_rate": 2.3971442047625535e-08, "logits/chosen": 15.522686004638672, "logits/rejected": 14.53306770324707, "logps/chosen": -5.108304977416992, "logps/rejected": -4.866522789001465, "loss": 3.7787, "rewards/accuracies": 0.5, "rewards/chosen": -51.08305358886719, "rewards/margins": -2.417827606201172, "rewards/rejected": -48.665225982666016, "step": 6612 }, { "epoch": 0.9004629629629629, "grad_norm": 43.23917175680192, "learning_rate": 2.390665094602915e-08, "logits/chosen": 14.596944808959961, "logits/rejected": 14.803720474243164, "logps/chosen": -4.801316738128662, "logps/rejected": -4.654937744140625, "loss": 3.6174, "rewards/accuracies": 0.5, "rewards/chosen": -48.01316833496094, "rewards/margins": -1.4637947082519531, "rewards/rejected": -46.549373626708984, "step": 6613 }, { "epoch": 0.900599128540305, "grad_norm": 40.101805351730725, "learning_rate": 2.384194482583375e-08, "logits/chosen": 14.53053092956543, "logits/rejected": 15.070972442626953, "logps/chosen": -4.694120407104492, "logps/rejected": -5.068093299865723, "loss": 4.1315, "rewards/accuracies": 1.0, "rewards/chosen": -46.94120788574219, "rewards/margins": 3.7397241592407227, "rewards/rejected": -50.680931091308594, "step": 6614 }, { "epoch": 0.9007352941176471, "grad_norm": 44.443393816612016, "learning_rate": 2.37773237016603e-08, "logits/chosen": 13.876293182373047, "logits/rejected": 14.613698959350586, "logps/chosen": -4.45341682434082, "logps/rejected": -4.837693691253662, "loss": 4.7785, "rewards/accuracies": 0.75, "rewards/chosen": -44.5341682434082, "rewards/margins": 3.842771530151367, "rewards/rejected": -48.37693786621094, "step": 6615 }, { "epoch": 0.900871459694989, "grad_norm": 40.84004561353091, "learning_rate": 2.371278758811046e-08, "logits/chosen": 13.734946250915527, "logits/rejected": 14.034526824951172, "logps/chosen": -4.436408519744873, "logps/rejected": -4.587111949920654, "loss": 4.103, "rewards/accuracies": 0.75, "rewards/chosen": -44.36408233642578, "rewards/margins": 1.5070323944091797, "rewards/rejected": -45.871116638183594, "step": 6616 }, { "epoch": 0.9010076252723311, "grad_norm": 41.01570751706207, "learning_rate": 2.3648336499766653e-08, "logits/chosen": 14.109833717346191, "logits/rejected": 14.556406021118164, "logps/chosen": -4.354866981506348, "logps/rejected": -4.545698165893555, "loss": 4.1311, "rewards/accuracies": 1.0, "rewards/chosen": -43.548667907714844, "rewards/margins": 1.9083137512207031, "rewards/rejected": -45.45698547363281, "step": 6617 }, { "epoch": 0.9011437908496732, "grad_norm": 38.857884347435245, "learning_rate": 2.358397045119216e-08, "logits/chosen": 14.212953567504883, "logits/rejected": 14.689332962036133, "logps/chosen": -4.331437110900879, "logps/rejected": -4.687047481536865, "loss": 3.5014, "rewards/accuracies": 1.0, "rewards/chosen": -43.314369201660156, "rewards/margins": 3.5561065673828125, "rewards/rejected": -46.87047576904297, "step": 6618 }, { "epoch": 0.9012799564270153, "grad_norm": 40.98770166172769, "learning_rate": 2.3519689456931124e-08, "logits/chosen": 15.176350593566895, "logits/rejected": 15.358345031738281, "logps/chosen": -4.703244686126709, "logps/rejected": -4.708144187927246, "loss": 3.8002, "rewards/accuracies": 0.5, "rewards/chosen": -47.032447814941406, "rewards/margins": 0.048996925354003906, "rewards/rejected": -47.081443786621094, "step": 6619 }, { "epoch": 0.9014161220043573, "grad_norm": 45.41058111913426, "learning_rate": 2.3455493531508197e-08, "logits/chosen": 12.96987533569336, "logits/rejected": 14.167156219482422, "logps/chosen": -4.3516526222229, "logps/rejected": -4.656923294067383, "loss": 3.9202, "rewards/accuracies": 0.75, "rewards/chosen": -43.51652526855469, "rewards/margins": 3.052706718444824, "rewards/rejected": -46.56922912597656, "step": 6620 }, { "epoch": 0.9015522875816994, "grad_norm": 37.42759409492895, "learning_rate": 2.3391382689429018e-08, "logits/chosen": 13.515890121459961, "logits/rejected": 14.851162910461426, "logps/chosen": -4.210671901702881, "logps/rejected": -4.938775062561035, "loss": 3.7131, "rewards/accuracies": 1.0, "rewards/chosen": -42.10671615600586, "rewards/margins": 7.281030654907227, "rewards/rejected": -49.38774871826172, "step": 6621 }, { "epoch": 0.9016884531590414, "grad_norm": 41.305311266809994, "learning_rate": 2.3327356945180086e-08, "logits/chosen": 14.472702026367188, "logits/rejected": 14.843656539916992, "logps/chosen": -4.617489814758301, "logps/rejected": -4.721074104309082, "loss": 4.0343, "rewards/accuracies": 0.5, "rewards/chosen": -46.17490005493164, "rewards/margins": 1.0358448028564453, "rewards/rejected": -47.21074676513672, "step": 6622 }, { "epoch": 0.9018246187363834, "grad_norm": 42.60995942756478, "learning_rate": 2.326341631322841e-08, "logits/chosen": 13.670720100402832, "logits/rejected": 14.209677696228027, "logps/chosen": -4.0514702796936035, "logps/rejected": -4.631076812744141, "loss": 3.8074, "rewards/accuracies": 1.0, "rewards/chosen": -40.51470184326172, "rewards/margins": 5.796063423156738, "rewards/rejected": -46.31076431274414, "step": 6623 }, { "epoch": 0.9019607843137255, "grad_norm": 49.09672057964268, "learning_rate": 2.3199560808021946e-08, "logits/chosen": 14.632899284362793, "logits/rejected": 14.971511840820312, "logps/chosen": -4.926749229431152, "logps/rejected": -5.064959526062012, "loss": 3.6348, "rewards/accuracies": 0.75, "rewards/chosen": -49.267494201660156, "rewards/margins": 1.3821029663085938, "rewards/rejected": -50.64959716796875, "step": 6624 }, { "epoch": 0.9020969498910676, "grad_norm": 45.03411541948308, "learning_rate": 2.313579044398941e-08, "logits/chosen": 14.501262664794922, "logits/rejected": 14.764598846435547, "logps/chosen": -4.6288228034973145, "logps/rejected": -4.803999900817871, "loss": 3.7778, "rewards/accuracies": 0.75, "rewards/chosen": -46.288230895996094, "rewards/margins": 1.7517690658569336, "rewards/rejected": -48.040000915527344, "step": 6625 }, { "epoch": 0.9022331154684096, "grad_norm": 40.09766242277699, "learning_rate": 2.3072105235540173e-08, "logits/chosen": 14.347946166992188, "logits/rejected": 14.042180061340332, "logps/chosen": -4.5975661277771, "logps/rejected": -4.446233749389648, "loss": 4.16, "rewards/accuracies": 0.25, "rewards/chosen": -45.97566223144531, "rewards/margins": -1.5133209228515625, "rewards/rejected": -44.46234130859375, "step": 6626 }, { "epoch": 0.9023692810457516, "grad_norm": 48.124194539301115, "learning_rate": 2.3008505197064497e-08, "logits/chosen": 14.737473487854004, "logits/rejected": 14.532309532165527, "logps/chosen": -4.734158515930176, "logps/rejected": -4.71622371673584, "loss": 4.3959, "rewards/accuracies": 0.5, "rewards/chosen": -47.341583251953125, "rewards/margins": -0.17934322357177734, "rewards/rejected": -47.16223907470703, "step": 6627 }, { "epoch": 0.9025054466230937, "grad_norm": 40.850198499849284, "learning_rate": 2.294499034293338e-08, "logits/chosen": 14.050617218017578, "logits/rejected": 14.555561065673828, "logps/chosen": -4.496124267578125, "logps/rejected": -4.76030158996582, "loss": 3.8681, "rewards/accuracies": 1.0, "rewards/chosen": -44.96124267578125, "rewards/margins": 2.6417760848999023, "rewards/rejected": -47.60301971435547, "step": 6628 }, { "epoch": 0.9026416122004357, "grad_norm": 39.841208047019975, "learning_rate": 2.288156068749836e-08, "logits/chosen": 14.729616165161133, "logits/rejected": 15.051721572875977, "logps/chosen": -4.751150131225586, "logps/rejected": -4.817196369171143, "loss": 4.0023, "rewards/accuracies": 0.25, "rewards/chosen": -47.511505126953125, "rewards/margins": 0.6604595184326172, "rewards/rejected": -48.171966552734375, "step": 6629 }, { "epoch": 0.9027777777777778, "grad_norm": 40.446215025362775, "learning_rate": 2.281821624509206e-08, "logits/chosen": 13.477128028869629, "logits/rejected": 14.013031005859375, "logps/chosen": -4.336541175842285, "logps/rejected": -4.476435661315918, "loss": 4.0283, "rewards/accuracies": 0.5, "rewards/chosen": -43.365413665771484, "rewards/margins": 1.3989391326904297, "rewards/rejected": -44.76435089111328, "step": 6630 }, { "epoch": 0.9029139433551199, "grad_norm": 44.5194320447146, "learning_rate": 2.275495703002761e-08, "logits/chosen": 14.755099296569824, "logits/rejected": 14.00799560546875, "logps/chosen": -4.592947959899902, "logps/rejected": -4.7509002685546875, "loss": 3.7208, "rewards/accuracies": 0.75, "rewards/chosen": -45.929481506347656, "rewards/margins": 1.579519271850586, "rewards/rejected": -47.509002685546875, "step": 6631 }, { "epoch": 0.9030501089324618, "grad_norm": 46.8003287560935, "learning_rate": 2.2691783056598913e-08, "logits/chosen": 14.09131908416748, "logits/rejected": 13.355628967285156, "logps/chosen": -4.678750038146973, "logps/rejected": -4.256147861480713, "loss": 4.0622, "rewards/accuracies": 0.0, "rewards/chosen": -46.78750228881836, "rewards/margins": -4.2260236740112305, "rewards/rejected": -42.56147766113281, "step": 6632 }, { "epoch": 0.9031862745098039, "grad_norm": 43.61369863142673, "learning_rate": 2.2628694339080724e-08, "logits/chosen": 13.79692554473877, "logits/rejected": 14.490657806396484, "logps/chosen": -4.395462989807129, "logps/rejected": -4.5332255363464355, "loss": 4.3163, "rewards/accuracies": 0.75, "rewards/chosen": -43.95463180541992, "rewards/margins": 1.37762451171875, "rewards/rejected": -45.33225631713867, "step": 6633 }, { "epoch": 0.903322440087146, "grad_norm": 40.04482193268204, "learning_rate": 2.2565690891728482e-08, "logits/chosen": 13.764474868774414, "logits/rejected": 14.049985885620117, "logps/chosen": -4.316680908203125, "logps/rejected": -4.575900077819824, "loss": 3.6826, "rewards/accuracies": 1.0, "rewards/chosen": -43.16680908203125, "rewards/margins": 2.5921945571899414, "rewards/rejected": -45.759002685546875, "step": 6634 }, { "epoch": 0.903458605664488, "grad_norm": 41.29835914003808, "learning_rate": 2.250277272877823e-08, "logits/chosen": 14.248673439025879, "logits/rejected": 14.10324478149414, "logps/chosen": -4.320493698120117, "logps/rejected": -4.637654781341553, "loss": 4.3333, "rewards/accuracies": 0.75, "rewards/chosen": -43.20494079589844, "rewards/margins": 3.171609878540039, "rewards/rejected": -46.376548767089844, "step": 6635 }, { "epoch": 0.9035947712418301, "grad_norm": 77.24868626695081, "learning_rate": 2.24399398644469e-08, "logits/chosen": 15.145013809204102, "logits/rejected": 15.233470916748047, "logps/chosen": -4.639134883880615, "logps/rejected": -4.592113971710205, "loss": 4.2481, "rewards/accuracies": 0.5, "rewards/chosen": -46.3913459777832, "rewards/margins": -0.47020530700683594, "rewards/rejected": -45.921138763427734, "step": 6636 }, { "epoch": 0.9037309368191722, "grad_norm": 51.39501433143997, "learning_rate": 2.2377192312932157e-08, "logits/chosen": 13.914634704589844, "logits/rejected": 14.134689331054688, "logps/chosen": -4.569584369659424, "logps/rejected": -4.634587287902832, "loss": 4.4558, "rewards/accuracies": 0.5, "rewards/chosen": -45.69584655761719, "rewards/margins": 0.6500282287597656, "rewards/rejected": -46.34587097167969, "step": 6637 }, { "epoch": 0.9038671023965141, "grad_norm": 42.389204185702326, "learning_rate": 2.2314530088412175e-08, "logits/chosen": 14.283220291137695, "logits/rejected": 14.44314193725586, "logps/chosen": -4.651002883911133, "logps/rejected": -4.633033752441406, "loss": 3.8569, "rewards/accuracies": 0.5, "rewards/chosen": -46.510032653808594, "rewards/margins": -0.1796884536743164, "rewards/rejected": -46.33034133911133, "step": 6638 }, { "epoch": 0.9040032679738562, "grad_norm": 39.40666970463507, "learning_rate": 2.225195320504616e-08, "logits/chosen": 12.953451156616211, "logits/rejected": 14.91528034210205, "logps/chosen": -4.169700622558594, "logps/rejected": -5.06016731262207, "loss": 3.8879, "rewards/accuracies": 1.0, "rewards/chosen": -41.69700622558594, "rewards/margins": 8.904668807983398, "rewards/rejected": -50.6016731262207, "step": 6639 }, { "epoch": 0.9041394335511983, "grad_norm": 45.55135697737713, "learning_rate": 2.218946167697382e-08, "logits/chosen": 14.100295066833496, "logits/rejected": 15.535682678222656, "logps/chosen": -4.440946102142334, "logps/rejected": -5.266910552978516, "loss": 4.1105, "rewards/accuracies": 1.0, "rewards/chosen": -44.409461975097656, "rewards/margins": 8.25964069366455, "rewards/rejected": -52.669105529785156, "step": 6640 }, { "epoch": 0.9042755991285403, "grad_norm": 46.39811693862066, "learning_rate": 2.212705551831564e-08, "logits/chosen": 14.614197731018066, "logits/rejected": 14.711920738220215, "logps/chosen": -4.416437149047852, "logps/rejected": -4.646538257598877, "loss": 3.4827, "rewards/accuracies": 0.5, "rewards/chosen": -44.16436767578125, "rewards/margins": 2.301013946533203, "rewards/rejected": -46.46538162231445, "step": 6641 }, { "epoch": 0.9044117647058824, "grad_norm": 43.33109354194937, "learning_rate": 2.2064734743172742e-08, "logits/chosen": 15.096620559692383, "logits/rejected": 15.285106658935547, "logps/chosen": -4.930114269256592, "logps/rejected": -4.941020965576172, "loss": 3.661, "rewards/accuracies": 0.25, "rewards/chosen": -49.301143646240234, "rewards/margins": 0.10906600952148438, "rewards/rejected": -49.41020965576172, "step": 6642 }, { "epoch": 0.9045479302832244, "grad_norm": 42.43489941610128, "learning_rate": 2.2002499365627146e-08, "logits/chosen": 14.728046417236328, "logits/rejected": 14.836980819702148, "logps/chosen": -4.434186935424805, "logps/rejected": -5.002631187438965, "loss": 4.1791, "rewards/accuracies": 1.0, "rewards/chosen": -44.34187316894531, "rewards/margins": 5.6844377517700195, "rewards/rejected": -50.026309967041016, "step": 6643 }, { "epoch": 0.9046840958605664, "grad_norm": 38.41540239386309, "learning_rate": 2.1940349399741296e-08, "logits/chosen": 14.084810256958008, "logits/rejected": 14.495725631713867, "logps/chosen": -4.459628105163574, "logps/rejected": -4.704826354980469, "loss": 3.7006, "rewards/accuracies": 0.75, "rewards/chosen": -44.596282958984375, "rewards/margins": 2.4519805908203125, "rewards/rejected": -47.04826354980469, "step": 6644 }, { "epoch": 0.9048202614379085, "grad_norm": 42.94036114523743, "learning_rate": 2.1878284859558583e-08, "logits/chosen": 14.210733413696289, "logits/rejected": 14.914236068725586, "logps/chosen": -4.664942264556885, "logps/rejected": -4.762699604034424, "loss": 3.6918, "rewards/accuracies": 0.75, "rewards/chosen": -46.64942169189453, "rewards/margins": 0.9775714874267578, "rewards/rejected": -47.62699508666992, "step": 6645 }, { "epoch": 0.9049564270152506, "grad_norm": 39.123166872289445, "learning_rate": 2.1816305759103072e-08, "logits/chosen": 14.658554077148438, "logits/rejected": 14.919204711914062, "logps/chosen": -4.570672512054443, "logps/rejected": -4.8375043869018555, "loss": 3.7414, "rewards/accuracies": 0.75, "rewards/chosen": -45.70672607421875, "rewards/margins": 2.668313980102539, "rewards/rejected": -48.375038146972656, "step": 6646 }, { "epoch": 0.9050925925925926, "grad_norm": 43.080338238001204, "learning_rate": 2.1754412112379295e-08, "logits/chosen": 14.541765213012695, "logits/rejected": 14.739055633544922, "logps/chosen": -4.791943073272705, "logps/rejected": -4.924604415893555, "loss": 4.1412, "rewards/accuracies": 0.5, "rewards/chosen": -47.919429779052734, "rewards/margins": 1.326615333557129, "rewards/rejected": -49.24604797363281, "step": 6647 }, { "epoch": 0.9052287581699346, "grad_norm": 45.96486543732579, "learning_rate": 2.1692603933372733e-08, "logits/chosen": 14.105734825134277, "logits/rejected": 14.5213623046875, "logps/chosen": -4.553252220153809, "logps/rejected": -4.880348205566406, "loss": 3.6967, "rewards/accuracies": 0.75, "rewards/chosen": -45.53252410888672, "rewards/margins": 3.270960807800293, "rewards/rejected": -48.80348205566406, "step": 6648 }, { "epoch": 0.9053649237472767, "grad_norm": 39.766785780979134, "learning_rate": 2.163088123604946e-08, "logits/chosen": 14.18639087677002, "logits/rejected": 14.761214256286621, "logps/chosen": -4.474785804748535, "logps/rejected": -4.951552867889404, "loss": 3.4852, "rewards/accuracies": 0.75, "rewards/chosen": -44.74785614013672, "rewards/margins": 4.767672538757324, "rewards/rejected": -49.51552963256836, "step": 6649 }, { "epoch": 0.9055010893246187, "grad_norm": 44.80996733462384, "learning_rate": 2.1569244034356184e-08, "logits/chosen": 15.035260200500488, "logits/rejected": 14.926736831665039, "logps/chosen": -4.719832420349121, "logps/rejected": -4.729714393615723, "loss": 3.4419, "rewards/accuracies": 0.75, "rewards/chosen": -47.198326110839844, "rewards/margins": 0.09882068634033203, "rewards/rejected": -47.29714584350586, "step": 6650 }, { "epoch": 0.9056372549019608, "grad_norm": 44.43722030464297, "learning_rate": 2.150769234222034e-08, "logits/chosen": 13.514665603637695, "logits/rejected": 13.204534530639648, "logps/chosen": -4.484996318817139, "logps/rejected": -4.5527873039245605, "loss": 3.9644, "rewards/accuracies": 1.0, "rewards/chosen": -44.84996795654297, "rewards/margins": 0.6779079437255859, "rewards/rejected": -45.52787399291992, "step": 6651 }, { "epoch": 0.9057734204793029, "grad_norm": 37.833896394996025, "learning_rate": 2.1446226173550097e-08, "logits/chosen": 14.372304916381836, "logits/rejected": 15.108009338378906, "logps/chosen": -4.638570785522461, "logps/rejected": -4.744853496551514, "loss": 3.9358, "rewards/accuracies": 0.5, "rewards/chosen": -46.385711669921875, "rewards/margins": 1.062826156616211, "rewards/rejected": -47.44853210449219, "step": 6652 }, { "epoch": 0.9059095860566448, "grad_norm": 42.79734488340313, "learning_rate": 2.1384845542234166e-08, "logits/chosen": 13.618810653686523, "logits/rejected": 15.190618515014648, "logps/chosen": -4.387542724609375, "logps/rejected": -4.8051605224609375, "loss": 3.4955, "rewards/accuracies": 1.0, "rewards/chosen": -43.87542724609375, "rewards/margins": 4.176177978515625, "rewards/rejected": -48.05160140991211, "step": 6653 }, { "epoch": 0.9060457516339869, "grad_norm": 45.08959825674754, "learning_rate": 2.132355046214207e-08, "logits/chosen": 13.985715866088867, "logits/rejected": 14.268783569335938, "logps/chosen": -4.505951881408691, "logps/rejected": -4.875575065612793, "loss": 3.8721, "rewards/accuracies": 1.0, "rewards/chosen": -45.05952072143555, "rewards/margins": 3.69622802734375, "rewards/rejected": -48.7557487487793, "step": 6654 }, { "epoch": 0.906181917211329, "grad_norm": 38.69583395471052, "learning_rate": 2.1262340947123937e-08, "logits/chosen": 14.135068893432617, "logits/rejected": 14.372303009033203, "logps/chosen": -4.590676307678223, "logps/rejected": -4.604943752288818, "loss": 4.028, "rewards/accuracies": 0.5, "rewards/chosen": -45.906761169433594, "rewards/margins": 0.14267635345458984, "rewards/rejected": -46.0494384765625, "step": 6655 }, { "epoch": 0.906318082788671, "grad_norm": 42.51472683102128, "learning_rate": 2.120121701101052e-08, "logits/chosen": 13.955775260925293, "logits/rejected": 14.563097953796387, "logps/chosen": -4.5857367515563965, "logps/rejected": -4.991122722625732, "loss": 3.8348, "rewards/accuracies": 0.75, "rewards/chosen": -45.85736846923828, "rewards/margins": 4.053858757019043, "rewards/rejected": -49.911224365234375, "step": 6656 }, { "epoch": 0.9064542483660131, "grad_norm": 39.16419822913624, "learning_rate": 2.1140178667613264e-08, "logits/chosen": 14.282608032226562, "logits/rejected": 14.526337623596191, "logps/chosen": -4.406243324279785, "logps/rejected": -4.661991119384766, "loss": 4.1843, "rewards/accuracies": 0.75, "rewards/chosen": -44.062435150146484, "rewards/margins": 2.557478904724121, "rewards/rejected": -46.61991500854492, "step": 6657 }, { "epoch": 0.9065904139433552, "grad_norm": 46.13356008315945, "learning_rate": 2.107922593072442e-08, "logits/chosen": 15.12122917175293, "logits/rejected": 13.741338729858398, "logps/chosen": -4.803373336791992, "logps/rejected": -4.660734176635742, "loss": 4.296, "rewards/accuracies": 0.5, "rewards/chosen": -48.03373336791992, "rewards/margins": -1.4263887405395508, "rewards/rejected": -46.60734558105469, "step": 6658 }, { "epoch": 0.9067265795206971, "grad_norm": 41.28376434947718, "learning_rate": 2.1018358814116577e-08, "logits/chosen": 14.134804725646973, "logits/rejected": 14.422489166259766, "logps/chosen": -4.152235984802246, "logps/rejected": -4.581939697265625, "loss": 3.5552, "rewards/accuracies": 0.75, "rewards/chosen": -41.52235794067383, "rewards/margins": 4.297039985656738, "rewards/rejected": -45.81939697265625, "step": 6659 }, { "epoch": 0.9068627450980392, "grad_norm": 41.5463673493398, "learning_rate": 2.095757733154331e-08, "logits/chosen": 14.46471118927002, "logits/rejected": 15.164369583129883, "logps/chosen": -4.857754707336426, "logps/rejected": -4.813946723937988, "loss": 3.4522, "rewards/accuracies": 0.75, "rewards/chosen": -48.577545166015625, "rewards/margins": -0.4380788803100586, "rewards/rejected": -48.139469146728516, "step": 6660 }, { "epoch": 0.9069989106753813, "grad_norm": 37.82631894545564, "learning_rate": 2.089688149673865e-08, "logits/chosen": 14.694070816040039, "logits/rejected": 14.895160675048828, "logps/chosen": -4.728250026702881, "logps/rejected": -4.876530647277832, "loss": 3.9092, "rewards/accuracies": 0.5, "rewards/chosen": -47.282501220703125, "rewards/margins": 1.482804298400879, "rewards/rejected": -48.76530456542969, "step": 6661 }, { "epoch": 0.9071350762527233, "grad_norm": 54.594797410516776, "learning_rate": 2.0836271323417276e-08, "logits/chosen": 14.304737091064453, "logits/rejected": 15.131388664245605, "logps/chosen": -4.724032402038574, "logps/rejected": -4.790124893188477, "loss": 4.0541, "rewards/accuracies": 0.5, "rewards/chosen": -47.240325927734375, "rewards/margins": 0.6609249114990234, "rewards/rejected": -47.90125274658203, "step": 6662 }, { "epoch": 0.9072712418300654, "grad_norm": 44.06903497805218, "learning_rate": 2.077574682527459e-08, "logits/chosen": 14.199321746826172, "logits/rejected": 14.822487831115723, "logps/chosen": -4.621512413024902, "logps/rejected": -5.117500305175781, "loss": 3.3442, "rewards/accuracies": 1.0, "rewards/chosen": -46.215126037597656, "rewards/margins": 4.959877967834473, "rewards/rejected": -51.17500305175781, "step": 6663 }, { "epoch": 0.9074074074074074, "grad_norm": 45.08668219191591, "learning_rate": 2.0715308015986664e-08, "logits/chosen": 14.021478652954102, "logits/rejected": 14.167856216430664, "logps/chosen": -4.436680793762207, "logps/rejected": -4.719158172607422, "loss": 3.9171, "rewards/accuracies": 0.75, "rewards/chosen": -44.3668098449707, "rewards/margins": 2.824770927429199, "rewards/rejected": -47.19158172607422, "step": 6664 }, { "epoch": 0.9075435729847494, "grad_norm": 47.75758525616645, "learning_rate": 2.065495490921001e-08, "logits/chosen": 14.478649139404297, "logits/rejected": 15.114907264709473, "logps/chosen": -4.553188800811768, "logps/rejected": -4.864815711975098, "loss": 4.3256, "rewards/accuracies": 0.5, "rewards/chosen": -45.531890869140625, "rewards/margins": 3.116270065307617, "rewards/rejected": -48.64815902709961, "step": 6665 }, { "epoch": 0.9076797385620915, "grad_norm": 50.90100662253481, "learning_rate": 2.059468751858202e-08, "logits/chosen": 15.02569580078125, "logits/rejected": 15.39999008178711, "logps/chosen": -4.721734046936035, "logps/rejected": -4.964895248413086, "loss": 3.8983, "rewards/accuracies": 0.75, "rewards/chosen": -47.21733856201172, "rewards/margins": 2.4316091537475586, "rewards/rejected": -49.648948669433594, "step": 6666 }, { "epoch": 0.9078159041394336, "grad_norm": 40.03319392347927, "learning_rate": 2.0534505857720653e-08, "logits/chosen": 14.660245895385742, "logits/rejected": 15.637784004211426, "logps/chosen": -4.487544536590576, "logps/rejected": -4.904115200042725, "loss": 3.6474, "rewards/accuracies": 0.75, "rewards/chosen": -44.87544250488281, "rewards/margins": 4.165707588195801, "rewards/rejected": -49.04115295410156, "step": 6667 }, { "epoch": 0.9079520697167756, "grad_norm": 44.03935238363702, "learning_rate": 2.0474409940224313e-08, "logits/chosen": 14.180974960327148, "logits/rejected": 14.39753532409668, "logps/chosen": -4.713033676147461, "logps/rejected": -4.934231758117676, "loss": 4.3456, "rewards/accuracies": 0.75, "rewards/chosen": -47.130340576171875, "rewards/margins": 2.211977958679199, "rewards/rejected": -49.342315673828125, "step": 6668 }, { "epoch": 0.9080882352941176, "grad_norm": 42.9312269346551, "learning_rate": 2.041439977967223e-08, "logits/chosen": 14.222246170043945, "logits/rejected": 14.929895401000977, "logps/chosen": -4.550518035888672, "logps/rejected": -4.911617279052734, "loss": 3.7549, "rewards/accuracies": 0.75, "rewards/chosen": -45.50518035888672, "rewards/margins": 3.610992431640625, "rewards/rejected": -49.116172790527344, "step": 6669 }, { "epoch": 0.9082244008714597, "grad_norm": 39.2141226763571, "learning_rate": 2.0354475389624224e-08, "logits/chosen": 14.873812675476074, "logits/rejected": 14.375962257385254, "logps/chosen": -4.73745584487915, "logps/rejected": -4.913836479187012, "loss": 3.8359, "rewards/accuracies": 0.5, "rewards/chosen": -47.37455749511719, "rewards/margins": 1.7638072967529297, "rewards/rejected": -49.138362884521484, "step": 6670 }, { "epoch": 0.9083605664488017, "grad_norm": 44.09479048914531, "learning_rate": 2.0294636783620667e-08, "logits/chosen": 14.388895034790039, "logits/rejected": 13.862878799438477, "logps/chosen": -4.590476989746094, "logps/rejected": -4.46435546875, "loss": 4.335, "rewards/accuracies": 0.25, "rewards/chosen": -45.90476989746094, "rewards/margins": -1.2612133026123047, "rewards/rejected": -44.6435546875, "step": 6671 }, { "epoch": 0.9084967320261438, "grad_norm": 37.512528230077194, "learning_rate": 2.0234883975182605e-08, "logits/chosen": 14.876432418823242, "logits/rejected": 14.755614280700684, "logps/chosen": -4.670368194580078, "logps/rejected": -4.758132457733154, "loss": 3.9339, "rewards/accuracies": 0.5, "rewards/chosen": -46.70368194580078, "rewards/margins": 0.8776445388793945, "rewards/rejected": -47.58132553100586, "step": 6672 }, { "epoch": 0.9086328976034859, "grad_norm": 43.22098203211344, "learning_rate": 2.017521697781177e-08, "logits/chosen": 14.217578887939453, "logits/rejected": 13.894996643066406, "logps/chosen": -4.936863899230957, "logps/rejected": -4.675816059112549, "loss": 4.4634, "rewards/accuracies": 0.25, "rewards/chosen": -49.36863708496094, "rewards/margins": -2.6104736328125, "rewards/rejected": -46.75816345214844, "step": 6673 }, { "epoch": 0.9087690631808278, "grad_norm": 41.6549988852382, "learning_rate": 2.0115635804990228e-08, "logits/chosen": 14.729822158813477, "logits/rejected": 14.851058959960938, "logps/chosen": -4.462913513183594, "logps/rejected": -4.638492584228516, "loss": 4.1256, "rewards/accuracies": 0.5, "rewards/chosen": -44.62913131713867, "rewards/margins": 1.7557916641235352, "rewards/rejected": -46.384925842285156, "step": 6674 }, { "epoch": 0.9089052287581699, "grad_norm": 45.26665122116407, "learning_rate": 2.0056140470180937e-08, "logits/chosen": 14.552682876586914, "logits/rejected": 14.417192459106445, "logps/chosen": -4.822709083557129, "logps/rejected": -4.928576469421387, "loss": 4.236, "rewards/accuracies": 0.75, "rewards/chosen": -48.227088928222656, "rewards/margins": 1.0586776733398438, "rewards/rejected": -49.2857666015625, "step": 6675 }, { "epoch": 0.909041394335512, "grad_norm": 39.85042255257345, "learning_rate": 1.999673098682737e-08, "logits/chosen": 14.170969009399414, "logits/rejected": 15.001583099365234, "logps/chosen": -4.635040283203125, "logps/rejected": -4.944853782653809, "loss": 4.0853, "rewards/accuracies": 0.75, "rewards/chosen": -46.35040283203125, "rewards/margins": 3.0981359481811523, "rewards/rejected": -49.44853973388672, "step": 6676 }, { "epoch": 0.909177559912854, "grad_norm": 34.241980504058375, "learning_rate": 1.9937407368353588e-08, "logits/chosen": 13.807014465332031, "logits/rejected": 14.478693008422852, "logps/chosen": -4.064448356628418, "logps/rejected": -4.562741279602051, "loss": 3.4409, "rewards/accuracies": 0.75, "rewards/chosen": -40.64448165893555, "rewards/margins": 4.982929229736328, "rewards/rejected": -45.627410888671875, "step": 6677 }, { "epoch": 0.9093137254901961, "grad_norm": 42.87765210052695, "learning_rate": 1.98781696281642e-08, "logits/chosen": 14.697832107543945, "logits/rejected": 14.865301132202148, "logps/chosen": -4.4123430252075195, "logps/rejected": -4.468003273010254, "loss": 4.3417, "rewards/accuracies": 0.75, "rewards/chosen": -44.123435974121094, "rewards/margins": 0.5565958023071289, "rewards/rejected": -44.680030822753906, "step": 6678 }, { "epoch": 0.9094498910675382, "grad_norm": 43.4589829726411, "learning_rate": 1.9819017779644544e-08, "logits/chosen": 13.828777313232422, "logits/rejected": 15.027697563171387, "logps/chosen": -4.122118949890137, "logps/rejected": -4.649411678314209, "loss": 4.1308, "rewards/accuracies": 0.75, "rewards/chosen": -41.22119140625, "rewards/margins": 5.272928237915039, "rewards/rejected": -46.494117736816406, "step": 6679 }, { "epoch": 0.9095860566448801, "grad_norm": 38.19251314129574, "learning_rate": 1.9759951836160416e-08, "logits/chosen": 14.315871238708496, "logits/rejected": 15.115632057189941, "logps/chosen": -4.443115711212158, "logps/rejected": -4.818454742431641, "loss": 3.7978, "rewards/accuracies": 0.75, "rewards/chosen": -44.431156158447266, "rewards/margins": 3.753390312194824, "rewards/rejected": -48.184547424316406, "step": 6680 }, { "epoch": 0.9097222222222222, "grad_norm": 41.08607617662466, "learning_rate": 1.9700971811058253e-08, "logits/chosen": 13.962602615356445, "logits/rejected": 14.77692699432373, "logps/chosen": -4.265133857727051, "logps/rejected": -4.604477882385254, "loss": 3.5991, "rewards/accuracies": 0.75, "rewards/chosen": -42.651336669921875, "rewards/margins": 3.3934383392333984, "rewards/rejected": -46.044776916503906, "step": 6681 }, { "epoch": 0.9098583877995643, "grad_norm": 65.03332660769314, "learning_rate": 1.9642077717665128e-08, "logits/chosen": 14.486157417297363, "logits/rejected": 13.947319984436035, "logps/chosen": -4.960582733154297, "logps/rejected": -4.885175704956055, "loss": 4.5721, "rewards/accuracies": 0.5, "rewards/chosen": -49.605831146240234, "rewards/margins": -0.7540769577026367, "rewards/rejected": -48.85175323486328, "step": 6682 }, { "epoch": 0.9099945533769063, "grad_norm": 40.407378824564525, "learning_rate": 1.9583269569288575e-08, "logits/chosen": 13.807184219360352, "logits/rejected": 15.761478424072266, "logps/chosen": -4.217436790466309, "logps/rejected": -4.8401780128479, "loss": 4.0665, "rewards/accuracies": 1.0, "rewards/chosen": -42.17436981201172, "rewards/margins": 6.227410316467285, "rewards/rejected": -48.40177917480469, "step": 6683 }, { "epoch": 0.9101307189542484, "grad_norm": 45.02049748747809, "learning_rate": 1.9524547379216848e-08, "logits/chosen": 14.3883056640625, "logits/rejected": 13.908073425292969, "logps/chosen": -4.662555694580078, "logps/rejected": -4.438049793243408, "loss": 3.9894, "rewards/accuracies": 0.25, "rewards/chosen": -46.625553131103516, "rewards/margins": -2.245054244995117, "rewards/rejected": -44.38050079345703, "step": 6684 }, { "epoch": 0.9102668845315904, "grad_norm": 51.35007217792168, "learning_rate": 1.9465911160718674e-08, "logits/chosen": 13.700751304626465, "logits/rejected": 14.562773704528809, "logps/chosen": -4.236029624938965, "logps/rejected": -4.631062030792236, "loss": 3.568, "rewards/accuracies": 1.0, "rewards/chosen": -42.36029815673828, "rewards/margins": 3.9503183364868164, "rewards/rejected": -46.31061553955078, "step": 6685 }, { "epoch": 0.9104030501089324, "grad_norm": 38.25388896823731, "learning_rate": 1.9407360927043403e-08, "logits/chosen": 14.572431564331055, "logits/rejected": 14.002555847167969, "logps/chosen": -4.709508895874023, "logps/rejected": -4.7939300537109375, "loss": 3.6558, "rewards/accuracies": 0.5, "rewards/chosen": -47.09508514404297, "rewards/margins": 0.8442106246948242, "rewards/rejected": -47.939300537109375, "step": 6686 }, { "epoch": 0.9105392156862745, "grad_norm": 38.82708115881686, "learning_rate": 1.93488966914209e-08, "logits/chosen": 14.406234741210938, "logits/rejected": 14.637176513671875, "logps/chosen": -4.616641521453857, "logps/rejected": -4.720925331115723, "loss": 3.4082, "rewards/accuracies": 0.75, "rewards/chosen": -46.166412353515625, "rewards/margins": 1.042837142944336, "rewards/rejected": -47.209251403808594, "step": 6687 }, { "epoch": 0.9106753812636166, "grad_norm": 38.686453354636285, "learning_rate": 1.9290518467061712e-08, "logits/chosen": 15.371472358703613, "logits/rejected": 15.395671844482422, "logps/chosen": -5.093179702758789, "logps/rejected": -5.1485066413879395, "loss": 3.79, "rewards/accuracies": 0.5, "rewards/chosen": -50.931793212890625, "rewards/margins": 0.5532684326171875, "rewards/rejected": -51.48506164550781, "step": 6688 }, { "epoch": 0.9108115468409586, "grad_norm": 46.36762598396369, "learning_rate": 1.923222626715688e-08, "logits/chosen": 14.756339073181152, "logits/rejected": 14.788352966308594, "logps/chosen": -5.007007598876953, "logps/rejected": -4.740532398223877, "loss": 4.2639, "rewards/accuracies": 0.0, "rewards/chosen": -50.07007598876953, "rewards/margins": -2.664750099182129, "rewards/rejected": -47.40532684326172, "step": 6689 }, { "epoch": 0.9109477124183006, "grad_norm": 38.851020768355625, "learning_rate": 1.9174020104877965e-08, "logits/chosen": 15.126924514770508, "logits/rejected": 13.649419784545898, "logps/chosen": -4.9746317863464355, "logps/rejected": -4.70156192779541, "loss": 4.0402, "rewards/accuracies": 0.0, "rewards/chosen": -49.746315002441406, "rewards/margins": -2.730696678161621, "rewards/rejected": -47.01561737060547, "step": 6690 }, { "epoch": 0.9110838779956427, "grad_norm": 38.86733928936323, "learning_rate": 1.9115899993377104e-08, "logits/chosen": 13.848287582397461, "logits/rejected": 14.715869903564453, "logps/chosen": -4.8120503425598145, "logps/rejected": -4.954735279083252, "loss": 3.8749, "rewards/accuracies": 0.75, "rewards/chosen": -48.120506286621094, "rewards/margins": 1.4268455505371094, "rewards/rejected": -49.5473518371582, "step": 6691 }, { "epoch": 0.9112200435729847, "grad_norm": 40.93174804414606, "learning_rate": 1.9057865945787133e-08, "logits/chosen": 14.351924896240234, "logits/rejected": 14.941909790039062, "logps/chosen": -4.4026641845703125, "logps/rejected": -4.651601314544678, "loss": 4.0671, "rewards/accuracies": 0.75, "rewards/chosen": -44.026641845703125, "rewards/margins": 2.4893712997436523, "rewards/rejected": -46.516014099121094, "step": 6692 }, { "epoch": 0.9113562091503268, "grad_norm": 39.90634816337801, "learning_rate": 1.89999179752212e-08, "logits/chosen": 14.99413013458252, "logits/rejected": 15.615421295166016, "logps/chosen": -4.8381757736206055, "logps/rejected": -5.094505310058594, "loss": 4.127, "rewards/accuracies": 1.0, "rewards/chosen": -48.38175582885742, "rewards/margins": 2.563295364379883, "rewards/rejected": -50.94505310058594, "step": 6693 }, { "epoch": 0.9114923747276689, "grad_norm": 43.579764454651794, "learning_rate": 1.8942056094773196e-08, "logits/chosen": 14.963214874267578, "logits/rejected": 14.543905258178711, "logps/chosen": -4.609870910644531, "logps/rejected": -4.69120979309082, "loss": 4.2025, "rewards/accuracies": 0.5, "rewards/chosen": -46.09870529174805, "rewards/margins": 0.8133916854858398, "rewards/rejected": -46.9120979309082, "step": 6694 }, { "epoch": 0.911628540305011, "grad_norm": 40.817667071013986, "learning_rate": 1.8884280317517453e-08, "logits/chosen": 14.827702522277832, "logits/rejected": 15.322016716003418, "logps/chosen": -4.553913116455078, "logps/rejected": -4.987846374511719, "loss": 4.1207, "rewards/accuracies": 0.5, "rewards/chosen": -45.53913116455078, "rewards/margins": 4.339326858520508, "rewards/rejected": -49.87846374511719, "step": 6695 }, { "epoch": 0.9117647058823529, "grad_norm": 43.381449039514074, "learning_rate": 1.8826590656508955e-08, "logits/chosen": 14.286949157714844, "logits/rejected": 14.504220962524414, "logps/chosen": -4.626726150512695, "logps/rejected": -4.786688804626465, "loss": 4.1882, "rewards/accuracies": 0.75, "rewards/chosen": -46.26726150512695, "rewards/margins": 1.5996274948120117, "rewards/rejected": -47.86688995361328, "step": 6696 }, { "epoch": 0.911900871459695, "grad_norm": 42.34798754026035, "learning_rate": 1.8768987124783054e-08, "logits/chosen": 14.53094482421875, "logits/rejected": 14.99316120147705, "logps/chosen": -4.586512565612793, "logps/rejected": -4.684614181518555, "loss": 3.4521, "rewards/accuracies": 0.5, "rewards/chosen": -45.8651237487793, "rewards/margins": 0.9810161590576172, "rewards/rejected": -46.84613800048828, "step": 6697 }, { "epoch": 0.9120370370370371, "grad_norm": 51.44593263114024, "learning_rate": 1.8711469735355824e-08, "logits/chosen": 15.368955612182617, "logits/rejected": 15.721224784851074, "logps/chosen": -4.951221466064453, "logps/rejected": -4.6934709548950195, "loss": 4.4546, "rewards/accuracies": 0.25, "rewards/chosen": -49.51221466064453, "rewards/margins": -2.5775041580200195, "rewards/rejected": -46.93470764160156, "step": 6698 }, { "epoch": 0.9121732026143791, "grad_norm": 43.83660977057471, "learning_rate": 1.86540385012238e-08, "logits/chosen": 14.7268648147583, "logits/rejected": 14.98118782043457, "logps/chosen": -4.7958984375, "logps/rejected": -5.021492958068848, "loss": 4.3372, "rewards/accuracies": 0.75, "rewards/chosen": -47.958984375, "rewards/margins": 2.2559404373168945, "rewards/rejected": -50.21492385864258, "step": 6699 }, { "epoch": 0.9123093681917211, "grad_norm": 46.13214225443444, "learning_rate": 1.8596693435363985e-08, "logits/chosen": 14.13222599029541, "logits/rejected": 13.727469444274902, "logps/chosen": -4.445971965789795, "logps/rejected": -4.522481441497803, "loss": 3.8857, "rewards/accuracies": 0.75, "rewards/chosen": -44.459720611572266, "rewards/margins": 0.7650947570800781, "rewards/rejected": -45.224815368652344, "step": 6700 }, { "epoch": 0.9124455337690632, "grad_norm": 35.848091091274206, "learning_rate": 1.8539434550733967e-08, "logits/chosen": 12.918415069580078, "logits/rejected": 14.408086776733398, "logps/chosen": -4.559051513671875, "logps/rejected": -4.8100266456604, "loss": 3.6369, "rewards/accuracies": 0.5, "rewards/chosen": -45.59051513671875, "rewards/margins": 2.509751319885254, "rewards/rejected": -48.10026550292969, "step": 6701 }, { "epoch": 0.9125816993464052, "grad_norm": 41.821585739778854, "learning_rate": 1.848226186027193e-08, "logits/chosen": 13.285126686096191, "logits/rejected": 14.111709594726562, "logps/chosen": -4.309845924377441, "logps/rejected": -4.754433631896973, "loss": 3.5877, "rewards/accuracies": 1.0, "rewards/chosen": -43.09845733642578, "rewards/margins": 4.445880889892578, "rewards/rejected": -47.54433822631836, "step": 6702 }, { "epoch": 0.9127178649237473, "grad_norm": 40.774900820261585, "learning_rate": 1.842517537689652e-08, "logits/chosen": 14.281402587890625, "logits/rejected": 14.916107177734375, "logps/chosen": -4.358415126800537, "logps/rejected": -4.542661666870117, "loss": 3.7699, "rewards/accuracies": 0.75, "rewards/chosen": -43.58414840698242, "rewards/margins": 1.8424711227416992, "rewards/rejected": -45.42662048339844, "step": 6703 }, { "epoch": 0.9128540305010894, "grad_norm": 40.807946468926794, "learning_rate": 1.8368175113506834e-08, "logits/chosen": 14.480512619018555, "logits/rejected": 13.648452758789062, "logps/chosen": -4.572599411010742, "logps/rejected": -4.693899154663086, "loss": 3.8786, "rewards/accuracies": 0.75, "rewards/chosen": -45.725990295410156, "rewards/margins": 1.2129955291748047, "rewards/rejected": -46.938987731933594, "step": 6704 }, { "epoch": 0.9129901960784313, "grad_norm": 41.25801256281276, "learning_rate": 1.8311261082982532e-08, "logits/chosen": 14.157268524169922, "logits/rejected": 14.261289596557617, "logps/chosen": -4.485854148864746, "logps/rejected": -4.726783275604248, "loss": 3.627, "rewards/accuracies": 0.5, "rewards/chosen": -44.85853576660156, "rewards/margins": 2.409295082092285, "rewards/rejected": -47.2678337097168, "step": 6705 }, { "epoch": 0.9131263616557734, "grad_norm": 40.65861577640479, "learning_rate": 1.8254433298183948e-08, "logits/chosen": 14.113595962524414, "logits/rejected": 14.458602905273438, "logps/chosen": -4.440264701843262, "logps/rejected": -4.382683753967285, "loss": 3.6992, "rewards/accuracies": 0.5, "rewards/chosen": -44.40264129638672, "rewards/margins": -0.5758028030395508, "rewards/rejected": -43.82684326171875, "step": 6706 }, { "epoch": 0.9132625272331155, "grad_norm": 39.43297556115268, "learning_rate": 1.8197691771951652e-08, "logits/chosen": 13.502429962158203, "logits/rejected": 14.333898544311523, "logps/chosen": -4.546031475067139, "logps/rejected": -4.620264530181885, "loss": 3.7819, "rewards/accuracies": 0.5, "rewards/chosen": -45.4603157043457, "rewards/margins": 0.7423305511474609, "rewards/rejected": -46.20264434814453, "step": 6707 }, { "epoch": 0.9133986928104575, "grad_norm": 40.089516419589486, "learning_rate": 1.81410365171069e-08, "logits/chosen": 15.379501342773438, "logits/rejected": 15.249211311340332, "logps/chosen": -5.190189838409424, "logps/rejected": -5.303492069244385, "loss": 3.7315, "rewards/accuracies": 0.75, "rewards/chosen": -51.90190124511719, "rewards/margins": 1.1330204010009766, "rewards/rejected": -53.03491973876953, "step": 6708 }, { "epoch": 0.9135348583877996, "grad_norm": 41.55132180979413, "learning_rate": 1.8084467546451452e-08, "logits/chosen": 13.715373992919922, "logits/rejected": 14.497365951538086, "logps/chosen": -4.379607200622559, "logps/rejected": -4.740215301513672, "loss": 4.2446, "rewards/accuracies": 0.75, "rewards/chosen": -43.79607391357422, "rewards/margins": 3.606081008911133, "rewards/rejected": -47.40215301513672, "step": 6709 }, { "epoch": 0.9136710239651417, "grad_norm": 41.22646720392411, "learning_rate": 1.8027984872767488e-08, "logits/chosen": 14.152936935424805, "logits/rejected": 14.707353591918945, "logps/chosen": -4.454685688018799, "logps/rejected": -4.781818866729736, "loss": 3.7138, "rewards/accuracies": 0.75, "rewards/chosen": -44.54685592651367, "rewards/margins": 3.2713308334350586, "rewards/rejected": -47.81818389892578, "step": 6710 }, { "epoch": 0.9138071895424836, "grad_norm": 43.90658154716276, "learning_rate": 1.797158850881777e-08, "logits/chosen": 14.433002471923828, "logits/rejected": 14.768560409545898, "logps/chosen": -4.618204593658447, "logps/rejected": -4.723602294921875, "loss": 3.7389, "rewards/accuracies": 0.5, "rewards/chosen": -46.182044982910156, "rewards/margins": 1.0539798736572266, "rewards/rejected": -47.23602294921875, "step": 6711 }, { "epoch": 0.9139433551198257, "grad_norm": 48.423176076482214, "learning_rate": 1.791527846734553e-08, "logits/chosen": 14.72031307220459, "logits/rejected": 15.5094633102417, "logps/chosen": -5.0324387550354, "logps/rejected": -5.227556228637695, "loss": 3.963, "rewards/accuracies": 0.75, "rewards/chosen": -50.32438659667969, "rewards/margins": 1.9511709213256836, "rewards/rejected": -52.27555847167969, "step": 6712 }, { "epoch": 0.9140795206971678, "grad_norm": 42.50065562406987, "learning_rate": 1.785905476107441e-08, "logits/chosen": 14.414649963378906, "logits/rejected": 14.79471206665039, "logps/chosen": -4.65838098526001, "logps/rejected": -4.923117637634277, "loss": 3.6274, "rewards/accuracies": 1.0, "rewards/chosen": -46.58380889892578, "rewards/margins": 2.647369384765625, "rewards/rejected": -49.231178283691406, "step": 6713 }, { "epoch": 0.9142156862745098, "grad_norm": 43.76930006633924, "learning_rate": 1.7802917402708696e-08, "logits/chosen": 13.92312240600586, "logits/rejected": 14.534712791442871, "logps/chosen": -4.32607364654541, "logps/rejected": -4.768518447875977, "loss": 3.797, "rewards/accuracies": 1.0, "rewards/chosen": -43.26073455810547, "rewards/margins": 4.424447059631348, "rewards/rejected": -47.685184478759766, "step": 6714 }, { "epoch": 0.9143518518518519, "grad_norm": 41.85010605549837, "learning_rate": 1.774686640493308e-08, "logits/chosen": 13.986568450927734, "logits/rejected": 14.496861457824707, "logps/chosen": -4.404998302459717, "logps/rejected": -4.995025634765625, "loss": 3.7752, "rewards/accuracies": 1.0, "rewards/chosen": -44.049983978271484, "rewards/margins": 5.900274276733398, "rewards/rejected": -49.950260162353516, "step": 6715 }, { "epoch": 0.914488017429194, "grad_norm": 41.27668032671389, "learning_rate": 1.7690901780412725e-08, "logits/chosen": 14.737095832824707, "logits/rejected": 15.092412948608398, "logps/chosen": -4.8873186111450195, "logps/rejected": -5.128111362457275, "loss": 3.9172, "rewards/accuracies": 0.75, "rewards/chosen": -48.87318420410156, "rewards/margins": 2.4079294204711914, "rewards/rejected": -51.28111267089844, "step": 6716 }, { "epoch": 0.9146241830065359, "grad_norm": 40.32187977431285, "learning_rate": 1.7635023541793292e-08, "logits/chosen": 14.944141387939453, "logits/rejected": 15.024419784545898, "logps/chosen": -4.860836029052734, "logps/rejected": -5.001791000366211, "loss": 4.176, "rewards/accuracies": 0.75, "rewards/chosen": -48.608360290527344, "rewards/margins": 1.4095458984375, "rewards/rejected": -50.017906188964844, "step": 6717 }, { "epoch": 0.914760348583878, "grad_norm": 42.56638134996496, "learning_rate": 1.7579231701701035e-08, "logits/chosen": 14.793991088867188, "logits/rejected": 15.513692855834961, "logps/chosen": -4.741247177124023, "logps/rejected": -5.022244453430176, "loss": 3.864, "rewards/accuracies": 0.5, "rewards/chosen": -47.4124755859375, "rewards/margins": 2.8099756240844727, "rewards/rejected": -50.222450256347656, "step": 6718 }, { "epoch": 0.9148965141612201, "grad_norm": 44.95002109270664, "learning_rate": 1.7523526272742405e-08, "logits/chosen": 13.958932876586914, "logits/rejected": 14.611984252929688, "logps/chosen": -4.457433700561523, "logps/rejected": -4.747585296630859, "loss": 4.2033, "rewards/accuracies": 1.0, "rewards/chosen": -44.57433319091797, "rewards/margins": 2.901521682739258, "rewards/rejected": -47.47585678100586, "step": 6719 }, { "epoch": 0.9150326797385621, "grad_norm": 43.22442504455347, "learning_rate": 1.7467907267504623e-08, "logits/chosen": 13.439298629760742, "logits/rejected": 14.031906127929688, "logps/chosen": -4.345303535461426, "logps/rejected": -4.464112281799316, "loss": 4.0251, "rewards/accuracies": 0.75, "rewards/chosen": -43.453033447265625, "rewards/margins": 1.188084602355957, "rewards/rejected": -44.64112091064453, "step": 6720 }, { "epoch": 0.9151688453159041, "grad_norm": 44.35018737874633, "learning_rate": 1.7412374698555275e-08, "logits/chosen": 14.217777252197266, "logits/rejected": 14.56861686706543, "logps/chosen": -4.7062177658081055, "logps/rejected": -4.929343223571777, "loss": 4.346, "rewards/accuracies": 0.75, "rewards/chosen": -47.06217575073242, "rewards/margins": 2.231259346008301, "rewards/rejected": -49.293434143066406, "step": 6721 }, { "epoch": 0.9153050108932462, "grad_norm": 57.25186184273717, "learning_rate": 1.735692857844233e-08, "logits/chosen": 15.05665397644043, "logits/rejected": 14.861652374267578, "logps/chosen": -4.863471031188965, "logps/rejected": -4.638768196105957, "loss": 4.2603, "rewards/accuracies": 0.25, "rewards/chosen": -48.63471221923828, "rewards/margins": -2.247030258178711, "rewards/rejected": -46.38768005371094, "step": 6722 }, { "epoch": 0.9154411764705882, "grad_norm": 41.23354841680553, "learning_rate": 1.73015689196943e-08, "logits/chosen": 13.701273918151855, "logits/rejected": 14.397151947021484, "logps/chosen": -4.424712181091309, "logps/rejected": -4.600324630737305, "loss": 3.8169, "rewards/accuracies": 0.75, "rewards/chosen": -44.24712371826172, "rewards/margins": 1.756119728088379, "rewards/rejected": -46.00324249267578, "step": 6723 }, { "epoch": 0.9155773420479303, "grad_norm": 40.24866032631033, "learning_rate": 1.724629573482028e-08, "logits/chosen": 14.852397918701172, "logits/rejected": 15.255749702453613, "logps/chosen": -4.531219482421875, "logps/rejected": -5.039289951324463, "loss": 3.9503, "rewards/accuracies": 0.75, "rewards/chosen": -45.31220245361328, "rewards/margins": 5.080699920654297, "rewards/rejected": -50.39289855957031, "step": 6724 }, { "epoch": 0.9157135076252724, "grad_norm": 39.54251185031297, "learning_rate": 1.7191109036309536e-08, "logits/chosen": 13.42332935333252, "logits/rejected": 13.542121887207031, "logps/chosen": -4.263423919677734, "logps/rejected": -4.533936023712158, "loss": 3.8419, "rewards/accuracies": 0.75, "rewards/chosen": -42.634239196777344, "rewards/margins": 2.705118179321289, "rewards/rejected": -45.33935546875, "step": 6725 }, { "epoch": 0.9158496732026143, "grad_norm": 45.48154358440942, "learning_rate": 1.7136008836632042e-08, "logits/chosen": 14.188180923461914, "logits/rejected": 15.133886337280273, "logps/chosen": -4.487006187438965, "logps/rejected": -4.716798305511475, "loss": 3.965, "rewards/accuracies": 1.0, "rewards/chosen": -44.870059967041016, "rewards/margins": 2.297922134399414, "rewards/rejected": -47.16798400878906, "step": 6726 }, { "epoch": 0.9159858387799564, "grad_norm": 42.89347969158693, "learning_rate": 1.7080995148238152e-08, "logits/chosen": 14.326642036437988, "logits/rejected": 14.835664749145508, "logps/chosen": -4.486357688903809, "logps/rejected": -4.715433597564697, "loss": 3.7088, "rewards/accuracies": 0.5, "rewards/chosen": -44.863582611083984, "rewards/margins": 2.290755271911621, "rewards/rejected": -47.15433883666992, "step": 6727 }, { "epoch": 0.9161220043572985, "grad_norm": 43.786167521772434, "learning_rate": 1.7026067983558635e-08, "logits/chosen": 14.43188190460205, "logits/rejected": 15.10136604309082, "logps/chosen": -4.372806072235107, "logps/rejected": -4.870115280151367, "loss": 4.0125, "rewards/accuracies": 0.75, "rewards/chosen": -43.72806167602539, "rewards/margins": 4.973092079162598, "rewards/rejected": -48.70115280151367, "step": 6728 }, { "epoch": 0.9162581699346405, "grad_norm": 45.31612044064354, "learning_rate": 1.697122735500476e-08, "logits/chosen": 14.732308387756348, "logits/rejected": 15.232099533081055, "logps/chosen": -4.342516899108887, "logps/rejected": -4.725614547729492, "loss": 3.7229, "rewards/accuracies": 1.0, "rewards/chosen": -43.425167083740234, "rewards/margins": 3.830979347229004, "rewards/rejected": -47.25614547729492, "step": 6729 }, { "epoch": 0.9163943355119826, "grad_norm": 40.268609039421996, "learning_rate": 1.691647327496826e-08, "logits/chosen": 14.176493644714355, "logits/rejected": 14.880973815917969, "logps/chosen": -4.495622158050537, "logps/rejected": -4.536306858062744, "loss": 3.4962, "rewards/accuracies": 0.5, "rewards/chosen": -44.95622253417969, "rewards/margins": 0.4068479537963867, "rewards/rejected": -45.36307144165039, "step": 6730 }, { "epoch": 0.9165305010893247, "grad_norm": 48.415396884772846, "learning_rate": 1.686180575582119e-08, "logits/chosen": 14.487730979919434, "logits/rejected": 14.27824878692627, "logps/chosen": -4.512663841247559, "logps/rejected": -4.490056991577148, "loss": 4.3658, "rewards/accuracies": 0.25, "rewards/chosen": -45.12664031982422, "rewards/margins": -0.22606658935546875, "rewards/rejected": -44.90057373046875, "step": 6731 }, { "epoch": 0.9166666666666666, "grad_norm": 50.7459216057353, "learning_rate": 1.680722480991612e-08, "logits/chosen": 14.335134506225586, "logits/rejected": 14.808963775634766, "logps/chosen": -4.730999946594238, "logps/rejected": -4.912758827209473, "loss": 4.32, "rewards/accuracies": 0.5, "rewards/chosen": -47.30999755859375, "rewards/margins": 1.8175907135009766, "rewards/rejected": -49.12759017944336, "step": 6732 }, { "epoch": 0.9168028322440087, "grad_norm": 47.118525018580264, "learning_rate": 1.675273044958616e-08, "logits/chosen": 14.497742652893066, "logits/rejected": 14.879541397094727, "logps/chosen": -4.626609802246094, "logps/rejected": -4.9829864501953125, "loss": 4.3823, "rewards/accuracies": 0.75, "rewards/chosen": -46.26609802246094, "rewards/margins": 3.563767433166504, "rewards/rejected": -49.829864501953125, "step": 6733 }, { "epoch": 0.9169389978213508, "grad_norm": 44.35659718146597, "learning_rate": 1.6698322687144707e-08, "logits/chosen": 14.889957427978516, "logits/rejected": 14.365741729736328, "logps/chosen": -4.439936637878418, "logps/rejected": -4.32230281829834, "loss": 4.4394, "rewards/accuracies": 0.25, "rewards/chosen": -44.39936065673828, "rewards/margins": -1.1763343811035156, "rewards/rejected": -43.22303009033203, "step": 6734 }, { "epoch": 0.9170751633986928, "grad_norm": 53.61066480052035, "learning_rate": 1.664400153488561e-08, "logits/chosen": 15.330438613891602, "logits/rejected": 15.428049087524414, "logps/chosen": -5.0300397872924805, "logps/rejected": -5.2213921546936035, "loss": 3.8766, "rewards/accuracies": 0.75, "rewards/chosen": -50.30039596557617, "rewards/margins": 1.9135265350341797, "rewards/rejected": -52.21392059326172, "step": 6735 }, { "epoch": 0.9172113289760349, "grad_norm": 39.58803816417253, "learning_rate": 1.6589767005083276e-08, "logits/chosen": 13.966545104980469, "logits/rejected": 14.517465591430664, "logps/chosen": -4.079880237579346, "logps/rejected": -4.760558128356934, "loss": 3.5071, "rewards/accuracies": 1.0, "rewards/chosen": -40.798805236816406, "rewards/margins": 6.80677604675293, "rewards/rejected": -47.6055793762207, "step": 6736 }, { "epoch": 0.9173474945533769, "grad_norm": 48.40057326372423, "learning_rate": 1.6535619109992305e-08, "logits/chosen": 14.706247329711914, "logits/rejected": 14.076663970947266, "logps/chosen": -4.701305389404297, "logps/rejected": -4.6523566246032715, "loss": 4.0232, "rewards/accuracies": 0.5, "rewards/chosen": -47.013057708740234, "rewards/margins": -0.4894905090332031, "rewards/rejected": -46.52356719970703, "step": 6737 }, { "epoch": 0.9174836601307189, "grad_norm": 41.50168273397593, "learning_rate": 1.6481557861847973e-08, "logits/chosen": 14.53156852722168, "logits/rejected": 15.15737533569336, "logps/chosen": -4.646544456481934, "logps/rejected": -4.959150314331055, "loss": 3.8397, "rewards/accuracies": 0.5, "rewards/chosen": -46.46544647216797, "rewards/margins": 3.126058578491211, "rewards/rejected": -49.59149932861328, "step": 6738 }, { "epoch": 0.917619825708061, "grad_norm": 41.03290057927514, "learning_rate": 1.642758327286593e-08, "logits/chosen": 13.473488807678223, "logits/rejected": 13.711894989013672, "logps/chosen": -4.445440292358398, "logps/rejected": -4.378737926483154, "loss": 3.6151, "rewards/accuracies": 0.5, "rewards/chosen": -44.45439910888672, "rewards/margins": -0.667022705078125, "rewards/rejected": -43.787376403808594, "step": 6739 }, { "epoch": 0.9177559912854031, "grad_norm": 40.02899836939286, "learning_rate": 1.637369535524198e-08, "logits/chosen": 14.427011489868164, "logits/rejected": 14.736498832702637, "logps/chosen": -4.59568977355957, "logps/rejected": -4.76918888092041, "loss": 3.4852, "rewards/accuracies": 0.75, "rewards/chosen": -45.95690155029297, "rewards/margins": 1.734990119934082, "rewards/rejected": -47.691890716552734, "step": 6740 }, { "epoch": 0.9178921568627451, "grad_norm": 44.52920555957329, "learning_rate": 1.631989412115269e-08, "logits/chosen": 13.620780944824219, "logits/rejected": 14.122085571289062, "logps/chosen": -4.3185834884643555, "logps/rejected": -4.514713764190674, "loss": 3.9711, "rewards/accuracies": 0.5, "rewards/chosen": -43.18583679199219, "rewards/margins": 1.9613008499145508, "rewards/rejected": -45.14714050292969, "step": 6741 }, { "epoch": 0.9180283224400871, "grad_norm": 49.916227872092506, "learning_rate": 1.6266179582754868e-08, "logits/chosen": 13.42918586730957, "logits/rejected": 13.922933578491211, "logps/chosen": -4.316636562347412, "logps/rejected": -4.341921806335449, "loss": 4.4519, "rewards/accuracies": 0.25, "rewards/chosen": -43.16636657714844, "rewards/margins": 0.25284862518310547, "rewards/rejected": -43.41921615600586, "step": 6742 }, { "epoch": 0.9181644880174292, "grad_norm": 43.6778648545696, "learning_rate": 1.62125517521857e-08, "logits/chosen": 14.191356658935547, "logits/rejected": 14.549787521362305, "logps/chosen": -4.443716049194336, "logps/rejected": -4.895580768585205, "loss": 3.3165, "rewards/accuracies": 1.0, "rewards/chosen": -44.437164306640625, "rewards/margins": 4.518644332885742, "rewards/rejected": -48.955810546875, "step": 6743 }, { "epoch": 0.9183006535947712, "grad_norm": 46.73787271359297, "learning_rate": 1.615901064156291e-08, "logits/chosen": 13.593523025512695, "logits/rejected": 13.435396194458008, "logps/chosen": -4.089503288269043, "logps/rejected": -4.15757942199707, "loss": 4.053, "rewards/accuracies": 0.5, "rewards/chosen": -40.89503479003906, "rewards/margins": 0.6807594299316406, "rewards/rejected": -41.57579040527344, "step": 6744 }, { "epoch": 0.9184368191721133, "grad_norm": 40.75589067794002, "learning_rate": 1.6105556262984556e-08, "logits/chosen": 14.489201545715332, "logits/rejected": 14.92846393585205, "logps/chosen": -4.556798934936523, "logps/rejected": -5.077003002166748, "loss": 3.7231, "rewards/accuracies": 1.0, "rewards/chosen": -45.56798553466797, "rewards/margins": 5.202040672302246, "rewards/rejected": -50.7700309753418, "step": 6745 }, { "epoch": 0.9185729847494554, "grad_norm": 46.29820450714252, "learning_rate": 1.6052188628529017e-08, "logits/chosen": 13.927278518676758, "logits/rejected": 14.484935760498047, "logps/chosen": -4.588665962219238, "logps/rejected": -4.739291191101074, "loss": 4.557, "rewards/accuracies": 0.5, "rewards/chosen": -45.886661529541016, "rewards/margins": 1.5062494277954102, "rewards/rejected": -47.392913818359375, "step": 6746 }, { "epoch": 0.9187091503267973, "grad_norm": 43.964968521757044, "learning_rate": 1.599890775025523e-08, "logits/chosen": 14.140157699584961, "logits/rejected": 14.450202941894531, "logps/chosen": -4.222975730895996, "logps/rejected": -4.5796217918396, "loss": 3.7768, "rewards/accuracies": 1.0, "rewards/chosen": -42.229759216308594, "rewards/margins": 3.566460609436035, "rewards/rejected": -45.79621887207031, "step": 6747 }, { "epoch": 0.9188453159041394, "grad_norm": 43.68527912387259, "learning_rate": 1.594571364020245e-08, "logits/chosen": 15.004860877990723, "logits/rejected": 15.183977127075195, "logps/chosen": -4.665379524230957, "logps/rejected": -4.9420695304870605, "loss": 3.9631, "rewards/accuracies": 0.75, "rewards/chosen": -46.65379333496094, "rewards/margins": 2.7669029235839844, "rewards/rejected": -49.420692443847656, "step": 6748 }, { "epoch": 0.9189814814814815, "grad_norm": 48.82324288848725, "learning_rate": 1.5892606310390266e-08, "logits/chosen": 14.38282299041748, "logits/rejected": 15.771282196044922, "logps/chosen": -4.781447410583496, "logps/rejected": -5.238945484161377, "loss": 4.2146, "rewards/accuracies": 1.0, "rewards/chosen": -47.81446838378906, "rewards/margins": 4.574982643127441, "rewards/rejected": -52.38945388793945, "step": 6749 }, { "epoch": 0.9191176470588235, "grad_norm": 44.00155275863371, "learning_rate": 1.583958577281872e-08, "logits/chosen": 14.801801681518555, "logits/rejected": 14.909854888916016, "logps/chosen": -4.511538505554199, "logps/rejected": -4.7479352951049805, "loss": 3.8806, "rewards/accuracies": 1.0, "rewards/chosen": -45.115386962890625, "rewards/margins": 2.363962173461914, "rewards/rejected": -47.479347229003906, "step": 6750 }, { "epoch": 0.9192538126361656, "grad_norm": 50.793516817686424, "learning_rate": 1.5786652039468317e-08, "logits/chosen": 14.787588119506836, "logits/rejected": 14.529532432556152, "logps/chosen": -4.914092063903809, "logps/rejected": -4.900106906890869, "loss": 4.0166, "rewards/accuracies": 0.5, "rewards/chosen": -49.14092254638672, "rewards/margins": -0.13985538482666016, "rewards/rejected": -49.001068115234375, "step": 6751 }, { "epoch": 0.9193899782135077, "grad_norm": 42.59819657595761, "learning_rate": 1.5733805122299803e-08, "logits/chosen": 14.148103713989258, "logits/rejected": 14.522682189941406, "logps/chosen": -4.587550640106201, "logps/rejected": -4.7668962478637695, "loss": 3.2267, "rewards/accuracies": 0.75, "rewards/chosen": -45.87550354003906, "rewards/margins": 1.793461799621582, "rewards/rejected": -47.668968200683594, "step": 6752 }, { "epoch": 0.9195261437908496, "grad_norm": 47.05560866892509, "learning_rate": 1.5681045033254382e-08, "logits/chosen": 14.611883163452148, "logits/rejected": 14.81380844116211, "logps/chosen": -4.56358003616333, "logps/rejected": -4.583906650543213, "loss": 4.3663, "rewards/accuracies": 0.5, "rewards/chosen": -45.635799407958984, "rewards/margins": 0.20327091217041016, "rewards/rejected": -45.83906936645508, "step": 6753 }, { "epoch": 0.9196623093681917, "grad_norm": 43.644988905292266, "learning_rate": 1.5628371784253713e-08, "logits/chosen": 14.37531566619873, "logits/rejected": 14.337075233459473, "logps/chosen": -4.458844184875488, "logps/rejected": -4.672021389007568, "loss": 3.6571, "rewards/accuracies": 0.5, "rewards/chosen": -44.588443756103516, "rewards/margins": 2.131768226623535, "rewards/rejected": -46.72021484375, "step": 6754 }, { "epoch": 0.9197984749455338, "grad_norm": 45.57851972606284, "learning_rate": 1.557578538719966e-08, "logits/chosen": 14.512767791748047, "logits/rejected": 15.072948455810547, "logps/chosen": -4.841028213500977, "logps/rejected": -5.053831100463867, "loss": 3.9657, "rewards/accuracies": 0.75, "rewards/chosen": -48.41028594970703, "rewards/margins": 2.128026008605957, "rewards/rejected": -50.538307189941406, "step": 6755 }, { "epoch": 0.9199346405228758, "grad_norm": 44.692863806081114, "learning_rate": 1.5523285853974532e-08, "logits/chosen": 15.06356430053711, "logits/rejected": 15.145618438720703, "logps/chosen": -4.766707897186279, "logps/rejected": -4.833586692810059, "loss": 4.6117, "rewards/accuracies": 0.5, "rewards/chosen": -47.66707992553711, "rewards/margins": 0.6687898635864258, "rewards/rejected": -48.33586883544922, "step": 6756 }, { "epoch": 0.9200708061002179, "grad_norm": 37.505846207592704, "learning_rate": 1.5470873196441157e-08, "logits/chosen": 14.31532096862793, "logits/rejected": 15.016408920288086, "logps/chosen": -4.775619983673096, "logps/rejected": -5.1004438400268555, "loss": 4.0564, "rewards/accuracies": 1.0, "rewards/chosen": -47.75619888305664, "rewards/margins": 3.2482385635375977, "rewards/rejected": -51.00444030761719, "step": 6757 }, { "epoch": 0.9202069716775599, "grad_norm": 42.51440506126448, "learning_rate": 1.5418547426442465e-08, "logits/chosen": 14.228719711303711, "logits/rejected": 14.444573402404785, "logps/chosen": -4.674832344055176, "logps/rejected": -4.6006317138671875, "loss": 4.0139, "rewards/accuracies": 0.5, "rewards/chosen": -46.748321533203125, "rewards/margins": -0.7420034408569336, "rewards/rejected": -46.006317138671875, "step": 6758 }, { "epoch": 0.9203431372549019, "grad_norm": 43.73487751274473, "learning_rate": 1.536630855580201e-08, "logits/chosen": 13.324379920959473, "logits/rejected": 14.612096786499023, "logps/chosen": -4.299986362457275, "logps/rejected": -4.646446228027344, "loss": 4.5725, "rewards/accuracies": 0.75, "rewards/chosen": -42.9998664855957, "rewards/margins": 3.4646005630493164, "rewards/rejected": -46.4644660949707, "step": 6759 }, { "epoch": 0.920479302832244, "grad_norm": 39.483854155790205, "learning_rate": 1.5314156596323557e-08, "logits/chosen": 14.416642189025879, "logits/rejected": 14.319658279418945, "logps/chosen": -4.546533584594727, "logps/rejected": -4.668818950653076, "loss": 3.277, "rewards/accuracies": 0.75, "rewards/chosen": -45.46533203125, "rewards/margins": 1.2228574752807617, "rewards/rejected": -46.68819046020508, "step": 6760 }, { "epoch": 0.9206154684095861, "grad_norm": 41.279709510569006, "learning_rate": 1.5262091559791234e-08, "logits/chosen": 14.641911506652832, "logits/rejected": 15.338605880737305, "logps/chosen": -4.477970123291016, "logps/rejected": -4.558156490325928, "loss": 3.8815, "rewards/accuracies": 0.75, "rewards/chosen": -44.779701232910156, "rewards/margins": 0.8018674850463867, "rewards/rejected": -45.581565856933594, "step": 6761 }, { "epoch": 0.920751633986928, "grad_norm": 43.14494322689173, "learning_rate": 1.5210113457969587e-08, "logits/chosen": 13.784461975097656, "logits/rejected": 14.775447845458984, "logps/chosen": -4.543388366699219, "logps/rejected": -4.675769805908203, "loss": 4.0538, "rewards/accuracies": 0.5, "rewards/chosen": -45.43388366699219, "rewards/margins": 1.3238162994384766, "rewards/rejected": -46.75769805908203, "step": 6762 }, { "epoch": 0.9208877995642701, "grad_norm": 46.731212436941924, "learning_rate": 1.5158222302603573e-08, "logits/chosen": 13.906042098999023, "logits/rejected": 15.198945045471191, "logps/chosen": -4.431740760803223, "logps/rejected": -4.933783531188965, "loss": 3.4129, "rewards/accuracies": 1.0, "rewards/chosen": -44.317413330078125, "rewards/margins": 5.02042293548584, "rewards/rejected": -49.337833404541016, "step": 6763 }, { "epoch": 0.9210239651416122, "grad_norm": 48.03616677058671, "learning_rate": 1.5106418105418307e-08, "logits/chosen": 14.56702995300293, "logits/rejected": 13.996644020080566, "logps/chosen": -4.850089073181152, "logps/rejected": -4.685321807861328, "loss": 4.1504, "rewards/accuracies": 0.5, "rewards/chosen": -48.500892639160156, "rewards/margins": -1.6476726531982422, "rewards/rejected": -46.85321807861328, "step": 6764 }, { "epoch": 0.9211601307189542, "grad_norm": 40.84891132590205, "learning_rate": 1.5054700878119442e-08, "logits/chosen": 14.0270414352417, "logits/rejected": 14.260913848876953, "logps/chosen": -4.6758575439453125, "logps/rejected": -4.9003472328186035, "loss": 3.6062, "rewards/accuracies": 0.75, "rewards/chosen": -46.75857162475586, "rewards/margins": 2.244898796081543, "rewards/rejected": -49.00347137451172, "step": 6765 }, { "epoch": 0.9212962962962963, "grad_norm": 44.35638204317317, "learning_rate": 1.5003070632392924e-08, "logits/chosen": 14.803614616394043, "logits/rejected": 14.863611221313477, "logps/chosen": -4.735879898071289, "logps/rejected": -4.893359184265137, "loss": 3.8922, "rewards/accuracies": 0.75, "rewards/chosen": -47.35879898071289, "rewards/margins": 1.5747900009155273, "rewards/rejected": -48.933589935302734, "step": 6766 }, { "epoch": 0.9214324618736384, "grad_norm": 42.54697143585407, "learning_rate": 1.4951527379904973e-08, "logits/chosen": 13.955964088439941, "logits/rejected": 14.27647876739502, "logps/chosen": -4.547407150268555, "logps/rejected": -4.701155662536621, "loss": 3.9236, "rewards/accuracies": 0.5, "rewards/chosen": -45.47407150268555, "rewards/margins": 1.537485122680664, "rewards/rejected": -47.01155471801758, "step": 6767 }, { "epoch": 0.9215686274509803, "grad_norm": 42.56700768880189, "learning_rate": 1.4900071132302272e-08, "logits/chosen": 13.477507591247559, "logits/rejected": 14.194658279418945, "logps/chosen": -4.081647872924805, "logps/rejected": -4.418722152709961, "loss": 3.4503, "rewards/accuracies": 0.75, "rewards/chosen": -40.81648254394531, "rewards/margins": 3.3707408905029297, "rewards/rejected": -44.187217712402344, "step": 6768 }, { "epoch": 0.9217047930283224, "grad_norm": 40.69099136994903, "learning_rate": 1.4848701901211835e-08, "logits/chosen": 14.161725997924805, "logits/rejected": 14.947760581970215, "logps/chosen": -4.288275241851807, "logps/rejected": -4.625242710113525, "loss": 4.1488, "rewards/accuracies": 0.5, "rewards/chosen": -42.882755279541016, "rewards/margins": 3.3696727752685547, "rewards/rejected": -46.25242614746094, "step": 6769 }, { "epoch": 0.9218409586056645, "grad_norm": 39.942572889931206, "learning_rate": 1.479741969824082e-08, "logits/chosen": 14.773813247680664, "logits/rejected": 14.3272066116333, "logps/chosen": -4.466123104095459, "logps/rejected": -4.487545490264893, "loss": 3.9774, "rewards/accuracies": 0.5, "rewards/chosen": -44.661231994628906, "rewards/margins": 0.21422290802001953, "rewards/rejected": -44.87545394897461, "step": 6770 }, { "epoch": 0.9219771241830066, "grad_norm": 38.03026277394239, "learning_rate": 1.4746224534976936e-08, "logits/chosen": 14.221427917480469, "logits/rejected": 14.6093111038208, "logps/chosen": -4.5460004806518555, "logps/rejected": -4.622384071350098, "loss": 3.9059, "rewards/accuracies": 0.75, "rewards/chosen": -45.46000671386719, "rewards/margins": 0.7638301849365234, "rewards/rejected": -46.223838806152344, "step": 6771 }, { "epoch": 0.9221132897603486, "grad_norm": 41.707762382880794, "learning_rate": 1.4695116422988219e-08, "logits/chosen": 13.796150207519531, "logits/rejected": 14.295923233032227, "logps/chosen": -4.528137683868408, "logps/rejected": -4.760712623596191, "loss": 3.4889, "rewards/accuracies": 0.75, "rewards/chosen": -45.28137969970703, "rewards/margins": 2.325747489929199, "rewards/rejected": -47.60712432861328, "step": 6772 }, { "epoch": 0.9222494553376906, "grad_norm": 44.915823672562766, "learning_rate": 1.4644095373822851e-08, "logits/chosen": 14.00808334350586, "logits/rejected": 14.243919372558594, "logps/chosen": -4.196809768676758, "logps/rejected": -4.837044715881348, "loss": 4.4, "rewards/accuracies": 1.0, "rewards/chosen": -41.96809768676758, "rewards/margins": 6.402349472045898, "rewards/rejected": -48.370445251464844, "step": 6773 }, { "epoch": 0.9223856209150327, "grad_norm": 43.389796687727916, "learning_rate": 1.4593161399009523e-08, "logits/chosen": 14.242349624633789, "logits/rejected": 14.496246337890625, "logps/chosen": -4.42404842376709, "logps/rejected": -4.661789894104004, "loss": 3.9538, "rewards/accuracies": 0.25, "rewards/chosen": -44.24048614501953, "rewards/margins": 2.3774166107177734, "rewards/rejected": -46.61790466308594, "step": 6774 }, { "epoch": 0.9225217864923747, "grad_norm": 40.30604116842655, "learning_rate": 1.4542314510057207e-08, "logits/chosen": 14.147954940795898, "logits/rejected": 14.537179946899414, "logps/chosen": -4.468812465667725, "logps/rejected": -4.5650553703308105, "loss": 4.0749, "rewards/accuracies": 0.75, "rewards/chosen": -44.68812561035156, "rewards/margins": 0.9624290466308594, "rewards/rejected": -45.65055465698242, "step": 6775 }, { "epoch": 0.9226579520697168, "grad_norm": 42.619381704495, "learning_rate": 1.4491554718455157e-08, "logits/chosen": 14.697086334228516, "logits/rejected": 15.47967529296875, "logps/chosen": -4.746865749359131, "logps/rejected": -5.19781494140625, "loss": 3.2954, "rewards/accuracies": 0.75, "rewards/chosen": -47.468658447265625, "rewards/margins": 4.509489059448242, "rewards/rejected": -51.9781494140625, "step": 6776 }, { "epoch": 0.9227941176470589, "grad_norm": 45.19852615597026, "learning_rate": 1.4440882035672907e-08, "logits/chosen": 14.110578536987305, "logits/rejected": 14.930730819702148, "logps/chosen": -4.348196983337402, "logps/rejected": -5.051416873931885, "loss": 4.501, "rewards/accuracies": 1.0, "rewards/chosen": -43.48196792602539, "rewards/margins": 7.03220272064209, "rewards/rejected": -50.51416778564453, "step": 6777 }, { "epoch": 0.9229302832244008, "grad_norm": 45.780503703258205, "learning_rate": 1.43902964731605e-08, "logits/chosen": 13.254469871520996, "logits/rejected": 13.682064056396484, "logps/chosen": -4.149381637573242, "logps/rejected": -4.398719787597656, "loss": 3.2073, "rewards/accuracies": 1.0, "rewards/chosen": -41.493812561035156, "rewards/margins": 2.493380546569824, "rewards/rejected": -43.9871940612793, "step": 6778 }, { "epoch": 0.9230664488017429, "grad_norm": 44.68901190285503, "learning_rate": 1.4339798042348039e-08, "logits/chosen": 14.555624008178711, "logits/rejected": 14.754658699035645, "logps/chosen": -4.642744064331055, "logps/rejected": -4.751301288604736, "loss": 3.5147, "rewards/accuracies": 0.5, "rewards/chosen": -46.42744445800781, "rewards/margins": 1.0855712890625, "rewards/rejected": -47.51301574707031, "step": 6779 }, { "epoch": 0.923202614379085, "grad_norm": 42.18387941053631, "learning_rate": 1.4289386754646126e-08, "logits/chosen": 14.293551445007324, "logits/rejected": 14.626333236694336, "logps/chosen": -4.431190013885498, "logps/rejected": -4.696178436279297, "loss": 3.6014, "rewards/accuracies": 0.5, "rewards/chosen": -44.3119010925293, "rewards/margins": 2.6498842239379883, "rewards/rejected": -46.96178436279297, "step": 6780 }, { "epoch": 0.923338779956427, "grad_norm": 45.029050820387006, "learning_rate": 1.4239062621445608e-08, "logits/chosen": 14.58551025390625, "logits/rejected": 14.55484390258789, "logps/chosen": -4.547111511230469, "logps/rejected": -4.641956329345703, "loss": 3.7327, "rewards/accuracies": 0.5, "rewards/chosen": -45.47111511230469, "rewards/margins": 0.9484443664550781, "rewards/rejected": -46.419559478759766, "step": 6781 }, { "epoch": 0.9234749455337691, "grad_norm": 40.671470879688556, "learning_rate": 1.418882565411761e-08, "logits/chosen": 14.019132614135742, "logits/rejected": 15.257169723510742, "logps/chosen": -4.447349548339844, "logps/rejected": -4.891264915466309, "loss": 3.9718, "rewards/accuracies": 0.75, "rewards/chosen": -44.47349548339844, "rewards/margins": 4.439154624938965, "rewards/rejected": -48.91264724731445, "step": 6782 }, { "epoch": 0.9236111111111112, "grad_norm": 40.980271557765846, "learning_rate": 1.4138675864013583e-08, "logits/chosen": 14.961189270019531, "logits/rejected": 15.608867645263672, "logps/chosen": -4.797214984893799, "logps/rejected": -5.220175743103027, "loss": 4.0862, "rewards/accuracies": 0.75, "rewards/chosen": -47.972145080566406, "rewards/margins": 4.229611396789551, "rewards/rejected": -52.201759338378906, "step": 6783 }, { "epoch": 0.9237472766884531, "grad_norm": 40.320080063411154, "learning_rate": 1.4088613262465355e-08, "logits/chosen": 14.757402420043945, "logits/rejected": 15.244171142578125, "logps/chosen": -4.631191730499268, "logps/rejected": -5.033164024353027, "loss": 3.4714, "rewards/accuracies": 0.75, "rewards/chosen": -46.311920166015625, "rewards/margins": 4.019722938537598, "rewards/rejected": -50.331642150878906, "step": 6784 }, { "epoch": 0.9238834422657952, "grad_norm": 37.80221384663999, "learning_rate": 1.4038637860784897e-08, "logits/chosen": 13.510490417480469, "logits/rejected": 14.337658882141113, "logps/chosen": -4.304315090179443, "logps/rejected": -4.476646423339844, "loss": 3.9667, "rewards/accuracies": 0.75, "rewards/chosen": -43.04315185546875, "rewards/margins": 1.7233142852783203, "rewards/rejected": -44.76646423339844, "step": 6785 }, { "epoch": 0.9240196078431373, "grad_norm": 39.28245621442695, "learning_rate": 1.3988749670264554e-08, "logits/chosen": 13.914095878601074, "logits/rejected": 14.004167556762695, "logps/chosen": -4.375773906707764, "logps/rejected": -4.5246171951293945, "loss": 4.1213, "rewards/accuracies": 0.5, "rewards/chosen": -43.75774002075195, "rewards/margins": 1.4884328842163086, "rewards/rejected": -45.24617004394531, "step": 6786 }, { "epoch": 0.9241557734204793, "grad_norm": 43.98209433215757, "learning_rate": 1.393894870217709e-08, "logits/chosen": 14.417642593383789, "logits/rejected": 14.91773796081543, "logps/chosen": -4.507721900939941, "logps/rejected": -4.883083820343018, "loss": 3.5354, "rewards/accuracies": 0.75, "rewards/chosen": -45.07721710205078, "rewards/margins": 3.75362491607666, "rewards/rejected": -48.830841064453125, "step": 6787 }, { "epoch": 0.9242919389978214, "grad_norm": 40.675462898519534, "learning_rate": 1.3889234967775409e-08, "logits/chosen": 14.171514511108398, "logits/rejected": 14.45113754272461, "logps/chosen": -4.718999862670898, "logps/rejected": -5.022078037261963, "loss": 3.7732, "rewards/accuracies": 0.5, "rewards/chosen": -47.18999481201172, "rewards/margins": 3.03078556060791, "rewards/rejected": -50.22077941894531, "step": 6788 }, { "epoch": 0.9244281045751634, "grad_norm": 41.39736059309218, "learning_rate": 1.3839608478292664e-08, "logits/chosen": 14.346257209777832, "logits/rejected": 14.347829818725586, "logps/chosen": -4.669155120849609, "logps/rejected": -4.6030378341674805, "loss": 3.6551, "rewards/accuracies": 0.5, "rewards/chosen": -46.691551208496094, "rewards/margins": -0.6611766815185547, "rewards/rejected": -46.03037643432617, "step": 6789 }, { "epoch": 0.9245642701525054, "grad_norm": 44.13531176604979, "learning_rate": 1.3790069244942415e-08, "logits/chosen": 13.950839042663574, "logits/rejected": 14.424667358398438, "logps/chosen": -4.515474319458008, "logps/rejected": -4.619921684265137, "loss": 3.7735, "rewards/accuracies": 0.5, "rewards/chosen": -45.15474319458008, "rewards/margins": 1.0444707870483398, "rewards/rejected": -46.199214935302734, "step": 6790 }, { "epoch": 0.9247004357298475, "grad_norm": 43.14034303783659, "learning_rate": 1.3740617278918509e-08, "logits/chosen": 14.436854362487793, "logits/rejected": 15.524492263793945, "logps/chosen": -4.51909065246582, "logps/rejected": -4.882948875427246, "loss": 4.0235, "rewards/accuracies": 1.0, "rewards/chosen": -45.19091033935547, "rewards/margins": 3.6385812759399414, "rewards/rejected": -48.829490661621094, "step": 6791 }, { "epoch": 0.9248366013071896, "grad_norm": 41.044784506396695, "learning_rate": 1.3691252591394897e-08, "logits/chosen": 14.551023483276367, "logits/rejected": 15.504243850708008, "logps/chosen": -4.611681938171387, "logps/rejected": -4.6921491622924805, "loss": 4.2915, "rewards/accuracies": 0.5, "rewards/chosen": -46.116817474365234, "rewards/margins": 0.8046760559082031, "rewards/rejected": -46.92149353027344, "step": 6792 }, { "epoch": 0.9249727668845316, "grad_norm": 37.87842059996649, "learning_rate": 1.3641975193526079e-08, "logits/chosen": 14.880453109741211, "logits/rejected": 15.119278907775879, "logps/chosen": -4.503069877624512, "logps/rejected": -4.835857391357422, "loss": 4.0431, "rewards/accuracies": 0.75, "rewards/chosen": -45.03070068359375, "rewards/margins": 3.327871322631836, "rewards/rejected": -48.35857009887695, "step": 6793 }, { "epoch": 0.9251089324618736, "grad_norm": 47.891162695302214, "learning_rate": 1.3592785096446613e-08, "logits/chosen": 13.557901382446289, "logits/rejected": 15.202877044677734, "logps/chosen": -4.578295707702637, "logps/rejected": -4.712339401245117, "loss": 3.5264, "rewards/accuracies": 0.5, "rewards/chosen": -45.782958984375, "rewards/margins": 1.3404350280761719, "rewards/rejected": -47.123390197753906, "step": 6794 }, { "epoch": 0.9252450980392157, "grad_norm": 51.94735615425791, "learning_rate": 1.3543682311271476e-08, "logits/chosen": 13.758283615112305, "logits/rejected": 14.422635078430176, "logps/chosen": -4.13035774230957, "logps/rejected": -4.73408317565918, "loss": 4.0386, "rewards/accuracies": 1.0, "rewards/chosen": -41.3035774230957, "rewards/margins": 6.037252426147461, "rewards/rejected": -47.3408317565918, "step": 6795 }, { "epoch": 0.9253812636165577, "grad_norm": 40.56903870782872, "learning_rate": 1.3494666849095748e-08, "logits/chosen": 13.794588088989258, "logits/rejected": 13.251556396484375, "logps/chosen": -4.21988582611084, "logps/rejected": -4.171363830566406, "loss": 3.8327, "rewards/accuracies": 0.5, "rewards/chosen": -42.19886016845703, "rewards/margins": -0.48522281646728516, "rewards/rejected": -41.71363830566406, "step": 6796 }, { "epoch": 0.9255174291938998, "grad_norm": 39.36379930490999, "learning_rate": 1.3445738720994925e-08, "logits/chosen": 14.578933715820312, "logits/rejected": 15.243785858154297, "logps/chosen": -4.557867527008057, "logps/rejected": -4.883699417114258, "loss": 3.827, "rewards/accuracies": 0.75, "rewards/chosen": -45.57867431640625, "rewards/margins": 3.2583208084106445, "rewards/rejected": -48.83699417114258, "step": 6797 }, { "epoch": 0.9256535947712419, "grad_norm": 46.14070468700627, "learning_rate": 1.3396897938024788e-08, "logits/chosen": 14.073331832885742, "logits/rejected": 14.560345649719238, "logps/chosen": -4.635652542114258, "logps/rejected": -4.9270405769348145, "loss": 3.5317, "rewards/accuracies": 0.5, "rewards/chosen": -46.35652542114258, "rewards/margins": 2.91387939453125, "rewards/rejected": -49.27040481567383, "step": 6798 }, { "epoch": 0.9257897603485838, "grad_norm": 41.57024774064826, "learning_rate": 1.3348144511221216e-08, "logits/chosen": 15.126147270202637, "logits/rejected": 14.304495811462402, "logps/chosen": -5.112196922302246, "logps/rejected": -4.912537574768066, "loss": 3.6692, "rewards/accuracies": 0.25, "rewards/chosen": -51.12196731567383, "rewards/margins": -1.9965906143188477, "rewards/rejected": -49.12537384033203, "step": 6799 }, { "epoch": 0.9259259259259259, "grad_norm": 44.78352399047249, "learning_rate": 1.3299478451600465e-08, "logits/chosen": 15.462804794311523, "logits/rejected": 15.222739219665527, "logps/chosen": -4.865962505340576, "logps/rejected": -5.1308794021606445, "loss": 3.3757, "rewards/accuracies": 0.75, "rewards/chosen": -48.65962219238281, "rewards/margins": 2.6491737365722656, "rewards/rejected": -51.308799743652344, "step": 6800 }, { "epoch": 0.926062091503268, "grad_norm": 49.830393861163046, "learning_rate": 1.3250899770159074e-08, "logits/chosen": 14.119539260864258, "logits/rejected": 14.76306438446045, "logps/chosen": -4.387840270996094, "logps/rejected": -4.695355415344238, "loss": 4.4822, "rewards/accuracies": 0.75, "rewards/chosen": -43.87840270996094, "rewards/margins": 3.0751514434814453, "rewards/rejected": -46.953556060791016, "step": 6801 }, { "epoch": 0.92619825708061, "grad_norm": 47.99410376389737, "learning_rate": 1.3202408477873816e-08, "logits/chosen": 14.931583404541016, "logits/rejected": 15.332601547241211, "logps/chosen": -4.497415065765381, "logps/rejected": -4.467586517333984, "loss": 3.8277, "rewards/accuracies": 0.5, "rewards/chosen": -44.974151611328125, "rewards/margins": -0.29828643798828125, "rewards/rejected": -44.675865173339844, "step": 6802 }, { "epoch": 0.9263344226579521, "grad_norm": 43.292148162232294, "learning_rate": 1.3154004585701662e-08, "logits/chosen": 14.020215034484863, "logits/rejected": 14.930686950683594, "logps/chosen": -4.523578643798828, "logps/rejected": -5.009856224060059, "loss": 3.7263, "rewards/accuracies": 1.0, "rewards/chosen": -45.23578643798828, "rewards/margins": 4.862776756286621, "rewards/rejected": -50.09856414794922, "step": 6803 }, { "epoch": 0.9264705882352942, "grad_norm": 46.95905644020762, "learning_rate": 1.3105688104579814e-08, "logits/chosen": 13.882823944091797, "logits/rejected": 14.184961318969727, "logps/chosen": -4.795082092285156, "logps/rejected": -4.942070007324219, "loss": 4.5735, "rewards/accuracies": 0.75, "rewards/chosen": -47.95082092285156, "rewards/margins": 1.469879150390625, "rewards/rejected": -49.42070007324219, "step": 6804 }, { "epoch": 0.9266067538126361, "grad_norm": 44.166825659228365, "learning_rate": 1.30574590454259e-08, "logits/chosen": 13.882579803466797, "logits/rejected": 15.44838809967041, "logps/chosen": -4.36814022064209, "logps/rejected": -4.963750839233398, "loss": 4.1691, "rewards/accuracies": 1.0, "rewards/chosen": -43.681396484375, "rewards/margins": 5.9561052322387695, "rewards/rejected": -49.63750457763672, "step": 6805 }, { "epoch": 0.9267429193899782, "grad_norm": 44.939833658219186, "learning_rate": 1.30093174191376e-08, "logits/chosen": 14.074735641479492, "logits/rejected": 14.765722274780273, "logps/chosen": -4.576440811157227, "logps/rejected": -4.643331050872803, "loss": 4.2098, "rewards/accuracies": 0.5, "rewards/chosen": -45.764408111572266, "rewards/margins": 0.6689023971557617, "rewards/rejected": -46.433311462402344, "step": 6806 }, { "epoch": 0.9268790849673203, "grad_norm": 39.242110836822135, "learning_rate": 1.296126323659288e-08, "logits/chosen": 15.34489917755127, "logits/rejected": 14.971809387207031, "logps/chosen": -4.9505815505981445, "logps/rejected": -5.131821632385254, "loss": 3.8377, "rewards/accuracies": 0.75, "rewards/chosen": -49.50581359863281, "rewards/margins": 1.812403678894043, "rewards/rejected": -51.318214416503906, "step": 6807 }, { "epoch": 0.9270152505446623, "grad_norm": 42.43443897454243, "learning_rate": 1.2913296508650117e-08, "logits/chosen": 14.967772483825684, "logits/rejected": 14.447614669799805, "logps/chosen": -4.378334045410156, "logps/rejected": -4.544655799865723, "loss": 4.1012, "rewards/accuracies": 0.75, "rewards/chosen": -43.78334045410156, "rewards/margins": 1.663217544555664, "rewards/rejected": -45.446556091308594, "step": 6808 }, { "epoch": 0.9271514161220044, "grad_norm": 42.474215611348484, "learning_rate": 1.2865417246147626e-08, "logits/chosen": 14.351016998291016, "logits/rejected": 15.170116424560547, "logps/chosen": -4.410572052001953, "logps/rejected": -4.868417263031006, "loss": 3.4532, "rewards/accuracies": 1.0, "rewards/chosen": -44.105716705322266, "rewards/margins": 4.578451156616211, "rewards/rejected": -48.68416976928711, "step": 6809 }, { "epoch": 0.9272875816993464, "grad_norm": 41.67131090677845, "learning_rate": 1.2817625459904214e-08, "logits/chosen": 13.792198181152344, "logits/rejected": 13.262957572937012, "logps/chosen": -4.35244083404541, "logps/rejected": -4.212399959564209, "loss": 4.0096, "rewards/accuracies": 0.5, "rewards/chosen": -43.52440643310547, "rewards/margins": -1.4004087448120117, "rewards/rejected": -42.124000549316406, "step": 6810 }, { "epoch": 0.9274237472766884, "grad_norm": 45.97516848446598, "learning_rate": 1.2769921160718845e-08, "logits/chosen": 14.919551849365234, "logits/rejected": 15.393167495727539, "logps/chosen": -4.717129230499268, "logps/rejected": -4.763381004333496, "loss": 3.8741, "rewards/accuracies": 0.5, "rewards/chosen": -47.171295166015625, "rewards/margins": 0.46251678466796875, "rewards/rejected": -47.633811950683594, "step": 6811 }, { "epoch": 0.9275599128540305, "grad_norm": 50.053489536971, "learning_rate": 1.2722304359370628e-08, "logits/chosen": 14.240507125854492, "logits/rejected": 13.94351863861084, "logps/chosen": -4.743525505065918, "logps/rejected": -4.5304951667785645, "loss": 3.3497, "rewards/accuracies": 0.5, "rewards/chosen": -47.43525695800781, "rewards/margins": -2.130303382873535, "rewards/rejected": -45.30495071411133, "step": 6812 }, { "epoch": 0.9276960784313726, "grad_norm": 46.684657929640004, "learning_rate": 1.2674775066619003e-08, "logits/chosen": 14.936235427856445, "logits/rejected": 14.154708862304688, "logps/chosen": -4.745233058929443, "logps/rejected": -4.846002578735352, "loss": 3.3303, "rewards/accuracies": 0.5, "rewards/chosen": -47.45233154296875, "rewards/margins": 1.0076923370361328, "rewards/rejected": -48.46002197265625, "step": 6813 }, { "epoch": 0.9278322440087146, "grad_norm": 44.255169939812426, "learning_rate": 1.2627333293203646e-08, "logits/chosen": 14.248638153076172, "logits/rejected": 14.567628860473633, "logps/chosen": -4.531932830810547, "logps/rejected": -4.744208335876465, "loss": 3.7588, "rewards/accuracies": 0.75, "rewards/chosen": -45.319332122802734, "rewards/margins": 2.122756004333496, "rewards/rejected": -47.44208526611328, "step": 6814 }, { "epoch": 0.9279684095860566, "grad_norm": 41.832607010711016, "learning_rate": 1.2579979049844336e-08, "logits/chosen": 14.743961334228516, "logits/rejected": 14.43391227722168, "logps/chosen": -4.747077941894531, "logps/rejected": -4.798740863800049, "loss": 3.6282, "rewards/accuracies": 0.75, "rewards/chosen": -47.47077941894531, "rewards/margins": 0.5166263580322266, "rewards/rejected": -47.98740768432617, "step": 6815 }, { "epoch": 0.9281045751633987, "grad_norm": 41.47783958109193, "learning_rate": 1.2532712347241226e-08, "logits/chosen": 14.534767150878906, "logits/rejected": 14.673720359802246, "logps/chosen": -4.323418617248535, "logps/rejected": -4.8644585609436035, "loss": 4.2292, "rewards/accuracies": 0.75, "rewards/chosen": -43.234188079833984, "rewards/margins": 5.410398483276367, "rewards/rejected": -48.64458465576172, "step": 6816 }, { "epoch": 0.9282407407407407, "grad_norm": 37.63626769444746, "learning_rate": 1.2485533196074661e-08, "logits/chosen": 14.44247055053711, "logits/rejected": 15.180987358093262, "logps/chosen": -4.676215171813965, "logps/rejected": -4.9397430419921875, "loss": 3.6513, "rewards/accuracies": 0.75, "rewards/chosen": -46.762149810791016, "rewards/margins": 2.635282516479492, "rewards/rejected": -49.39743423461914, "step": 6817 }, { "epoch": 0.9283769063180828, "grad_norm": 37.440202689889595, "learning_rate": 1.2438441607005046e-08, "logits/chosen": 14.411815643310547, "logits/rejected": 14.854447364807129, "logps/chosen": -4.406407833099365, "logps/rejected": -4.499478340148926, "loss": 3.612, "rewards/accuracies": 0.75, "rewards/chosen": -44.06407928466797, "rewards/margins": 0.9307060241699219, "rewards/rejected": -44.99478530883789, "step": 6818 }, { "epoch": 0.9285130718954249, "grad_norm": 40.76723355889497, "learning_rate": 1.2391437590673116e-08, "logits/chosen": 14.13672161102295, "logits/rejected": 14.734207153320312, "logps/chosen": -4.32354736328125, "logps/rejected": -4.561983108520508, "loss": 3.771, "rewards/accuracies": 0.75, "rewards/chosen": -43.2354736328125, "rewards/margins": 2.3843603134155273, "rewards/rejected": -45.619834899902344, "step": 6819 }, { "epoch": 0.9286492374727668, "grad_norm": 44.307224570174476, "learning_rate": 1.2344521157699972e-08, "logits/chosen": 14.719215393066406, "logits/rejected": 14.847542762756348, "logps/chosen": -4.4958415031433105, "logps/rejected": -4.646947383880615, "loss": 4.1131, "rewards/accuracies": 0.5, "rewards/chosen": -44.958412170410156, "rewards/margins": 1.5110588073730469, "rewards/rejected": -46.46947479248047, "step": 6820 }, { "epoch": 0.9287854030501089, "grad_norm": 41.685173295697695, "learning_rate": 1.2297692318686604e-08, "logits/chosen": 14.561368942260742, "logits/rejected": 15.660706520080566, "logps/chosen": -4.532009601593018, "logps/rejected": -5.010805606842041, "loss": 3.6209, "rewards/accuracies": 1.0, "rewards/chosen": -45.320091247558594, "rewards/margins": 4.787965774536133, "rewards/rejected": -50.10805892944336, "step": 6821 }, { "epoch": 0.928921568627451, "grad_norm": 43.25112414987553, "learning_rate": 1.2250951084214412e-08, "logits/chosen": 13.683297157287598, "logits/rejected": 14.689645767211914, "logps/chosen": -4.54910135269165, "logps/rejected": -4.7684526443481445, "loss": 3.9069, "rewards/accuracies": 0.75, "rewards/chosen": -45.49101257324219, "rewards/margins": 2.1935129165649414, "rewards/rejected": -47.68452453613281, "step": 6822 }, { "epoch": 0.929057734204793, "grad_norm": 45.89844940251257, "learning_rate": 1.220429746484508e-08, "logits/chosen": 13.78103256225586, "logits/rejected": 14.262883186340332, "logps/chosen": -4.134953498840332, "logps/rejected": -4.617044925689697, "loss": 3.9214, "rewards/accuracies": 1.0, "rewards/chosen": -41.34953308105469, "rewards/margins": 4.820918083190918, "rewards/rejected": -46.17045211791992, "step": 6823 }, { "epoch": 0.9291938997821351, "grad_norm": 43.299341876577834, "learning_rate": 1.2157731471120181e-08, "logits/chosen": 14.273653030395508, "logits/rejected": 14.84665584564209, "logps/chosen": -4.936089515686035, "logps/rejected": -5.115044593811035, "loss": 4.4478, "rewards/accuracies": 0.75, "rewards/chosen": -49.36089324951172, "rewards/margins": 1.789555549621582, "rewards/rejected": -51.150447845458984, "step": 6824 }, { "epoch": 0.9293300653594772, "grad_norm": 41.86711933093271, "learning_rate": 1.2111253113561826e-08, "logits/chosen": 14.816075325012207, "logits/rejected": 14.209869384765625, "logps/chosen": -4.637279987335205, "logps/rejected": -4.554003715515137, "loss": 4.1989, "rewards/accuracies": 0.25, "rewards/chosen": -46.372802734375, "rewards/margins": -0.8327608108520508, "rewards/rejected": -45.5400390625, "step": 6825 }, { "epoch": 0.9294662309368191, "grad_norm": 38.61319958898135, "learning_rate": 1.2064862402672194e-08, "logits/chosen": 14.644523620605469, "logits/rejected": 14.787063598632812, "logps/chosen": -4.639031887054443, "logps/rejected": -4.832553386688232, "loss": 3.4997, "rewards/accuracies": 0.5, "rewards/chosen": -46.39031982421875, "rewards/margins": 1.9352140426635742, "rewards/rejected": -48.325531005859375, "step": 6826 }, { "epoch": 0.9296023965141612, "grad_norm": 43.082348151595106, "learning_rate": 1.2018559348933565e-08, "logits/chosen": 13.964320182800293, "logits/rejected": 14.741704940795898, "logps/chosen": -4.524424076080322, "logps/rejected": -4.770031929016113, "loss": 3.5781, "rewards/accuracies": 0.75, "rewards/chosen": -45.244239807128906, "rewards/margins": 2.4560766220092773, "rewards/rejected": -47.7003173828125, "step": 6827 }, { "epoch": 0.9297385620915033, "grad_norm": 41.20293250439699, "learning_rate": 1.197234396280855e-08, "logits/chosen": 14.042362213134766, "logits/rejected": 13.51156997680664, "logps/chosen": -4.473462104797363, "logps/rejected": -4.2088623046875, "loss": 3.5567, "rewards/accuracies": 0.25, "rewards/chosen": -44.734619140625, "rewards/margins": -2.6459951400756836, "rewards/rejected": -42.088623046875, "step": 6828 }, { "epoch": 0.9298747276688453, "grad_norm": 42.138354097666145, "learning_rate": 1.1926216254739908e-08, "logits/chosen": 14.43367862701416, "logits/rejected": 14.675575256347656, "logps/chosen": -4.891173362731934, "logps/rejected": -4.890615463256836, "loss": 3.9418, "rewards/accuracies": 0.5, "rewards/chosen": -48.91173553466797, "rewards/margins": -0.005580902099609375, "rewards/rejected": -48.906150817871094, "step": 6829 }, { "epoch": 0.9300108932461874, "grad_norm": 45.11817823176374, "learning_rate": 1.1880176235150542e-08, "logits/chosen": 14.366971969604492, "logits/rejected": 13.971528053283691, "logps/chosen": -4.550327301025391, "logps/rejected": -4.591680526733398, "loss": 4.0927, "rewards/accuracies": 0.5, "rewards/chosen": -45.503273010253906, "rewards/margins": 0.4135313034057617, "rewards/rejected": -45.916805267333984, "step": 6830 }, { "epoch": 0.9301470588235294, "grad_norm": 44.681426105640504, "learning_rate": 1.1834223914443553e-08, "logits/chosen": 13.927125930786133, "logits/rejected": 14.095054626464844, "logps/chosen": -4.277551174163818, "logps/rejected": -4.527559280395508, "loss": 3.6046, "rewards/accuracies": 1.0, "rewards/chosen": -42.7755126953125, "rewards/margins": 2.5000762939453125, "rewards/rejected": -45.27558898925781, "step": 6831 }, { "epoch": 0.9302832244008714, "grad_norm": 41.88068510857336, "learning_rate": 1.1788359303002326e-08, "logits/chosen": 14.854968070983887, "logits/rejected": 14.327155113220215, "logps/chosen": -4.414320945739746, "logps/rejected": -4.469879150390625, "loss": 3.9414, "rewards/accuracies": 0.5, "rewards/chosen": -44.14320755004883, "rewards/margins": 0.5555820465087891, "rewards/rejected": -44.69879150390625, "step": 6832 }, { "epoch": 0.9304193899782135, "grad_norm": 39.4223502545656, "learning_rate": 1.1742582411190305e-08, "logits/chosen": 14.249805450439453, "logits/rejected": 13.96307373046875, "logps/chosen": -4.1873369216918945, "logps/rejected": -4.416213035583496, "loss": 3.6381, "rewards/accuracies": 0.75, "rewards/chosen": -41.87337112426758, "rewards/margins": 2.288760185241699, "rewards/rejected": -44.162132263183594, "step": 6833 }, { "epoch": 0.9305555555555556, "grad_norm": 39.82868262563326, "learning_rate": 1.1696893249351125e-08, "logits/chosen": 15.326889038085938, "logits/rejected": 15.375059127807617, "logps/chosen": -5.020552158355713, "logps/rejected": -4.928747177124023, "loss": 3.6222, "rewards/accuracies": 0.25, "rewards/chosen": -50.20552062988281, "rewards/margins": -0.9180488586425781, "rewards/rejected": -49.287471771240234, "step": 6834 }, { "epoch": 0.9306917211328976, "grad_norm": 46.071701816968364, "learning_rate": 1.1651291827808662e-08, "logits/chosen": 14.353043556213379, "logits/rejected": 14.077020645141602, "logps/chosen": -4.303255558013916, "logps/rejected": -4.383349895477295, "loss": 4.1453, "rewards/accuracies": 0.75, "rewards/chosen": -43.032554626464844, "rewards/margins": 0.8009443283081055, "rewards/rejected": -43.833499908447266, "step": 6835 }, { "epoch": 0.9308278867102396, "grad_norm": 47.19782232073604, "learning_rate": 1.1605778156866942e-08, "logits/chosen": 15.72195816040039, "logits/rejected": 15.205635070800781, "logps/chosen": -4.5485429763793945, "logps/rejected": -5.012184143066406, "loss": 3.8737, "rewards/accuracies": 1.0, "rewards/chosen": -45.48542785644531, "rewards/margins": 4.636415481567383, "rewards/rejected": -50.12184143066406, "step": 6836 }, { "epoch": 0.9309640522875817, "grad_norm": 41.49731258729726, "learning_rate": 1.1560352246810135e-08, "logits/chosen": 14.414050102233887, "logits/rejected": 14.95100212097168, "logps/chosen": -4.450064659118652, "logps/rejected": -4.50115966796875, "loss": 4.1485, "rewards/accuracies": 0.5, "rewards/chosen": -44.50064468383789, "rewards/margins": 0.5109519958496094, "rewards/rejected": -45.0115966796875, "step": 6837 }, { "epoch": 0.9311002178649237, "grad_norm": 43.96908167509617, "learning_rate": 1.1515014107902654e-08, "logits/chosen": 14.553272247314453, "logits/rejected": 14.897878646850586, "logps/chosen": -4.840450286865234, "logps/rejected": -4.893021106719971, "loss": 4.2123, "rewards/accuracies": 0.5, "rewards/chosen": -48.40450668334961, "rewards/margins": 0.5257072448730469, "rewards/rejected": -48.930213928222656, "step": 6838 }, { "epoch": 0.9312363834422658, "grad_norm": 37.0761067400311, "learning_rate": 1.1469763750388973e-08, "logits/chosen": 14.577103614807129, "logits/rejected": 15.11781120300293, "logps/chosen": -4.5175065994262695, "logps/rejected": -4.986605167388916, "loss": 3.7441, "rewards/accuracies": 1.0, "rewards/chosen": -45.17506408691406, "rewards/margins": 4.6909894943237305, "rewards/rejected": -49.866050720214844, "step": 6839 }, { "epoch": 0.9313725490196079, "grad_norm": 40.78544356046149, "learning_rate": 1.1424601184493753e-08, "logits/chosen": 15.534957885742188, "logits/rejected": 15.359867095947266, "logps/chosen": -5.032819747924805, "logps/rejected": -4.865486145019531, "loss": 3.8273, "rewards/accuracies": 0.0, "rewards/chosen": -50.32820129394531, "rewards/margins": -1.6733427047729492, "rewards/rejected": -48.65486145019531, "step": 6840 }, { "epoch": 0.9315087145969498, "grad_norm": 41.365862228982174, "learning_rate": 1.1379526420421947e-08, "logits/chosen": 13.691926956176758, "logits/rejected": 14.106900215148926, "logps/chosen": -4.599076747894287, "logps/rejected": -4.593097686767578, "loss": 4.1494, "rewards/accuracies": 0.5, "rewards/chosen": -45.99076843261719, "rewards/margins": -0.059790611267089844, "rewards/rejected": -45.93097686767578, "step": 6841 }, { "epoch": 0.9316448801742919, "grad_norm": 44.64912935903031, "learning_rate": 1.1334539468358473e-08, "logits/chosen": 14.588306427001953, "logits/rejected": 14.823476791381836, "logps/chosen": -4.576956272125244, "logps/rejected": -4.4984450340271, "loss": 4.3158, "rewards/accuracies": 0.25, "rewards/chosen": -45.769561767578125, "rewards/margins": -0.7851142883300781, "rewards/rejected": -44.98444747924805, "step": 6842 }, { "epoch": 0.931781045751634, "grad_norm": 43.246502604436856, "learning_rate": 1.128964033846853e-08, "logits/chosen": 14.072507858276367, "logits/rejected": 15.03666877746582, "logps/chosen": -4.498435020446777, "logps/rejected": -4.817373275756836, "loss": 3.9082, "rewards/accuracies": 1.0, "rewards/chosen": -44.984352111816406, "rewards/margins": 3.1893815994262695, "rewards/rejected": -48.173736572265625, "step": 6843 }, { "epoch": 0.931917211328976, "grad_norm": 45.624677349476926, "learning_rate": 1.1244829040897564e-08, "logits/chosen": 14.556089401245117, "logits/rejected": 14.340676307678223, "logps/chosen": -4.78104305267334, "logps/rejected": -4.769586563110352, "loss": 4.3026, "rewards/accuracies": 0.5, "rewards/chosen": -47.81043243408203, "rewards/margins": -0.11456298828125, "rewards/rejected": -47.69586944580078, "step": 6844 }, { "epoch": 0.9320533769063181, "grad_norm": 43.357364236356815, "learning_rate": 1.1200105585770847e-08, "logits/chosen": 15.078008651733398, "logits/rejected": 14.940640449523926, "logps/chosen": -4.676421165466309, "logps/rejected": -4.795801639556885, "loss": 4.286, "rewards/accuracies": 0.75, "rewards/chosen": -46.76420593261719, "rewards/margins": 1.193807601928711, "rewards/rejected": -47.95801544189453, "step": 6845 }, { "epoch": 0.9321895424836601, "grad_norm": 44.30037210193184, "learning_rate": 1.1155469983194165e-08, "logits/chosen": 14.746892929077148, "logits/rejected": 15.30131721496582, "logps/chosen": -4.35122013092041, "logps/rejected": -4.989719390869141, "loss": 4.1347, "rewards/accuracies": 1.0, "rewards/chosen": -43.512203216552734, "rewards/margins": 6.3849945068359375, "rewards/rejected": -49.897193908691406, "step": 6846 }, { "epoch": 0.9323257080610022, "grad_norm": 42.32357507688563, "learning_rate": 1.1110922243253318e-08, "logits/chosen": 13.860211372375488, "logits/rejected": 14.659642219543457, "logps/chosen": -4.228435516357422, "logps/rejected": -4.629107475280762, "loss": 4.4499, "rewards/accuracies": 1.0, "rewards/chosen": -42.28435516357422, "rewards/margins": 4.006718635559082, "rewards/rejected": -46.291072845458984, "step": 6847 }, { "epoch": 0.9324618736383442, "grad_norm": 41.58590408012931, "learning_rate": 1.1066462376014118e-08, "logits/chosen": 14.635881423950195, "logits/rejected": 15.40927505493164, "logps/chosen": -4.669826984405518, "logps/rejected": -4.981544494628906, "loss": 3.9989, "rewards/accuracies": 1.0, "rewards/chosen": -46.698272705078125, "rewards/margins": 3.1171741485595703, "rewards/rejected": -49.81544494628906, "step": 6848 }, { "epoch": 0.9325980392156863, "grad_norm": 41.98850973369946, "learning_rate": 1.1022090391522709e-08, "logits/chosen": 14.519847869873047, "logits/rejected": 14.145217895507812, "logps/chosen": -4.7471723556518555, "logps/rejected": -4.856633186340332, "loss": 4.0825, "rewards/accuracies": 0.75, "rewards/chosen": -47.47172546386719, "rewards/margins": 1.094609260559082, "rewards/rejected": -48.56633377075195, "step": 6849 }, { "epoch": 0.9327342047930284, "grad_norm": 41.1582710172295, "learning_rate": 1.0977806299805292e-08, "logits/chosen": 14.848958015441895, "logits/rejected": 15.307422637939453, "logps/chosen": -4.5570268630981445, "logps/rejected": -4.90714693069458, "loss": 3.8594, "rewards/accuracies": 1.0, "rewards/chosen": -45.57026672363281, "rewards/margins": 3.5011978149414062, "rewards/rejected": -49.07146453857422, "step": 6850 }, { "epoch": 0.9328703703703703, "grad_norm": 47.29533507128946, "learning_rate": 1.0933610110868263e-08, "logits/chosen": 14.491256713867188, "logits/rejected": 15.716161727905273, "logps/chosen": -4.526363372802734, "logps/rejected": -4.813316822052002, "loss": 4.3503, "rewards/accuracies": 0.75, "rewards/chosen": -45.263633728027344, "rewards/margins": 2.869532585144043, "rewards/rejected": -48.1331672668457, "step": 6851 }, { "epoch": 0.9330065359477124, "grad_norm": 42.87610591773992, "learning_rate": 1.0889501834698033e-08, "logits/chosen": 14.83009147644043, "logits/rejected": 15.12741470336914, "logps/chosen": -4.845651149749756, "logps/rejected": -4.929164409637451, "loss": 3.7055, "rewards/accuracies": 0.75, "rewards/chosen": -48.456512451171875, "rewards/margins": 0.8351345062255859, "rewards/rejected": -49.29164505004883, "step": 6852 }, { "epoch": 0.9331427015250545, "grad_norm": 43.03782517946119, "learning_rate": 1.0845481481261343e-08, "logits/chosen": 14.584213256835938, "logits/rejected": 14.364288330078125, "logps/chosen": -4.637338638305664, "logps/rejected": -4.552652359008789, "loss": 3.8436, "rewards/accuracies": 0.5, "rewards/chosen": -46.373390197753906, "rewards/margins": -0.8468637466430664, "rewards/rejected": -45.526527404785156, "step": 6853 }, { "epoch": 0.9332788671023965, "grad_norm": 42.17009450915577, "learning_rate": 1.0801549060504855e-08, "logits/chosen": 13.458466529846191, "logits/rejected": 14.954694747924805, "logps/chosen": -4.418586730957031, "logps/rejected": -4.648889064788818, "loss": 4.0297, "rewards/accuracies": 0.75, "rewards/chosen": -44.18586349487305, "rewards/margins": 2.3030290603637695, "rewards/rejected": -46.4888916015625, "step": 6854 }, { "epoch": 0.9334150326797386, "grad_norm": 42.6439066809783, "learning_rate": 1.075770458235552e-08, "logits/chosen": 15.143668174743652, "logits/rejected": 15.051082611083984, "logps/chosen": -4.769762992858887, "logps/rejected": -4.774326324462891, "loss": 3.8071, "rewards/accuracies": 0.25, "rewards/chosen": -47.69762420654297, "rewards/margins": 0.045635223388671875, "rewards/rejected": -47.743263244628906, "step": 6855 }, { "epoch": 0.9335511982570807, "grad_norm": 43.186825113778674, "learning_rate": 1.071394805672039e-08, "logits/chosen": 13.929965019226074, "logits/rejected": 14.559854507446289, "logps/chosen": -4.492833137512207, "logps/rejected": -4.638473033905029, "loss": 4.1788, "rewards/accuracies": 0.5, "rewards/chosen": -44.92832946777344, "rewards/margins": 1.4563961029052734, "rewards/rejected": -46.384727478027344, "step": 6856 }, { "epoch": 0.9336873638344226, "grad_norm": 41.54437815987252, "learning_rate": 1.0670279493486489e-08, "logits/chosen": 14.837064743041992, "logits/rejected": 14.960268020629883, "logps/chosen": -4.815189361572266, "logps/rejected": -4.7327165603637695, "loss": 3.502, "rewards/accuracies": 0.5, "rewards/chosen": -48.15188980102539, "rewards/margins": -0.8247222900390625, "rewards/rejected": -47.32716751098633, "step": 6857 }, { "epoch": 0.9338235294117647, "grad_norm": 54.89694434673775, "learning_rate": 1.062669890252117e-08, "logits/chosen": 14.466697692871094, "logits/rejected": 14.329699516296387, "logps/chosen": -4.920681953430176, "logps/rejected": -4.744776725769043, "loss": 4.1137, "rewards/accuracies": 0.0, "rewards/chosen": -49.20682144165039, "rewards/margins": -1.759054183959961, "rewards/rejected": -47.44776916503906, "step": 6858 }, { "epoch": 0.9339596949891068, "grad_norm": 44.28896447731792, "learning_rate": 1.0583206293671887e-08, "logits/chosen": 14.177772521972656, "logits/rejected": 14.616545677185059, "logps/chosen": -4.436943054199219, "logps/rejected": -4.566309452056885, "loss": 3.9724, "rewards/accuracies": 0.5, "rewards/chosen": -44.36943054199219, "rewards/margins": 1.2936649322509766, "rewards/rejected": -45.66309356689453, "step": 6859 }, { "epoch": 0.9340958605664488, "grad_norm": 40.487797566789496, "learning_rate": 1.0539801676766068e-08, "logits/chosen": 14.277473449707031, "logits/rejected": 14.782121658325195, "logps/chosen": -4.60321044921875, "logps/rejected": -4.641080379486084, "loss": 3.9378, "rewards/accuracies": 0.25, "rewards/chosen": -46.0321044921875, "rewards/margins": 0.37869739532470703, "rewards/rejected": -46.410804748535156, "step": 6860 }, { "epoch": 0.9342320261437909, "grad_norm": 42.30878305456616, "learning_rate": 1.0496485061611338e-08, "logits/chosen": 13.859188079833984, "logits/rejected": 14.514060020446777, "logps/chosen": -4.547896385192871, "logps/rejected": -4.688656806945801, "loss": 3.9159, "rewards/accuracies": 0.5, "rewards/chosen": -45.478965759277344, "rewards/margins": 1.4076080322265625, "rewards/rejected": -46.88656997680664, "step": 6861 }, { "epoch": 0.934368191721133, "grad_norm": 40.06880670781742, "learning_rate": 1.0453256457995552e-08, "logits/chosen": 14.875210762023926, "logits/rejected": 14.91059398651123, "logps/chosen": -4.52433443069458, "logps/rejected": -4.709494113922119, "loss": 3.659, "rewards/accuracies": 0.75, "rewards/chosen": -45.243343353271484, "rewards/margins": 1.8516016006469727, "rewards/rejected": -47.09494400024414, "step": 6862 }, { "epoch": 0.9345043572984749, "grad_norm": 44.5884086526111, "learning_rate": 1.041011587568641e-08, "logits/chosen": 14.22126579284668, "logits/rejected": 14.469697952270508, "logps/chosen": -4.829853057861328, "logps/rejected": -4.799950122833252, "loss": 3.3702, "rewards/accuracies": 0.5, "rewards/chosen": -48.29853057861328, "rewards/margins": -0.2990274429321289, "rewards/rejected": -47.99950408935547, "step": 6863 }, { "epoch": 0.934640522875817, "grad_norm": 45.498790327770614, "learning_rate": 1.0367063324432023e-08, "logits/chosen": 14.65110969543457, "logits/rejected": 15.023660659790039, "logps/chosen": -4.946689605712891, "logps/rejected": -4.910632133483887, "loss": 4.0521, "rewards/accuracies": 0.25, "rewards/chosen": -49.466896057128906, "rewards/margins": -0.36057472229003906, "rewards/rejected": -49.1063232421875, "step": 6864 }, { "epoch": 0.9347766884531591, "grad_norm": 41.178509988267294, "learning_rate": 1.0324098813960435e-08, "logits/chosen": 14.117189407348633, "logits/rejected": 14.783201217651367, "logps/chosen": -4.434813499450684, "logps/rejected": -4.874578952789307, "loss": 3.952, "rewards/accuracies": 0.75, "rewards/chosen": -44.34813690185547, "rewards/margins": 4.39765739440918, "rewards/rejected": -48.745792388916016, "step": 6865 }, { "epoch": 0.9349128540305011, "grad_norm": 41.12683176458967, "learning_rate": 1.0281222353979746e-08, "logits/chosen": 14.831857681274414, "logits/rejected": 15.004317283630371, "logps/chosen": -4.9436421394348145, "logps/rejected": -4.856261730194092, "loss": 3.6708, "rewards/accuracies": 0.5, "rewards/chosen": -49.43642044067383, "rewards/margins": -0.8738059997558594, "rewards/rejected": -48.56261444091797, "step": 6866 }, { "epoch": 0.9350490196078431, "grad_norm": 48.658995391575, "learning_rate": 1.0238433954178338e-08, "logits/chosen": 13.542425155639648, "logits/rejected": 14.27503776550293, "logps/chosen": -4.2186479568481445, "logps/rejected": -4.510953903198242, "loss": 4.166, "rewards/accuracies": 0.75, "rewards/chosen": -42.18647766113281, "rewards/margins": 2.923063278198242, "rewards/rejected": -45.10954284667969, "step": 6867 }, { "epoch": 0.9351851851851852, "grad_norm": 46.88448004155229, "learning_rate": 1.0195733624224611e-08, "logits/chosen": 14.65890121459961, "logits/rejected": 14.710172653198242, "logps/chosen": -4.704077243804932, "logps/rejected": -4.7129716873168945, "loss": 4.0034, "rewards/accuracies": 0.5, "rewards/chosen": -47.040771484375, "rewards/margins": 0.08894157409667969, "rewards/rejected": -47.12971496582031, "step": 6868 }, { "epoch": 0.9353213507625272, "grad_norm": 45.20634142853858, "learning_rate": 1.0153121373766982e-08, "logits/chosen": 14.753766059875488, "logits/rejected": 14.919416427612305, "logps/chosen": -4.567616939544678, "logps/rejected": -4.757887363433838, "loss": 3.2929, "rewards/accuracies": 0.75, "rewards/chosen": -45.676170349121094, "rewards/margins": 1.9027061462402344, "rewards/rejected": -47.57887649536133, "step": 6869 }, { "epoch": 0.9354575163398693, "grad_norm": 42.23726720178839, "learning_rate": 1.0110597212434102e-08, "logits/chosen": 14.805512428283691, "logits/rejected": 14.90770149230957, "logps/chosen": -4.695461273193359, "logps/rejected": -4.868034362792969, "loss": 3.4182, "rewards/accuracies": 0.75, "rewards/chosen": -46.954612731933594, "rewards/margins": 1.7257347106933594, "rewards/rejected": -48.68034744262695, "step": 6870 }, { "epoch": 0.9355936819172114, "grad_norm": 47.2946841789118, "learning_rate": 1.0068161149834687e-08, "logits/chosen": 14.206809997558594, "logits/rejected": 14.39288330078125, "logps/chosen": -4.5460124015808105, "logps/rejected": -4.823742389678955, "loss": 4.2051, "rewards/accuracies": 0.5, "rewards/chosen": -45.46012496948242, "rewards/margins": 2.7772979736328125, "rewards/rejected": -48.237422943115234, "step": 6871 }, { "epoch": 0.9357298474945533, "grad_norm": 41.57136684853464, "learning_rate": 1.002581319555742e-08, "logits/chosen": 14.88151741027832, "logits/rejected": 15.821741104125977, "logps/chosen": -4.783535003662109, "logps/rejected": -5.218263626098633, "loss": 3.5909, "rewards/accuracies": 1.0, "rewards/chosen": -47.835350036621094, "rewards/margins": 4.347285270690918, "rewards/rejected": -52.18263244628906, "step": 6872 }, { "epoch": 0.9358660130718954, "grad_norm": 41.29491260068738, "learning_rate": 9.983553359171225e-09, "logits/chosen": 14.739368438720703, "logits/rejected": 15.175424575805664, "logps/chosen": -4.425223350524902, "logps/rejected": -4.95692253112793, "loss": 3.9386, "rewards/accuracies": 1.0, "rewards/chosen": -44.252235412597656, "rewards/margins": 5.316990852355957, "rewards/rejected": -49.56922912597656, "step": 6873 }, { "epoch": 0.9360021786492375, "grad_norm": 42.14439149395716, "learning_rate": 9.941381650225089e-09, "logits/chosen": 13.816999435424805, "logits/rejected": 15.207093238830566, "logps/chosen": -4.808359622955322, "logps/rejected": -4.919498920440674, "loss": 4.1363, "rewards/accuracies": 0.5, "rewards/chosen": -48.083595275878906, "rewards/margins": 1.1113929748535156, "rewards/rejected": -49.19499206542969, "step": 6874 }, { "epoch": 0.9361383442265795, "grad_norm": 45.72646154010182, "learning_rate": 9.899298078247965e-09, "logits/chosen": 14.662151336669922, "logits/rejected": 14.687002182006836, "logps/chosen": -4.618411540985107, "logps/rejected": -4.592325210571289, "loss": 4.1921, "rewards/accuracies": 0.75, "rewards/chosen": -46.184120178222656, "rewards/margins": -0.2608661651611328, "rewards/rejected": -45.92325210571289, "step": 6875 }, { "epoch": 0.9362745098039216, "grad_norm": 42.50743570301805, "learning_rate": 9.857302652749088e-09, "logits/chosen": 14.470312118530273, "logits/rejected": 15.636731147766113, "logps/chosen": -4.467907905578613, "logps/rejected": -4.893316268920898, "loss": 4.0798, "rewards/accuracies": 0.75, "rewards/chosen": -44.679080963134766, "rewards/margins": 4.254082679748535, "rewards/rejected": -48.933162689208984, "step": 6876 }, { "epoch": 0.9364106753812637, "grad_norm": 42.47081582849322, "learning_rate": 9.815395383217628e-09, "logits/chosen": 14.04140853881836, "logits/rejected": 14.653924942016602, "logps/chosen": -4.586031913757324, "logps/rejected": -4.820502281188965, "loss": 3.7896, "rewards/accuracies": 1.0, "rewards/chosen": -45.86031723022461, "rewards/margins": 2.34470272064209, "rewards/rejected": -48.20501708984375, "step": 6877 }, { "epoch": 0.9365468409586056, "grad_norm": 41.54746459708711, "learning_rate": 9.773576279122852e-09, "logits/chosen": 15.131757736206055, "logits/rejected": 15.141615867614746, "logps/chosen": -5.001343727111816, "logps/rejected": -4.762801170349121, "loss": 4.0795, "rewards/accuracies": 0.5, "rewards/chosen": -50.01343536376953, "rewards/margins": -2.3854246139526367, "rewards/rejected": -47.628013610839844, "step": 6878 }, { "epoch": 0.9366830065359477, "grad_norm": 39.68123340025826, "learning_rate": 9.73184534991418e-09, "logits/chosen": 14.945575714111328, "logits/rejected": 14.958456039428711, "logps/chosen": -4.855557441711426, "logps/rejected": -4.848855972290039, "loss": 3.6595, "rewards/accuracies": 0.5, "rewards/chosen": -48.55557632446289, "rewards/margins": -0.0670175552368164, "rewards/rejected": -48.48855972290039, "step": 6879 }, { "epoch": 0.9368191721132898, "grad_norm": 40.30237088405677, "learning_rate": 9.690202605021092e-09, "logits/chosen": 13.878363609313965, "logits/rejected": 14.145366668701172, "logps/chosen": -4.3919782638549805, "logps/rejected": -4.674375534057617, "loss": 3.6436, "rewards/accuracies": 1.0, "rewards/chosen": -43.91978073120117, "rewards/margins": 2.823976516723633, "rewards/rejected": -46.74375534057617, "step": 6880 }, { "epoch": 0.9369553376906318, "grad_norm": 46.61602758246859, "learning_rate": 9.648648053852993e-09, "logits/chosen": 14.494006156921387, "logits/rejected": 15.185625076293945, "logps/chosen": -4.694488048553467, "logps/rejected": -4.961798191070557, "loss": 4.518, "rewards/accuracies": 0.75, "rewards/chosen": -46.944881439208984, "rewards/margins": 2.673098564147949, "rewards/rejected": -49.61798095703125, "step": 6881 }, { "epoch": 0.9370915032679739, "grad_norm": 42.05098858631764, "learning_rate": 9.607181705799527e-09, "logits/chosen": 14.696211814880371, "logits/rejected": 15.335310935974121, "logps/chosen": -4.729795455932617, "logps/rejected": -4.700606346130371, "loss": 3.9167, "rewards/accuracies": 0.5, "rewards/chosen": -47.29795455932617, "rewards/margins": -0.29189109802246094, "rewards/rejected": -47.006065368652344, "step": 6882 }, { "epoch": 0.9372276688453159, "grad_norm": 39.080155780994914, "learning_rate": 9.565803570230446e-09, "logits/chosen": 14.221214294433594, "logits/rejected": 14.464757919311523, "logps/chosen": -4.342190742492676, "logps/rejected": -4.897492408752441, "loss": 3.5117, "rewards/accuracies": 1.0, "rewards/chosen": -43.42190933227539, "rewards/margins": 5.553012847900391, "rewards/rejected": -48.97492218017578, "step": 6883 }, { "epoch": 0.9373638344226579, "grad_norm": 40.72673454942065, "learning_rate": 9.524513656495337e-09, "logits/chosen": 15.04909610748291, "logits/rejected": 14.985183715820312, "logps/chosen": -4.613934516906738, "logps/rejected": -4.838591575622559, "loss": 3.9233, "rewards/accuracies": 0.75, "rewards/chosen": -46.13934326171875, "rewards/margins": 2.246565818786621, "rewards/rejected": -48.38591003417969, "step": 6884 }, { "epoch": 0.9375, "grad_norm": 42.21499048183973, "learning_rate": 9.483311973924114e-09, "logits/chosen": 14.272483825683594, "logits/rejected": 14.843310356140137, "logps/chosen": -4.227461814880371, "logps/rejected": -4.534058094024658, "loss": 4.0817, "rewards/accuracies": 1.0, "rewards/chosen": -42.274620056152344, "rewards/margins": 3.0659561157226562, "rewards/rejected": -45.340576171875, "step": 6885 }, { "epoch": 0.9376361655773421, "grad_norm": 42.119559835697544, "learning_rate": 9.442198531826573e-09, "logits/chosen": 15.222925186157227, "logits/rejected": 13.83249282836914, "logps/chosen": -4.835831642150879, "logps/rejected": -4.462825298309326, "loss": 4.201, "rewards/accuracies": 0.25, "rewards/chosen": -48.358314514160156, "rewards/margins": -3.7300615310668945, "rewards/rejected": -44.62825012207031, "step": 6886 }, { "epoch": 0.9377723311546841, "grad_norm": 41.194231642631806, "learning_rate": 9.401173339492708e-09, "logits/chosen": 14.909950256347656, "logits/rejected": 14.152828216552734, "logps/chosen": -4.532943248748779, "logps/rejected": -4.4675493240356445, "loss": 4.2699, "rewards/accuracies": 0.25, "rewards/chosen": -45.32943344116211, "rewards/margins": -0.6539402008056641, "rewards/rejected": -44.67549133300781, "step": 6887 }, { "epoch": 0.9379084967320261, "grad_norm": 42.07863408226484, "learning_rate": 9.36023640619239e-09, "logits/chosen": 14.346305847167969, "logits/rejected": 14.851910591125488, "logps/chosen": -4.336190700531006, "logps/rejected": -4.865900039672852, "loss": 3.5256, "rewards/accuracies": 1.0, "rewards/chosen": -43.36190414428711, "rewards/margins": 5.2970991134643555, "rewards/rejected": -48.65900421142578, "step": 6888 }, { "epoch": 0.9380446623093682, "grad_norm": 43.297252152686866, "learning_rate": 9.319387741175688e-09, "logits/chosen": 14.450002670288086, "logits/rejected": 14.810288429260254, "logps/chosen": -4.581588268280029, "logps/rejected": -4.948993682861328, "loss": 3.5524, "rewards/accuracies": 0.75, "rewards/chosen": -45.815879821777344, "rewards/margins": 3.674057960510254, "rewards/rejected": -49.48994064331055, "step": 6889 }, { "epoch": 0.9381808278867102, "grad_norm": 40.61284851459513, "learning_rate": 9.278627353672819e-09, "logits/chosen": 14.802713394165039, "logits/rejected": 14.625804901123047, "logps/chosen": -4.79136848449707, "logps/rejected": -4.73417854309082, "loss": 3.8976, "rewards/accuracies": 0.25, "rewards/chosen": -47.91368103027344, "rewards/margins": -0.5718965530395508, "rewards/rejected": -47.34178924560547, "step": 6890 }, { "epoch": 0.9383169934640523, "grad_norm": 41.13869254983578, "learning_rate": 9.237955252893792e-09, "logits/chosen": 14.628450393676758, "logits/rejected": 14.77076244354248, "logps/chosen": -4.584856033325195, "logps/rejected": -4.952444076538086, "loss": 3.5664, "rewards/accuracies": 0.75, "rewards/chosen": -45.84855651855469, "rewards/margins": 3.6758832931518555, "rewards/rejected": -49.524444580078125, "step": 6891 }, { "epoch": 0.9384531590413944, "grad_norm": 47.43959048653393, "learning_rate": 9.197371448028812e-09, "logits/chosen": 14.658512115478516, "logits/rejected": 14.725698471069336, "logps/chosen": -4.610936164855957, "logps/rejected": -4.595935821533203, "loss": 4.2178, "rewards/accuracies": 0.25, "rewards/chosen": -46.1093635559082, "rewards/margins": -0.15000629425048828, "rewards/rejected": -45.95935821533203, "step": 6892 }, { "epoch": 0.9385893246187363, "grad_norm": 41.216502516855805, "learning_rate": 9.156875948248188e-09, "logits/chosen": 14.653944969177246, "logits/rejected": 15.464923858642578, "logps/chosen": -4.444697380065918, "logps/rejected": -4.9911298751831055, "loss": 4.3893, "rewards/accuracies": 1.0, "rewards/chosen": -44.44697570800781, "rewards/margins": 5.464321136474609, "rewards/rejected": -49.91129684448242, "step": 6893 }, { "epoch": 0.9387254901960784, "grad_norm": 48.02133607611285, "learning_rate": 9.1164687627022e-09, "logits/chosen": 13.688940048217773, "logits/rejected": 14.133329391479492, "logps/chosen": -4.674432754516602, "logps/rejected": -4.759232044219971, "loss": 4.4923, "rewards/accuracies": 0.5, "rewards/chosen": -46.74433135986328, "rewards/margins": 0.8479890823364258, "rewards/rejected": -47.592323303222656, "step": 6894 }, { "epoch": 0.9388616557734205, "grad_norm": 43.80214787617769, "learning_rate": 9.076149900521191e-09, "logits/chosen": 14.702388763427734, "logits/rejected": 15.095048904418945, "logps/chosen": -4.740135192871094, "logps/rejected": -4.856230735778809, "loss": 4.4536, "rewards/accuracies": 0.5, "rewards/chosen": -47.40135192871094, "rewards/margins": 1.16094970703125, "rewards/rejected": -48.56230545043945, "step": 6895 }, { "epoch": 0.9389978213507625, "grad_norm": 41.8089958782932, "learning_rate": 9.03591937081547e-09, "logits/chosen": 14.254979133605957, "logits/rejected": 14.54865837097168, "logps/chosen": -4.737712383270264, "logps/rejected": -4.839001178741455, "loss": 3.9459, "rewards/accuracies": 0.5, "rewards/chosen": -47.37712478637695, "rewards/margins": 1.012887954711914, "rewards/rejected": -48.3900146484375, "step": 6896 }, { "epoch": 0.9391339869281046, "grad_norm": 41.99094211545827, "learning_rate": 8.995777182675546e-09, "logits/chosen": 13.80337905883789, "logits/rejected": 14.407451629638672, "logps/chosen": -4.626100540161133, "logps/rejected": -4.621707916259766, "loss": 4.0655, "rewards/accuracies": 0.25, "rewards/chosen": -46.261009216308594, "rewards/margins": -0.04392528533935547, "rewards/rejected": -46.21708297729492, "step": 6897 }, { "epoch": 0.9392701525054467, "grad_norm": 45.00229926013391, "learning_rate": 8.955723345171806e-09, "logits/chosen": 15.472545623779297, "logits/rejected": 15.169687271118164, "logps/chosen": -4.848649501800537, "logps/rejected": -4.816109657287598, "loss": 4.2939, "rewards/accuracies": 0.5, "rewards/chosen": -48.48649215698242, "rewards/margins": -0.3253927230834961, "rewards/rejected": -48.16109848022461, "step": 6898 }, { "epoch": 0.9394063180827886, "grad_norm": 42.145317323215565, "learning_rate": 8.91575786735479e-09, "logits/chosen": 15.389347076416016, "logits/rejected": 15.515470504760742, "logps/chosen": -4.5502166748046875, "logps/rejected": -4.665454387664795, "loss": 3.8145, "rewards/accuracies": 0.75, "rewards/chosen": -45.502166748046875, "rewards/margins": 1.152374267578125, "rewards/rejected": -46.654541015625, "step": 6899 }, { "epoch": 0.9395424836601307, "grad_norm": 37.93677350958501, "learning_rate": 8.87588075825505e-09, "logits/chosen": 14.647583961486816, "logits/rejected": 14.912717819213867, "logps/chosen": -4.360179901123047, "logps/rejected": -4.399839401245117, "loss": 3.351, "rewards/accuracies": 0.5, "rewards/chosen": -43.60179901123047, "rewards/margins": 0.3965950012207031, "rewards/rejected": -43.99839782714844, "step": 6900 }, { "epoch": 0.9396786492374728, "grad_norm": 48.494112587313055, "learning_rate": 8.836092026883114e-09, "logits/chosen": 14.088515281677246, "logits/rejected": 14.623054504394531, "logps/chosen": -4.481271266937256, "logps/rejected": -4.40327787399292, "loss": 4.1208, "rewards/accuracies": 0.25, "rewards/chosen": -44.812713623046875, "rewards/margins": -0.7799358367919922, "rewards/rejected": -44.032779693603516, "step": 6901 }, { "epoch": 0.9398148148148148, "grad_norm": 40.542151302834995, "learning_rate": 8.796391682229565e-09, "logits/chosen": 14.907560348510742, "logits/rejected": 14.855657577514648, "logps/chosen": -4.7306904792785645, "logps/rejected": -4.830967903137207, "loss": 3.8135, "rewards/accuracies": 0.5, "rewards/chosen": -47.306907653808594, "rewards/margins": 1.0027713775634766, "rewards/rejected": -48.30967712402344, "step": 6902 }, { "epoch": 0.9399509803921569, "grad_norm": 45.34516026228406, "learning_rate": 8.756779733265007e-09, "logits/chosen": 15.18459701538086, "logits/rejected": 14.783921241760254, "logps/chosen": -4.769928932189941, "logps/rejected": -4.88753604888916, "loss": 3.9317, "rewards/accuracies": 0.75, "rewards/chosen": -47.69928741455078, "rewards/margins": 1.1760749816894531, "rewards/rejected": -48.875362396240234, "step": 6903 }, { "epoch": 0.9400871459694989, "grad_norm": 44.51186124037354, "learning_rate": 8.717256188940147e-09, "logits/chosen": 14.6929931640625, "logits/rejected": 14.286859512329102, "logps/chosen": -4.592432975769043, "logps/rejected": -4.664031028747559, "loss": 3.61, "rewards/accuracies": 0.75, "rewards/chosen": -45.92433166503906, "rewards/margins": 0.7159795761108398, "rewards/rejected": -46.64031219482422, "step": 6904 }, { "epoch": 0.9402233115468409, "grad_norm": 41.188072118135445, "learning_rate": 8.677821058185619e-09, "logits/chosen": 14.675224304199219, "logits/rejected": 15.36415958404541, "logps/chosen": -4.808989524841309, "logps/rejected": -5.028199195861816, "loss": 3.5331, "rewards/accuracies": 0.75, "rewards/chosen": -48.08989715576172, "rewards/margins": 2.192091941833496, "rewards/rejected": -50.28199005126953, "step": 6905 }, { "epoch": 0.940359477124183, "grad_norm": 40.89120410632803, "learning_rate": 8.638474349912118e-09, "logits/chosen": 13.35487174987793, "logits/rejected": 14.698532104492188, "logps/chosen": -4.505465507507324, "logps/rejected": -4.94973087310791, "loss": 3.7588, "rewards/accuracies": 0.75, "rewards/chosen": -45.054656982421875, "rewards/margins": 4.442652702331543, "rewards/rejected": -49.49730682373047, "step": 6906 }, { "epoch": 0.9404956427015251, "grad_norm": 39.795275386498645, "learning_rate": 8.59921607301044e-09, "logits/chosen": 14.349146842956543, "logits/rejected": 13.757251739501953, "logps/chosen": -4.588999271392822, "logps/rejected": -4.416774749755859, "loss": 4.158, "rewards/accuracies": 0.5, "rewards/chosen": -45.889991760253906, "rewards/margins": -1.7222423553466797, "rewards/rejected": -44.16775131225586, "step": 6907 }, { "epoch": 0.940631808278867, "grad_norm": 46.093839047499195, "learning_rate": 8.560046236351137e-09, "logits/chosen": 14.299910545349121, "logits/rejected": 14.669179916381836, "logps/chosen": -4.638324737548828, "logps/rejected": -4.617371559143066, "loss": 4.397, "rewards/accuracies": 0.25, "rewards/chosen": -46.38324737548828, "rewards/margins": -0.20952987670898438, "rewards/rejected": -46.1737174987793, "step": 6908 }, { "epoch": 0.9407679738562091, "grad_norm": 41.93747312039732, "learning_rate": 8.520964848785084e-09, "logits/chosen": 14.925947189331055, "logits/rejected": 14.716161727905273, "logps/chosen": -4.8017072677612305, "logps/rejected": -4.692030906677246, "loss": 3.8657, "rewards/accuracies": 0.5, "rewards/chosen": -48.01707077026367, "rewards/margins": -1.0967607498168945, "rewards/rejected": -46.920310974121094, "step": 6909 }, { "epoch": 0.9409041394335512, "grad_norm": 44.670038908189696, "learning_rate": 8.481971919143082e-09, "logits/chosen": 14.616288185119629, "logits/rejected": 15.16282844543457, "logps/chosen": -4.790757656097412, "logps/rejected": -4.97894811630249, "loss": 3.8935, "rewards/accuracies": 0.5, "rewards/chosen": -47.90757369995117, "rewards/margins": 1.8819055557250977, "rewards/rejected": -49.78948211669922, "step": 6910 }, { "epoch": 0.9410403050108932, "grad_norm": 43.185602813759466, "learning_rate": 8.44306745623582e-09, "logits/chosen": 13.481124877929688, "logits/rejected": 14.141372680664062, "logps/chosen": -4.491480350494385, "logps/rejected": -4.538690090179443, "loss": 3.8629, "rewards/accuracies": 0.5, "rewards/chosen": -44.9148063659668, "rewards/margins": 0.47209644317626953, "rewards/rejected": -45.38690185546875, "step": 6911 }, { "epoch": 0.9411764705882353, "grad_norm": 49.56554814987832, "learning_rate": 8.404251468854085e-09, "logits/chosen": 14.683856964111328, "logits/rejected": 14.888498306274414, "logps/chosen": -4.588581562042236, "logps/rejected": -4.8650336265563965, "loss": 4.3795, "rewards/accuracies": 0.75, "rewards/chosen": -45.88581848144531, "rewards/margins": 2.7645206451416016, "rewards/rejected": -48.65033721923828, "step": 6912 }, { "epoch": 0.9413126361655774, "grad_norm": 42.678514444893736, "learning_rate": 8.365523965768728e-09, "logits/chosen": 14.431665420532227, "logits/rejected": 15.448336601257324, "logps/chosen": -4.812992095947266, "logps/rejected": -4.99976110458374, "loss": 3.72, "rewards/accuracies": 0.75, "rewards/chosen": -48.129920959472656, "rewards/margins": 1.867690086364746, "rewards/rejected": -49.99761199951172, "step": 6913 }, { "epoch": 0.9414488017429193, "grad_norm": 110.552554259247, "learning_rate": 8.326884955730484e-09, "logits/chosen": 14.431086540222168, "logits/rejected": 14.973323822021484, "logps/chosen": -4.691864967346191, "logps/rejected": -4.6438493728637695, "loss": 3.4846, "rewards/accuracies": 0.5, "rewards/chosen": -46.91865158081055, "rewards/margins": -0.48015403747558594, "rewards/rejected": -46.43849563598633, "step": 6914 }, { "epoch": 0.9415849673202614, "grad_norm": 68.29666507979704, "learning_rate": 8.288334447470147e-09, "logits/chosen": 13.938057899475098, "logits/rejected": 14.994645118713379, "logps/chosen": -4.406249046325684, "logps/rejected": -4.818389415740967, "loss": 3.8946, "rewards/accuracies": 0.75, "rewards/chosen": -44.06249237060547, "rewards/margins": 4.121403694152832, "rewards/rejected": -48.18389129638672, "step": 6915 }, { "epoch": 0.9417211328976035, "grad_norm": 45.50862947607502, "learning_rate": 8.249872449698659e-09, "logits/chosen": 13.83323860168457, "logits/rejected": 13.708108901977539, "logps/chosen": -4.39415168762207, "logps/rejected": -4.696560382843018, "loss": 3.9614, "rewards/accuracies": 0.75, "rewards/chosen": -43.94151306152344, "rewards/margins": 3.0240888595581055, "rewards/rejected": -46.96560287475586, "step": 6916 }, { "epoch": 0.9418572984749455, "grad_norm": 45.73289293917924, "learning_rate": 8.211498971106667e-09, "logits/chosen": 14.286892890930176, "logits/rejected": 14.849934577941895, "logps/chosen": -4.755159378051758, "logps/rejected": -4.88164758682251, "loss": 3.9625, "rewards/accuracies": 0.75, "rewards/chosen": -47.551597595214844, "rewards/margins": 1.2648811340332031, "rewards/rejected": -48.81647491455078, "step": 6917 }, { "epoch": 0.9419934640522876, "grad_norm": 39.55315072996561, "learning_rate": 8.17321402036506e-09, "logits/chosen": 14.746052742004395, "logits/rejected": 15.26179313659668, "logps/chosen": -4.591978549957275, "logps/rejected": -5.130623817443848, "loss": 3.4965, "rewards/accuracies": 0.75, "rewards/chosen": -45.91978454589844, "rewards/margins": 5.386449813842773, "rewards/rejected": -51.306236267089844, "step": 6918 }, { "epoch": 0.9421296296296297, "grad_norm": 42.37718427876438, "learning_rate": 8.135017606124606e-09, "logits/chosen": 14.68995475769043, "logits/rejected": 14.546009063720703, "logps/chosen": -4.757900714874268, "logps/rejected": -4.56693172454834, "loss": 3.6451, "rewards/accuracies": 0.5, "rewards/chosen": -47.579010009765625, "rewards/margins": -1.9096946716308594, "rewards/rejected": -45.6693115234375, "step": 6919 }, { "epoch": 0.9422657952069716, "grad_norm": 42.74982737456, "learning_rate": 8.096909737016133e-09, "logits/chosen": 14.51902961730957, "logits/rejected": 14.820751190185547, "logps/chosen": -4.606621742248535, "logps/rejected": -4.748512268066406, "loss": 3.9592, "rewards/accuracies": 0.5, "rewards/chosen": -46.066219329833984, "rewards/margins": 1.4189023971557617, "rewards/rejected": -47.48512268066406, "step": 6920 }, { "epoch": 0.9424019607843137, "grad_norm": 42.3511165699426, "learning_rate": 8.058890421650355e-09, "logits/chosen": 15.124823570251465, "logits/rejected": 15.597601890563965, "logps/chosen": -5.022294998168945, "logps/rejected": -5.2470855712890625, "loss": 4.3031, "rewards/accuracies": 1.0, "rewards/chosen": -50.22294616699219, "rewards/margins": 2.2479114532470703, "rewards/rejected": -52.47085952758789, "step": 6921 }, { "epoch": 0.9425381263616558, "grad_norm": 42.90356410517303, "learning_rate": 8.020959668618177e-09, "logits/chosen": 13.805425643920898, "logits/rejected": 14.510021209716797, "logps/chosen": -4.672367095947266, "logps/rejected": -4.781196594238281, "loss": 3.6451, "rewards/accuracies": 0.75, "rewards/chosen": -46.723670959472656, "rewards/margins": 1.0882940292358398, "rewards/rejected": -47.81196594238281, "step": 6922 }, { "epoch": 0.9426742919389978, "grad_norm": 44.715577580715525, "learning_rate": 7.983117486490253e-09, "logits/chosen": 13.922609329223633, "logits/rejected": 14.130220413208008, "logps/chosen": -4.759281158447266, "logps/rejected": -4.959779739379883, "loss": 3.641, "rewards/accuracies": 0.75, "rewards/chosen": -47.592811584472656, "rewards/margins": 2.0049877166748047, "rewards/rejected": -49.597801208496094, "step": 6923 }, { "epoch": 0.9428104575163399, "grad_norm": 41.824379554304116, "learning_rate": 7.94536388381739e-09, "logits/chosen": 14.442699432373047, "logits/rejected": 14.324213027954102, "logps/chosen": -4.651986598968506, "logps/rejected": -4.62660026550293, "loss": 4.4027, "rewards/accuracies": 0.5, "rewards/chosen": -46.51986312866211, "rewards/margins": -0.2538614273071289, "rewards/rejected": -46.2660026550293, "step": 6924 }, { "epoch": 0.9429466230936819, "grad_norm": 41.398994038788814, "learning_rate": 7.90769886913032e-09, "logits/chosen": 14.462051391601562, "logits/rejected": 14.76650333404541, "logps/chosen": -4.70319938659668, "logps/rejected": -4.790493011474609, "loss": 4.2433, "rewards/accuracies": 0.5, "rewards/chosen": -47.03199768066406, "rewards/margins": 0.8729333877563477, "rewards/rejected": -47.904930114746094, "step": 6925 }, { "epoch": 0.943082788671024, "grad_norm": 63.380796007113794, "learning_rate": 7.870122450939742e-09, "logits/chosen": 14.959582328796387, "logits/rejected": 14.074639320373535, "logps/chosen": -4.6506476402282715, "logps/rejected": -4.517961502075195, "loss": 3.8679, "rewards/accuracies": 0.5, "rewards/chosen": -46.50647735595703, "rewards/margins": -1.3268623352050781, "rewards/rejected": -45.17961502075195, "step": 6926 }, { "epoch": 0.943218954248366, "grad_norm": 45.53252289365277, "learning_rate": 7.832634637736379e-09, "logits/chosen": 14.863122940063477, "logits/rejected": 15.303537368774414, "logps/chosen": -4.640608787536621, "logps/rejected": -4.93627405166626, "loss": 4.1706, "rewards/accuracies": 1.0, "rewards/chosen": -46.406089782714844, "rewards/margins": 2.9566516876220703, "rewards/rejected": -49.36274337768555, "step": 6927 }, { "epoch": 0.9433551198257081, "grad_norm": 42.04532286728811, "learning_rate": 7.795235437990922e-09, "logits/chosen": 14.336524963378906, "logits/rejected": 14.098471641540527, "logps/chosen": -4.650326728820801, "logps/rejected": -4.573353290557861, "loss": 4.1425, "rewards/accuracies": 0.5, "rewards/chosen": -46.503265380859375, "rewards/margins": -0.7697315216064453, "rewards/rejected": -45.7335319519043, "step": 6928 }, { "epoch": 0.9434912854030502, "grad_norm": 44.779728337862515, "learning_rate": 7.757924860153985e-09, "logits/chosen": 14.497130393981934, "logits/rejected": 14.52825927734375, "logps/chosen": -4.415020942687988, "logps/rejected": -4.5411577224731445, "loss": 3.6052, "rewards/accuracies": 0.5, "rewards/chosen": -44.150211334228516, "rewards/margins": 1.261368751525879, "rewards/rejected": -45.411582946777344, "step": 6929 }, { "epoch": 0.9436274509803921, "grad_norm": 38.15927607588616, "learning_rate": 7.720702912656252e-09, "logits/chosen": 14.348609924316406, "logits/rejected": 14.570161819458008, "logps/chosen": -4.797967910766602, "logps/rejected": -4.8224334716796875, "loss": 3.6642, "rewards/accuracies": 0.5, "rewards/chosen": -47.979679107666016, "rewards/margins": 0.24465274810791016, "rewards/rejected": -48.224334716796875, "step": 6930 }, { "epoch": 0.9437636165577342, "grad_norm": 43.04314111674474, "learning_rate": 7.683569603908324e-09, "logits/chosen": 13.753853797912598, "logits/rejected": 13.728954315185547, "logps/chosen": -4.375126361846924, "logps/rejected": -4.488002300262451, "loss": 4.1303, "rewards/accuracies": 0.75, "rewards/chosen": -43.75126266479492, "rewards/margins": 1.1287612915039062, "rewards/rejected": -44.88002395629883, "step": 6931 }, { "epoch": 0.9438997821350763, "grad_norm": 48.437767579554624, "learning_rate": 7.646524942300736e-09, "logits/chosen": 14.580052375793457, "logits/rejected": 13.926939010620117, "logps/chosen": -4.467479705810547, "logps/rejected": -4.680415153503418, "loss": 4.7716, "rewards/accuracies": 0.5, "rewards/chosen": -44.67479705810547, "rewards/margins": 2.1293582916259766, "rewards/rejected": -46.80415344238281, "step": 6932 }, { "epoch": 0.9440359477124183, "grad_norm": 40.07638903090437, "learning_rate": 7.60956893620408e-09, "logits/chosen": 14.544331550598145, "logits/rejected": 15.476293563842773, "logps/chosen": -4.448927879333496, "logps/rejected": -5.159093856811523, "loss": 3.6141, "rewards/accuracies": 1.0, "rewards/chosen": -44.489280700683594, "rewards/margins": 7.101658821105957, "rewards/rejected": -51.590938568115234, "step": 6933 }, { "epoch": 0.9441721132897604, "grad_norm": 40.56680430114941, "learning_rate": 7.572701593968877e-09, "logits/chosen": 14.237771034240723, "logits/rejected": 15.806085586547852, "logps/chosen": -4.437560081481934, "logps/rejected": -4.921247482299805, "loss": 3.7424, "rewards/accuracies": 1.0, "rewards/chosen": -44.37560272216797, "rewards/margins": 4.836874008178711, "rewards/rejected": -49.21247863769531, "step": 6934 }, { "epoch": 0.9443082788671024, "grad_norm": 43.53506651758999, "learning_rate": 7.53592292392553e-09, "logits/chosen": 14.958137512207031, "logits/rejected": 14.487018585205078, "logps/chosen": -4.650595188140869, "logps/rejected": -4.621236801147461, "loss": 4.142, "rewards/accuracies": 0.5, "rewards/chosen": -46.505950927734375, "rewards/margins": -0.2935800552368164, "rewards/rejected": -46.212371826171875, "step": 6935 }, { "epoch": 0.9444444444444444, "grad_norm": 39.36567650596533, "learning_rate": 7.499232934384548e-09, "logits/chosen": 14.737680435180664, "logits/rejected": 15.087997436523438, "logps/chosen": -4.600200653076172, "logps/rejected": -5.064713478088379, "loss": 3.5311, "rewards/accuracies": 1.0, "rewards/chosen": -46.00200653076172, "rewards/margins": 4.645127296447754, "rewards/rejected": -50.647132873535156, "step": 6936 }, { "epoch": 0.9445806100217865, "grad_norm": 39.54984456891766, "learning_rate": 7.462631633636407e-09, "logits/chosen": 14.061666488647461, "logits/rejected": 14.489714622497559, "logps/chosen": -4.746756553649902, "logps/rejected": -4.744580268859863, "loss": 3.8022, "rewards/accuracies": 0.5, "rewards/chosen": -47.467567443847656, "rewards/margins": -0.02176380157470703, "rewards/rejected": -47.445804595947266, "step": 6937 }, { "epoch": 0.9447167755991286, "grad_norm": 44.76924827401267, "learning_rate": 7.426119029951294e-09, "logits/chosen": 14.211380958557129, "logits/rejected": 13.986968994140625, "logps/chosen": -4.4200944900512695, "logps/rejected": -4.513622760772705, "loss": 4.1087, "rewards/accuracies": 0.75, "rewards/chosen": -44.20094680786133, "rewards/margins": 0.9352827072143555, "rewards/rejected": -45.13623046875, "step": 6938 }, { "epoch": 0.9448529411764706, "grad_norm": 43.106739478568336, "learning_rate": 7.389695131579676e-09, "logits/chosen": 14.229432106018066, "logits/rejected": 15.659088134765625, "logps/chosen": -4.524499893188477, "logps/rejected": -4.984643936157227, "loss": 4.1598, "rewards/accuracies": 1.0, "rewards/chosen": -45.24500274658203, "rewards/margins": 4.601435661315918, "rewards/rejected": -49.846439361572266, "step": 6939 }, { "epoch": 0.9449891067538126, "grad_norm": 47.81959246182694, "learning_rate": 7.3533599467518134e-09, "logits/chosen": 14.148284912109375, "logits/rejected": 13.870916366577148, "logps/chosen": -4.461548328399658, "logps/rejected": -4.667203426361084, "loss": 3.3251, "rewards/accuracies": 0.75, "rewards/chosen": -44.615482330322266, "rewards/margins": 2.0565500259399414, "rewards/rejected": -46.672035217285156, "step": 6940 }, { "epoch": 0.9451252723311547, "grad_norm": 49.99499184118856, "learning_rate": 7.317113483677894e-09, "logits/chosen": 14.390753746032715, "logits/rejected": 14.199581146240234, "logps/chosen": -4.661349296569824, "logps/rejected": -4.672597885131836, "loss": 4.513, "rewards/accuracies": 0.75, "rewards/chosen": -46.613494873046875, "rewards/margins": 0.11248016357421875, "rewards/rejected": -46.725975036621094, "step": 6941 }, { "epoch": 0.9452614379084967, "grad_norm": 44.71733637361678, "learning_rate": 7.280955750548124e-09, "logits/chosen": 14.288840293884277, "logits/rejected": 15.561464309692383, "logps/chosen": -4.515518665313721, "logps/rejected": -4.860966205596924, "loss": 3.5282, "rewards/accuracies": 0.75, "rewards/chosen": -45.155189514160156, "rewards/margins": 3.4544754028320312, "rewards/rejected": -48.60966110229492, "step": 6942 }, { "epoch": 0.9453976034858388, "grad_norm": 38.964783318980395, "learning_rate": 7.24488675553272e-09, "logits/chosen": 13.898759841918945, "logits/rejected": 15.292709350585938, "logps/chosen": -4.6775922775268555, "logps/rejected": -4.735572814941406, "loss": 3.7441, "rewards/accuracies": 0.5, "rewards/chosen": -46.77592468261719, "rewards/margins": 0.5798091888427734, "rewards/rejected": -47.35573196411133, "step": 6943 }, { "epoch": 0.9455337690631809, "grad_norm": 42.26803493768785, "learning_rate": 7.208906506781609e-09, "logits/chosen": 14.406496047973633, "logits/rejected": 14.747172355651855, "logps/chosen": -4.553023815155029, "logps/rejected": -4.878946304321289, "loss": 4.1402, "rewards/accuracies": 0.75, "rewards/chosen": -45.530235290527344, "rewards/margins": 3.259221076965332, "rewards/rejected": -48.789459228515625, "step": 6944 }, { "epoch": 0.9456699346405228, "grad_norm": 49.383418521914884, "learning_rate": 7.173015012424955e-09, "logits/chosen": 15.15129566192627, "logits/rejected": 15.269275665283203, "logps/chosen": -4.759042739868164, "logps/rejected": -4.747749328613281, "loss": 4.3939, "rewards/accuracies": 0.5, "rewards/chosen": -47.590423583984375, "rewards/margins": -0.1129302978515625, "rewards/rejected": -47.47749328613281, "step": 6945 }, { "epoch": 0.9458061002178649, "grad_norm": 45.06014329616734, "learning_rate": 7.137212280572713e-09, "logits/chosen": 13.998945236206055, "logits/rejected": 14.768545150756836, "logps/chosen": -4.611594200134277, "logps/rejected": -5.001151084899902, "loss": 3.7866, "rewards/accuracies": 0.75, "rewards/chosen": -46.115943908691406, "rewards/margins": 3.895571708679199, "rewards/rejected": -50.011512756347656, "step": 6946 }, { "epoch": 0.945942265795207, "grad_norm": 41.98567432650908, "learning_rate": 7.101498319314769e-09, "logits/chosen": 14.532591819763184, "logits/rejected": 14.496565818786621, "logps/chosen": -4.499133110046387, "logps/rejected": -4.81526517868042, "loss": 3.5741, "rewards/accuracies": 0.75, "rewards/chosen": -44.9913330078125, "rewards/margins": 3.161322593688965, "rewards/rejected": -48.15265655517578, "step": 6947 }, { "epoch": 0.946078431372549, "grad_norm": 43.33889227377276, "learning_rate": 7.0658731367210234e-09, "logits/chosen": 15.07870864868164, "logits/rejected": 14.447515487670898, "logps/chosen": -4.575809478759766, "logps/rejected": -4.692293643951416, "loss": 4.2162, "rewards/accuracies": 0.5, "rewards/chosen": -45.758094787597656, "rewards/margins": 1.1648426055908203, "rewards/rejected": -46.922935485839844, "step": 6948 }, { "epoch": 0.9462145969498911, "grad_norm": 42.51138374633602, "learning_rate": 7.030336740841303e-09, "logits/chosen": 13.874177932739258, "logits/rejected": 15.139093399047852, "logps/chosen": -4.5810346603393555, "logps/rejected": -4.828472137451172, "loss": 3.401, "rewards/accuracies": 0.5, "rewards/chosen": -45.81034469604492, "rewards/margins": 2.4743804931640625, "rewards/rejected": -48.284725189208984, "step": 6949 }, { "epoch": 0.9463507625272332, "grad_norm": 41.0413726765217, "learning_rate": 6.994889139705273e-09, "logits/chosen": 13.507627487182617, "logits/rejected": 14.364938735961914, "logps/chosen": -4.256761074066162, "logps/rejected": -4.723093509674072, "loss": 3.5575, "rewards/accuracies": 0.75, "rewards/chosen": -42.56761169433594, "rewards/margins": 4.663324356079102, "rewards/rejected": -47.230934143066406, "step": 6950 }, { "epoch": 0.9464869281045751, "grad_norm": 40.79128991622369, "learning_rate": 6.959530341322661e-09, "logits/chosen": 14.246177673339844, "logits/rejected": 14.709653854370117, "logps/chosen": -4.274662494659424, "logps/rejected": -4.8485260009765625, "loss": 3.7678, "rewards/accuracies": 1.0, "rewards/chosen": -42.746620178222656, "rewards/margins": 5.738637924194336, "rewards/rejected": -48.485260009765625, "step": 6951 }, { "epoch": 0.9466230936819172, "grad_norm": 46.05193952807945, "learning_rate": 6.924260353683075e-09, "logits/chosen": 14.199907302856445, "logits/rejected": 14.747591018676758, "logps/chosen": -4.370641708374023, "logps/rejected": -4.762795448303223, "loss": 4.4386, "rewards/accuracies": 1.0, "rewards/chosen": -43.7064208984375, "rewards/margins": 3.921539306640625, "rewards/rejected": -47.62795639038086, "step": 6952 }, { "epoch": 0.9467592592592593, "grad_norm": 42.62034050303882, "learning_rate": 6.889079184756052e-09, "logits/chosen": 14.605827331542969, "logits/rejected": 13.808295249938965, "logps/chosen": -4.590973377227783, "logps/rejected": -4.531466484069824, "loss": 4.1744, "rewards/accuracies": 0.5, "rewards/chosen": -45.90973663330078, "rewards/margins": -0.5950651168823242, "rewards/rejected": -45.314666748046875, "step": 6953 }, { "epoch": 0.9468954248366013, "grad_norm": 39.500832223939724, "learning_rate": 6.8539868424911e-09, "logits/chosen": 13.382010459899902, "logits/rejected": 15.152802467346191, "logps/chosen": -4.058610439300537, "logps/rejected": -4.526169776916504, "loss": 3.7207, "rewards/accuracies": 0.75, "rewards/chosen": -40.58610534667969, "rewards/margins": 4.675591468811035, "rewards/rejected": -45.261695861816406, "step": 6954 }, { "epoch": 0.9470315904139434, "grad_norm": 44.04371647737276, "learning_rate": 6.818983334817607e-09, "logits/chosen": 13.633447647094727, "logits/rejected": 13.762678146362305, "logps/chosen": -4.358811378479004, "logps/rejected": -4.5103960037231445, "loss": 3.2579, "rewards/accuracies": 0.75, "rewards/chosen": -43.588111877441406, "rewards/margins": 1.5158452987670898, "rewards/rejected": -45.10395812988281, "step": 6955 }, { "epoch": 0.9471677559912854, "grad_norm": 40.200208662932994, "learning_rate": 6.784068669644849e-09, "logits/chosen": 13.941781997680664, "logits/rejected": 14.034250259399414, "logps/chosen": -4.579524993896484, "logps/rejected": -4.705615043640137, "loss": 3.9705, "rewards/accuracies": 0.75, "rewards/chosen": -45.795249938964844, "rewards/margins": 1.2609004974365234, "rewards/rejected": -47.05615234375, "step": 6956 }, { "epoch": 0.9473039215686274, "grad_norm": 43.67372068704213, "learning_rate": 6.749242854862158e-09, "logits/chosen": 14.416345596313477, "logits/rejected": 15.327253341674805, "logps/chosen": -4.7665300369262695, "logps/rejected": -5.014795303344727, "loss": 3.8517, "rewards/accuracies": 0.75, "rewards/chosen": -47.66530227661133, "rewards/margins": 2.4826536178588867, "rewards/rejected": -50.14795684814453, "step": 6957 }, { "epoch": 0.9474400871459695, "grad_norm": 38.541916549077754, "learning_rate": 6.714505898338707e-09, "logits/chosen": 14.321544647216797, "logits/rejected": 14.350616455078125, "logps/chosen": -4.330519676208496, "logps/rejected": -4.192365646362305, "loss": 3.9366, "rewards/accuracies": 0.25, "rewards/chosen": -43.30519485473633, "rewards/margins": -1.3815383911132812, "rewards/rejected": -41.92366027832031, "step": 6958 }, { "epoch": 0.9475762527233116, "grad_norm": 39.546278950492486, "learning_rate": 6.67985780792355e-09, "logits/chosen": 14.780953407287598, "logits/rejected": 14.160030364990234, "logps/chosen": -4.637388229370117, "logps/rejected": -4.59766960144043, "loss": 3.3166, "rewards/accuracies": 0.5, "rewards/chosen": -46.37388610839844, "rewards/margins": -0.39719200134277344, "rewards/rejected": -45.97669219970703, "step": 6959 }, { "epoch": 0.9477124183006536, "grad_norm": 39.77963402367219, "learning_rate": 6.6452985914457135e-09, "logits/chosen": 14.565561294555664, "logits/rejected": 14.245112419128418, "logps/chosen": -4.9021711349487305, "logps/rejected": -4.7552690505981445, "loss": 4.1993, "rewards/accuracies": 0.5, "rewards/chosen": -49.02170944213867, "rewards/margins": -1.4690189361572266, "rewards/rejected": -47.55269241333008, "step": 6960 }, { "epoch": 0.9478485838779956, "grad_norm": 43.71526859247004, "learning_rate": 6.61082825671424e-09, "logits/chosen": 15.218993186950684, "logits/rejected": 14.741418838500977, "logps/chosen": -4.976576805114746, "logps/rejected": -4.731391429901123, "loss": 4.3802, "rewards/accuracies": 0.0, "rewards/chosen": -49.76576614379883, "rewards/margins": -2.4518508911132812, "rewards/rejected": -47.31391143798828, "step": 6961 }, { "epoch": 0.9479847494553377, "grad_norm": 44.80711055453808, "learning_rate": 6.576446811517833e-09, "logits/chosen": 14.768302917480469, "logits/rejected": 14.988863945007324, "logps/chosen": -4.987821578979492, "logps/rejected": -5.002074718475342, "loss": 4.2453, "rewards/accuracies": 0.5, "rewards/chosen": -49.87821578979492, "rewards/margins": 0.14252758026123047, "rewards/rejected": -50.02074432373047, "step": 6962 }, { "epoch": 0.9481209150326797, "grad_norm": 41.29209454459149, "learning_rate": 6.542154263625388e-09, "logits/chosen": 14.586385726928711, "logits/rejected": 14.789692878723145, "logps/chosen": -4.7763142585754395, "logps/rejected": -4.747068881988525, "loss": 3.8264, "rewards/accuracies": 0.5, "rewards/chosen": -47.763145446777344, "rewards/margins": -0.29245471954345703, "rewards/rejected": -47.47068786621094, "step": 6963 }, { "epoch": 0.9482570806100218, "grad_norm": 48.654773285009405, "learning_rate": 6.507950620785552e-09, "logits/chosen": 14.278518676757812, "logits/rejected": 15.009626388549805, "logps/chosen": -4.490039825439453, "logps/rejected": -4.436645984649658, "loss": 4.1038, "rewards/accuracies": 0.25, "rewards/chosen": -44.90039825439453, "rewards/margins": -0.5339412689208984, "rewards/rejected": -44.366458892822266, "step": 6964 }, { "epoch": 0.9483932461873639, "grad_norm": 39.999847249948, "learning_rate": 6.473835890726853e-09, "logits/chosen": 14.372106552124023, "logits/rejected": 14.815099716186523, "logps/chosen": -4.621794700622559, "logps/rejected": -4.565160751342773, "loss": 3.8545, "rewards/accuracies": 0.5, "rewards/chosen": -46.21794509887695, "rewards/margins": -0.5663375854492188, "rewards/rejected": -45.651607513427734, "step": 6965 }, { "epoch": 0.9485294117647058, "grad_norm": 42.9565820827413, "learning_rate": 6.439810081157882e-09, "logits/chosen": 15.346317291259766, "logits/rejected": 15.142789840698242, "logps/chosen": -4.857905387878418, "logps/rejected": -5.111692428588867, "loss": 3.4332, "rewards/accuracies": 0.25, "rewards/chosen": -48.57905960083008, "rewards/margins": 2.5378665924072266, "rewards/rejected": -51.11692428588867, "step": 6966 }, { "epoch": 0.9486655773420479, "grad_norm": 42.33692134931159, "learning_rate": 6.405873199767065e-09, "logits/chosen": 15.060235977172852, "logits/rejected": 15.170490264892578, "logps/chosen": -4.623383522033691, "logps/rejected": -4.742532253265381, "loss": 3.9781, "rewards/accuracies": 0.5, "rewards/chosen": -46.23383712768555, "rewards/margins": 1.1914873123168945, "rewards/rejected": -47.425323486328125, "step": 6967 }, { "epoch": 0.94880174291939, "grad_norm": 41.61383770673908, "learning_rate": 6.3720252542226235e-09, "logits/chosen": 14.806869506835938, "logits/rejected": 15.127412796020508, "logps/chosen": -4.736032485961914, "logps/rejected": -4.897809982299805, "loss": 4.3449, "rewards/accuracies": 0.5, "rewards/chosen": -47.36032485961914, "rewards/margins": 1.6177759170532227, "rewards/rejected": -48.97810363769531, "step": 6968 }, { "epoch": 0.948937908496732, "grad_norm": 43.07014803658058, "learning_rate": 6.338266252172841e-09, "logits/chosen": 15.03508186340332, "logits/rejected": 15.498041152954102, "logps/chosen": -4.85711669921875, "logps/rejected": -4.708434104919434, "loss": 4.0816, "rewards/accuracies": 0.5, "rewards/chosen": -48.5711669921875, "rewards/margins": -1.4868288040161133, "rewards/rejected": -47.08434295654297, "step": 6969 }, { "epoch": 0.9490740740740741, "grad_norm": 43.33917691114495, "learning_rate": 6.304596201245926e-09, "logits/chosen": 14.40127182006836, "logits/rejected": 14.737129211425781, "logps/chosen": -4.741581916809082, "logps/rejected": -4.642049312591553, "loss": 3.5616, "rewards/accuracies": 0.5, "rewards/chosen": -47.41582107543945, "rewards/margins": -0.995326042175293, "rewards/rejected": -46.420494079589844, "step": 6970 }, { "epoch": 0.9492102396514162, "grad_norm": 41.115669131119645, "learning_rate": 6.271015109049704e-09, "logits/chosen": 14.48383617401123, "logits/rejected": 13.971723556518555, "logps/chosen": -4.640152931213379, "logps/rejected": -4.465721130371094, "loss": 4.3798, "rewards/accuracies": 0.25, "rewards/chosen": -46.401527404785156, "rewards/margins": -1.7443199157714844, "rewards/rejected": -44.65720748901367, "step": 6971 }, { "epoch": 0.9493464052287581, "grad_norm": 41.91841536415329, "learning_rate": 6.237522983172283e-09, "logits/chosen": 13.963882446289062, "logits/rejected": 15.247184753417969, "logps/chosen": -4.32805061340332, "logps/rejected": -4.666436195373535, "loss": 3.4508, "rewards/accuracies": 0.75, "rewards/chosen": -43.28050994873047, "rewards/margins": 3.383854866027832, "rewards/rejected": -46.66436004638672, "step": 6972 }, { "epoch": 0.9494825708061002, "grad_norm": 43.21795498825454, "learning_rate": 6.204119831181432e-09, "logits/chosen": 14.189468383789062, "logits/rejected": 15.125380516052246, "logps/chosen": -4.400627136230469, "logps/rejected": -4.8860673904418945, "loss": 3.8996, "rewards/accuracies": 0.75, "rewards/chosen": -44.00627136230469, "rewards/margins": 4.854403495788574, "rewards/rejected": -48.86067581176758, "step": 6973 }, { "epoch": 0.9496187363834423, "grad_norm": 47.07686717885236, "learning_rate": 6.1708056606248e-09, "logits/chosen": 15.134554862976074, "logits/rejected": 15.487249374389648, "logps/chosen": -4.6785759925842285, "logps/rejected": -4.533255100250244, "loss": 3.8604, "rewards/accuracies": 0.25, "rewards/chosen": -46.78575897216797, "rewards/margins": -1.4532098770141602, "rewards/rejected": -45.332550048828125, "step": 6974 }, { "epoch": 0.9497549019607843, "grad_norm": 42.84160037275255, "learning_rate": 6.137580479030058e-09, "logits/chosen": 13.77250862121582, "logits/rejected": 13.79946517944336, "logps/chosen": -4.594082355499268, "logps/rejected": -4.559666633605957, "loss": 4.22, "rewards/accuracies": 0.5, "rewards/chosen": -45.94082260131836, "rewards/margins": -0.34415531158447266, "rewards/rejected": -45.5966682434082, "step": 6975 }, { "epoch": 0.9498910675381264, "grad_norm": 41.73157023073239, "learning_rate": 6.104444293904753e-09, "logits/chosen": 14.161630630493164, "logits/rejected": 15.151355743408203, "logps/chosen": -4.772249221801758, "logps/rejected": -5.289870738983154, "loss": 3.8926, "rewards/accuracies": 1.0, "rewards/chosen": -47.72249221801758, "rewards/margins": 5.176212310791016, "rewards/rejected": -52.898704528808594, "step": 6976 }, { "epoch": 0.9500272331154684, "grad_norm": 42.87627477511945, "learning_rate": 6.071397112736187e-09, "logits/chosen": 14.39427375793457, "logits/rejected": 14.49004077911377, "logps/chosen": -4.622817516326904, "logps/rejected": -4.768654823303223, "loss": 3.9955, "rewards/accuracies": 0.5, "rewards/chosen": -46.228172302246094, "rewards/margins": 1.4583740234375, "rewards/rejected": -47.686546325683594, "step": 6977 }, { "epoch": 0.9501633986928104, "grad_norm": 43.54228841007603, "learning_rate": 6.038438942991719e-09, "logits/chosen": 14.309925079345703, "logits/rejected": 15.488584518432617, "logps/chosen": -4.6423869132995605, "logps/rejected": -4.866601467132568, "loss": 4.3419, "rewards/accuracies": 0.75, "rewards/chosen": -46.42387390136719, "rewards/margins": 2.242142677307129, "rewards/rejected": -48.666015625, "step": 6978 }, { "epoch": 0.9502995642701525, "grad_norm": 38.71096688796058, "learning_rate": 6.005569792118459e-09, "logits/chosen": 15.191784858703613, "logits/rejected": 15.006288528442383, "logps/chosen": -4.259507179260254, "logps/rejected": -4.176698207855225, "loss": 3.7636, "rewards/accuracies": 0.25, "rewards/chosen": -42.595069885253906, "rewards/margins": -0.8280858993530273, "rewards/rejected": -41.76698303222656, "step": 6979 }, { "epoch": 0.9504357298474946, "grad_norm": 41.406739506269986, "learning_rate": 5.972789667543532e-09, "logits/chosen": 14.972782135009766, "logits/rejected": 15.173914909362793, "logps/chosen": -4.447159767150879, "logps/rejected": -4.7259392738342285, "loss": 3.832, "rewards/accuracies": 0.5, "rewards/chosen": -44.471595764160156, "rewards/margins": 2.7877960205078125, "rewards/rejected": -47.25939178466797, "step": 6980 }, { "epoch": 0.9505718954248366, "grad_norm": 46.20514933640177, "learning_rate": 5.940098576673813e-09, "logits/chosen": 15.041943550109863, "logits/rejected": 14.843149185180664, "logps/chosen": -4.618298053741455, "logps/rejected": -4.813755989074707, "loss": 3.762, "rewards/accuracies": 0.75, "rewards/chosen": -46.182979583740234, "rewards/margins": 1.9545793533325195, "rewards/rejected": -48.13755798339844, "step": 6981 }, { "epoch": 0.9507080610021786, "grad_norm": 40.513529557640204, "learning_rate": 5.90749652689615e-09, "logits/chosen": 14.88939380645752, "logits/rejected": 15.010330200195312, "logps/chosen": -4.684897422790527, "logps/rejected": -4.767431735992432, "loss": 3.8643, "rewards/accuracies": 0.5, "rewards/chosen": -46.84897232055664, "rewards/margins": 0.8253450393676758, "rewards/rejected": -47.67431640625, "step": 6982 }, { "epoch": 0.9508442265795207, "grad_norm": 39.13371176615649, "learning_rate": 5.874983525577315e-09, "logits/chosen": 13.62901496887207, "logits/rejected": 14.692676544189453, "logps/chosen": -4.51517391204834, "logps/rejected": -4.932980537414551, "loss": 3.5971, "rewards/accuracies": 1.0, "rewards/chosen": -45.15174102783203, "rewards/margins": 4.178065299987793, "rewards/rejected": -49.329803466796875, "step": 6983 }, { "epoch": 0.9509803921568627, "grad_norm": 42.82283922099771, "learning_rate": 5.842559580063744e-09, "logits/chosen": 15.099462509155273, "logits/rejected": 14.40814208984375, "logps/chosen": -4.66793966293335, "logps/rejected": -4.51301383972168, "loss": 3.5076, "rewards/accuracies": 0.5, "rewards/chosen": -46.67939758300781, "rewards/margins": -1.5492582321166992, "rewards/rejected": -45.1301383972168, "step": 6984 }, { "epoch": 0.9511165577342048, "grad_norm": 42.70565456837438, "learning_rate": 5.810224697681976e-09, "logits/chosen": 14.690542221069336, "logits/rejected": 14.099907875061035, "logps/chosen": -4.83560037612915, "logps/rejected": -4.524865627288818, "loss": 3.8733, "rewards/accuracies": 0.25, "rewards/chosen": -48.35600280761719, "rewards/margins": -3.1073503494262695, "rewards/rejected": -45.248653411865234, "step": 6985 }, { "epoch": 0.9512527233115469, "grad_norm": 40.92021419013796, "learning_rate": 5.777978885738432e-09, "logits/chosen": 14.705085754394531, "logits/rejected": 15.301847457885742, "logps/chosen": -4.718654155731201, "logps/rejected": -5.3800950050354, "loss": 3.8975, "rewards/accuracies": 0.75, "rewards/chosen": -47.18653869628906, "rewards/margins": 6.614409446716309, "rewards/rejected": -53.80094909667969, "step": 6986 }, { "epoch": 0.9513888888888888, "grad_norm": 43.759364800088434, "learning_rate": 5.745822151519153e-09, "logits/chosen": 13.731067657470703, "logits/rejected": 15.471771240234375, "logps/chosen": -4.407912254333496, "logps/rejected": -4.9621992111206055, "loss": 3.9567, "rewards/accuracies": 0.75, "rewards/chosen": -44.07912063598633, "rewards/margins": 5.542873382568359, "rewards/rejected": -49.62199401855469, "step": 6987 }, { "epoch": 0.9515250544662309, "grad_norm": 39.602298715856485, "learning_rate": 5.71375450229028e-09, "logits/chosen": 14.203006744384766, "logits/rejected": 14.91697883605957, "logps/chosen": -4.9658203125, "logps/rejected": -5.112451553344727, "loss": 3.926, "rewards/accuracies": 0.75, "rewards/chosen": -49.658203125, "rewards/margins": 1.4663124084472656, "rewards/rejected": -51.124515533447266, "step": 6988 }, { "epoch": 0.951661220043573, "grad_norm": 46.96715100055661, "learning_rate": 5.6817759452978394e-09, "logits/chosen": 14.916437149047852, "logits/rejected": 13.276154518127441, "logps/chosen": -4.496297836303711, "logps/rejected": -4.315646171569824, "loss": 4.4769, "rewards/accuracies": 0.25, "rewards/chosen": -44.96297836303711, "rewards/margins": -1.8065185546875, "rewards/rejected": -43.15645980834961, "step": 6989 }, { "epoch": 0.951797385620915, "grad_norm": 39.81901920232712, "learning_rate": 5.649886487767563e-09, "logits/chosen": 14.300582885742188, "logits/rejected": 14.819140434265137, "logps/chosen": -4.880359649658203, "logps/rejected": -5.112282752990723, "loss": 3.9558, "rewards/accuracies": 0.75, "rewards/chosen": -48.803592681884766, "rewards/margins": 2.319232940673828, "rewards/rejected": -51.122825622558594, "step": 6990 }, { "epoch": 0.9519335511982571, "grad_norm": 36.568005246393724, "learning_rate": 5.618086136905154e-09, "logits/chosen": 13.280058860778809, "logits/rejected": 15.002099990844727, "logps/chosen": -4.51517391204834, "logps/rejected": -5.0003509521484375, "loss": 3.7085, "rewards/accuracies": 1.0, "rewards/chosen": -45.1517333984375, "rewards/margins": 4.851771354675293, "rewards/rejected": -50.00350570678711, "step": 6991 }, { "epoch": 0.9520697167755992, "grad_norm": 46.60899015371355, "learning_rate": 5.586374899896195e-09, "logits/chosen": 14.22955322265625, "logits/rejected": 14.382462501525879, "logps/chosen": -4.362181663513184, "logps/rejected": -4.543414115905762, "loss": 4.4507, "rewards/accuracies": 1.0, "rewards/chosen": -43.62181854248047, "rewards/margins": 1.8123197555541992, "rewards/rejected": -45.43413543701172, "step": 6992 }, { "epoch": 0.9522058823529411, "grad_norm": 43.617293762327755, "learning_rate": 5.554752783906114e-09, "logits/chosen": 14.298912048339844, "logits/rejected": 14.082392692565918, "logps/chosen": -5.016393661499023, "logps/rejected": -4.955391883850098, "loss": 3.8886, "rewards/accuracies": 0.25, "rewards/chosen": -50.163936614990234, "rewards/margins": -0.6100187301635742, "rewards/rejected": -49.553916931152344, "step": 6993 }, { "epoch": 0.9523420479302832, "grad_norm": 43.492068858865444, "learning_rate": 5.523219796080081e-09, "logits/chosen": 13.47453784942627, "logits/rejected": 14.783828735351562, "logps/chosen": -4.398484706878662, "logps/rejected": -4.726094722747803, "loss": 3.8921, "rewards/accuracies": 0.5, "rewards/chosen": -43.98484802246094, "rewards/margins": 3.276102066040039, "rewards/rejected": -47.260948181152344, "step": 6994 }, { "epoch": 0.9524782135076253, "grad_norm": 40.97797031930774, "learning_rate": 5.491775943543375e-09, "logits/chosen": 13.046875, "logits/rejected": 14.616471290588379, "logps/chosen": -4.385980606079102, "logps/rejected": -4.653402328491211, "loss": 4.066, "rewards/accuracies": 0.75, "rewards/chosen": -43.85980987548828, "rewards/margins": 2.6742143630981445, "rewards/rejected": -46.53402328491211, "step": 6995 }, { "epoch": 0.9526143790849673, "grad_norm": 41.29784949282841, "learning_rate": 5.460421233400936e-09, "logits/chosen": 14.579451560974121, "logits/rejected": 14.615530014038086, "logps/chosen": -4.529355525970459, "logps/rejected": -4.647148609161377, "loss": 3.9375, "rewards/accuracies": 0.5, "rewards/chosen": -45.293556213378906, "rewards/margins": 1.1779308319091797, "rewards/rejected": -46.47148895263672, "step": 6996 }, { "epoch": 0.9527505446623094, "grad_norm": 40.80355846107031, "learning_rate": 5.429155672737584e-09, "logits/chosen": 14.594123840332031, "logits/rejected": 15.107845306396484, "logps/chosen": -4.562682151794434, "logps/rejected": -4.582668304443359, "loss": 4.0886, "rewards/accuracies": 0.75, "rewards/chosen": -45.62682342529297, "rewards/margins": 0.1998605728149414, "rewards/rejected": -45.826683044433594, "step": 6997 }, { "epoch": 0.9528867102396514, "grad_norm": 42.47160934831304, "learning_rate": 5.397979268618069e-09, "logits/chosen": 13.911663055419922, "logits/rejected": 14.08592414855957, "logps/chosen": -4.312708854675293, "logps/rejected": -4.545731544494629, "loss": 3.9791, "rewards/accuracies": 1.0, "rewards/chosen": -43.12709426879883, "rewards/margins": 2.33022403717041, "rewards/rejected": -45.45731735229492, "step": 6998 }, { "epoch": 0.9530228758169934, "grad_norm": 40.35265610347934, "learning_rate": 5.366892028086933e-09, "logits/chosen": 13.784751892089844, "logits/rejected": 14.530734062194824, "logps/chosen": -4.362643241882324, "logps/rejected": -4.66908073425293, "loss": 3.9589, "rewards/accuracies": 0.75, "rewards/chosen": -43.626426696777344, "rewards/margins": 3.0643796920776367, "rewards/rejected": -46.6908073425293, "step": 6999 }, { "epoch": 0.9531590413943355, "grad_norm": 40.01563357522428, "learning_rate": 5.335893958168647e-09, "logits/chosen": 14.076530456542969, "logits/rejected": 15.673004150390625, "logps/chosen": -4.587364196777344, "logps/rejected": -4.8091020584106445, "loss": 4.1222, "rewards/accuracies": 0.5, "rewards/chosen": -45.87364196777344, "rewards/margins": 2.2173824310302734, "rewards/rejected": -48.09102249145508, "step": 7000 }, { "epoch": 0.9532952069716776, "grad_norm": 42.313512356433314, "learning_rate": 5.304985065867429e-09, "logits/chosen": 14.374670028686523, "logits/rejected": 14.472061157226562, "logps/chosen": -5.131771564483643, "logps/rejected": -5.154403209686279, "loss": 3.8215, "rewards/accuracies": 0.5, "rewards/chosen": -51.31771469116211, "rewards/margins": 0.2263174057006836, "rewards/rejected": -51.54403305053711, "step": 7001 }, { "epoch": 0.9534313725490197, "grad_norm": 42.257741647341966, "learning_rate": 5.274165358167426e-09, "logits/chosen": 14.863038063049316, "logits/rejected": 14.6875, "logps/chosen": -4.684814929962158, "logps/rejected": -4.665395736694336, "loss": 4.3563, "rewards/accuracies": 0.5, "rewards/chosen": -46.848148345947266, "rewards/margins": -0.19419384002685547, "rewards/rejected": -46.653953552246094, "step": 7002 }, { "epoch": 0.9535675381263616, "grad_norm": 43.295200168029716, "learning_rate": 5.2434348420326235e-09, "logits/chosen": 13.776630401611328, "logits/rejected": 14.18430233001709, "logps/chosen": -4.478753089904785, "logps/rejected": -4.680187225341797, "loss": 3.8597, "rewards/accuracies": 0.75, "rewards/chosen": -44.78752899169922, "rewards/margins": 2.014338493347168, "rewards/rejected": -46.80186462402344, "step": 7003 }, { "epoch": 0.9537037037037037, "grad_norm": 42.767538455322494, "learning_rate": 5.212793524406755e-09, "logits/chosen": 14.837004661560059, "logits/rejected": 14.981002807617188, "logps/chosen": -5.038639545440674, "logps/rejected": -4.80793571472168, "loss": 4.2527, "rewards/accuracies": 0.25, "rewards/chosen": -50.38639450073242, "rewards/margins": -2.3070383071899414, "rewards/rejected": -48.0793571472168, "step": 7004 }, { "epoch": 0.9538398692810458, "grad_norm": 44.11893755723517, "learning_rate": 5.182241412213573e-09, "logits/chosen": 14.150614738464355, "logits/rejected": 13.994303703308105, "logps/chosen": -4.499227523803711, "logps/rejected": -4.347856521606445, "loss": 4.1839, "rewards/accuracies": 0.5, "rewards/chosen": -44.992279052734375, "rewards/margins": -1.5137128829956055, "rewards/rejected": -43.47856521606445, "step": 7005 }, { "epoch": 0.9539760348583878, "grad_norm": 40.16758698208063, "learning_rate": 5.151778512356531e-09, "logits/chosen": 14.676980972290039, "logits/rejected": 14.561298370361328, "logps/chosen": -4.902523040771484, "logps/rejected": -4.908805847167969, "loss": 3.6757, "rewards/accuracies": 0.25, "rewards/chosen": -49.025230407714844, "rewards/margins": 0.06283187866210938, "rewards/rejected": -49.08805847167969, "step": 7006 }, { "epoch": 0.9541122004357299, "grad_norm": 45.55799870506938, "learning_rate": 5.1214048317190115e-09, "logits/chosen": 14.961103439331055, "logits/rejected": 14.909468650817871, "logps/chosen": -4.950410842895508, "logps/rejected": -4.980818748474121, "loss": 4.3119, "rewards/accuracies": 0.75, "rewards/chosen": -49.50410461425781, "rewards/margins": 0.30408477783203125, "rewards/rejected": -49.808189392089844, "step": 7007 }, { "epoch": 0.954248366013072, "grad_norm": 46.62758501910857, "learning_rate": 5.0911203771641045e-09, "logits/chosen": 14.236222267150879, "logits/rejected": 14.389326095581055, "logps/chosen": -4.644311904907227, "logps/rejected": -4.675763130187988, "loss": 3.5358, "rewards/accuracies": 0.5, "rewards/chosen": -46.443119049072266, "rewards/margins": 0.3145132064819336, "rewards/rejected": -46.75762939453125, "step": 7008 }, { "epoch": 0.9543845315904139, "grad_norm": 38.491770505211264, "learning_rate": 5.0609251555349566e-09, "logits/chosen": 15.256494522094727, "logits/rejected": 15.53320598602295, "logps/chosen": -4.95723295211792, "logps/rejected": -5.258107662200928, "loss": 3.6858, "rewards/accuracies": 1.0, "rewards/chosen": -49.57232666015625, "rewards/margins": 3.008747100830078, "rewards/rejected": -52.58107376098633, "step": 7009 }, { "epoch": 0.954520697167756, "grad_norm": 41.634718075082, "learning_rate": 5.030819173654333e-09, "logits/chosen": 14.526376724243164, "logits/rejected": 15.269237518310547, "logps/chosen": -4.51746940612793, "logps/rejected": -4.894245147705078, "loss": 4.3884, "rewards/accuracies": 0.5, "rewards/chosen": -45.17469787597656, "rewards/margins": 3.7677555084228516, "rewards/rejected": -48.94245147705078, "step": 7010 }, { "epoch": 0.9546568627450981, "grad_norm": 44.21634805409797, "learning_rate": 5.000802438324969e-09, "logits/chosen": 14.744251251220703, "logits/rejected": 14.214727401733398, "logps/chosen": -4.849291801452637, "logps/rejected": -4.867959022521973, "loss": 3.8598, "rewards/accuracies": 0.25, "rewards/chosen": -48.492919921875, "rewards/margins": 0.18667125701904297, "rewards/rejected": -48.67959213256836, "step": 7011 }, { "epoch": 0.9547930283224401, "grad_norm": 40.213159270478144, "learning_rate": 4.970874956329396e-09, "logits/chosen": 13.677464485168457, "logits/rejected": 13.798107147216797, "logps/chosen": -4.208148956298828, "logps/rejected": -4.65723991394043, "loss": 4.1755, "rewards/accuracies": 1.0, "rewards/chosen": -42.08148956298828, "rewards/margins": 4.490903854370117, "rewards/rejected": -46.57239532470703, "step": 7012 }, { "epoch": 0.9549291938997821, "grad_norm": 40.79829973654979, "learning_rate": 4.941036734430026e-09, "logits/chosen": 13.735669136047363, "logits/rejected": 14.404485702514648, "logps/chosen": -4.363763809204102, "logps/rejected": -4.7060956954956055, "loss": 3.9451, "rewards/accuracies": 0.75, "rewards/chosen": -43.63763427734375, "rewards/margins": 3.423323631286621, "rewards/rejected": -47.06095886230469, "step": 7013 }, { "epoch": 0.9550653594771242, "grad_norm": 42.07589822463518, "learning_rate": 4.9112877793689335e-09, "logits/chosen": 14.804248809814453, "logits/rejected": 14.734652519226074, "logps/chosen": -4.135768413543701, "logps/rejected": -4.671977996826172, "loss": 3.7707, "rewards/accuracies": 1.0, "rewards/chosen": -41.35768508911133, "rewards/margins": 5.362093925476074, "rewards/rejected": -46.71977996826172, "step": 7014 }, { "epoch": 0.9552015250544662, "grad_norm": 43.03222153688147, "learning_rate": 4.881628097868207e-09, "logits/chosen": 14.944070816040039, "logits/rejected": 14.720178604125977, "logps/chosen": -4.769309043884277, "logps/rejected": -4.765944004058838, "loss": 4.1752, "rewards/accuracies": 0.75, "rewards/chosen": -47.69308853149414, "rewards/margins": -0.03364753723144531, "rewards/rejected": -47.65943908691406, "step": 7015 }, { "epoch": 0.9553376906318083, "grad_norm": 41.049991053099134, "learning_rate": 4.85205769662973e-09, "logits/chosen": 14.411645889282227, "logits/rejected": 13.801513671875, "logps/chosen": -4.654879570007324, "logps/rejected": -4.679368495941162, "loss": 3.9262, "rewards/accuracies": 0.25, "rewards/chosen": -46.54879379272461, "rewards/margins": 0.24489307403564453, "rewards/rejected": -46.79368591308594, "step": 7016 }, { "epoch": 0.9554738562091504, "grad_norm": 40.81570501941625, "learning_rate": 4.822576582335092e-09, "logits/chosen": 14.369146347045898, "logits/rejected": 15.007062911987305, "logps/chosen": -4.835173606872559, "logps/rejected": -5.052914619445801, "loss": 4.0423, "rewards/accuracies": 0.75, "rewards/chosen": -48.35173797607422, "rewards/margins": 2.177412986755371, "rewards/rejected": -50.52914810180664, "step": 7017 }, { "epoch": 0.9556100217864923, "grad_norm": 39.699655378972395, "learning_rate": 4.793184761645852e-09, "logits/chosen": 14.050899505615234, "logits/rejected": 14.524557113647461, "logps/chosen": -4.5510358810424805, "logps/rejected": -4.892482280731201, "loss": 3.6065, "rewards/accuracies": 0.75, "rewards/chosen": -45.51036071777344, "rewards/margins": 3.414463996887207, "rewards/rejected": -48.92482376098633, "step": 7018 }, { "epoch": 0.9557461873638344, "grad_norm": 43.89449372783598, "learning_rate": 4.763882241203365e-09, "logits/chosen": 14.749565124511719, "logits/rejected": 14.913629531860352, "logps/chosen": -5.02616548538208, "logps/rejected": -4.971920967102051, "loss": 3.6216, "rewards/accuracies": 0.5, "rewards/chosen": -50.26165008544922, "rewards/margins": -0.5424394607543945, "rewards/rejected": -49.719215393066406, "step": 7019 }, { "epoch": 0.9558823529411765, "grad_norm": 42.114827797068514, "learning_rate": 4.7346690276286905e-09, "logits/chosen": 14.658858299255371, "logits/rejected": 15.364751815795898, "logps/chosen": -4.6320929527282715, "logps/rejected": -4.870147228240967, "loss": 4.1214, "rewards/accuracies": 0.75, "rewards/chosen": -46.320926666259766, "rewards/margins": 2.3805456161499023, "rewards/rejected": -48.701473236083984, "step": 7020 }, { "epoch": 0.9560185185185185, "grad_norm": 44.68972562882494, "learning_rate": 4.705545127522903e-09, "logits/chosen": 15.003009796142578, "logits/rejected": 15.103342056274414, "logps/chosen": -4.5287675857543945, "logps/rejected": -4.796462535858154, "loss": 3.8279, "rewards/accuracies": 0.75, "rewards/chosen": -45.28767776489258, "rewards/margins": 2.6769495010375977, "rewards/rejected": -47.96462631225586, "step": 7021 }, { "epoch": 0.9561546840958606, "grad_norm": 40.931114543855074, "learning_rate": 4.676510547466695e-09, "logits/chosen": 14.636661529541016, "logits/rejected": 15.255781173706055, "logps/chosen": -4.895862102508545, "logps/rejected": -4.962218761444092, "loss": 3.818, "rewards/accuracies": 0.25, "rewards/chosen": -48.9586181640625, "rewards/margins": 0.6635684967041016, "rewards/rejected": -49.622188568115234, "step": 7022 }, { "epoch": 0.9562908496732027, "grad_norm": 41.830009570263904, "learning_rate": 4.6475652940207275e-09, "logits/chosen": 14.280781745910645, "logits/rejected": 14.737386703491211, "logps/chosen": -4.647289276123047, "logps/rejected": -5.1052117347717285, "loss": 3.9569, "rewards/accuracies": 0.75, "rewards/chosen": -46.4728889465332, "rewards/margins": 4.579227447509766, "rewards/rejected": -51.05211639404297, "step": 7023 }, { "epoch": 0.9564270152505446, "grad_norm": 46.52673271512176, "learning_rate": 4.618709373725371e-09, "logits/chosen": 13.420894622802734, "logits/rejected": 13.942646980285645, "logps/chosen": -4.540377616882324, "logps/rejected": -4.517156600952148, "loss": 4.2663, "rewards/accuracies": 0.25, "rewards/chosen": -45.403778076171875, "rewards/margins": -0.23221492767333984, "rewards/rejected": -45.17156219482422, "step": 7024 }, { "epoch": 0.9565631808278867, "grad_norm": 38.60067990680591, "learning_rate": 4.589942793100921e-09, "logits/chosen": 14.7642822265625, "logits/rejected": 13.913741111755371, "logps/chosen": -4.74002742767334, "logps/rejected": -4.670596122741699, "loss": 3.4904, "rewards/accuracies": 0.5, "rewards/chosen": -47.40027618408203, "rewards/margins": -0.6943130493164062, "rewards/rejected": -46.705963134765625, "step": 7025 }, { "epoch": 0.9566993464052288, "grad_norm": 42.67080896986917, "learning_rate": 4.561265558647376e-09, "logits/chosen": 14.389030456542969, "logits/rejected": 15.07581901550293, "logps/chosen": -4.846871376037598, "logps/rejected": -5.024266719818115, "loss": 4.0851, "rewards/accuracies": 1.0, "rewards/chosen": -48.468711853027344, "rewards/margins": 1.773951530456543, "rewards/rejected": -50.2426643371582, "step": 7026 }, { "epoch": 0.9568355119825708, "grad_norm": 43.60478192069251, "learning_rate": 4.5326776768445766e-09, "logits/chosen": 14.611474990844727, "logits/rejected": 14.899497985839844, "logps/chosen": -4.453151226043701, "logps/rejected": -4.620302677154541, "loss": 4.0335, "rewards/accuracies": 0.75, "rewards/chosen": -44.53150939941406, "rewards/margins": 1.6715164184570312, "rewards/rejected": -46.20302963256836, "step": 7027 }, { "epoch": 0.9569716775599129, "grad_norm": 45.56809073178973, "learning_rate": 4.504179154152243e-09, "logits/chosen": 15.104008674621582, "logits/rejected": 14.499438285827637, "logps/chosen": -4.7679338455200195, "logps/rejected": -4.606138229370117, "loss": 4.3226, "rewards/accuracies": 0.25, "rewards/chosen": -47.67933654785156, "rewards/margins": -1.617955207824707, "rewards/rejected": -46.06138610839844, "step": 7028 }, { "epoch": 0.9571078431372549, "grad_norm": 45.26877591840036, "learning_rate": 4.475769997009848e-09, "logits/chosen": 15.296537399291992, "logits/rejected": 14.869546890258789, "logps/chosen": -4.772534370422363, "logps/rejected": -4.676355838775635, "loss": 3.6894, "rewards/accuracies": 0.5, "rewards/chosen": -47.725341796875, "rewards/margins": -0.9617815017700195, "rewards/rejected": -46.76355743408203, "step": 7029 }, { "epoch": 0.9572440087145969, "grad_norm": 43.68453748020629, "learning_rate": 4.447450211836612e-09, "logits/chosen": 14.657971382141113, "logits/rejected": 14.011333465576172, "logps/chosen": -4.939115524291992, "logps/rejected": -4.941697120666504, "loss": 4.1226, "rewards/accuracies": 0.5, "rewards/chosen": -49.39115524291992, "rewards/margins": 0.02581501007080078, "rewards/rejected": -49.416969299316406, "step": 7030 }, { "epoch": 0.957380174291939, "grad_norm": 44.64781926852608, "learning_rate": 4.419219805031727e-09, "logits/chosen": 14.291322708129883, "logits/rejected": 14.309076309204102, "logps/chosen": -4.727736473083496, "logps/rejected": -4.8414411544799805, "loss": 4.0766, "rewards/accuracies": 0.25, "rewards/chosen": -47.27736282348633, "rewards/margins": 1.1370487213134766, "rewards/rejected": -48.41441345214844, "step": 7031 }, { "epoch": 0.9575163398692811, "grad_norm": 43.337554154178555, "learning_rate": 4.3910787829740006e-09, "logits/chosen": 14.52115249633789, "logits/rejected": 14.002080917358398, "logps/chosen": -4.573620796203613, "logps/rejected": -4.877353191375732, "loss": 3.7906, "rewards/accuracies": 0.5, "rewards/chosen": -45.7362060546875, "rewards/margins": 3.0373268127441406, "rewards/rejected": -48.77353286743164, "step": 7032 }, { "epoch": 0.9576525054466231, "grad_norm": 38.877120993225184, "learning_rate": 4.36302715202217e-09, "logits/chosen": 14.65964412689209, "logits/rejected": 14.216850280761719, "logps/chosen": -4.4691925048828125, "logps/rejected": -4.653939723968506, "loss": 3.7585, "rewards/accuracies": 0.75, "rewards/chosen": -44.69192123413086, "rewards/margins": 1.8474760055541992, "rewards/rejected": -46.539398193359375, "step": 7033 }, { "epoch": 0.9577886710239651, "grad_norm": 40.492083664477825, "learning_rate": 4.3350649185147196e-09, "logits/chosen": 13.570650100708008, "logits/rejected": 14.713929176330566, "logps/chosen": -4.225065231323242, "logps/rejected": -4.651118278503418, "loss": 3.4142, "rewards/accuracies": 0.75, "rewards/chosen": -42.25065612792969, "rewards/margins": 4.260524749755859, "rewards/rejected": -46.51118087768555, "step": 7034 }, { "epoch": 0.9579248366013072, "grad_norm": 42.03648679782357, "learning_rate": 4.307192088769973e-09, "logits/chosen": 14.851810455322266, "logits/rejected": 14.978334426879883, "logps/chosen": -4.718562126159668, "logps/rejected": -4.88021993637085, "loss": 3.4761, "rewards/accuracies": 0.75, "rewards/chosen": -47.18562316894531, "rewards/margins": 1.6165781021118164, "rewards/rejected": -48.80220031738281, "step": 7035 }, { "epoch": 0.9580610021786492, "grad_norm": 42.707792582464265, "learning_rate": 4.2794086690859595e-09, "logits/chosen": 14.029930114746094, "logits/rejected": 14.996673583984375, "logps/chosen": -4.529171466827393, "logps/rejected": -4.985057830810547, "loss": 3.4551, "rewards/accuracies": 1.0, "rewards/chosen": -45.291717529296875, "rewards/margins": 4.558859825134277, "rewards/rejected": -49.85057830810547, "step": 7036 }, { "epoch": 0.9581971677559913, "grad_norm": 42.73068071597958, "learning_rate": 4.25171466574068e-09, "logits/chosen": 14.070234298706055, "logits/rejected": 14.590555191040039, "logps/chosen": -4.721556663513184, "logps/rejected": -5.0033745765686035, "loss": 4.2032, "rewards/accuracies": 0.5, "rewards/chosen": -47.2155647277832, "rewards/margins": 2.818182945251465, "rewards/rejected": -50.033748626708984, "step": 7037 }, { "epoch": 0.9583333333333334, "grad_norm": 40.673864165008304, "learning_rate": 4.224110084991705e-09, "logits/chosen": 14.148937225341797, "logits/rejected": 13.859159469604492, "logps/chosen": -4.729463577270508, "logps/rejected": -4.738384246826172, "loss": 3.6288, "rewards/accuracies": 0.5, "rewards/chosen": -47.29463577270508, "rewards/margins": 0.08920860290527344, "rewards/rejected": -47.38384246826172, "step": 7038 }, { "epoch": 0.9584694989106753, "grad_norm": 38.72566282022676, "learning_rate": 4.1965949330765805e-09, "logits/chosen": 14.34174919128418, "logits/rejected": 14.829431533813477, "logps/chosen": -4.948132514953613, "logps/rejected": -4.943820476531982, "loss": 3.4238, "rewards/accuracies": 0.25, "rewards/chosen": -49.4813232421875, "rewards/margins": -0.043122291564941406, "rewards/rejected": -49.438201904296875, "step": 7039 }, { "epoch": 0.9586056644880174, "grad_norm": 48.72077403046283, "learning_rate": 4.169169216212598e-09, "logits/chosen": 14.044349670410156, "logits/rejected": 14.585504531860352, "logps/chosen": -4.462619304656982, "logps/rejected": -4.655645370483398, "loss": 4.2242, "rewards/accuracies": 0.75, "rewards/chosen": -44.62619400024414, "rewards/margins": 1.9302616119384766, "rewards/rejected": -46.55645751953125, "step": 7040 }, { "epoch": 0.9587418300653595, "grad_norm": 41.11921056044804, "learning_rate": 4.141832940596757e-09, "logits/chosen": 14.611187934875488, "logits/rejected": 14.562536239624023, "logps/chosen": -4.676234722137451, "logps/rejected": -4.958641052246094, "loss": 3.9575, "rewards/accuracies": 0.75, "rewards/chosen": -46.76234817504883, "rewards/margins": 2.8240652084350586, "rewards/rejected": -49.5864143371582, "step": 7041 }, { "epoch": 0.9588779956427015, "grad_norm": 43.66899940518007, "learning_rate": 4.114586112405982e-09, "logits/chosen": 14.514248847961426, "logits/rejected": 14.820916175842285, "logps/chosen": -4.672481536865234, "logps/rejected": -5.004436492919922, "loss": 4.3621, "rewards/accuracies": 0.75, "rewards/chosen": -46.724815368652344, "rewards/margins": 3.319551467895508, "rewards/rejected": -50.044368743896484, "step": 7042 }, { "epoch": 0.9590141612200436, "grad_norm": 41.24089916001526, "learning_rate": 4.08742873779695e-09, "logits/chosen": 14.371345520019531, "logits/rejected": 13.945108413696289, "logps/chosen": -4.72335147857666, "logps/rejected": -4.617324352264404, "loss": 3.8257, "rewards/accuracies": 0.25, "rewards/chosen": -47.2335090637207, "rewards/margins": -1.0602684020996094, "rewards/rejected": -46.17324447631836, "step": 7043 }, { "epoch": 0.9591503267973857, "grad_norm": 45.7706229270703, "learning_rate": 4.06036082290595e-09, "logits/chosen": 14.782512664794922, "logits/rejected": 14.951749801635742, "logps/chosen": -4.823547840118408, "logps/rejected": -4.812869071960449, "loss": 4.0469, "rewards/accuracies": 0.5, "rewards/chosen": -48.235477447509766, "rewards/margins": -0.10679054260253906, "rewards/rejected": -48.12868881225586, "step": 7044 }, { "epoch": 0.9592864923747276, "grad_norm": 40.24033753547956, "learning_rate": 4.033382373849337e-09, "logits/chosen": 14.013235092163086, "logits/rejected": 15.3809814453125, "logps/chosen": -4.638931751251221, "logps/rejected": -4.997907638549805, "loss": 3.233, "rewards/accuracies": 0.75, "rewards/chosen": -46.38931655883789, "rewards/margins": 3.5897598266601562, "rewards/rejected": -49.97908020019531, "step": 7045 }, { "epoch": 0.9594226579520697, "grad_norm": 46.13212078262153, "learning_rate": 4.0064933967230766e-09, "logits/chosen": 13.460319519042969, "logits/rejected": 14.325845718383789, "logps/chosen": -4.046621322631836, "logps/rejected": -4.564686298370361, "loss": 3.971, "rewards/accuracies": 1.0, "rewards/chosen": -40.466217041015625, "rewards/margins": 5.1806488037109375, "rewards/rejected": -45.64686584472656, "step": 7046 }, { "epoch": 0.9595588235294118, "grad_norm": 42.36989969679408, "learning_rate": 3.979693897602976e-09, "logits/chosen": 14.681985855102539, "logits/rejected": 14.959945678710938, "logps/chosen": -4.784702301025391, "logps/rejected": -4.9511919021606445, "loss": 3.8687, "rewards/accuracies": 0.5, "rewards/chosen": -47.84702682495117, "rewards/margins": 1.664891242980957, "rewards/rejected": -49.51191711425781, "step": 7047 }, { "epoch": 0.9596949891067538, "grad_norm": 40.99381474642183, "learning_rate": 3.952983882544503e-09, "logits/chosen": 14.00454330444336, "logits/rejected": 14.723295211791992, "logps/chosen": -4.289417266845703, "logps/rejected": -4.810273170471191, "loss": 3.8094, "rewards/accuracies": 1.0, "rewards/chosen": -42.89417266845703, "rewards/margins": 5.20855712890625, "rewards/rejected": -48.10273361206055, "step": 7048 }, { "epoch": 0.9598311546840959, "grad_norm": 51.54152397691373, "learning_rate": 3.92636335758314e-09, "logits/chosen": 13.65425968170166, "logits/rejected": 14.56079387664795, "logps/chosen": -4.248193740844727, "logps/rejected": -4.623396873474121, "loss": 3.6819, "rewards/accuracies": 0.75, "rewards/chosen": -42.48194122314453, "rewards/margins": 3.752030372619629, "rewards/rejected": -46.233970642089844, "step": 7049 }, { "epoch": 0.9599673202614379, "grad_norm": 43.5685579409548, "learning_rate": 3.899832328733943e-09, "logits/chosen": 14.488698959350586, "logits/rejected": 14.375980377197266, "logps/chosen": -4.963606834411621, "logps/rejected": -4.879353046417236, "loss": 3.7468, "rewards/accuracies": 0.5, "rewards/chosen": -49.636070251464844, "rewards/margins": -0.8425369262695312, "rewards/rejected": -48.79352951049805, "step": 7050 }, { "epoch": 0.9601034858387799, "grad_norm": 42.34490442800625, "learning_rate": 3.873390801991805e-09, "logits/chosen": 13.740188598632812, "logits/rejected": 14.455476760864258, "logps/chosen": -4.153120994567871, "logps/rejected": -4.655211448669434, "loss": 3.6364, "rewards/accuracies": 1.0, "rewards/chosen": -41.53120803833008, "rewards/margins": 5.020909309387207, "rewards/rejected": -46.55211639404297, "step": 7051 }, { "epoch": 0.960239651416122, "grad_norm": 37.568313261124146, "learning_rate": 3.8470387833314574e-09, "logits/chosen": 13.748392105102539, "logits/rejected": 13.977795600891113, "logps/chosen": -3.993199348449707, "logps/rejected": -4.603302955627441, "loss": 3.9386, "rewards/accuracies": 1.0, "rewards/chosen": -39.93199157714844, "rewards/margins": 6.101038932800293, "rewards/rejected": -46.03303146362305, "step": 7052 }, { "epoch": 0.9603758169934641, "grad_norm": 42.134768665655926, "learning_rate": 3.820776278707294e-09, "logits/chosen": 14.00191879272461, "logits/rejected": 14.36546516418457, "logps/chosen": -4.256494522094727, "logps/rejected": -4.462099552154541, "loss": 4.2377, "rewards/accuracies": 0.5, "rewards/chosen": -42.56494903564453, "rewards/margins": 2.0560503005981445, "rewards/rejected": -44.62099838256836, "step": 7053 }, { "epoch": 0.960511982570806, "grad_norm": 43.909838179513244, "learning_rate": 3.794603294053633e-09, "logits/chosen": 14.309781074523926, "logits/rejected": 14.314374923706055, "logps/chosen": -4.565537452697754, "logps/rejected": -4.6944193840026855, "loss": 4.1969, "rewards/accuracies": 0.5, "rewards/chosen": -45.655372619628906, "rewards/margins": 1.2888193130493164, "rewards/rejected": -46.94419479370117, "step": 7054 }, { "epoch": 0.9606481481481481, "grad_norm": 39.75261273151364, "learning_rate": 3.768519835284412e-09, "logits/chosen": 15.433466911315918, "logits/rejected": 15.85947036743164, "logps/chosen": -4.67435359954834, "logps/rejected": -5.496532440185547, "loss": 3.7443, "rewards/accuracies": 1.0, "rewards/chosen": -46.74353790283203, "rewards/margins": 8.221784591674805, "rewards/rejected": -54.96532440185547, "step": 7055 }, { "epoch": 0.9607843137254902, "grad_norm": 42.75307359885601, "learning_rate": 3.742525908293403e-09, "logits/chosen": 14.279029846191406, "logits/rejected": 15.258245468139648, "logps/chosen": -4.532276630401611, "logps/rejected": -5.235783576965332, "loss": 3.6976, "rewards/accuracies": 1.0, "rewards/chosen": -45.32276916503906, "rewards/margins": 7.035065650939941, "rewards/rejected": -52.35783386230469, "step": 7056 }, { "epoch": 0.9609204793028322, "grad_norm": 45.173573990503506, "learning_rate": 3.7166215189541328e-09, "logits/chosen": 14.320676803588867, "logits/rejected": 14.64643669128418, "logps/chosen": -4.688381195068359, "logps/rejected": -4.655272960662842, "loss": 3.4304, "rewards/accuracies": 0.5, "rewards/chosen": -46.88380432128906, "rewards/margins": -0.33107471466064453, "rewards/rejected": -46.552734375, "step": 7057 }, { "epoch": 0.9610566448801743, "grad_norm": 44.20191257851308, "learning_rate": 3.690806673120006e-09, "logits/chosen": 14.474319458007812, "logits/rejected": 14.420475006103516, "logps/chosen": -4.514081001281738, "logps/rejected": -4.611351490020752, "loss": 4.0496, "rewards/accuracies": 0.5, "rewards/chosen": -45.14080810546875, "rewards/margins": 0.9727048873901367, "rewards/rejected": -46.1135139465332, "step": 7058 }, { "epoch": 0.9611928104575164, "grad_norm": 44.50966827540307, "learning_rate": 3.6650813766239573e-09, "logits/chosen": 14.061759948730469, "logits/rejected": 13.914855003356934, "logps/chosen": -4.317190170288086, "logps/rejected": -4.428073883056641, "loss": 3.8601, "rewards/accuracies": 0.75, "rewards/chosen": -43.17190170288086, "rewards/margins": 1.1088342666625977, "rewards/rejected": -44.280738830566406, "step": 7059 }, { "epoch": 0.9613289760348583, "grad_norm": 41.713289786346564, "learning_rate": 3.6394456352789815e-09, "logits/chosen": 14.146916389465332, "logits/rejected": 15.154220581054688, "logps/chosen": -4.537842273712158, "logps/rejected": -4.917140007019043, "loss": 3.6983, "rewards/accuracies": 0.75, "rewards/chosen": -45.378421783447266, "rewards/margins": 3.7929773330688477, "rewards/rejected": -49.17140197753906, "step": 7060 }, { "epoch": 0.9614651416122004, "grad_norm": 39.77936453769229, "learning_rate": 3.6138994548776003e-09, "logits/chosen": 14.793115615844727, "logits/rejected": 14.759986877441406, "logps/chosen": -4.475270748138428, "logps/rejected": -4.855771541595459, "loss": 3.7937, "rewards/accuracies": 0.75, "rewards/chosen": -44.752708435058594, "rewards/margins": 3.8050079345703125, "rewards/rejected": -48.557716369628906, "step": 7061 }, { "epoch": 0.9616013071895425, "grad_norm": 47.359130798997526, "learning_rate": 3.588442841192174e-09, "logits/chosen": 14.393903732299805, "logits/rejected": 14.526390075683594, "logps/chosen": -4.411847114562988, "logps/rejected": -4.678920745849609, "loss": 3.509, "rewards/accuracies": 1.0, "rewards/chosen": -44.11846923828125, "rewards/margins": 2.6707324981689453, "rewards/rejected": -46.789207458496094, "step": 7062 }, { "epoch": 0.9617374727668845, "grad_norm": 42.92974643235324, "learning_rate": 3.5630757999748574e-09, "logits/chosen": 13.919954299926758, "logits/rejected": 13.377986907958984, "logps/chosen": -4.494581699371338, "logps/rejected": -4.580447673797607, "loss": 4.2422, "rewards/accuracies": 0.75, "rewards/chosen": -44.94581604003906, "rewards/margins": 0.8586616516113281, "rewards/rejected": -45.80447769165039, "step": 7063 }, { "epoch": 0.9618736383442266, "grad_norm": 40.15525690668956, "learning_rate": 3.5377983369575536e-09, "logits/chosen": 14.92972183227539, "logits/rejected": 15.924886703491211, "logps/chosen": -4.822055816650391, "logps/rejected": -5.084047317504883, "loss": 4.1268, "rewards/accuracies": 0.75, "rewards/chosen": -48.220558166503906, "rewards/margins": 2.6199169158935547, "rewards/rejected": -50.840476989746094, "step": 7064 }, { "epoch": 0.9620098039215687, "grad_norm": 40.95936600832556, "learning_rate": 3.5126104578519165e-09, "logits/chosen": 14.768016815185547, "logits/rejected": 13.994619369506836, "logps/chosen": -4.628249168395996, "logps/rejected": -4.324639320373535, "loss": 4.3371, "rewards/accuracies": 0.25, "rewards/chosen": -46.282493591308594, "rewards/margins": -3.036100387573242, "rewards/rejected": -43.24639129638672, "step": 7065 }, { "epoch": 0.9621459694989106, "grad_norm": 71.31813474268777, "learning_rate": 3.487512168349305e-09, "logits/chosen": 14.599115371704102, "logits/rejected": 14.545291900634766, "logps/chosen": -4.811384201049805, "logps/rejected": -4.531394004821777, "loss": 4.1588, "rewards/accuracies": 0.5, "rewards/chosen": -48.11384201049805, "rewards/margins": -2.79990291595459, "rewards/rejected": -45.31393814086914, "step": 7066 }, { "epoch": 0.9622821350762527, "grad_norm": 41.16697461451451, "learning_rate": 3.4625034741210034e-09, "logits/chosen": 13.693310737609863, "logits/rejected": 14.536230087280273, "logps/chosen": -4.45775842666626, "logps/rejected": -4.761748790740967, "loss": 3.6004, "rewards/accuracies": 1.0, "rewards/chosen": -44.57758331298828, "rewards/margins": 3.039905548095703, "rewards/rejected": -47.617488861083984, "step": 7067 }, { "epoch": 0.9624183006535948, "grad_norm": 38.248370661854864, "learning_rate": 3.437584380817782e-09, "logits/chosen": 14.873672485351562, "logits/rejected": 15.5050048828125, "logps/chosen": -4.493404865264893, "logps/rejected": -4.796329021453857, "loss": 3.5786, "rewards/accuracies": 0.5, "rewards/chosen": -44.934051513671875, "rewards/margins": 3.029240608215332, "rewards/rejected": -47.963287353515625, "step": 7068 }, { "epoch": 0.9625544662309368, "grad_norm": 40.05244845544127, "learning_rate": 3.412754894070424e-09, "logits/chosen": 14.694408416748047, "logits/rejected": 14.697446823120117, "logps/chosen": -4.653977394104004, "logps/rejected": -4.492903709411621, "loss": 4.0061, "rewards/accuracies": 0.25, "rewards/chosen": -46.539772033691406, "rewards/margins": -1.6107358932495117, "rewards/rejected": -44.929039001464844, "step": 7069 }, { "epoch": 0.9626906318082789, "grad_norm": 47.83244583313571, "learning_rate": 3.3880150194892877e-09, "logits/chosen": 13.75202465057373, "logits/rejected": 14.005305290222168, "logps/chosen": -4.553606986999512, "logps/rejected": -4.72721004486084, "loss": 4.5338, "rewards/accuracies": 0.5, "rewards/chosen": -45.53606414794922, "rewards/margins": 1.7360334396362305, "rewards/rejected": -47.27210235595703, "step": 7070 }, { "epoch": 0.9628267973856209, "grad_norm": 55.80712845076658, "learning_rate": 3.363364762664611e-09, "logits/chosen": 14.445858001708984, "logits/rejected": 15.727787971496582, "logps/chosen": -4.720205783843994, "logps/rejected": -5.415408611297607, "loss": 3.4436, "rewards/accuracies": 1.0, "rewards/chosen": -47.20206069946289, "rewards/margins": 6.952025413513184, "rewards/rejected": -54.15408706665039, "step": 7071 }, { "epoch": 0.9629629629629629, "grad_norm": 44.08007975660023, "learning_rate": 3.338804129166295e-09, "logits/chosen": 14.95791244506836, "logits/rejected": 14.7930269241333, "logps/chosen": -4.670375823974609, "logps/rejected": -4.518646240234375, "loss": 3.753, "rewards/accuracies": 0.5, "rewards/chosen": -46.703758239746094, "rewards/margins": -1.5172996520996094, "rewards/rejected": -45.186458587646484, "step": 7072 }, { "epoch": 0.963099128540305, "grad_norm": 41.904413375508966, "learning_rate": 3.314333124544033e-09, "logits/chosen": 14.00251579284668, "logits/rejected": 14.42509651184082, "logps/chosen": -4.449495315551758, "logps/rejected": -4.490551471710205, "loss": 4.505, "rewards/accuracies": 0.75, "rewards/chosen": -44.49495315551758, "rewards/margins": 0.41056060791015625, "rewards/rejected": -44.905513763427734, "step": 7073 }, { "epoch": 0.9632352941176471, "grad_norm": 42.90234769593447, "learning_rate": 3.289951754327225e-09, "logits/chosen": 14.51513671875, "logits/rejected": 15.263749122619629, "logps/chosen": -4.482399940490723, "logps/rejected": -4.692218780517578, "loss": 4.1472, "rewards/accuracies": 1.0, "rewards/chosen": -44.823997497558594, "rewards/margins": 2.098186492919922, "rewards/rejected": -46.92218780517578, "step": 7074 }, { "epoch": 0.963371459694989, "grad_norm": 42.002316425251, "learning_rate": 3.265660024025063e-09, "logits/chosen": 14.207275390625, "logits/rejected": 14.75855827331543, "logps/chosen": -4.515276908874512, "logps/rejected": -4.61429500579834, "loss": 3.7747, "rewards/accuracies": 0.75, "rewards/chosen": -45.152767181396484, "rewards/margins": 0.9901828765869141, "rewards/rejected": -46.142948150634766, "step": 7075 }, { "epoch": 0.9635076252723311, "grad_norm": 54.40096063418388, "learning_rate": 3.2414579391264464e-09, "logits/chosen": 13.70440673828125, "logits/rejected": 14.760407447814941, "logps/chosen": -4.318727016448975, "logps/rejected": -4.84607458114624, "loss": 4.2325, "rewards/accuracies": 1.0, "rewards/chosen": -43.18727111816406, "rewards/margins": 5.2734785079956055, "rewards/rejected": -48.46074676513672, "step": 7076 }, { "epoch": 0.9636437908496732, "grad_norm": 39.899995275586804, "learning_rate": 3.2173455051000665e-09, "logits/chosen": 14.957525253295898, "logits/rejected": 14.828365325927734, "logps/chosen": -4.805487632751465, "logps/rejected": -4.4823150634765625, "loss": 3.8051, "rewards/accuracies": 0.25, "rewards/chosen": -48.054874420166016, "rewards/margins": -3.2317237854003906, "rewards/rejected": -44.823150634765625, "step": 7077 }, { "epoch": 0.9637799564270153, "grad_norm": 43.474398627690206, "learning_rate": 3.1933227273942763e-09, "logits/chosen": 14.226122856140137, "logits/rejected": 14.846246719360352, "logps/chosen": -4.5276312828063965, "logps/rejected": -4.661974906921387, "loss": 3.8037, "rewards/accuracies": 1.0, "rewards/chosen": -45.276309967041016, "rewards/margins": 1.343435287475586, "rewards/rejected": -46.619747161865234, "step": 7078 }, { "epoch": 0.9639161220043573, "grad_norm": 50.325436752914854, "learning_rate": 3.1693896114372677e-09, "logits/chosen": 13.659761428833008, "logits/rejected": 14.4765043258667, "logps/chosen": -4.406280517578125, "logps/rejected": -4.715958118438721, "loss": 3.8156, "rewards/accuracies": 0.5, "rewards/chosen": -44.06280517578125, "rewards/margins": 3.09677791595459, "rewards/rejected": -47.15958023071289, "step": 7079 }, { "epoch": 0.9640522875816994, "grad_norm": 48.114669177593285, "learning_rate": 3.145546162636936e-09, "logits/chosen": 14.198383331298828, "logits/rejected": 14.540923118591309, "logps/chosen": -4.6346917152404785, "logps/rejected": -4.698084354400635, "loss": 4.3041, "rewards/accuracies": 0.5, "rewards/chosen": -46.34691619873047, "rewards/margins": 0.6339273452758789, "rewards/rejected": -46.98084259033203, "step": 7080 }, { "epoch": 0.9641884531590414, "grad_norm": 42.76286258633947, "learning_rate": 3.1217923863808395e-09, "logits/chosen": 14.727523803710938, "logits/rejected": 15.197582244873047, "logps/chosen": -4.683215141296387, "logps/rejected": -4.992509841918945, "loss": 4.0539, "rewards/accuracies": 1.0, "rewards/chosen": -46.8321533203125, "rewards/margins": 3.092947006225586, "rewards/rejected": -49.92509841918945, "step": 7081 }, { "epoch": 0.9643246187363834, "grad_norm": 38.08575717787863, "learning_rate": 3.0981282880364167e-09, "logits/chosen": 13.772994041442871, "logits/rejected": 14.35035228729248, "logps/chosen": -4.181456565856934, "logps/rejected": -4.592408657073975, "loss": 4.0374, "rewards/accuracies": 1.0, "rewards/chosen": -41.814571380615234, "rewards/margins": 4.109518051147461, "rewards/rejected": -45.92408752441406, "step": 7082 }, { "epoch": 0.9644607843137255, "grad_norm": 39.57103565375952, "learning_rate": 3.074553872950725e-09, "logits/chosen": 15.133394241333008, "logits/rejected": 15.076221466064453, "logps/chosen": -5.160364151000977, "logps/rejected": -5.423098564147949, "loss": 3.7163, "rewards/accuracies": 1.0, "rewards/chosen": -51.603641510009766, "rewards/margins": 2.627347946166992, "rewards/rejected": -54.23099136352539, "step": 7083 }, { "epoch": 0.9645969498910676, "grad_norm": 41.44033392774437, "learning_rate": 3.051069146450569e-09, "logits/chosen": 14.015722274780273, "logits/rejected": 14.527863502502441, "logps/chosen": -4.864362716674805, "logps/rejected": -4.933658123016357, "loss": 3.9446, "rewards/accuracies": 0.5, "rewards/chosen": -48.64363098144531, "rewards/margins": 0.6929502487182617, "rewards/rejected": -49.33658218383789, "step": 7084 }, { "epoch": 0.9647331154684096, "grad_norm": 41.17996488739331, "learning_rate": 3.027674113842593e-09, "logits/chosen": 14.412840843200684, "logits/rejected": 14.73527717590332, "logps/chosen": -4.457481384277344, "logps/rejected": -4.900445461273193, "loss": 3.6873, "rewards/accuracies": 1.0, "rewards/chosen": -44.57481384277344, "rewards/margins": 4.42963981628418, "rewards/rejected": -49.00445556640625, "step": 7085 }, { "epoch": 0.9648692810457516, "grad_norm": 39.34287733107515, "learning_rate": 3.004368780413058e-09, "logits/chosen": 14.32677173614502, "logits/rejected": 15.15896224975586, "logps/chosen": -4.552278995513916, "logps/rejected": -4.968876361846924, "loss": 3.7233, "rewards/accuracies": 0.75, "rewards/chosen": -45.522789001464844, "rewards/margins": 4.165973663330078, "rewards/rejected": -49.68876266479492, "step": 7086 }, { "epoch": 0.9650054466230937, "grad_norm": 43.23459872215825, "learning_rate": 2.981153151427973e-09, "logits/chosen": 13.953283309936523, "logits/rejected": 13.784238815307617, "logps/chosen": -4.421811103820801, "logps/rejected": -4.374880790710449, "loss": 3.8159, "rewards/accuracies": 0.25, "rewards/chosen": -44.218109130859375, "rewards/margins": -0.46930503845214844, "rewards/rejected": -43.748802185058594, "step": 7087 }, { "epoch": 0.9651416122004357, "grad_norm": 50.764655650670825, "learning_rate": 2.9580272321331423e-09, "logits/chosen": 14.701981544494629, "logits/rejected": 14.244572639465332, "logps/chosen": -4.945412635803223, "logps/rejected": -4.884810447692871, "loss": 3.9879, "rewards/accuracies": 0.5, "rewards/chosen": -49.45412826538086, "rewards/margins": -0.60601806640625, "rewards/rejected": -48.84811019897461, "step": 7088 }, { "epoch": 0.9652777777777778, "grad_norm": 62.51791068800091, "learning_rate": 2.9349910277540304e-09, "logits/chosen": 14.310486793518066, "logits/rejected": 14.687053680419922, "logps/chosen": -4.42403507232666, "logps/rejected": -4.485983371734619, "loss": 3.5911, "rewards/accuracies": 0.5, "rewards/chosen": -44.24034881591797, "rewards/margins": 0.6194877624511719, "rewards/rejected": -44.859832763671875, "step": 7089 }, { "epoch": 0.9654139433551199, "grad_norm": 42.29282494051011, "learning_rate": 2.9120445434958507e-09, "logits/chosen": 14.307806968688965, "logits/rejected": 14.82752799987793, "logps/chosen": -4.541686058044434, "logps/rejected": -4.85767126083374, "loss": 4.0518, "rewards/accuracies": 0.5, "rewards/chosen": -45.4168586730957, "rewards/margins": 3.1598548889160156, "rewards/rejected": -48.57671356201172, "step": 7090 }, { "epoch": 0.9655501089324618, "grad_norm": 41.302904184800745, "learning_rate": 2.8891877845436118e-09, "logits/chosen": 13.632391929626465, "logits/rejected": 14.010004043579102, "logps/chosen": -4.302488327026367, "logps/rejected": -4.582427978515625, "loss": 3.9825, "rewards/accuracies": 0.75, "rewards/chosen": -43.02488708496094, "rewards/margins": 2.7993955612182617, "rewards/rejected": -45.82427978515625, "step": 7091 }, { "epoch": 0.9656862745098039, "grad_norm": 38.080072702276624, "learning_rate": 2.866420756061938e-09, "logits/chosen": 14.06102180480957, "logits/rejected": 15.056252479553223, "logps/chosen": -4.493657112121582, "logps/rejected": -4.832159519195557, "loss": 3.6379, "rewards/accuracies": 1.0, "rewards/chosen": -44.93657684326172, "rewards/margins": 3.3850231170654297, "rewards/rejected": -48.321598052978516, "step": 7092 }, { "epoch": 0.965822440087146, "grad_norm": 40.64386807318303, "learning_rate": 2.8437434631952027e-09, "logits/chosen": 14.310587882995605, "logits/rejected": 14.733489990234375, "logps/chosen": -4.646145820617676, "logps/rejected": -4.867083549499512, "loss": 3.4812, "rewards/accuracies": 0.75, "rewards/chosen": -46.461456298828125, "rewards/margins": 2.209378242492676, "rewards/rejected": -48.670833587646484, "step": 7093 }, { "epoch": 0.965958605664488, "grad_norm": 42.75051972704709, "learning_rate": 2.821155911067574e-09, "logits/chosen": 13.249034881591797, "logits/rejected": 15.568136215209961, "logps/chosen": -4.24029541015625, "logps/rejected": -5.2051472663879395, "loss": 3.7492, "rewards/accuracies": 1.0, "rewards/chosen": -42.4029541015625, "rewards/margins": 9.648517608642578, "rewards/rejected": -52.051475524902344, "step": 7094 }, { "epoch": 0.9660947712418301, "grad_norm": 42.9012187164448, "learning_rate": 2.7986581047828805e-09, "logits/chosen": 13.956212997436523, "logits/rejected": 13.93739128112793, "logps/chosen": -4.2775421142578125, "logps/rejected": -4.446355819702148, "loss": 4.2629, "rewards/accuracies": 0.5, "rewards/chosen": -42.775421142578125, "rewards/margins": 1.6881380081176758, "rewards/rejected": -44.46356201171875, "step": 7095 }, { "epoch": 0.9662309368191722, "grad_norm": 40.2655713864857, "learning_rate": 2.7762500494247e-09, "logits/chosen": 13.80813217163086, "logits/rejected": 14.411226272583008, "logps/chosen": -4.345829486846924, "logps/rejected": -4.8540849685668945, "loss": 3.6578, "rewards/accuracies": 0.75, "rewards/chosen": -43.45829391479492, "rewards/margins": 5.08255672454834, "rewards/rejected": -48.54084777832031, "step": 7096 }, { "epoch": 0.9663671023965141, "grad_norm": 44.04849712619931, "learning_rate": 2.753931750056271e-09, "logits/chosen": 14.640970230102539, "logits/rejected": 15.31875228881836, "logps/chosen": -4.933810234069824, "logps/rejected": -5.053133487701416, "loss": 3.9708, "rewards/accuracies": 0.5, "rewards/chosen": -49.338104248046875, "rewards/margins": 1.1932315826416016, "rewards/rejected": -50.531333923339844, "step": 7097 }, { "epoch": 0.9665032679738562, "grad_norm": 45.62982597445511, "learning_rate": 2.7317032117206705e-09, "logits/chosen": 14.635029792785645, "logits/rejected": 15.01516342163086, "logps/chosen": -4.776239395141602, "logps/rejected": -5.046988487243652, "loss": 3.7586, "rewards/accuracies": 0.75, "rewards/chosen": -47.76239013671875, "rewards/margins": 2.7074947357177734, "rewards/rejected": -50.469886779785156, "step": 7098 }, { "epoch": 0.9666394335511983, "grad_norm": 40.13029173230568, "learning_rate": 2.7095644394405925e-09, "logits/chosen": 13.226343154907227, "logits/rejected": 14.204392433166504, "logps/chosen": -4.3898420333862305, "logps/rejected": -4.8583598136901855, "loss": 3.4558, "rewards/accuracies": 1.0, "rewards/chosen": -43.89841842651367, "rewards/margins": 4.685177803039551, "rewards/rejected": -48.583595275878906, "step": 7099 }, { "epoch": 0.9667755991285403, "grad_norm": 52.573355910745086, "learning_rate": 2.687515438218435e-09, "logits/chosen": 14.37402629852295, "logits/rejected": 15.168527603149414, "logps/chosen": -4.511951446533203, "logps/rejected": -4.746719837188721, "loss": 3.691, "rewards/accuracies": 0.75, "rewards/chosen": -45.11951446533203, "rewards/margins": 2.3476858139038086, "rewards/rejected": -47.467201232910156, "step": 7100 }, { "epoch": 0.9669117647058824, "grad_norm": 43.11722814543561, "learning_rate": 2.6655562130363463e-09, "logits/chosen": 14.206501960754395, "logits/rejected": 14.345620155334473, "logps/chosen": -4.630185127258301, "logps/rejected": -4.5037994384765625, "loss": 4.1302, "rewards/accuracies": 0.5, "rewards/chosen": -46.301856994628906, "rewards/margins": -1.2638626098632812, "rewards/rejected": -45.03799057006836, "step": 7101 }, { "epoch": 0.9670479302832244, "grad_norm": 44.079869822674226, "learning_rate": 2.6436867688563127e-09, "logits/chosen": 13.79653549194336, "logits/rejected": 13.921939849853516, "logps/chosen": -4.815099716186523, "logps/rejected": -4.854936122894287, "loss": 4.5846, "rewards/accuracies": 0.75, "rewards/chosen": -48.150997161865234, "rewards/margins": 0.3983650207519531, "rewards/rejected": -48.54936218261719, "step": 7102 }, { "epoch": 0.9671840958605664, "grad_norm": 50.586700115526334, "learning_rate": 2.6219071106197587e-09, "logits/chosen": 13.924324035644531, "logits/rejected": 14.556936264038086, "logps/chosen": -4.64649772644043, "logps/rejected": -4.637506484985352, "loss": 4.0195, "rewards/accuracies": 0.25, "rewards/chosen": -46.46497344970703, "rewards/margins": -0.08990669250488281, "rewards/rejected": -46.37506866455078, "step": 7103 }, { "epoch": 0.9673202614379085, "grad_norm": 49.71257440632314, "learning_rate": 2.600217243248082e-09, "logits/chosen": 13.396427154541016, "logits/rejected": 13.630594253540039, "logps/chosen": -4.478499889373779, "logps/rejected": -4.458662033081055, "loss": 4.1143, "rewards/accuracies": 0.5, "rewards/chosen": -44.784996032714844, "rewards/margins": -0.19837665557861328, "rewards/rejected": -44.58662033081055, "step": 7104 }, { "epoch": 0.9674564270152506, "grad_norm": 42.48866730450471, "learning_rate": 2.5786171716422943e-09, "logits/chosen": 14.662189483642578, "logits/rejected": 15.500986099243164, "logps/chosen": -4.6839094161987305, "logps/rejected": -4.941668510437012, "loss": 3.4589, "rewards/accuracies": 0.75, "rewards/chosen": -46.83909606933594, "rewards/margins": 2.5775861740112305, "rewards/rejected": -49.41668701171875, "step": 7105 }, { "epoch": 0.9675925925925926, "grad_norm": 40.86309436413444, "learning_rate": 2.5571069006830704e-09, "logits/chosen": 14.818483352661133, "logits/rejected": 14.970197677612305, "logps/chosen": -4.725795745849609, "logps/rejected": -4.878110885620117, "loss": 3.7231, "rewards/accuracies": 0.75, "rewards/chosen": -47.257957458496094, "rewards/margins": 1.5231523513793945, "rewards/rejected": -48.78111267089844, "step": 7106 }, { "epoch": 0.9677287581699346, "grad_norm": 41.22275176323207, "learning_rate": 2.5356864352307882e-09, "logits/chosen": 14.400809288024902, "logits/rejected": 14.440799713134766, "logps/chosen": -4.060738563537598, "logps/rejected": -4.206683158874512, "loss": 3.2558, "rewards/accuracies": 0.75, "rewards/chosen": -40.607391357421875, "rewards/margins": 1.459442138671875, "rewards/rejected": -42.06683349609375, "step": 7107 }, { "epoch": 0.9678649237472767, "grad_norm": 47.42373719368978, "learning_rate": 2.5143557801256655e-09, "logits/chosen": 13.221487045288086, "logits/rejected": 13.479705810546875, "logps/chosen": -4.388036251068115, "logps/rejected": -4.547194480895996, "loss": 4.4765, "rewards/accuracies": 0.75, "rewards/chosen": -43.88036346435547, "rewards/margins": 1.5915851593017578, "rewards/rejected": -45.471946716308594, "step": 7108 }, { "epoch": 0.9680010893246187, "grad_norm": 46.49792472284796, "learning_rate": 2.493114940187491e-09, "logits/chosen": 14.842589378356934, "logits/rejected": 14.90407943725586, "logps/chosen": -4.676999092102051, "logps/rejected": -4.769240379333496, "loss": 4.5073, "rewards/accuracies": 0.5, "rewards/chosen": -46.769989013671875, "rewards/margins": 0.9224176406860352, "rewards/rejected": -47.692405700683594, "step": 7109 }, { "epoch": 0.9681372549019608, "grad_norm": 42.35413429301767, "learning_rate": 2.4719639202158026e-09, "logits/chosen": 15.21617603302002, "logits/rejected": 15.419197082519531, "logps/chosen": -4.432374477386475, "logps/rejected": -4.795697212219238, "loss": 3.3823, "rewards/accuracies": 0.75, "rewards/chosen": -44.3237419128418, "rewards/margins": 3.6332340240478516, "rewards/rejected": -47.95697784423828, "step": 7110 }, { "epoch": 0.9682734204793029, "grad_norm": 42.886075667899505, "learning_rate": 2.4509027249898893e-09, "logits/chosen": 14.693151473999023, "logits/rejected": 15.007375717163086, "logps/chosen": -4.915154457092285, "logps/rejected": -4.948508262634277, "loss": 4.1879, "rewards/accuracies": 0.5, "rewards/chosen": -49.15154266357422, "rewards/margins": 0.3335380554199219, "rewards/rejected": -49.48508071899414, "step": 7111 }, { "epoch": 0.9684095860566448, "grad_norm": 46.74261590579072, "learning_rate": 2.4299313592687e-09, "logits/chosen": 14.572105407714844, "logits/rejected": 14.504358291625977, "logps/chosen": -4.942367076873779, "logps/rejected": -5.009864330291748, "loss": 3.9116, "rewards/accuracies": 0.5, "rewards/chosen": -49.423667907714844, "rewards/margins": 0.6749734878540039, "rewards/rejected": -50.09864044189453, "step": 7112 }, { "epoch": 0.9685457516339869, "grad_norm": 43.39530496413891, "learning_rate": 2.4090498277908433e-09, "logits/chosen": 13.71632194519043, "logits/rejected": 13.767282485961914, "logps/chosen": -4.630580425262451, "logps/rejected": -4.6197896003723145, "loss": 4.443, "rewards/accuracies": 0.25, "rewards/chosen": -46.30580520629883, "rewards/margins": -0.10790824890136719, "rewards/rejected": -46.197898864746094, "step": 7113 }, { "epoch": 0.968681917211329, "grad_norm": 47.92113982950413, "learning_rate": 2.3882581352747235e-09, "logits/chosen": 14.001731872558594, "logits/rejected": 15.171209335327148, "logps/chosen": -4.7307844161987305, "logps/rejected": -4.955398082733154, "loss": 3.7331, "rewards/accuracies": 1.0, "rewards/chosen": -47.30784225463867, "rewards/margins": 2.246140480041504, "rewards/rejected": -49.55398178100586, "step": 7114 }, { "epoch": 0.968818082788671, "grad_norm": 43.915856055960646, "learning_rate": 2.3675562864183595e-09, "logits/chosen": 14.64286994934082, "logits/rejected": 15.238530158996582, "logps/chosen": -4.66603946685791, "logps/rejected": -4.991769313812256, "loss": 4.1883, "rewards/accuracies": 0.5, "rewards/chosen": -46.66039276123047, "rewards/margins": 3.257298469543457, "rewards/rejected": -49.917694091796875, "step": 7115 }, { "epoch": 0.9689542483660131, "grad_norm": 39.40936251142186, "learning_rate": 2.346944285899477e-09, "logits/chosen": 14.986907958984375, "logits/rejected": 15.346336364746094, "logps/chosen": -4.536545276641846, "logps/rejected": -4.920560836791992, "loss": 3.1476, "rewards/accuracies": 0.75, "rewards/chosen": -45.365455627441406, "rewards/margins": 3.840160369873047, "rewards/rejected": -49.20561218261719, "step": 7116 }, { "epoch": 0.9690904139433552, "grad_norm": 51.05614782044083, "learning_rate": 2.3264221383755942e-09, "logits/chosen": 14.78061294555664, "logits/rejected": 14.565975189208984, "logps/chosen": -4.781469345092773, "logps/rejected": -4.657299995422363, "loss": 4.5505, "rewards/accuracies": 0.25, "rewards/chosen": -47.81468963623047, "rewards/margins": -1.2416925430297852, "rewards/rejected": -46.572998046875, "step": 7117 }, { "epoch": 0.9692265795206971, "grad_norm": 42.71802329100855, "learning_rate": 2.3059898484838468e-09, "logits/chosen": 14.712686538696289, "logits/rejected": 16.004138946533203, "logps/chosen": -4.744091510772705, "logps/rejected": -5.1935038566589355, "loss": 3.4214, "rewards/accuracies": 1.0, "rewards/chosen": -47.44091796875, "rewards/margins": 4.4941253662109375, "rewards/rejected": -51.93503952026367, "step": 7118 }, { "epoch": 0.9693627450980392, "grad_norm": 40.085110663694046, "learning_rate": 2.2856474208410305e-09, "logits/chosen": 13.644947052001953, "logits/rejected": 15.414766311645508, "logps/chosen": -4.497641563415527, "logps/rejected": -5.014978408813477, "loss": 3.4621, "rewards/accuracies": 0.75, "rewards/chosen": -44.97641372680664, "rewards/margins": 5.173372268676758, "rewards/rejected": -50.14978790283203, "step": 7119 }, { "epoch": 0.9694989106753813, "grad_norm": 42.011747264098496, "learning_rate": 2.2653948600437346e-09, "logits/chosen": 14.478486061096191, "logits/rejected": 14.052099227905273, "logps/chosen": -4.619316577911377, "logps/rejected": -4.555191516876221, "loss": 3.9412, "rewards/accuracies": 0.5, "rewards/chosen": -46.19316482543945, "rewards/margins": -0.6412506103515625, "rewards/rejected": -45.551918029785156, "step": 7120 }, { "epoch": 0.9696350762527233, "grad_norm": 47.01544107613311, "learning_rate": 2.24523217066821e-09, "logits/chosen": 14.715812683105469, "logits/rejected": 14.873067855834961, "logps/chosen": -4.968823432922363, "logps/rejected": -4.9022722244262695, "loss": 3.8155, "rewards/accuracies": 0.75, "rewards/chosen": -49.688232421875, "rewards/margins": -0.6655101776123047, "rewards/rejected": -49.02272415161133, "step": 7121 }, { "epoch": 0.9697712418300654, "grad_norm": 42.7609088751224, "learning_rate": 2.2251593572703233e-09, "logits/chosen": 14.515137672424316, "logits/rejected": 14.476262092590332, "logps/chosen": -4.730037689208984, "logps/rejected": -5.016536712646484, "loss": 3.6065, "rewards/accuracies": 0.75, "rewards/chosen": -47.30038070678711, "rewards/margins": 2.8649911880493164, "rewards/rejected": -50.16537094116211, "step": 7122 }, { "epoch": 0.9699074074074074, "grad_norm": 47.42891857262955, "learning_rate": 2.2051764243856907e-09, "logits/chosen": 14.318571090698242, "logits/rejected": 14.526371955871582, "logps/chosen": -4.4172773361206055, "logps/rejected": -4.665940284729004, "loss": 3.9756, "rewards/accuracies": 0.75, "rewards/chosen": -44.17277145385742, "rewards/margins": 2.48663330078125, "rewards/rejected": -46.65940475463867, "step": 7123 }, { "epoch": 0.9700435729847494, "grad_norm": 43.59962659957336, "learning_rate": 2.185283376529723e-09, "logits/chosen": 14.57081413269043, "logits/rejected": 14.649862289428711, "logps/chosen": -4.61293363571167, "logps/rejected": -4.861837387084961, "loss": 3.6953, "rewards/accuracies": 0.75, "rewards/chosen": -46.129337310791016, "rewards/margins": 2.4890356063842773, "rewards/rejected": -48.618370056152344, "step": 7124 }, { "epoch": 0.9701797385620915, "grad_norm": 62.279530062631686, "learning_rate": 2.1654802181972686e-09, "logits/chosen": 14.510108947753906, "logits/rejected": 15.347465515136719, "logps/chosen": -4.57657527923584, "logps/rejected": -5.13224983215332, "loss": 3.8077, "rewards/accuracies": 0.75, "rewards/chosen": -45.765750885009766, "rewards/margins": 5.556744575500488, "rewards/rejected": -51.32249450683594, "step": 7125 }, { "epoch": 0.9703159041394336, "grad_norm": 43.01378035422824, "learning_rate": 2.1457669538631485e-09, "logits/chosen": 15.536296844482422, "logits/rejected": 15.467283248901367, "logps/chosen": -4.627258777618408, "logps/rejected": -4.863006591796875, "loss": 4.2944, "rewards/accuracies": 0.5, "rewards/chosen": -46.272586822509766, "rewards/margins": 2.3574771881103516, "rewards/rejected": -48.63006591796875, "step": 7126 }, { "epoch": 0.9704520697167756, "grad_norm": 44.09136989461945, "learning_rate": 2.126143587981666e-09, "logits/chosen": 13.821565628051758, "logits/rejected": 14.098649024963379, "logps/chosen": -4.455996513366699, "logps/rejected": -4.641064643859863, "loss": 3.6455, "rewards/accuracies": 0.75, "rewards/chosen": -44.559967041015625, "rewards/margins": 1.8506851196289062, "rewards/rejected": -46.41065216064453, "step": 7127 }, { "epoch": 0.9705882352941176, "grad_norm": 40.085322436612216, "learning_rate": 2.106610124986874e-09, "logits/chosen": 14.294252395629883, "logits/rejected": 14.623414993286133, "logps/chosen": -4.553576469421387, "logps/rejected": -4.805048942565918, "loss": 3.4979, "rewards/accuracies": 0.75, "rewards/chosen": -45.5357666015625, "rewards/margins": 2.514725685119629, "rewards/rejected": -48.05049133300781, "step": 7128 }, { "epoch": 0.9707244008714597, "grad_norm": 41.3366480599398, "learning_rate": 2.0871665692925755e-09, "logits/chosen": 14.764219284057617, "logits/rejected": 15.07241439819336, "logps/chosen": -4.681474208831787, "logps/rejected": -5.021571159362793, "loss": 4.0417, "rewards/accuracies": 0.75, "rewards/chosen": -46.81474304199219, "rewards/margins": 3.400970458984375, "rewards/rejected": -50.21571350097656, "step": 7129 }, { "epoch": 0.9708605664488017, "grad_norm": 39.05856530124993, "learning_rate": 2.0678129252921894e-09, "logits/chosen": 14.879119873046875, "logits/rejected": 14.794239044189453, "logps/chosen": -4.508502006530762, "logps/rejected": -4.6996750831604, "loss": 3.1146, "rewards/accuracies": 0.5, "rewards/chosen": -45.08501434326172, "rewards/margins": 1.911733627319336, "rewards/rejected": -46.99674987792969, "step": 7130 }, { "epoch": 0.9709967320261438, "grad_norm": 44.42038326376441, "learning_rate": 2.048549197358751e-09, "logits/chosen": 13.585464477539062, "logits/rejected": 14.268145561218262, "logps/chosen": -4.4511590003967285, "logps/rejected": -4.984190464019775, "loss": 4.1406, "rewards/accuracies": 1.0, "rewards/chosen": -44.51158905029297, "rewards/margins": 5.330316543579102, "rewards/rejected": -49.8419075012207, "step": 7131 }, { "epoch": 0.9711328976034859, "grad_norm": 44.77864541450904, "learning_rate": 2.029375389845178e-09, "logits/chosen": 14.938688278198242, "logits/rejected": 15.09091854095459, "logps/chosen": -5.011487007141113, "logps/rejected": -4.881806373596191, "loss": 4.0528, "rewards/accuracies": 0.5, "rewards/chosen": -50.1148681640625, "rewards/margins": -1.2968015670776367, "rewards/rejected": -48.81806564331055, "step": 7132 }, { "epoch": 0.9712690631808278, "grad_norm": 38.300742088472354, "learning_rate": 2.0102915070838724e-09, "logits/chosen": 14.187963485717773, "logits/rejected": 14.536251068115234, "logps/chosen": -4.118884086608887, "logps/rejected": -4.39102840423584, "loss": 3.5698, "rewards/accuracies": 0.75, "rewards/chosen": -41.188838958740234, "rewards/margins": 2.721445083618164, "rewards/rejected": -43.91028594970703, "step": 7133 }, { "epoch": 0.9714052287581699, "grad_norm": 43.90446556187542, "learning_rate": 1.9912975533869836e-09, "logits/chosen": 15.04551887512207, "logits/rejected": 14.589008331298828, "logps/chosen": -4.999577045440674, "logps/rejected": -4.91987943649292, "loss": 4.1, "rewards/accuracies": 0.25, "rewards/chosen": -49.99577331542969, "rewards/margins": -0.7969789505004883, "rewards/rejected": -49.198795318603516, "step": 7134 }, { "epoch": 0.971541394335512, "grad_norm": 45.40697833307719, "learning_rate": 1.9723935330464126e-09, "logits/chosen": 15.23654842376709, "logits/rejected": 15.226943969726562, "logps/chosen": -4.964811325073242, "logps/rejected": -4.6504597663879395, "loss": 4.1571, "rewards/accuracies": 0.25, "rewards/chosen": -49.648109436035156, "rewards/margins": -3.143514633178711, "rewards/rejected": -46.504600524902344, "step": 7135 }, { "epoch": 0.971677559912854, "grad_norm": 42.5834599978989, "learning_rate": 1.9535794503336756e-09, "logits/chosen": 14.642784118652344, "logits/rejected": 14.593161582946777, "logps/chosen": -4.554338455200195, "logps/rejected": -4.750238418579102, "loss": 4.2468, "rewards/accuracies": 0.75, "rewards/chosen": -45.54338455200195, "rewards/margins": 1.9589967727661133, "rewards/rejected": -47.50238037109375, "step": 7136 }, { "epoch": 0.9718137254901961, "grad_norm": 40.93918545055447, "learning_rate": 1.93485530949995e-09, "logits/chosen": 14.215929985046387, "logits/rejected": 14.324244499206543, "logps/chosen": -4.561873912811279, "logps/rejected": -4.822235107421875, "loss": 4.0573, "rewards/accuracies": 0.75, "rewards/chosen": -45.618736267089844, "rewards/margins": 2.6036148071289062, "rewards/rejected": -48.222354888916016, "step": 7137 }, { "epoch": 0.9719498910675382, "grad_norm": 54.18489093612012, "learning_rate": 1.916221114776073e-09, "logits/chosen": 14.889281272888184, "logits/rejected": 14.70053482055664, "logps/chosen": -4.447513580322266, "logps/rejected": -4.818851470947266, "loss": 4.0771, "rewards/accuracies": 0.75, "rewards/chosen": -44.475135803222656, "rewards/margins": 3.713383674621582, "rewards/rejected": -48.18851852416992, "step": 7138 }, { "epoch": 0.9720860566448801, "grad_norm": 37.46426921056975, "learning_rate": 1.897676870372633e-09, "logits/chosen": 15.013711929321289, "logits/rejected": 15.087023735046387, "logps/chosen": -4.466437816619873, "logps/rejected": -4.958740234375, "loss": 3.35, "rewards/accuracies": 1.0, "rewards/chosen": -44.66437911987305, "rewards/margins": 4.923022270202637, "rewards/rejected": -49.58740234375, "step": 7139 }, { "epoch": 0.9722222222222222, "grad_norm": 42.16931954459169, "learning_rate": 1.8792225804798776e-09, "logits/chosen": 14.629310607910156, "logits/rejected": 14.29534912109375, "logps/chosen": -4.557735919952393, "logps/rejected": -4.77787446975708, "loss": 3.6749, "rewards/accuracies": 0.5, "rewards/chosen": -45.57735824584961, "rewards/margins": 2.201387405395508, "rewards/rejected": -47.77874755859375, "step": 7140 }, { "epoch": 0.9723583877995643, "grad_norm": 48.39993821967561, "learning_rate": 1.8608582492676716e-09, "logits/chosen": 15.256779670715332, "logits/rejected": 14.655866622924805, "logps/chosen": -4.875906944274902, "logps/rejected": -4.661937236785889, "loss": 4.4388, "rewards/accuracies": 0.5, "rewards/chosen": -48.75906753540039, "rewards/margins": -2.1396970748901367, "rewards/rejected": -46.6193733215332, "step": 7141 }, { "epoch": 0.9724945533769063, "grad_norm": 44.62295827327664, "learning_rate": 1.8425838808855843e-09, "logits/chosen": 14.011846542358398, "logits/rejected": 14.156156539916992, "logps/chosen": -4.086548328399658, "logps/rejected": -4.510232448577881, "loss": 3.9328, "rewards/accuracies": 0.75, "rewards/chosen": -40.86548614501953, "rewards/margins": 4.236841201782227, "rewards/rejected": -45.102325439453125, "step": 7142 }, { "epoch": 0.9726307189542484, "grad_norm": 42.23582598284305, "learning_rate": 1.8243994794628459e-09, "logits/chosen": 14.491130828857422, "logits/rejected": 14.703123092651367, "logps/chosen": -4.381142616271973, "logps/rejected": -4.676360130310059, "loss": 3.4543, "rewards/accuracies": 0.75, "rewards/chosen": -43.811431884765625, "rewards/margins": 2.952174186706543, "rewards/rejected": -46.76360321044922, "step": 7143 }, { "epoch": 0.9727668845315904, "grad_norm": 43.4981029558544, "learning_rate": 1.8063050491084364e-09, "logits/chosen": 14.934195518493652, "logits/rejected": 15.118816375732422, "logps/chosen": -4.58082389831543, "logps/rejected": -4.846776962280273, "loss": 3.7339, "rewards/accuracies": 0.5, "rewards/chosen": -45.80823516845703, "rewards/margins": 2.6595325469970703, "rewards/rejected": -48.467769622802734, "step": 7144 }, { "epoch": 0.9729030501089324, "grad_norm": 37.55899782710381, "learning_rate": 1.7883005939109075e-09, "logits/chosen": 14.294942855834961, "logits/rejected": 14.719281196594238, "logps/chosen": -4.802069664001465, "logps/rejected": -5.0988616943359375, "loss": 3.3494, "rewards/accuracies": 1.0, "rewards/chosen": -48.020694732666016, "rewards/margins": 2.9679250717163086, "rewards/rejected": -50.988616943359375, "step": 7145 }, { "epoch": 0.9730392156862745, "grad_norm": 46.999840540827684, "learning_rate": 1.770386117938516e-09, "logits/chosen": 13.543563842773438, "logits/rejected": 14.276496887207031, "logps/chosen": -4.6118879318237305, "logps/rejected": -4.859107971191406, "loss": 4.0765, "rewards/accuracies": 0.75, "rewards/chosen": -46.11887741088867, "rewards/margins": 2.47220516204834, "rewards/rejected": -48.59107971191406, "step": 7146 }, { "epoch": 0.9731753812636166, "grad_norm": 41.25483434239996, "learning_rate": 1.7525616252391351e-09, "logits/chosen": 14.271137237548828, "logits/rejected": 14.265985488891602, "logps/chosen": -4.671872138977051, "logps/rejected": -4.597903251647949, "loss": 3.6061, "rewards/accuracies": 0.5, "rewards/chosen": -46.71872329711914, "rewards/margins": -0.739689826965332, "rewards/rejected": -45.979034423828125, "step": 7147 }, { "epoch": 0.9733115468409586, "grad_norm": 42.23439516516231, "learning_rate": 1.7348271198404318e-09, "logits/chosen": 14.716672897338867, "logits/rejected": 15.068014144897461, "logps/chosen": -4.844237327575684, "logps/rejected": -4.807552814483643, "loss": 3.547, "rewards/accuracies": 0.5, "rewards/chosen": -48.4423713684082, "rewards/margins": -0.3668403625488281, "rewards/rejected": -48.075531005859375, "step": 7148 }, { "epoch": 0.9734477124183006, "grad_norm": 50.857423809627285, "learning_rate": 1.7171826057496452e-09, "logits/chosen": 14.556034088134766, "logits/rejected": 14.777247428894043, "logps/chosen": -4.498805046081543, "logps/rejected": -4.485599040985107, "loss": 3.9934, "rewards/accuracies": 0.5, "rewards/chosen": -44.98805236816406, "rewards/margins": -0.13206005096435547, "rewards/rejected": -44.85599136352539, "step": 7149 }, { "epoch": 0.9735838779956427, "grad_norm": 43.75791437411388, "learning_rate": 1.699628086953675e-09, "logits/chosen": 14.860610008239746, "logits/rejected": 14.55418586730957, "logps/chosen": -4.641781806945801, "logps/rejected": -4.7387189865112305, "loss": 4.173, "rewards/accuracies": 0.75, "rewards/chosen": -46.41781997680664, "rewards/margins": 0.9693670272827148, "rewards/rejected": -47.387184143066406, "step": 7150 }, { "epoch": 0.9737200435729847, "grad_norm": 42.548351655637056, "learning_rate": 1.6821635674191259e-09, "logits/chosen": 14.319608688354492, "logits/rejected": 14.407089233398438, "logps/chosen": -4.33917236328125, "logps/rejected": -4.484174728393555, "loss": 3.714, "rewards/accuracies": 0.75, "rewards/chosen": -43.3917236328125, "rewards/margins": 1.4500226974487305, "rewards/rejected": -44.84174728393555, "step": 7151 }, { "epoch": 0.9738562091503268, "grad_norm": 40.71701540249799, "learning_rate": 1.6647890510922191e-09, "logits/chosen": 14.826522827148438, "logits/rejected": 14.214927673339844, "logps/chosen": -4.521034240722656, "logps/rejected": -4.463746070861816, "loss": 3.7606, "rewards/accuracies": 0.5, "rewards/chosen": -45.21034240722656, "rewards/margins": -0.5728836059570312, "rewards/rejected": -44.63745880126953, "step": 7152 }, { "epoch": 0.9739923747276689, "grad_norm": 44.041802971839644, "learning_rate": 1.6475045418989253e-09, "logits/chosen": 13.844193458557129, "logits/rejected": 14.192276000976562, "logps/chosen": -4.541720867156982, "logps/rejected": -4.623291492462158, "loss": 4.1031, "rewards/accuracies": 0.5, "rewards/chosen": -45.417205810546875, "rewards/margins": 0.8157072067260742, "rewards/rejected": -46.232913970947266, "step": 7153 }, { "epoch": 0.974128540305011, "grad_norm": 42.79123664251555, "learning_rate": 1.630310043744787e-09, "logits/chosen": 13.882575035095215, "logits/rejected": 15.354660034179688, "logps/chosen": -4.738892555236816, "logps/rejected": -5.171380043029785, "loss": 4.0613, "rewards/accuracies": 0.75, "rewards/chosen": -47.38892364501953, "rewards/margins": 4.324873924255371, "rewards/rejected": -51.71379852294922, "step": 7154 }, { "epoch": 0.9742647058823529, "grad_norm": 44.445192793501896, "learning_rate": 1.6132055605150518e-09, "logits/chosen": 14.169042587280273, "logits/rejected": 14.384124755859375, "logps/chosen": -4.517810821533203, "logps/rejected": -4.6316752433776855, "loss": 3.8528, "rewards/accuracies": 0.5, "rewards/chosen": -45.1781005859375, "rewards/margins": 1.1386499404907227, "rewards/rejected": -46.31675338745117, "step": 7155 }, { "epoch": 0.974400871459695, "grad_norm": 44.84018230492319, "learning_rate": 1.5961910960746282e-09, "logits/chosen": 13.991888046264648, "logits/rejected": 14.939178466796875, "logps/chosen": -4.655577182769775, "logps/rejected": -4.687848091125488, "loss": 3.9597, "rewards/accuracies": 0.5, "rewards/chosen": -46.55577087402344, "rewards/margins": 0.32271289825439453, "rewards/rejected": -46.87848663330078, "step": 7156 }, { "epoch": 0.9745370370370371, "grad_norm": 46.42310253274851, "learning_rate": 1.5792666542680855e-09, "logits/chosen": 13.804238319396973, "logits/rejected": 13.850959777832031, "logps/chosen": -4.495687007904053, "logps/rejected": -4.546723365783691, "loss": 4.2479, "rewards/accuracies": 0.5, "rewards/chosen": -44.956871032714844, "rewards/margins": 0.5103616714477539, "rewards/rejected": -45.46723175048828, "step": 7157 }, { "epoch": 0.9746732026143791, "grad_norm": 42.82373762556783, "learning_rate": 1.5624322389196087e-09, "logits/chosen": 14.045270919799805, "logits/rejected": 15.454627990722656, "logps/chosen": -4.723238945007324, "logps/rejected": -5.17160177230835, "loss": 4.0711, "rewards/accuracies": 1.0, "rewards/chosen": -47.232391357421875, "rewards/margins": 4.483624458312988, "rewards/rejected": -51.71601867675781, "step": 7158 }, { "epoch": 0.9748093681917211, "grad_norm": 40.70810906278032, "learning_rate": 1.5456878538330443e-09, "logits/chosen": 15.208176612854004, "logits/rejected": 15.600872039794922, "logps/chosen": -4.603753089904785, "logps/rejected": -4.98935604095459, "loss": 3.8733, "rewards/accuracies": 1.0, "rewards/chosen": -46.037532806396484, "rewards/margins": 3.856022834777832, "rewards/rejected": -49.8935546875, "step": 7159 }, { "epoch": 0.9749455337690632, "grad_norm": 45.31491936876412, "learning_rate": 1.529033502792032e-09, "logits/chosen": 15.220900535583496, "logits/rejected": 14.779440879821777, "logps/chosen": -4.549892902374268, "logps/rejected": -4.631160259246826, "loss": 3.9776, "rewards/accuracies": 0.5, "rewards/chosen": -45.49892807006836, "rewards/margins": 0.8126745223999023, "rewards/rejected": -46.31160354614258, "step": 7160 }, { "epoch": 0.9750816993464052, "grad_norm": 48.4973990748265, "learning_rate": 1.5124691895596508e-09, "logits/chosen": 14.087226867675781, "logits/rejected": 15.362005233764648, "logps/chosen": -4.432682514190674, "logps/rejected": -4.939569473266602, "loss": 3.9802, "rewards/accuracies": 1.0, "rewards/chosen": -44.326820373535156, "rewards/margins": 5.068869590759277, "rewards/rejected": -49.395694732666016, "step": 7161 }, { "epoch": 0.9752178649237473, "grad_norm": 40.607445556327015, "learning_rate": 1.4959949178787734e-09, "logits/chosen": 13.952444076538086, "logits/rejected": 14.269405364990234, "logps/chosen": -4.507169723510742, "logps/rejected": -4.4538469314575195, "loss": 3.8528, "rewards/accuracies": 0.5, "rewards/chosen": -45.07169723510742, "rewards/margins": -0.5332307815551758, "rewards/rejected": -44.53846740722656, "step": 7162 }, { "epoch": 0.9753540305010894, "grad_norm": 39.01773393622844, "learning_rate": 1.4796106914719332e-09, "logits/chosen": 14.14326286315918, "logits/rejected": 14.714676856994629, "logps/chosen": -4.570995330810547, "logps/rejected": -4.780261993408203, "loss": 3.7491, "rewards/accuracies": 0.5, "rewards/chosen": -45.7099494934082, "rewards/margins": 2.092665672302246, "rewards/rejected": -47.802616119384766, "step": 7163 }, { "epoch": 0.9754901960784313, "grad_norm": 42.025636379096994, "learning_rate": 1.4633165140412796e-09, "logits/chosen": 14.579032897949219, "logits/rejected": 14.13280963897705, "logps/chosen": -4.513910293579102, "logps/rejected": -4.791258811950684, "loss": 3.9383, "rewards/accuracies": 0.75, "rewards/chosen": -45.13910675048828, "rewards/margins": 2.773484230041504, "rewards/rejected": -47.91259002685547, "step": 7164 }, { "epoch": 0.9756263616557734, "grad_norm": 45.119379300592456, "learning_rate": 1.4471123892685789e-09, "logits/chosen": 14.31767463684082, "logits/rejected": 14.807840347290039, "logps/chosen": -4.8476996421813965, "logps/rejected": -4.808188438415527, "loss": 3.6412, "rewards/accuracies": 0.25, "rewards/chosen": -48.47699737548828, "rewards/margins": -0.3951139450073242, "rewards/rejected": -48.081886291503906, "step": 7165 }, { "epoch": 0.9757625272331155, "grad_norm": 42.05774778051229, "learning_rate": 1.430998320815302e-09, "logits/chosen": 14.431402206420898, "logits/rejected": 15.209832191467285, "logps/chosen": -4.483309268951416, "logps/rejected": -4.531298637390137, "loss": 3.9482, "rewards/accuracies": 0.75, "rewards/chosen": -44.833091735839844, "rewards/margins": 0.47989368438720703, "rewards/rejected": -45.31298828125, "step": 7166 }, { "epoch": 0.9758986928104575, "grad_norm": 41.149162007164726, "learning_rate": 1.4149743123225365e-09, "logits/chosen": 15.054035186767578, "logits/rejected": 15.100536346435547, "logps/chosen": -4.654532432556152, "logps/rejected": -4.894460678100586, "loss": 3.5302, "rewards/accuracies": 0.75, "rewards/chosen": -46.54532241821289, "rewards/margins": 2.399285316467285, "rewards/rejected": -48.944610595703125, "step": 7167 }, { "epoch": 0.9760348583877996, "grad_norm": 42.28598744087459, "learning_rate": 1.3990403674111194e-09, "logits/chosen": 14.132620811462402, "logits/rejected": 14.177186965942383, "logps/chosen": -4.052197456359863, "logps/rejected": -4.283940315246582, "loss": 3.4531, "rewards/accuracies": 0.75, "rewards/chosen": -40.52198028564453, "rewards/margins": 2.3174285888671875, "rewards/rejected": -42.83940887451172, "step": 7168 }, { "epoch": 0.9761710239651417, "grad_norm": 36.74980148725597, "learning_rate": 1.3831964896813709e-09, "logits/chosen": 14.583282470703125, "logits/rejected": 15.046940803527832, "logps/chosen": -4.528311729431152, "logps/rejected": -4.763509750366211, "loss": 3.5882, "rewards/accuracies": 0.5, "rewards/chosen": -45.283119201660156, "rewards/margins": 2.351983070373535, "rewards/rejected": -47.635101318359375, "step": 7169 }, { "epoch": 0.9763071895424836, "grad_norm": 42.75658670875799, "learning_rate": 1.36744268271336e-09, "logits/chosen": 15.053680419921875, "logits/rejected": 13.788383483886719, "logps/chosen": -4.764090538024902, "logps/rejected": -4.313595771789551, "loss": 4.3551, "rewards/accuracies": 0.25, "rewards/chosen": -47.64090347290039, "rewards/margins": -4.504946708679199, "rewards/rejected": -43.135955810546875, "step": 7170 }, { "epoch": 0.9764433551198257, "grad_norm": 45.66407250127188, "learning_rate": 1.3517789500668175e-09, "logits/chosen": 14.61823844909668, "logits/rejected": 15.066329002380371, "logps/chosen": -4.698844909667969, "logps/rejected": -4.8298516273498535, "loss": 3.9832, "rewards/accuracies": 0.75, "rewards/chosen": -46.98844909667969, "rewards/margins": 1.3100690841674805, "rewards/rejected": -48.298519134521484, "step": 7171 }, { "epoch": 0.9765795206971678, "grad_norm": 37.891485438539064, "learning_rate": 1.3362052952810897e-09, "logits/chosen": 14.215524673461914, "logits/rejected": 14.603670120239258, "logps/chosen": -4.2774176597595215, "logps/rejected": -4.767916679382324, "loss": 3.7185, "rewards/accuracies": 0.75, "rewards/chosen": -42.774173736572266, "rewards/margins": 4.904994964599609, "rewards/rejected": -47.679168701171875, "step": 7172 }, { "epoch": 0.9767156862745098, "grad_norm": 42.085160949439185, "learning_rate": 1.3207217218751398e-09, "logits/chosen": 14.86506462097168, "logits/rejected": 15.21995735168457, "logps/chosen": -4.357190132141113, "logps/rejected": -4.719053745269775, "loss": 3.9589, "rewards/accuracies": 0.75, "rewards/chosen": -43.5718994140625, "rewards/margins": 3.6186389923095703, "rewards/rejected": -47.19053649902344, "step": 7173 }, { "epoch": 0.9768518518518519, "grad_norm": 41.02510469415948, "learning_rate": 1.30532823334768e-09, "logits/chosen": 14.574943542480469, "logits/rejected": 15.057193756103516, "logps/chosen": -4.606398105621338, "logps/rejected": -4.9968695640563965, "loss": 3.924, "rewards/accuracies": 1.0, "rewards/chosen": -46.06398010253906, "rewards/margins": 3.9047117233276367, "rewards/rejected": -49.968692779541016, "step": 7174 }, { "epoch": 0.976988017429194, "grad_norm": 44.07415161238275, "learning_rate": 1.290024833176906e-09, "logits/chosen": 14.849283218383789, "logits/rejected": 14.7418794631958, "logps/chosen": -4.501081466674805, "logps/rejected": -4.739335536956787, "loss": 3.8538, "rewards/accuracies": 0.5, "rewards/chosen": -45.01081466674805, "rewards/margins": 2.382539749145508, "rewards/rejected": -47.39335250854492, "step": 7175 }, { "epoch": 0.9771241830065359, "grad_norm": 41.049854344723954, "learning_rate": 1.274811524820807e-09, "logits/chosen": 13.597858428955078, "logits/rejected": 14.24397087097168, "logps/chosen": -4.42764949798584, "logps/rejected": -4.7261152267456055, "loss": 3.8243, "rewards/accuracies": 1.0, "rewards/chosen": -44.2764892578125, "rewards/margins": 2.984661102294922, "rewards/rejected": -47.26115417480469, "step": 7176 }, { "epoch": 0.977260348583878, "grad_norm": 57.61786630555412, "learning_rate": 1.2596883117169444e-09, "logits/chosen": 14.014394760131836, "logits/rejected": 14.624567031860352, "logps/chosen": -4.471122741699219, "logps/rejected": -4.759783744812012, "loss": 4.1793, "rewards/accuracies": 1.0, "rewards/chosen": -44.71122741699219, "rewards/margins": 2.886612892150879, "rewards/rejected": -47.59783935546875, "step": 7177 }, { "epoch": 0.9773965141612201, "grad_norm": 45.479919092108666, "learning_rate": 1.2446551972825403e-09, "logits/chosen": 14.16493034362793, "logits/rejected": 14.471864700317383, "logps/chosen": -4.257986068725586, "logps/rejected": -4.506229400634766, "loss": 3.6966, "rewards/accuracies": 1.0, "rewards/chosen": -42.57986068725586, "rewards/margins": 2.482431411743164, "rewards/rejected": -45.06229019165039, "step": 7178 }, { "epoch": 0.9775326797385621, "grad_norm": 43.45161642367394, "learning_rate": 1.2297121849143889e-09, "logits/chosen": 14.563066482543945, "logits/rejected": 13.874919891357422, "logps/chosen": -4.3576741218566895, "logps/rejected": -4.031214714050293, "loss": 4.1179, "rewards/accuracies": 0.5, "rewards/chosen": -43.576744079589844, "rewards/margins": -3.2645950317382812, "rewards/rejected": -40.31214904785156, "step": 7179 }, { "epoch": 0.9776688453159041, "grad_norm": 40.69488275452422, "learning_rate": 1.2148592779890776e-09, "logits/chosen": 14.296548843383789, "logits/rejected": 14.023983001708984, "logps/chosen": -4.597309112548828, "logps/rejected": -4.722697734832764, "loss": 3.9235, "rewards/accuracies": 0.5, "rewards/chosen": -45.97309112548828, "rewards/margins": 1.2538824081420898, "rewards/rejected": -47.22697448730469, "step": 7180 }, { "epoch": 0.9778050108932462, "grad_norm": 41.71193621259835, "learning_rate": 1.2000964798627222e-09, "logits/chosen": 14.316944122314453, "logits/rejected": 14.380399703979492, "logps/chosen": -4.503451347351074, "logps/rejected": -4.644280910491943, "loss": 4.1347, "rewards/accuracies": 0.75, "rewards/chosen": -45.034515380859375, "rewards/margins": 1.4082984924316406, "rewards/rejected": -46.44281005859375, "step": 7181 }, { "epoch": 0.9779411764705882, "grad_norm": 42.78605437495148, "learning_rate": 1.1854237938710098e-09, "logits/chosen": 13.888629913330078, "logits/rejected": 14.759658813476562, "logps/chosen": -4.608386039733887, "logps/rejected": -4.8795976638793945, "loss": 3.8981, "rewards/accuracies": 1.0, "rewards/chosen": -46.0838623046875, "rewards/margins": 2.712113380432129, "rewards/rejected": -48.79597473144531, "step": 7182 }, { "epoch": 0.9780773420479303, "grad_norm": 44.72826054609198, "learning_rate": 1.1708412233294662e-09, "logits/chosen": 14.99716567993164, "logits/rejected": 14.994016647338867, "logps/chosen": -4.4700822830200195, "logps/rejected": -4.668953895568848, "loss": 3.9739, "rewards/accuracies": 0.5, "rewards/chosen": -44.70082092285156, "rewards/margins": 1.9887170791625977, "rewards/rejected": -46.689537048339844, "step": 7183 }, { "epoch": 0.9782135076252724, "grad_norm": 48.42129368275881, "learning_rate": 1.1563487715331445e-09, "logits/chosen": 13.760320663452148, "logits/rejected": 14.883222579956055, "logps/chosen": -4.479561805725098, "logps/rejected": -4.817505836486816, "loss": 3.7886, "rewards/accuracies": 0.75, "rewards/chosen": -44.795616149902344, "rewards/margins": 3.379441261291504, "rewards/rejected": -48.17505645751953, "step": 7184 }, { "epoch": 0.9783496732026143, "grad_norm": 42.57338294296737, "learning_rate": 1.1419464417566692e-09, "logits/chosen": 14.59253215789795, "logits/rejected": 14.744556427001953, "logps/chosen": -4.27870512008667, "logps/rejected": -4.3029890060424805, "loss": 4.2272, "rewards/accuracies": 0.5, "rewards/chosen": -42.787052154541016, "rewards/margins": 0.24283885955810547, "rewards/rejected": -43.02988815307617, "step": 7185 }, { "epoch": 0.9784858387799564, "grad_norm": 40.64605388727203, "learning_rate": 1.1276342372543713e-09, "logits/chosen": 14.276912689208984, "logits/rejected": 14.450838088989258, "logps/chosen": -4.447286605834961, "logps/rejected": -4.76546573638916, "loss": 3.0795, "rewards/accuracies": 0.5, "rewards/chosen": -44.472862243652344, "rewards/margins": 3.181791305541992, "rewards/rejected": -47.65465545654297, "step": 7186 }, { "epoch": 0.9786220043572985, "grad_norm": 41.44327163421483, "learning_rate": 1.113412161260241e-09, "logits/chosen": 14.880327224731445, "logits/rejected": 14.895283699035645, "logps/chosen": -4.775871276855469, "logps/rejected": -4.817775249481201, "loss": 4.1326, "rewards/accuracies": 0.25, "rewards/chosen": -47.75871276855469, "rewards/margins": 0.419036865234375, "rewards/rejected": -48.17774963378906, "step": 7187 }, { "epoch": 0.9787581699346405, "grad_norm": 45.156190538208655, "learning_rate": 1.0992802169878411e-09, "logits/chosen": 14.292778015136719, "logits/rejected": 14.965255737304688, "logps/chosen": -4.4781575202941895, "logps/rejected": -4.790894508361816, "loss": 4.1836, "rewards/accuracies": 0.75, "rewards/chosen": -44.781578063964844, "rewards/margins": 3.1273679733276367, "rewards/rejected": -47.90894317626953, "step": 7188 }, { "epoch": 0.9788943355119826, "grad_norm": 44.597741214424865, "learning_rate": 1.08523840763044e-09, "logits/chosen": 14.782434463500977, "logits/rejected": 15.420936584472656, "logps/chosen": -4.490204334259033, "logps/rejected": -4.9834418296813965, "loss": 4.0694, "rewards/accuracies": 1.0, "rewards/chosen": -44.902042388916016, "rewards/margins": 4.932376861572266, "rewards/rejected": -49.83441925048828, "step": 7189 }, { "epoch": 0.9790305010893247, "grad_norm": 44.25408835323311, "learning_rate": 1.0712867363608769e-09, "logits/chosen": 15.709610939025879, "logits/rejected": 14.827616691589355, "logps/chosen": -5.000181674957275, "logps/rejected": -5.050011157989502, "loss": 3.3848, "rewards/accuracies": 0.5, "rewards/chosen": -50.00181579589844, "rewards/margins": 0.49829769134521484, "rewards/rejected": -50.50011444091797, "step": 7190 }, { "epoch": 0.9791666666666666, "grad_norm": 36.965282709806125, "learning_rate": 1.0574252063316524e-09, "logits/chosen": 14.063529968261719, "logits/rejected": 14.206390380859375, "logps/chosen": -4.4897871017456055, "logps/rejected": -4.9029927253723145, "loss": 3.6732, "rewards/accuracies": 1.0, "rewards/chosen": -44.89786911010742, "rewards/margins": 4.132057189941406, "rewards/rejected": -49.029930114746094, "step": 7191 }, { "epoch": 0.9793028322440087, "grad_norm": 48.15089166874823, "learning_rate": 1.043653820674928e-09, "logits/chosen": 14.566238403320312, "logits/rejected": 14.61284065246582, "logps/chosen": -4.658936500549316, "logps/rejected": -4.770604133605957, "loss": 4.1455, "rewards/accuracies": 0.25, "rewards/chosen": -46.58936309814453, "rewards/margins": 1.1166801452636719, "rewards/rejected": -47.7060432434082, "step": 7192 }, { "epoch": 0.9794389978213508, "grad_norm": 39.23734398327527, "learning_rate": 1.029972582502392e-09, "logits/chosen": 14.76882553100586, "logits/rejected": 15.254486083984375, "logps/chosen": -4.317734718322754, "logps/rejected": -4.966555595397949, "loss": 3.6905, "rewards/accuracies": 1.0, "rewards/chosen": -43.177345275878906, "rewards/margins": 6.488211631774902, "rewards/rejected": -49.66555404663086, "step": 7193 }, { "epoch": 0.9795751633986928, "grad_norm": 43.22718046010592, "learning_rate": 1.0163814949054827e-09, "logits/chosen": 14.276565551757812, "logits/rejected": 14.705474853515625, "logps/chosen": -4.662367343902588, "logps/rejected": -4.866231918334961, "loss": 3.7753, "rewards/accuracies": 0.75, "rewards/chosen": -46.62367248535156, "rewards/margins": 2.038646697998047, "rewards/rejected": -48.66231918334961, "step": 7194 }, { "epoch": 0.9797113289760349, "grad_norm": 48.40555132090191, "learning_rate": 1.0028805609552104e-09, "logits/chosen": 14.124374389648438, "logits/rejected": 14.540956497192383, "logps/chosen": -4.626398086547852, "logps/rejected": -4.751834869384766, "loss": 4.1771, "rewards/accuracies": 0.75, "rewards/chosen": -46.263980865478516, "rewards/margins": 1.2543706893920898, "rewards/rejected": -47.518348693847656, "step": 7195 }, { "epoch": 0.9798474945533769, "grad_norm": 42.63013245277882, "learning_rate": 9.894697837022015e-10, "logits/chosen": 14.18327522277832, "logits/rejected": 14.035929679870605, "logps/chosen": -4.46937370300293, "logps/rejected": -4.399913787841797, "loss": 3.7196, "rewards/accuracies": 0.5, "rewards/chosen": -44.69373321533203, "rewards/margins": -0.6945991516113281, "rewards/rejected": -43.9991340637207, "step": 7196 }, { "epoch": 0.9799836601307189, "grad_norm": 52.38546158910907, "learning_rate": 9.761491661767429e-10, "logits/chosen": 15.532035827636719, "logits/rejected": 15.114723205566406, "logps/chosen": -5.173826217651367, "logps/rejected": -4.9258503913879395, "loss": 3.4974, "rewards/accuracies": 0.25, "rewards/chosen": -51.738258361816406, "rewards/margins": -2.479755401611328, "rewards/rejected": -49.25850296020508, "step": 7197 }, { "epoch": 0.980119825708061, "grad_norm": 42.666404931629486, "learning_rate": 9.629187113887828e-10, "logits/chosen": 14.842844009399414, "logits/rejected": 14.869457244873047, "logps/chosen": -4.5522308349609375, "logps/rejected": -4.7272796630859375, "loss": 3.7458, "rewards/accuracies": 0.75, "rewards/chosen": -45.52231216430664, "rewards/margins": 1.7504844665527344, "rewards/rejected": -47.272796630859375, "step": 7198 }, { "epoch": 0.9802559912854031, "grad_norm": 41.81511398584427, "learning_rate": 9.49778422327796e-10, "logits/chosen": 13.870834350585938, "logits/rejected": 14.736499786376953, "logps/chosen": -4.357479095458984, "logps/rejected": -4.719716548919678, "loss": 3.8692, "rewards/accuracies": 1.0, "rewards/chosen": -43.574790954589844, "rewards/margins": 3.62237548828125, "rewards/rejected": -47.197166442871094, "step": 7199 }, { "epoch": 0.9803921568627451, "grad_norm": 43.160708275758836, "learning_rate": 9.367283019629635e-10, "logits/chosen": 14.17038631439209, "logits/rejected": 14.175146102905273, "logps/chosen": -4.544046401977539, "logps/rejected": -4.463016986846924, "loss": 4.4298, "rewards/accuracies": 0.25, "rewards/chosen": -45.440460205078125, "rewards/margins": -0.8102922439575195, "rewards/rejected": -44.63016891479492, "step": 7200 }, { "epoch": 0.9805283224400871, "grad_norm": 42.01440219941931, "learning_rate": 9.237683532430817e-10, "logits/chosen": 14.245447158813477, "logits/rejected": 14.485538482666016, "logps/chosen": -4.861508369445801, "logps/rejected": -4.7645263671875, "loss": 3.7322, "rewards/accuracies": 0.5, "rewards/chosen": -48.61508560180664, "rewards/margins": -0.9698238372802734, "rewards/rejected": -47.645259857177734, "step": 7201 }, { "epoch": 0.9806644880174292, "grad_norm": 51.527617078004354, "learning_rate": 9.108985790964752e-10, "logits/chosen": 14.778924942016602, "logits/rejected": 14.94361400604248, "logps/chosen": -4.636678218841553, "logps/rejected": -4.987039566040039, "loss": 4.5312, "rewards/accuracies": 1.0, "rewards/chosen": -46.366783142089844, "rewards/margins": 3.5036115646362305, "rewards/rejected": -49.87039566040039, "step": 7202 }, { "epoch": 0.9808006535947712, "grad_norm": 50.264871991680344, "learning_rate": 8.981189824313062e-10, "logits/chosen": 14.1149320602417, "logits/rejected": 14.652225494384766, "logps/chosen": -4.389007568359375, "logps/rejected": -4.702177047729492, "loss": 4.4864, "rewards/accuracies": 0.75, "rewards/chosen": -43.89007568359375, "rewards/margins": 3.1316957473754883, "rewards/rejected": -47.02177047729492, "step": 7203 }, { "epoch": 0.9809368191721133, "grad_norm": 40.813704895847216, "learning_rate": 8.854295661351318e-10, "logits/chosen": 15.039283752441406, "logits/rejected": 15.199007034301758, "logps/chosen": -4.729979515075684, "logps/rejected": -4.79365348815918, "loss": 4.1475, "rewards/accuracies": 0.5, "rewards/chosen": -47.29979705810547, "rewards/margins": 0.6367368698120117, "rewards/rejected": -47.93653106689453, "step": 7204 }, { "epoch": 0.9810729847494554, "grad_norm": 45.11113922201398, "learning_rate": 8.728303330752585e-10, "logits/chosen": 14.704181671142578, "logits/rejected": 14.70672607421875, "logps/chosen": -4.803310394287109, "logps/rejected": -4.7961883544921875, "loss": 3.3804, "rewards/accuracies": 0.75, "rewards/chosen": -48.033103942871094, "rewards/margins": -0.07121944427490234, "rewards/rejected": -47.961883544921875, "step": 7205 }, { "epoch": 0.9812091503267973, "grad_norm": 40.385456954697624, "learning_rate": 8.603212860985642e-10, "logits/chosen": 13.869927406311035, "logits/rejected": 14.519493103027344, "logps/chosen": -4.578398704528809, "logps/rejected": -4.810449600219727, "loss": 3.7094, "rewards/accuracies": 0.75, "rewards/chosen": -45.78398513793945, "rewards/margins": 2.3205108642578125, "rewards/rejected": -48.104496002197266, "step": 7206 }, { "epoch": 0.9813453159041394, "grad_norm": 42.24178542735173, "learning_rate": 8.479024280316328e-10, "logits/chosen": 14.232900619506836, "logits/rejected": 14.201144218444824, "logps/chosen": -4.614112854003906, "logps/rejected": -4.469881057739258, "loss": 4.0471, "rewards/accuracies": 0.25, "rewards/chosen": -46.14112854003906, "rewards/margins": -1.4423151016235352, "rewards/rejected": -44.698814392089844, "step": 7207 }, { "epoch": 0.9814814814814815, "grad_norm": 46.32309864978834, "learning_rate": 8.355737616805747e-10, "logits/chosen": 14.665815353393555, "logits/rejected": 15.265539169311523, "logps/chosen": -4.56893253326416, "logps/rejected": -4.90376091003418, "loss": 4.3013, "rewards/accuracies": 0.75, "rewards/chosen": -45.68932342529297, "rewards/margins": 3.3482837677001953, "rewards/rejected": -49.0376091003418, "step": 7208 }, { "epoch": 0.9816176470588235, "grad_norm": 40.997537040761635, "learning_rate": 8.233352898311619e-10, "logits/chosen": 15.065906524658203, "logits/rejected": 15.260832786560059, "logps/chosen": -4.650645732879639, "logps/rejected": -4.702498435974121, "loss": 3.5124, "rewards/accuracies": 0.5, "rewards/chosen": -46.5064582824707, "rewards/margins": 0.5185298919677734, "rewards/rejected": -47.02499008178711, "step": 7209 }, { "epoch": 0.9817538126361656, "grad_norm": 45.79993541322065, "learning_rate": 8.111870152487377e-10, "logits/chosen": 14.551130294799805, "logits/rejected": 14.8034029006958, "logps/chosen": -4.72697639465332, "logps/rejected": -4.602492809295654, "loss": 3.7823, "rewards/accuracies": 0.25, "rewards/chosen": -47.2697639465332, "rewards/margins": -1.2448348999023438, "rewards/rejected": -46.02492904663086, "step": 7210 }, { "epoch": 0.9818899782135077, "grad_norm": 41.173934115409835, "learning_rate": 7.991289406783508e-10, "logits/chosen": 14.33090877532959, "logits/rejected": 13.93243408203125, "logps/chosen": -4.573546886444092, "logps/rejected": -4.591505527496338, "loss": 3.7741, "rewards/accuracies": 0.5, "rewards/chosen": -45.735469818115234, "rewards/margins": 0.17958736419677734, "rewards/rejected": -45.91505813598633, "step": 7211 }, { "epoch": 0.9820261437908496, "grad_norm": 41.087552710784024, "learning_rate": 7.871610688446217e-10, "logits/chosen": 14.879425048828125, "logits/rejected": 14.747568130493164, "logps/chosen": -5.128414630889893, "logps/rejected": -4.9927544593811035, "loss": 3.5976, "rewards/accuracies": 0.25, "rewards/chosen": -51.284149169921875, "rewards/margins": -1.3566036224365234, "rewards/rejected": -49.92754364013672, "step": 7212 }, { "epoch": 0.9821623093681917, "grad_norm": 42.60934226979112, "learning_rate": 7.75283402451743e-10, "logits/chosen": 13.004714965820312, "logits/rejected": 13.870620727539062, "logps/chosen": -4.2782182693481445, "logps/rejected": -4.5395073890686035, "loss": 3.9873, "rewards/accuracies": 0.75, "rewards/chosen": -42.78218078613281, "rewards/margins": 2.61289119720459, "rewards/rejected": -45.39507293701172, "step": 7213 }, { "epoch": 0.9822984749455338, "grad_norm": 42.75340251930164, "learning_rate": 7.63495944183612e-10, "logits/chosen": 15.403654098510742, "logits/rejected": 14.688497543334961, "logps/chosen": -4.922180652618408, "logps/rejected": -4.647676944732666, "loss": 3.9304, "rewards/accuracies": 0.25, "rewards/chosen": -49.221805572509766, "rewards/margins": -2.7450332641601562, "rewards/rejected": -46.476768493652344, "step": 7214 }, { "epoch": 0.9824346405228758, "grad_norm": 41.35036189850292, "learning_rate": 7.517986967036982e-10, "logits/chosen": 14.397150039672852, "logits/rejected": 13.230123519897461, "logps/chosen": -4.584294319152832, "logps/rejected": -4.357297897338867, "loss": 4.3126, "rewards/accuracies": 0.5, "rewards/chosen": -45.84294128417969, "rewards/margins": -2.2699575424194336, "rewards/rejected": -43.57298278808594, "step": 7215 }, { "epoch": 0.9825708061002179, "grad_norm": 40.62805354907614, "learning_rate": 7.401916626550875e-10, "logits/chosen": 13.593214988708496, "logits/rejected": 14.784883499145508, "logps/chosen": -4.136505126953125, "logps/rejected": -4.625655174255371, "loss": 3.586, "rewards/accuracies": 1.0, "rewards/chosen": -41.365055084228516, "rewards/margins": 4.891501426696777, "rewards/rejected": -46.256553649902344, "step": 7216 }, { "epoch": 0.9827069716775599, "grad_norm": 43.29009085676215, "learning_rate": 7.286748446605262e-10, "logits/chosen": 14.70755386352539, "logits/rejected": 14.707427024841309, "logps/chosen": -4.642580986022949, "logps/rejected": -4.884692192077637, "loss": 3.2397, "rewards/accuracies": 0.5, "rewards/chosen": -46.42580795288086, "rewards/margins": 2.421114921569824, "rewards/rejected": -48.846923828125, "step": 7217 }, { "epoch": 0.9828431372549019, "grad_norm": 40.40217916055163, "learning_rate": 7.172482453222439e-10, "logits/chosen": 14.89937686920166, "logits/rejected": 14.847968101501465, "logps/chosen": -4.853049278259277, "logps/rejected": -4.759450912475586, "loss": 4.2226, "rewards/accuracies": 0.5, "rewards/chosen": -48.53049087524414, "rewards/margins": -0.9359798431396484, "rewards/rejected": -47.594512939453125, "step": 7218 }, { "epoch": 0.982979302832244, "grad_norm": 44.80388771793194, "learning_rate": 7.059118672222642e-10, "logits/chosen": 14.712442398071289, "logits/rejected": 15.333334922790527, "logps/chosen": -4.735134124755859, "logps/rejected": -5.081876277923584, "loss": 3.9897, "rewards/accuracies": 0.75, "rewards/chosen": -47.351341247558594, "rewards/margins": 3.4674253463745117, "rewards/rejected": -50.818763732910156, "step": 7219 }, { "epoch": 0.9831154684095861, "grad_norm": 44.07535856070683, "learning_rate": 6.94665712922049e-10, "logits/chosen": 13.18553352355957, "logits/rejected": 14.527877807617188, "logps/chosen": -4.441241264343262, "logps/rejected": -5.008611679077148, "loss": 3.5037, "rewards/accuracies": 1.0, "rewards/chosen": -44.412410736083984, "rewards/margins": 5.673709869384766, "rewards/rejected": -50.08612060546875, "step": 7220 }, { "epoch": 0.983251633986928, "grad_norm": 40.30809557847954, "learning_rate": 6.835097849628547e-10, "logits/chosen": 14.83468246459961, "logits/rejected": 14.572975158691406, "logps/chosen": -4.648375034332275, "logps/rejected": -4.496240615844727, "loss": 3.7113, "rewards/accuracies": 0.5, "rewards/chosen": -46.48374938964844, "rewards/margins": -1.521341323852539, "rewards/rejected": -44.96240997314453, "step": 7221 }, { "epoch": 0.9833877995642701, "grad_norm": 46.736354224347004, "learning_rate": 6.724440858653757e-10, "logits/chosen": 15.26645278930664, "logits/rejected": 15.29263973236084, "logps/chosen": -4.559162139892578, "logps/rejected": -4.668193817138672, "loss": 3.6057, "rewards/accuracies": 0.75, "rewards/chosen": -45.59162139892578, "rewards/margins": 1.0903186798095703, "rewards/rejected": -46.68193817138672, "step": 7222 }, { "epoch": 0.9835239651416122, "grad_norm": 41.79440852152949, "learning_rate": 6.614686181300566e-10, "logits/chosen": 14.553507804870605, "logits/rejected": 14.596904754638672, "logps/chosen": -4.916682243347168, "logps/rejected": -5.056558609008789, "loss": 3.4251, "rewards/accuracies": 0.5, "rewards/chosen": -49.16682052612305, "rewards/margins": 1.3987646102905273, "rewards/rejected": -50.56558609008789, "step": 7223 }, { "epoch": 0.9836601307189542, "grad_norm": 41.5462180128661, "learning_rate": 6.505833842368247e-10, "logits/chosen": 14.218082427978516, "logits/rejected": 14.079473495483398, "logps/chosen": -4.3043365478515625, "logps/rejected": -4.4313859939575195, "loss": 3.7003, "rewards/accuracies": 0.5, "rewards/chosen": -43.04336929321289, "rewards/margins": 1.2704887390136719, "rewards/rejected": -44.31385803222656, "step": 7224 }, { "epoch": 0.9837962962962963, "grad_norm": 49.62308125167841, "learning_rate": 6.397883866453568e-10, "logits/chosen": 13.80372428894043, "logits/rejected": 14.915813446044922, "logps/chosen": -4.57582950592041, "logps/rejected": -4.8147430419921875, "loss": 3.8051, "rewards/accuracies": 0.75, "rewards/chosen": -45.75829315185547, "rewards/margins": 2.3891334533691406, "rewards/rejected": -48.147430419921875, "step": 7225 }, { "epoch": 0.9839324618736384, "grad_norm": 44.39020688603248, "learning_rate": 6.290836277948575e-10, "logits/chosen": 13.930990219116211, "logits/rejected": 14.333646774291992, "logps/chosen": -4.502124786376953, "logps/rejected": -4.7718048095703125, "loss": 3.671, "rewards/accuracies": 0.75, "rewards/chosen": -45.02124786376953, "rewards/margins": 2.6968002319335938, "rewards/rejected": -47.718048095703125, "step": 7226 }, { "epoch": 0.9840686274509803, "grad_norm": 49.98390124543322, "learning_rate": 6.184691101041473e-10, "logits/chosen": 15.13696575164795, "logits/rejected": 15.474411010742188, "logps/chosen": -4.8900837898254395, "logps/rejected": -4.940345764160156, "loss": 4.3231, "rewards/accuracies": 0.5, "rewards/chosen": -48.900840759277344, "rewards/margins": 0.5026159286499023, "rewards/rejected": -49.4034538269043, "step": 7227 }, { "epoch": 0.9842047930283224, "grad_norm": 38.950246316375924, "learning_rate": 6.079448359716632e-10, "logits/chosen": 14.209512710571289, "logits/rejected": 15.087064743041992, "logps/chosen": -4.355354309082031, "logps/rejected": -4.568345069885254, "loss": 3.4617, "rewards/accuracies": 0.75, "rewards/chosen": -43.55354309082031, "rewards/margins": 2.1299142837524414, "rewards/rejected": -45.68345260620117, "step": 7228 }, { "epoch": 0.9843409586056645, "grad_norm": 40.63687419109313, "learning_rate": 5.975108077754587e-10, "logits/chosen": 15.110082626342773, "logits/rejected": 15.229814529418945, "logps/chosen": -4.826103210449219, "logps/rejected": -5.254683494567871, "loss": 3.8477, "rewards/accuracies": 1.0, "rewards/chosen": -48.26103210449219, "rewards/margins": 4.285799980163574, "rewards/rejected": -52.546836853027344, "step": 7229 }, { "epoch": 0.9844771241830066, "grad_norm": 42.91266364674721, "learning_rate": 5.871670278731588e-10, "logits/chosen": 14.107233047485352, "logits/rejected": 14.479574203491211, "logps/chosen": -4.535799026489258, "logps/rejected": -4.674118518829346, "loss": 4.1225, "rewards/accuracies": 0.5, "rewards/chosen": -45.357994079589844, "rewards/margins": 1.383193016052246, "rewards/rejected": -46.741188049316406, "step": 7230 }, { "epoch": 0.9846132897603486, "grad_norm": 48.333886293688465, "learning_rate": 5.769134986020497e-10, "logits/chosen": 13.717013359069824, "logits/rejected": 14.240921020507812, "logps/chosen": -4.132879734039307, "logps/rejected": -4.60621452331543, "loss": 3.1081, "rewards/accuracies": 0.75, "rewards/chosen": -41.328800201416016, "rewards/margins": 4.733343124389648, "rewards/rejected": -46.06214141845703, "step": 7231 }, { "epoch": 0.9847494553376906, "grad_norm": 45.2166821999858, "learning_rate": 5.667502222789889e-10, "logits/chosen": 14.107348442077637, "logits/rejected": 14.654426574707031, "logps/chosen": -4.480496406555176, "logps/rejected": -4.929165363311768, "loss": 3.385, "rewards/accuracies": 1.0, "rewards/chosen": -44.804962158203125, "rewards/margins": 4.486690521240234, "rewards/rejected": -49.29165267944336, "step": 7232 }, { "epoch": 0.9848856209150327, "grad_norm": 62.29412660311725, "learning_rate": 5.56677201200495e-10, "logits/chosen": 13.642318725585938, "logits/rejected": 13.97280216217041, "logps/chosen": -3.954149007797241, "logps/rejected": -4.150843620300293, "loss": 3.9925, "rewards/accuracies": 0.75, "rewards/chosen": -39.5414924621582, "rewards/margins": 1.9669485092163086, "rewards/rejected": -41.50843811035156, "step": 7233 }, { "epoch": 0.9850217864923747, "grad_norm": 43.1283384468425, "learning_rate": 5.466944376426142e-10, "logits/chosen": 13.938177108764648, "logits/rejected": 14.598180770874023, "logps/chosen": -4.608526706695557, "logps/rejected": -4.5761003494262695, "loss": 3.7957, "rewards/accuracies": 0.5, "rewards/chosen": -46.08526611328125, "rewards/margins": -0.3242626190185547, "rewards/rejected": -45.76100540161133, "step": 7234 }, { "epoch": 0.9851579520697168, "grad_norm": 49.256179251945625, "learning_rate": 5.36801933861053e-10, "logits/chosen": 14.402383804321289, "logits/rejected": 14.164960861206055, "logps/chosen": -4.5675482749938965, "logps/rejected": -4.523139953613281, "loss": 4.4141, "rewards/accuracies": 0.25, "rewards/chosen": -45.67548370361328, "rewards/margins": -0.44408226013183594, "rewards/rejected": -45.23139953613281, "step": 7235 }, { "epoch": 0.9852941176470589, "grad_norm": 42.01586543855556, "learning_rate": 5.269996920910458e-10, "logits/chosen": 14.902488708496094, "logits/rejected": 15.51765251159668, "logps/chosen": -4.895108222961426, "logps/rejected": -5.003050804138184, "loss": 4.5321, "rewards/accuracies": 0.5, "rewards/chosen": -48.95108413696289, "rewards/margins": 1.0794219970703125, "rewards/rejected": -50.0305061340332, "step": 7236 }, { "epoch": 0.9854302832244008, "grad_norm": 44.075953156706476, "learning_rate": 5.172877145475763e-10, "logits/chosen": 15.026121139526367, "logits/rejected": 14.838817596435547, "logps/chosen": -4.835588455200195, "logps/rejected": -4.666252136230469, "loss": 4.3383, "rewards/accuracies": 0.25, "rewards/chosen": -48.35588836669922, "rewards/margins": -1.6933622360229492, "rewards/rejected": -46.66252517700195, "step": 7237 }, { "epoch": 0.9855664488017429, "grad_norm": 42.24114462444386, "learning_rate": 5.076660034250668e-10, "logits/chosen": 15.007787704467773, "logits/rejected": 15.109350204467773, "logps/chosen": -4.8798322677612305, "logps/rejected": -4.848993301391602, "loss": 3.9578, "rewards/accuracies": 0.5, "rewards/chosen": -48.79832458496094, "rewards/margins": -0.30838680267333984, "rewards/rejected": -48.48993682861328, "step": 7238 }, { "epoch": 0.985702614379085, "grad_norm": 40.967287952579746, "learning_rate": 4.981345608976894e-10, "logits/chosen": 13.989419937133789, "logits/rejected": 13.811534881591797, "logps/chosen": -4.1874494552612305, "logps/rejected": -4.303246021270752, "loss": 3.8284, "rewards/accuracies": 0.75, "rewards/chosen": -41.87449645996094, "rewards/margins": 1.157963752746582, "rewards/rejected": -43.03246307373047, "step": 7239 }, { "epoch": 0.985838779956427, "grad_norm": 41.949263069553226, "learning_rate": 4.886933891191436e-10, "logits/chosen": 14.464334487915039, "logits/rejected": 14.88990592956543, "logps/chosen": -4.639736175537109, "logps/rejected": -4.612403392791748, "loss": 3.7405, "rewards/accuracies": 0.75, "rewards/chosen": -46.39735794067383, "rewards/margins": -0.27332496643066406, "rewards/rejected": -46.12403106689453, "step": 7240 }, { "epoch": 0.9859749455337691, "grad_norm": 41.30865783613776, "learning_rate": 4.793424902227005e-10, "logits/chosen": 14.322280883789062, "logits/rejected": 14.867706298828125, "logps/chosen": -4.613984107971191, "logps/rejected": -4.912989139556885, "loss": 4.4046, "rewards/accuracies": 0.5, "rewards/chosen": -46.13984298706055, "rewards/margins": 2.990046501159668, "rewards/rejected": -49.12989044189453, "step": 7241 }, { "epoch": 0.9861111111111112, "grad_norm": 43.50443005106056, "learning_rate": 4.700818663212924e-10, "logits/chosen": 14.116004943847656, "logits/rejected": 14.536033630371094, "logps/chosen": -4.492137908935547, "logps/rejected": -4.67624568939209, "loss": 3.2309, "rewards/accuracies": 0.75, "rewards/chosen": -44.92137908935547, "rewards/margins": 1.8410758972167969, "rewards/rejected": -46.762454986572266, "step": 7242 }, { "epoch": 0.9862472766884531, "grad_norm": 46.77741475503164, "learning_rate": 4.609115195074231e-10, "logits/chosen": 14.322057723999023, "logits/rejected": 14.945808410644531, "logps/chosen": -4.480511665344238, "logps/rejected": -5.066292762756348, "loss": 3.9302, "rewards/accuracies": 1.0, "rewards/chosen": -44.80511474609375, "rewards/margins": 5.857809066772461, "rewards/rejected": -50.662925720214844, "step": 7243 }, { "epoch": 0.9863834422657952, "grad_norm": 43.1546583777877, "learning_rate": 4.5183145185321294e-10, "logits/chosen": 14.05816650390625, "logits/rejected": 14.223899841308594, "logps/chosen": -4.4272589683532715, "logps/rejected": -4.750728130340576, "loss": 3.6843, "rewards/accuracies": 0.75, "rewards/chosen": -44.27259063720703, "rewards/margins": 3.2346935272216797, "rewards/rejected": -47.50728225708008, "step": 7244 }, { "epoch": 0.9865196078431373, "grad_norm": 40.86599532222327, "learning_rate": 4.4284166541039834e-10, "logits/chosen": 14.346277236938477, "logits/rejected": 14.605749130249023, "logps/chosen": -4.266050338745117, "logps/rejected": -4.733774185180664, "loss": 4.1179, "rewards/accuracies": 1.0, "rewards/chosen": -42.66050720214844, "rewards/margins": 4.677239418029785, "rewards/rejected": -47.337745666503906, "step": 7245 }, { "epoch": 0.9866557734204793, "grad_norm": 44.53410538144678, "learning_rate": 4.339421622102879e-10, "logits/chosen": 14.009929656982422, "logits/rejected": 14.748763084411621, "logps/chosen": -4.3889875411987305, "logps/rejected": -4.670636177062988, "loss": 4.1554, "rewards/accuracies": 0.5, "rewards/chosen": -43.88987731933594, "rewards/margins": 2.8164825439453125, "rewards/rejected": -46.70635986328125, "step": 7246 }, { "epoch": 0.9867919389978214, "grad_norm": 42.52296782007245, "learning_rate": 4.251329442638063e-10, "logits/chosen": 14.494402885437012, "logits/rejected": 14.539176940917969, "logps/chosen": -4.8325395584106445, "logps/rejected": -4.991843223571777, "loss": 3.5108, "rewards/accuracies": 0.75, "rewards/chosen": -48.32539367675781, "rewards/margins": 1.593038558959961, "rewards/rejected": -49.91843032836914, "step": 7247 }, { "epoch": 0.9869281045751634, "grad_norm": 41.38293702675937, "learning_rate": 4.164140135614058e-10, "logits/chosen": 15.045736312866211, "logits/rejected": 15.20033073425293, "logps/chosen": -5.022365093231201, "logps/rejected": -5.024925708770752, "loss": 3.4759, "rewards/accuracies": 0.5, "rewards/chosen": -50.22365188598633, "rewards/margins": 0.025605201721191406, "rewards/rejected": -50.24925994873047, "step": 7248 }, { "epoch": 0.9870642701525054, "grad_norm": 43.16486245244274, "learning_rate": 4.0778537207328824e-10, "logits/chosen": 14.642545700073242, "logits/rejected": 14.939413070678711, "logps/chosen": -4.755178451538086, "logps/rejected": -4.885089874267578, "loss": 4.114, "rewards/accuracies": 0.5, "rewards/chosen": -47.551788330078125, "rewards/margins": 1.2991104125976562, "rewards/rejected": -48.85089874267578, "step": 7249 }, { "epoch": 0.9872004357298475, "grad_norm": 51.6399625058755, "learning_rate": 3.992470217491384e-10, "logits/chosen": 14.892965316772461, "logits/rejected": 14.851698875427246, "logps/chosen": -4.919746398925781, "logps/rejected": -4.776956081390381, "loss": 4.0253, "rewards/accuracies": 0.25, "rewards/chosen": -49.19746398925781, "rewards/margins": -1.427903175354004, "rewards/rejected": -47.769561767578125, "step": 7250 }, { "epoch": 0.9873366013071896, "grad_norm": 38.595078669808196, "learning_rate": 3.90798964518213e-10, "logits/chosen": 14.432095527648926, "logits/rejected": 15.044347763061523, "logps/chosen": -4.42668342590332, "logps/rejected": -4.791448593139648, "loss": 3.6988, "rewards/accuracies": 1.0, "rewards/chosen": -44.26683807373047, "rewards/margins": 3.647650718688965, "rewards/rejected": -47.914485931396484, "step": 7251 }, { "epoch": 0.9874727668845316, "grad_norm": 54.06962086007711, "learning_rate": 3.8244120228951847e-10, "logits/chosen": 13.907770156860352, "logits/rejected": 14.65298080444336, "logps/chosen": -4.749021530151367, "logps/rejected": -4.8644819259643555, "loss": 4.3451, "rewards/accuracies": 0.5, "rewards/chosen": -47.49021911621094, "rewards/margins": 1.1546001434326172, "rewards/rejected": -48.64481735229492, "step": 7252 }, { "epoch": 0.9876089324618736, "grad_norm": 46.462417697249805, "learning_rate": 3.7417373695149965e-10, "logits/chosen": 14.32697868347168, "logits/rejected": 15.024957656860352, "logps/chosen": -4.611491680145264, "logps/rejected": -4.699817657470703, "loss": 3.7743, "rewards/accuracies": 0.5, "rewards/chosen": -46.11491394042969, "rewards/margins": 0.8832597732543945, "rewards/rejected": -46.99817657470703, "step": 7253 }, { "epoch": 0.9877450980392157, "grad_norm": 46.270872185075774, "learning_rate": 3.659965703722179e-10, "logits/chosen": 14.422266006469727, "logits/rejected": 15.255195617675781, "logps/chosen": -4.521587371826172, "logps/rejected": -4.8736467361450195, "loss": 3.6065, "rewards/accuracies": 0.75, "rewards/chosen": -45.215877532958984, "rewards/margins": 3.52059268951416, "rewards/rejected": -48.73646926879883, "step": 7254 }, { "epoch": 0.9878812636165577, "grad_norm": 46.437545261721944, "learning_rate": 3.5790970439943946e-10, "logits/chosen": 14.287247657775879, "logits/rejected": 14.555791854858398, "logps/chosen": -4.642294406890869, "logps/rejected": -4.71820068359375, "loss": 3.5867, "rewards/accuracies": 0.75, "rewards/chosen": -46.422943115234375, "rewards/margins": 0.7590579986572266, "rewards/rejected": -47.182003021240234, "step": 7255 }, { "epoch": 0.9880174291938998, "grad_norm": 40.22809254608386, "learning_rate": 3.499131408604583e-10, "logits/chosen": 14.668724060058594, "logits/rejected": 15.472286224365234, "logps/chosen": -4.574697017669678, "logps/rejected": -5.24709415435791, "loss": 3.7743, "rewards/accuracies": 1.0, "rewards/chosen": -45.746971130371094, "rewards/margins": 6.723973274230957, "rewards/rejected": -52.470943450927734, "step": 7256 }, { "epoch": 0.9881535947712419, "grad_norm": 39.05546092679976, "learning_rate": 3.420068815621402e-10, "logits/chosen": 13.826303482055664, "logits/rejected": 13.704216003417969, "logps/chosen": -4.493769645690918, "logps/rejected": -4.522030830383301, "loss": 3.527, "rewards/accuracies": 0.5, "rewards/chosen": -44.93769073486328, "rewards/margins": 0.28261280059814453, "rewards/rejected": -45.220306396484375, "step": 7257 }, { "epoch": 0.9882897603485838, "grad_norm": 46.2716931238967, "learning_rate": 3.3419092829096717e-10, "logits/chosen": 14.67243480682373, "logits/rejected": 14.983156204223633, "logps/chosen": -4.723879337310791, "logps/rejected": -4.847497940063477, "loss": 4.414, "rewards/accuracies": 0.75, "rewards/chosen": -47.238792419433594, "rewards/margins": 1.2361869812011719, "rewards/rejected": -48.474979400634766, "step": 7258 }, { "epoch": 0.9884259259259259, "grad_norm": 39.645223145835345, "learning_rate": 3.2646528281303765e-10, "logits/chosen": 14.055571556091309, "logits/rejected": 14.990278244018555, "logps/chosen": -4.38828706741333, "logps/rejected": -4.772980690002441, "loss": 3.4104, "rewards/accuracies": 0.75, "rewards/chosen": -43.88287353515625, "rewards/margins": 3.846933364868164, "rewards/rejected": -47.72980499267578, "step": 7259 }, { "epoch": 0.988562091503268, "grad_norm": 40.689973880707406, "learning_rate": 3.1882994687397746e-10, "logits/chosen": 16.144384384155273, "logits/rejected": 15.483253479003906, "logps/chosen": -5.082430362701416, "logps/rejected": -5.077328205108643, "loss": 4.2086, "rewards/accuracies": 0.25, "rewards/chosen": -50.824302673339844, "rewards/margins": -0.05102062225341797, "rewards/rejected": -50.773284912109375, "step": 7260 }, { "epoch": 0.98869825708061, "grad_norm": 46.26754600883835, "learning_rate": 3.112849221991176e-10, "logits/chosen": 14.958227157592773, "logits/rejected": 14.233932495117188, "logps/chosen": -4.611149787902832, "logps/rejected": -4.636874675750732, "loss": 3.823, "rewards/accuracies": 0.75, "rewards/chosen": -46.11149597167969, "rewards/margins": 0.2572479248046875, "rewards/rejected": -46.36874771118164, "step": 7261 }, { "epoch": 0.9888344226579521, "grad_norm": 39.50817927481332, "learning_rate": 3.038302104932722e-10, "logits/chosen": 14.381149291992188, "logits/rejected": 14.414823532104492, "logps/chosen": -4.413771629333496, "logps/rejected": -4.484889507293701, "loss": 3.6001, "rewards/accuracies": 0.5, "rewards/chosen": -44.137718200683594, "rewards/margins": 0.7111787796020508, "rewards/rejected": -44.84889602661133, "step": 7262 }, { "epoch": 0.9889705882352942, "grad_norm": 41.890028226297446, "learning_rate": 2.964658134409159e-10, "logits/chosen": 14.871809005737305, "logits/rejected": 14.788421630859375, "logps/chosen": -4.645544528961182, "logps/rejected": -4.770815849304199, "loss": 3.2722, "rewards/accuracies": 0.5, "rewards/chosen": -46.4554443359375, "rewards/margins": 1.2527170181274414, "rewards/rejected": -47.708160400390625, "step": 7263 }, { "epoch": 0.9891067538126361, "grad_norm": 40.45404041817735, "learning_rate": 2.8919173270609554e-10, "logits/chosen": 14.723896980285645, "logits/rejected": 15.212662696838379, "logps/chosen": -4.710420608520508, "logps/rejected": -4.942553997039795, "loss": 4.2496, "rewards/accuracies": 0.75, "rewards/chosen": -47.10420608520508, "rewards/margins": 2.321329116821289, "rewards/rejected": -49.425537109375, "step": 7264 }, { "epoch": 0.9892429193899782, "grad_norm": 43.50872296686852, "learning_rate": 2.82007969932474e-10, "logits/chosen": 15.053637504577637, "logits/rejected": 14.562591552734375, "logps/chosen": -5.025782585144043, "logps/rejected": -4.83619499206543, "loss": 4.542, "rewards/accuracies": 0.25, "rewards/chosen": -50.2578239440918, "rewards/margins": -1.8958711624145508, "rewards/rejected": -48.36195373535156, "step": 7265 }, { "epoch": 0.9893790849673203, "grad_norm": 41.03336427753579, "learning_rate": 2.7491452674324176e-10, "logits/chosen": 15.066841125488281, "logits/rejected": 14.816156387329102, "logps/chosen": -4.817575454711914, "logps/rejected": -4.7592549324035645, "loss": 3.4636, "rewards/accuracies": 0.25, "rewards/chosen": -48.17575454711914, "rewards/margins": -0.5832071304321289, "rewards/rejected": -47.59254837036133, "step": 7266 }, { "epoch": 0.9895152505446623, "grad_norm": 45.718336708332224, "learning_rate": 2.6791140474120565e-10, "logits/chosen": 14.645851135253906, "logits/rejected": 14.694539070129395, "logps/chosen": -4.638707160949707, "logps/rejected": -4.594983100891113, "loss": 3.5729, "rewards/accuracies": 0.5, "rewards/chosen": -46.38707733154297, "rewards/margins": -0.4372406005859375, "rewards/rejected": -45.949832916259766, "step": 7267 }, { "epoch": 0.9896514161220044, "grad_norm": 40.42579878313764, "learning_rate": 2.6099860550883314e-10, "logits/chosen": 14.575248718261719, "logits/rejected": 15.113565444946289, "logps/chosen": -4.6132917404174805, "logps/rejected": -4.837782859802246, "loss": 4.183, "rewards/accuracies": 0.5, "rewards/chosen": -46.13291549682617, "rewards/margins": 2.2449169158935547, "rewards/rejected": -48.377830505371094, "step": 7268 }, { "epoch": 0.9897875816993464, "grad_norm": 42.083141880777454, "learning_rate": 2.5417613060807476e-10, "logits/chosen": 14.498130798339844, "logits/rejected": 14.307971000671387, "logps/chosen": -4.701562881469727, "logps/rejected": -4.435595512390137, "loss": 3.9208, "rewards/accuracies": 0.25, "rewards/chosen": -47.01563262939453, "rewards/margins": -2.659674644470215, "rewards/rejected": -44.35595703125, "step": 7269 }, { "epoch": 0.9899237472766884, "grad_norm": 42.199752279461734, "learning_rate": 2.474439815805862e-10, "logits/chosen": 13.517477035522461, "logits/rejected": 14.374784469604492, "logps/chosen": -4.458761215209961, "logps/rejected": -4.8194122314453125, "loss": 3.8908, "rewards/accuracies": 1.0, "rewards/chosen": -44.587615966796875, "rewards/margins": 3.606509208679199, "rewards/rejected": -48.19412612915039, "step": 7270 }, { "epoch": 0.9900599128540305, "grad_norm": 40.81287872390959, "learning_rate": 2.408021599475063e-10, "logits/chosen": 14.78392219543457, "logits/rejected": 14.952838897705078, "logps/chosen": -4.521145820617676, "logps/rejected": -4.907604217529297, "loss": 4.1703, "rewards/accuracies": 0.75, "rewards/chosen": -45.211456298828125, "rewards/margins": 3.86458683013916, "rewards/rejected": -49.07604217529297, "step": 7271 }, { "epoch": 0.9901960784313726, "grad_norm": 42.40571487258339, "learning_rate": 2.342506672095901e-10, "logits/chosen": 14.601806640625, "logits/rejected": 14.620616912841797, "logps/chosen": -4.517739295959473, "logps/rejected": -4.495951175689697, "loss": 3.7311, "rewards/accuracies": 0.5, "rewards/chosen": -45.17739486694336, "rewards/margins": -0.2178802490234375, "rewards/rejected": -44.95951461791992, "step": 7272 }, { "epoch": 0.9903322440087146, "grad_norm": 45.092877412559055, "learning_rate": 2.2778950484725333e-10, "logits/chosen": 14.511154174804688, "logits/rejected": 14.529029846191406, "logps/chosen": -4.673768043518066, "logps/rejected": -4.922098636627197, "loss": 3.884, "rewards/accuracies": 0.5, "rewards/chosen": -46.73767852783203, "rewards/margins": 2.4833059310913086, "rewards/rejected": -49.220985412597656, "step": 7273 }, { "epoch": 0.9904684095860566, "grad_norm": 42.84907340100509, "learning_rate": 2.2141867432043937e-10, "logits/chosen": 14.155654907226562, "logits/rejected": 15.076769828796387, "logps/chosen": -4.700996398925781, "logps/rejected": -4.99443244934082, "loss": 3.8131, "rewards/accuracies": 0.75, "rewards/chosen": -47.00996017456055, "rewards/margins": 2.93436336517334, "rewards/rejected": -49.94432067871094, "step": 7274 }, { "epoch": 0.9906045751633987, "grad_norm": 42.0304267200015, "learning_rate": 2.1513817706870774e-10, "logits/chosen": 14.832246780395508, "logits/rejected": 14.330486297607422, "logps/chosen": -4.760947227478027, "logps/rejected": -4.74235725402832, "loss": 3.7163, "rewards/accuracies": 0.5, "rewards/chosen": -47.60947036743164, "rewards/margins": -0.1858997344970703, "rewards/rejected": -47.4235725402832, "step": 7275 }, { "epoch": 0.9907407407407407, "grad_norm": 42.77728425045105, "learning_rate": 2.0894801451110111e-10, "logits/chosen": 14.166614532470703, "logits/rejected": 14.687030792236328, "logps/chosen": -4.638099670410156, "logps/rejected": -4.858315944671631, "loss": 3.5002, "rewards/accuracies": 0.5, "rewards/chosen": -46.38099670410156, "rewards/margins": 2.2021656036376953, "rewards/rejected": -48.583160400390625, "step": 7276 }, { "epoch": 0.9908769063180828, "grad_norm": 45.46669702781905, "learning_rate": 2.0284818804641167e-10, "logits/chosen": 14.006879806518555, "logits/rejected": 14.533540725708008, "logps/chosen": -4.501070022583008, "logps/rejected": -4.642042636871338, "loss": 4.3474, "rewards/accuracies": 0.75, "rewards/chosen": -45.01069641113281, "rewards/margins": 1.4097299575805664, "rewards/rejected": -46.42042541503906, "step": 7277 }, { "epoch": 0.9910130718954249, "grad_norm": 42.01351551659377, "learning_rate": 1.9683869905295912e-10, "logits/chosen": 13.891292572021484, "logits/rejected": 14.098800659179688, "logps/chosen": -3.941697359085083, "logps/rejected": -4.345005989074707, "loss": 3.1745, "rewards/accuracies": 0.75, "rewards/chosen": -39.41697311401367, "rewards/margins": 4.033086776733398, "rewards/rejected": -43.45005798339844, "step": 7278 }, { "epoch": 0.9911492374727668, "grad_norm": 41.186808454441824, "learning_rate": 1.9091954888859063e-10, "logits/chosen": 14.490995407104492, "logits/rejected": 14.750076293945312, "logps/chosen": -4.294306755065918, "logps/rejected": -4.7650909423828125, "loss": 3.9275, "rewards/accuracies": 1.0, "rewards/chosen": -42.94306945800781, "rewards/margins": 4.7078447341918945, "rewards/rejected": -47.65091323852539, "step": 7279 }, { "epoch": 0.9912854030501089, "grad_norm": 43.711501608598745, "learning_rate": 1.8509073889081407e-10, "logits/chosen": 14.861867904663086, "logits/rejected": 14.968542098999023, "logps/chosen": -4.867401599884033, "logps/rejected": -5.037788391113281, "loss": 4.5832, "rewards/accuracies": 0.5, "rewards/chosen": -48.674015045166016, "rewards/margins": 1.7038660049438477, "rewards/rejected": -50.37788391113281, "step": 7280 }, { "epoch": 0.991421568627451, "grad_norm": 60.54643655417531, "learning_rate": 1.793522703766648e-10, "logits/chosen": 14.353123664855957, "logits/rejected": 14.514762878417969, "logps/chosen": -4.476648330688477, "logps/rejected": -4.8075056076049805, "loss": 3.5669, "rewards/accuracies": 1.0, "rewards/chosen": -44.76648712158203, "rewards/margins": 3.3085737228393555, "rewards/rejected": -48.07505798339844, "step": 7281 }, { "epoch": 0.991557734204793, "grad_norm": 38.33759498911105, "learning_rate": 1.7370414464283888e-10, "logits/chosen": 14.368223190307617, "logits/rejected": 14.497116088867188, "logps/chosen": -4.4559221267700195, "logps/rejected": -4.638207912445068, "loss": 3.8361, "rewards/accuracies": 0.5, "rewards/chosen": -44.55922317504883, "rewards/margins": 1.822854995727539, "rewards/rejected": -46.382080078125, "step": 7282 }, { "epoch": 0.9916938997821351, "grad_norm": 44.542439086158424, "learning_rate": 1.6814636296555996e-10, "logits/chosen": 14.937647819519043, "logits/rejected": 15.196290969848633, "logps/chosen": -4.752452850341797, "logps/rejected": -4.855595588684082, "loss": 3.7997, "rewards/accuracies": 1.0, "rewards/chosen": -47.52452850341797, "rewards/margins": 1.0314264297485352, "rewards/rejected": -48.55595397949219, "step": 7283 }, { "epoch": 0.9918300653594772, "grad_norm": 49.14112966748407, "learning_rate": 1.6267892660066784e-10, "logits/chosen": 13.924354553222656, "logits/rejected": 14.46507740020752, "logps/chosen": -4.43000602722168, "logps/rejected": -4.834552764892578, "loss": 4.2815, "rewards/accuracies": 1.0, "rewards/chosen": -44.30006408691406, "rewards/margins": 4.045463562011719, "rewards/rejected": -48.34552764892578, "step": 7284 }, { "epoch": 0.9919662309368191, "grad_norm": 41.14759358387752, "learning_rate": 1.5730183678352992e-10, "logits/chosen": 14.692075729370117, "logits/rejected": 15.21319580078125, "logps/chosen": -4.34373664855957, "logps/rejected": -4.847507476806641, "loss": 4.5092, "rewards/accuracies": 1.0, "rewards/chosen": -43.4373664855957, "rewards/margins": 5.037708282470703, "rewards/rejected": -48.475074768066406, "step": 7285 }, { "epoch": 0.9921023965141612, "grad_norm": 41.54389712176199, "learning_rate": 1.520150947292187e-10, "logits/chosen": 14.493515014648438, "logits/rejected": 15.203969955444336, "logps/chosen": -4.354101181030273, "logps/rejected": -4.980232238769531, "loss": 4.1298, "rewards/accuracies": 1.0, "rewards/chosen": -43.54100799560547, "rewards/margins": 6.261308670043945, "rewards/rejected": -49.80231857299805, "step": 7286 }, { "epoch": 0.9922385620915033, "grad_norm": 44.86161058996788, "learning_rate": 1.468187016322453e-10, "logits/chosen": 14.490140914916992, "logits/rejected": 14.188806533813477, "logps/chosen": -4.3562331199646, "logps/rejected": -4.519021511077881, "loss": 3.8607, "rewards/accuracies": 0.75, "rewards/chosen": -43.56233215332031, "rewards/margins": 1.627882957458496, "rewards/rejected": -45.190216064453125, "step": 7287 }, { "epoch": 0.9923747276688453, "grad_norm": 42.75229073405933, "learning_rate": 1.4171265866678162e-10, "logits/chosen": 15.162726402282715, "logits/rejected": 15.191638946533203, "logps/chosen": -4.696713447570801, "logps/rejected": -4.859685897827148, "loss": 4.344, "rewards/accuracies": 0.5, "rewards/chosen": -46.967132568359375, "rewards/margins": 1.6297225952148438, "rewards/rejected": -48.59685516357422, "step": 7288 }, { "epoch": 0.9925108932461874, "grad_norm": 46.30243160794938, "learning_rate": 1.3669696698661582e-10, "logits/chosen": 15.18628978729248, "logits/rejected": 14.591333389282227, "logps/chosen": -4.885358810424805, "logps/rejected": -4.851207733154297, "loss": 4.3706, "rewards/accuracies": 0.5, "rewards/chosen": -48.85359191894531, "rewards/margins": -0.3415098190307617, "rewards/rejected": -48.51207733154297, "step": 7289 }, { "epoch": 0.9926470588235294, "grad_norm": 45.687245218541456, "learning_rate": 1.3177162772510796e-10, "logits/chosen": 14.707023620605469, "logits/rejected": 14.724331855773926, "logps/chosen": -4.600610733032227, "logps/rejected": -4.868246555328369, "loss": 3.4715, "rewards/accuracies": 0.75, "rewards/chosen": -46.00611114501953, "rewards/margins": 2.6763572692871094, "rewards/rejected": -48.682464599609375, "step": 7290 }, { "epoch": 0.9927832244008714, "grad_norm": 42.12804184824128, "learning_rate": 1.2693664199514566e-10, "logits/chosen": 14.366182327270508, "logits/rejected": 14.76589584350586, "logps/chosen": -4.397214889526367, "logps/rejected": -4.500732421875, "loss": 4.1731, "rewards/accuracies": 0.5, "rewards/chosen": -43.97214889526367, "rewards/margins": 1.035177230834961, "rewards/rejected": -45.00732421875, "step": 7291 }, { "epoch": 0.9929193899782135, "grad_norm": 40.62293453791411, "learning_rate": 1.221920108891883e-10, "logits/chosen": 14.933399200439453, "logits/rejected": 14.398072242736816, "logps/chosen": -4.828950881958008, "logps/rejected": -4.6504716873168945, "loss": 3.8266, "rewards/accuracies": 0.25, "rewards/chosen": -48.28950881958008, "rewards/margins": -1.784794807434082, "rewards/rejected": -46.50471496582031, "step": 7292 }, { "epoch": 0.9930555555555556, "grad_norm": 42.36464208929063, "learning_rate": 1.175377354794005e-10, "logits/chosen": 14.831825256347656, "logits/rejected": 15.246794700622559, "logps/chosen": -4.5379462242126465, "logps/rejected": -4.771734714508057, "loss": 3.9878, "rewards/accuracies": 0.75, "rewards/chosen": -45.379459381103516, "rewards/margins": 2.3378896713256836, "rewards/rejected": -47.717350006103516, "step": 7293 }, { "epoch": 0.9931917211328976, "grad_norm": 40.84550883278761, "learning_rate": 1.129738168174299e-10, "logits/chosen": 15.388971328735352, "logits/rejected": 14.932018280029297, "logps/chosen": -4.985682964324951, "logps/rejected": -4.776716709136963, "loss": 3.9018, "rewards/accuracies": 0.25, "rewards/chosen": -49.85683059692383, "rewards/margins": -2.0896615982055664, "rewards/rejected": -47.76716613769531, "step": 7294 }, { "epoch": 0.9933278867102396, "grad_norm": 50.94481693372673, "learning_rate": 1.0850025593449608e-10, "logits/chosen": 14.813628196716309, "logits/rejected": 14.596721649169922, "logps/chosen": -5.078139305114746, "logps/rejected": -4.965797424316406, "loss": 3.8622, "rewards/accuracies": 0.5, "rewards/chosen": -50.78138732910156, "rewards/margins": -1.1234130859375, "rewards/rejected": -49.65797424316406, "step": 7295 }, { "epoch": 0.9934640522875817, "grad_norm": 42.077361183098965, "learning_rate": 1.0411705384147928e-10, "logits/chosen": 15.044167518615723, "logits/rejected": 15.535318374633789, "logps/chosen": -4.835095405578613, "logps/rejected": -5.275148868560791, "loss": 4.1307, "rewards/accuracies": 0.75, "rewards/chosen": -48.3509521484375, "rewards/margins": 4.400534629821777, "rewards/rejected": -52.751487731933594, "step": 7296 }, { "epoch": 0.9936002178649237, "grad_norm": 42.29016437345222, "learning_rate": 9.982421152878728e-11, "logits/chosen": 14.654800415039062, "logits/rejected": 14.478681564331055, "logps/chosen": -4.559860706329346, "logps/rejected": -4.5813446044921875, "loss": 3.6778, "rewards/accuracies": 0.75, "rewards/chosen": -45.59860610961914, "rewards/margins": 0.21483802795410156, "rewards/rejected": -45.813446044921875, "step": 7297 }, { "epoch": 0.9937363834422658, "grad_norm": 43.571977122200515, "learning_rate": 9.562172996644413e-11, "logits/chosen": 14.239376068115234, "logits/rejected": 14.159887313842773, "logps/chosen": -4.4051666259765625, "logps/rejected": -4.576923847198486, "loss": 3.9491, "rewards/accuracies": 0.75, "rewards/chosen": -44.051666259765625, "rewards/margins": 1.7175750732421875, "rewards/rejected": -45.76923751831055, "step": 7298 }, { "epoch": 0.9938725490196079, "grad_norm": 40.61185452996969, "learning_rate": 9.150961010400138e-11, "logits/chosen": 14.302521705627441, "logits/rejected": 15.359872817993164, "logps/chosen": -4.425434112548828, "logps/rejected": -4.906346321105957, "loss": 3.7787, "rewards/accuracies": 0.75, "rewards/chosen": -44.25434112548828, "rewards/margins": 4.809116363525391, "rewards/rejected": -49.06346130371094, "step": 7299 }, { "epoch": 0.9940087145969498, "grad_norm": 41.474315504882924, "learning_rate": 8.748785287062688e-11, "logits/chosen": 14.76093864440918, "logits/rejected": 15.02573013305664, "logps/chosen": -4.63037633895874, "logps/rejected": -4.789061069488525, "loss": 3.9903, "rewards/accuracies": 0.5, "rewards/chosen": -46.30376052856445, "rewards/margins": 1.5868492126464844, "rewards/rejected": -47.89060974121094, "step": 7300 }, { "epoch": 0.9941448801742919, "grad_norm": 37.67478018167054, "learning_rate": 8.355645917506038e-11, "logits/chosen": 14.413894653320312, "logits/rejected": 14.776878356933594, "logps/chosen": -4.560164928436279, "logps/rejected": -4.7894463539123535, "loss": 3.7033, "rewards/accuracies": 0.75, "rewards/chosen": -45.601646423339844, "rewards/margins": 2.2928152084350586, "rewards/rejected": -47.89446258544922, "step": 7301 }, { "epoch": 0.994281045751634, "grad_norm": 44.077951136331144, "learning_rate": 7.971542990570235e-11, "logits/chosen": 13.613300323486328, "logits/rejected": 14.147769927978516, "logps/chosen": -4.3137969970703125, "logps/rejected": -4.621232032775879, "loss": 3.8839, "rewards/accuracies": 0.75, "rewards/chosen": -43.13796615600586, "rewards/margins": 3.074355125427246, "rewards/rejected": -46.21232223510742, "step": 7302 }, { "epoch": 0.994417211328976, "grad_norm": 41.24627432697785, "learning_rate": 7.596476593039191e-11, "logits/chosen": 14.241453170776367, "logits/rejected": 14.482646942138672, "logps/chosen": -4.636848449707031, "logps/rejected": -4.671359539031982, "loss": 4.2487, "rewards/accuracies": 0.5, "rewards/chosen": -46.36848449707031, "rewards/margins": 0.3451099395751953, "rewards/rejected": -46.713592529296875, "step": 7303 }, { "epoch": 0.9945533769063181, "grad_norm": 40.32128842503203, "learning_rate": 7.230446809667334e-11, "logits/chosen": 13.914484024047852, "logits/rejected": 14.744307518005371, "logps/chosen": -4.39236307144165, "logps/rejected": -4.842856407165527, "loss": 3.9178, "rewards/accuracies": 0.75, "rewards/chosen": -43.92362976074219, "rewards/margins": 4.504929542541504, "rewards/rejected": -48.42856216430664, "step": 7304 }, { "epoch": 0.9946895424836601, "grad_norm": 41.24521710795756, "learning_rate": 6.873453723157396e-11, "logits/chosen": 14.988468170166016, "logits/rejected": 15.170219421386719, "logps/chosen": -4.715982437133789, "logps/rejected": -4.6899237632751465, "loss": 3.9504, "rewards/accuracies": 0.5, "rewards/chosen": -47.159820556640625, "rewards/margins": -0.26058483123779297, "rewards/rejected": -46.89923858642578, "step": 7305 }, { "epoch": 0.9948257080610022, "grad_norm": 44.43123134043314, "learning_rate": 6.525497414178183e-11, "logits/chosen": 13.248950958251953, "logits/rejected": 14.334138870239258, "logps/chosen": -4.183372497558594, "logps/rejected": -4.461639404296875, "loss": 3.914, "rewards/accuracies": 0.75, "rewards/chosen": -41.83372497558594, "rewards/margins": 2.7826662063598633, "rewards/rejected": -44.61639404296875, "step": 7306 }, { "epoch": 0.9949618736383442, "grad_norm": 40.22492258649295, "learning_rate": 6.186577961351247e-11, "logits/chosen": 14.745549201965332, "logits/rejected": 14.285819053649902, "logps/chosen": -4.447511672973633, "logps/rejected": -4.810108661651611, "loss": 3.9858, "rewards/accuracies": 0.75, "rewards/chosen": -44.47511672973633, "rewards/margins": 3.6259679794311523, "rewards/rejected": -48.10108947753906, "step": 7307 }, { "epoch": 0.9950980392156863, "grad_norm": 42.51848193870174, "learning_rate": 5.856695441259774e-11, "logits/chosen": 15.028572082519531, "logits/rejected": 13.288220405578613, "logps/chosen": -4.896346092224121, "logps/rejected": -4.445211410522461, "loss": 4.2531, "rewards/accuracies": 0.25, "rewards/chosen": -48.96346664428711, "rewards/margins": -4.5113525390625, "rewards/rejected": -44.452110290527344, "step": 7308 }, { "epoch": 0.9952342047930284, "grad_norm": 38.923556019801445, "learning_rate": 5.5358499284396956e-11, "logits/chosen": 15.166305541992188, "logits/rejected": 15.216875076293945, "logps/chosen": -4.702885627746582, "logps/rejected": -4.897588729858398, "loss": 3.7198, "rewards/accuracies": 0.75, "rewards/chosen": -47.02886199951172, "rewards/margins": 1.947026252746582, "rewards/rejected": -48.97588348388672, "step": 7309 }, { "epoch": 0.9953703703703703, "grad_norm": 44.21795304515875, "learning_rate": 5.224041495397458e-11, "logits/chosen": 13.915794372558594, "logits/rejected": 13.536720275878906, "logps/chosen": -4.394595146179199, "logps/rejected": -4.422166347503662, "loss": 4.4728, "rewards/accuracies": 0.5, "rewards/chosen": -43.94594955444336, "rewards/margins": 0.2757139205932617, "rewards/rejected": -44.22166442871094, "step": 7310 }, { "epoch": 0.9955065359477124, "grad_norm": 45.999793222063865, "learning_rate": 4.9212702125789317e-11, "logits/chosen": 14.31032943725586, "logits/rejected": 14.582359313964844, "logps/chosen": -4.4801836013793945, "logps/rejected": -5.028555870056152, "loss": 3.9199, "rewards/accuracies": 1.0, "rewards/chosen": -44.80183410644531, "rewards/margins": 5.48372745513916, "rewards/rejected": -50.285560607910156, "step": 7311 }, { "epoch": 0.9956427015250545, "grad_norm": 39.533829711129755, "learning_rate": 4.6275361484049426e-11, "logits/chosen": 14.518308639526367, "logits/rejected": 14.199475288391113, "logps/chosen": -4.772062301635742, "logps/rejected": -4.753512859344482, "loss": 4.072, "rewards/accuracies": 0.5, "rewards/chosen": -47.72062301635742, "rewards/margins": -0.18549251556396484, "rewards/rejected": -47.535133361816406, "step": 7312 }, { "epoch": 0.9957788671023965, "grad_norm": 44.22001831549663, "learning_rate": 4.342839369244622e-11, "logits/chosen": 14.819031715393066, "logits/rejected": 14.344589233398438, "logps/chosen": -4.584964275360107, "logps/rejected": -4.462494850158691, "loss": 4.3135, "rewards/accuracies": 0.25, "rewards/chosen": -45.84964370727539, "rewards/margins": -1.224691390991211, "rewards/rejected": -44.62495422363281, "step": 7313 }, { "epoch": 0.9959150326797386, "grad_norm": 40.67452606105292, "learning_rate": 4.0671799394242925e-11, "logits/chosen": 14.786626815795898, "logits/rejected": 14.681493759155273, "logps/chosen": -4.659071922302246, "logps/rejected": -4.58351993560791, "loss": 3.5051, "rewards/accuracies": 0.25, "rewards/chosen": -46.590721130371094, "rewards/margins": -0.7555246353149414, "rewards/rejected": -45.83519744873047, "step": 7314 }, { "epoch": 0.9960511982570807, "grad_norm": 45.482171516918086, "learning_rate": 3.800557921236347e-11, "logits/chosen": 14.461505889892578, "logits/rejected": 14.586444854736328, "logps/chosen": -4.6512451171875, "logps/rejected": -4.828425407409668, "loss": 3.7019, "rewards/accuracies": 0.75, "rewards/chosen": -46.512451171875, "rewards/margins": 1.7718009948730469, "rewards/rejected": -48.28425598144531, "step": 7315 }, { "epoch": 0.9961873638344226, "grad_norm": 44.963451661670675, "learning_rate": 3.542973374925928e-11, "logits/chosen": 14.168487548828125, "logits/rejected": 15.115005493164062, "logps/chosen": -4.666411876678467, "logps/rejected": -5.067404270172119, "loss": 3.8043, "rewards/accuracies": 1.0, "rewards/chosen": -46.664119720458984, "rewards/margins": 4.009920120239258, "rewards/rejected": -50.674041748046875, "step": 7316 }, { "epoch": 0.9963235294117647, "grad_norm": 37.837587885584945, "learning_rate": 3.294426358690927e-11, "logits/chosen": 14.148103713989258, "logits/rejected": 13.802494049072266, "logps/chosen": -4.45308780670166, "logps/rejected": -4.455543518066406, "loss": 3.4718, "rewards/accuracies": 0.5, "rewards/chosen": -44.530879974365234, "rewards/margins": 0.024552345275878906, "rewards/rejected": -44.5554313659668, "step": 7317 }, { "epoch": 0.9964596949891068, "grad_norm": 35.09320931871939, "learning_rate": 3.0549169286997466e-11, "logits/chosen": 14.48712158203125, "logits/rejected": 14.584648132324219, "logps/chosen": -4.022791862487793, "logps/rejected": -4.551067352294922, "loss": 3.8203, "rewards/accuracies": 0.75, "rewards/chosen": -40.22792053222656, "rewards/margins": 5.282757759094238, "rewards/rejected": -45.51068115234375, "step": 7318 }, { "epoch": 0.9965958605664488, "grad_norm": 38.11182573627303, "learning_rate": 2.8244451390646573e-11, "logits/chosen": 14.018697738647461, "logits/rejected": 14.497066497802734, "logps/chosen": -4.649561882019043, "logps/rejected": -4.804888725280762, "loss": 3.8733, "rewards/accuracies": 0.75, "rewards/chosen": -46.49562072753906, "rewards/margins": 1.5532636642456055, "rewards/rejected": -48.04888153076172, "step": 7319 }, { "epoch": 0.9967320261437909, "grad_norm": 45.71526006230739, "learning_rate": 2.6030110418684416e-11, "logits/chosen": 14.465940475463867, "logits/rejected": 14.457223892211914, "logps/chosen": -4.70231819152832, "logps/rejected": -4.751959800720215, "loss": 4.1941, "rewards/accuracies": 0.5, "rewards/chosen": -47.02318572998047, "rewards/margins": 0.4964160919189453, "rewards/rejected": -47.51959991455078, "step": 7320 }, { "epoch": 0.996868191721133, "grad_norm": 38.50869433546324, "learning_rate": 2.3906146871421895e-11, "logits/chosen": 13.876495361328125, "logits/rejected": 15.562443733215332, "logps/chosen": -4.427539348602295, "logps/rejected": -5.057878494262695, "loss": 3.7805, "rewards/accuracies": 1.0, "rewards/chosen": -44.275394439697266, "rewards/margins": 6.30339241027832, "rewards/rejected": -50.57878494262695, "step": 7321 }, { "epoch": 0.9970043572984749, "grad_norm": 43.14415157419278, "learning_rate": 2.1872561228830634e-11, "logits/chosen": 14.887065887451172, "logits/rejected": 15.490257263183594, "logps/chosen": -4.415157318115234, "logps/rejected": -4.626655578613281, "loss": 3.9325, "rewards/accuracies": 0.5, "rewards/chosen": -44.151573181152344, "rewards/margins": 2.114980697631836, "rewards/rejected": -46.26655197143555, "step": 7322 }, { "epoch": 0.997140522875817, "grad_norm": 41.56225398514425, "learning_rate": 1.9929353950365323e-11, "logits/chosen": 14.805241584777832, "logits/rejected": 14.580211639404297, "logps/chosen": -5.0243048667907715, "logps/rejected": -4.828773021697998, "loss": 4.1864, "rewards/accuracies": 0.25, "rewards/chosen": -50.24304962158203, "rewards/margins": -1.955322265625, "rewards/rejected": -48.28772735595703, "step": 7323 }, { "epoch": 0.9972766884531591, "grad_norm": 41.9466411954515, "learning_rate": 1.807652547514138e-11, "logits/chosen": 14.84545612335205, "logits/rejected": 14.797086715698242, "logps/chosen": -4.758580207824707, "logps/rejected": -4.77530574798584, "loss": 3.6694, "rewards/accuracies": 0.75, "rewards/chosen": -47.58580017089844, "rewards/margins": 0.16725444793701172, "rewards/rejected": -47.753055572509766, "step": 7324 }, { "epoch": 0.9974128540305011, "grad_norm": 41.054184035182644, "learning_rate": 1.63140762218017e-11, "logits/chosen": 14.539942741394043, "logits/rejected": 15.82823371887207, "logps/chosen": -4.515037536621094, "logps/rejected": -4.93485164642334, "loss": 3.5764, "rewards/accuracies": 0.75, "rewards/chosen": -45.15037536621094, "rewards/margins": 4.198139190673828, "rewards/rejected": -49.34851837158203, "step": 7325 }, { "epoch": 0.9975490196078431, "grad_norm": 59.16774570822346, "learning_rate": 1.4642006588605483e-11, "logits/chosen": 14.552373886108398, "logits/rejected": 15.063289642333984, "logps/chosen": -4.651102066040039, "logps/rejected": -5.106640815734863, "loss": 4.2093, "rewards/accuracies": 0.75, "rewards/chosen": -46.511024475097656, "rewards/margins": 4.555388450622559, "rewards/rejected": -51.066410064697266, "step": 7326 }, { "epoch": 0.9976851851851852, "grad_norm": 90.05941383448534, "learning_rate": 1.3060316953339424e-11, "logits/chosen": 13.650270462036133, "logits/rejected": 14.647073745727539, "logps/chosen": -4.335283279418945, "logps/rejected": -4.79555606842041, "loss": 4.7308, "rewards/accuracies": 1.0, "rewards/chosen": -43.35283279418945, "rewards/margins": 4.602729797363281, "rewards/rejected": -47.955562591552734, "step": 7327 }, { "epoch": 0.9978213507625272, "grad_norm": 42.36702945876343, "learning_rate": 1.1569007673450925e-11, "logits/chosen": 15.037437438964844, "logits/rejected": 15.761396408081055, "logps/chosen": -4.725219249725342, "logps/rejected": -5.20014762878418, "loss": 3.2813, "rewards/accuracies": 0.75, "rewards/chosen": -47.252193450927734, "rewards/margins": 4.749283790588379, "rewards/rejected": -52.00148010253906, "step": 7328 }, { "epoch": 0.9979575163398693, "grad_norm": 44.701552807132785, "learning_rate": 1.0168079085870474e-11, "logits/chosen": 14.072952270507812, "logits/rejected": 14.022027015686035, "logps/chosen": -4.384071350097656, "logps/rejected": -4.618271827697754, "loss": 4.1325, "rewards/accuracies": 0.75, "rewards/chosen": -43.84071350097656, "rewards/margins": 2.342000961303711, "rewards/rejected": -46.182716369628906, "step": 7329 }, { "epoch": 0.9980936819172114, "grad_norm": 39.350103123397794, "learning_rate": 8.857531507144856e-12, "logits/chosen": 13.639074325561523, "logits/rejected": 14.862932205200195, "logps/chosen": -4.468686580657959, "logps/rejected": -4.81068229675293, "loss": 4.4322, "rewards/accuracies": 0.75, "rewards/chosen": -44.686866760253906, "rewards/margins": 3.419961929321289, "rewards/rejected": -48.10682678222656, "step": 7330 }, { "epoch": 0.9982298474945533, "grad_norm": 43.40491946211946, "learning_rate": 7.637365233437165e-12, "logits/chosen": 14.92971420288086, "logits/rejected": 14.79781723022461, "logps/chosen": -4.5181169509887695, "logps/rejected": -4.511020183563232, "loss": 3.7032, "rewards/accuracies": 0.5, "rewards/chosen": -45.18117141723633, "rewards/margins": -0.07097339630126953, "rewards/rejected": -45.110198974609375, "step": 7331 }, { "epoch": 0.9983660130718954, "grad_norm": 40.69813779077641, "learning_rate": 6.507580540437985e-12, "logits/chosen": 15.052729606628418, "logits/rejected": 15.889217376708984, "logps/chosen": -4.709288597106934, "logps/rejected": -5.146324157714844, "loss": 4.3934, "rewards/accuracies": 1.0, "rewards/chosen": -47.09288787841797, "rewards/margins": 4.3703508377075195, "rewards/rejected": -51.46324157714844, "step": 7332 }, { "epoch": 0.9985021786492375, "grad_norm": 42.14395868573152, "learning_rate": 5.4681776834097915e-12, "logits/chosen": 14.304018020629883, "logits/rejected": 15.857608795166016, "logps/chosen": -4.267925262451172, "logps/rejected": -4.661862373352051, "loss": 3.7223, "rewards/accuracies": 0.75, "rewards/chosen": -42.67924880981445, "rewards/margins": 3.9393739700317383, "rewards/rejected": -46.61862564086914, "step": 7333 }, { "epoch": 0.9986383442265795, "grad_norm": 44.03936918497589, "learning_rate": 4.519156897275777e-12, "logits/chosen": 14.85158634185791, "logits/rejected": 14.09189224243164, "logps/chosen": -4.499422073364258, "logps/rejected": -4.331978797912598, "loss": 3.5964, "rewards/accuracies": 0.5, "rewards/chosen": -44.994224548339844, "rewards/margins": -1.6744375228881836, "rewards/rejected": -43.319786071777344, "step": 7334 }, { "epoch": 0.9987745098039216, "grad_norm": 48.88516749188685, "learning_rate": 3.660518396397805e-12, "logits/chosen": 14.08591365814209, "logits/rejected": 13.823138236999512, "logps/chosen": -4.424304008483887, "logps/rejected": -4.385266304016113, "loss": 3.4397, "rewards/accuracies": 0.5, "rewards/chosen": -44.243038177490234, "rewards/margins": -0.39037418365478516, "rewards/rejected": -43.8526611328125, "step": 7335 }, { "epoch": 0.9989106753812637, "grad_norm": 44.911463728488, "learning_rate": 2.8922623748428577e-12, "logits/chosen": 14.23560905456543, "logits/rejected": 14.37529182434082, "logps/chosen": -4.514297962188721, "logps/rejected": -4.41602897644043, "loss": 4.486, "rewards/accuracies": 0.5, "rewards/chosen": -45.14297866821289, "rewards/margins": -0.9826869964599609, "rewards/rejected": -44.1602897644043, "step": 7336 }, { "epoch": 0.9990468409586056, "grad_norm": 46.528781377619204, "learning_rate": 2.2143890062054084e-12, "logits/chosen": 14.737150192260742, "logits/rejected": 15.262754440307617, "logps/chosen": -4.762523651123047, "logps/rejected": -4.917323112487793, "loss": 4.5577, "rewards/accuracies": 0.5, "rewards/chosen": -47.62523651123047, "rewards/margins": 1.5479917526245117, "rewards/rejected": -49.17322540283203, "step": 7337 }, { "epoch": 0.9991830065359477, "grad_norm": 40.2456534821328, "learning_rate": 1.6268984436074163e-12, "logits/chosen": 14.067743301391602, "logits/rejected": 14.096807479858398, "logps/chosen": -4.435381889343262, "logps/rejected": -4.619939804077148, "loss": 3.6517, "rewards/accuracies": 0.75, "rewards/chosen": -44.35381317138672, "rewards/margins": 1.845585823059082, "rewards/rejected": -46.19940185546875, "step": 7338 }, { "epoch": 0.9993191721132898, "grad_norm": 40.45144877494219, "learning_rate": 1.1297908198315553e-12, "logits/chosen": 14.036136627197266, "logits/rejected": 15.323738098144531, "logps/chosen": -4.400915622711182, "logps/rejected": -4.769132614135742, "loss": 3.5095, "rewards/accuracies": 0.75, "rewards/chosen": -44.0091552734375, "rewards/margins": 3.682170867919922, "rewards/rejected": -47.69132995605469, "step": 7339 }, { "epoch": 0.9994553376906318, "grad_norm": 45.338208595326876, "learning_rate": 7.230662472323957e-13, "logits/chosen": 14.825456619262695, "logits/rejected": 14.97541332244873, "logps/chosen": -4.738114356994629, "logps/rejected": -5.062869548797607, "loss": 4.2057, "rewards/accuracies": 0.75, "rewards/chosen": -47.38114547729492, "rewards/margins": 3.247547149658203, "rewards/rejected": -50.628692626953125, "step": 7340 }, { "epoch": 0.9995915032679739, "grad_norm": 42.001549036911605, "learning_rate": 4.067248176919946e-13, "logits/chosen": 14.57059097290039, "logits/rejected": 15.261387825012207, "logps/chosen": -4.613569259643555, "logps/rejected": -5.027078628540039, "loss": 4.3531, "rewards/accuracies": 1.0, "rewards/chosen": -46.13569641113281, "rewards/margins": 4.1350908279418945, "rewards/rejected": -50.27078628540039, "step": 7341 }, { "epoch": 0.9997276688453159, "grad_norm": 41.707215787998344, "learning_rate": 1.8076660266430622e-13, "logits/chosen": 14.690299987792969, "logits/rejected": 14.7698974609375, "logps/chosen": -4.666154861450195, "logps/rejected": -4.711336612701416, "loss": 4.19, "rewards/accuracies": 0.75, "rewards/chosen": -46.66154479980469, "rewards/margins": 0.45181941986083984, "rewards/rejected": -47.113365173339844, "step": 7342 }, { "epoch": 0.9998638344226579, "grad_norm": 44.52168111573998, "learning_rate": 4.519165321958951e-14, "logits/chosen": 13.932487487792969, "logits/rejected": 14.464506149291992, "logps/chosen": -4.819092750549316, "logps/rejected": -4.991519927978516, "loss": 4.3909, "rewards/accuracies": 1.0, "rewards/chosen": -48.1909294128418, "rewards/margins": 1.7242708206176758, "rewards/rejected": -49.915199279785156, "step": 7343 }, { "epoch": 1.0, "grad_norm": 44.869754461249585, "learning_rate": 0.0, "logits/chosen": 13.534160614013672, "logits/rejected": 14.768027305603027, "logps/chosen": -4.339266300201416, "logps/rejected": -4.692265033721924, "loss": 3.9873, "rewards/accuracies": 1.0, "rewards/chosen": -43.392662048339844, "rewards/margins": 3.5299882888793945, "rewards/rejected": -46.92264938354492, "step": 7344 }, { "epoch": 1.0, "eval_logits/chosen": 14.445918083190918, "eval_logits/rejected": 14.679619789123535, "eval_logps/chosen": -4.605433940887451, "eval_logps/rejected": -4.770407676696777, "eval_loss": 3.9434103965759277, "eval_rewards/accuracies": 0.6472868323326111, "eval_rewards/chosen": -46.05434036254883, "eval_rewards/margins": 1.6497403383255005, "eval_rewards/rejected": -47.704078674316406, "eval_runtime": 461.5443, "eval_samples_per_second": 107.194, "eval_steps_per_second": 1.677, "step": 7344 }, { "epoch": 1.0, "step": 7344, "total_flos": 4181538261958656.0, "train_loss": 4.104885466542899, "train_runtime": 31282.7939, "train_samples_per_second": 30.049, "train_steps_per_second": 0.235 } ], "logging_steps": 1.0, "max_steps": 7344, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4181538261958656.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }