{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 800000000, "global_step": 835, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 8.9375, "learning_rate": 5.952380952380953e-08, "logits/chosen": -3.4845848083496094, "logits/rejected": -3.85036301612854, "logps/chosen": -306.50885009765625, "logps/rejected": -197.74395751953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/diff": -0.625, "rewards/diff_abs": 0.625, "rewards/rejected": 0.0, "rewards/student_margin": 0.0, "rewards/teacher_margin": 0.625, "step": 1 }, { "epoch": 0.01, "grad_norm": 8.8125, "learning_rate": 5.952380952380953e-07, "logits/chosen": -3.454127788543701, "logits/rejected": -3.5237815380096436, "logps/chosen": -201.42767333984375, "logps/rejected": -183.9016571044922, "loss": 0.7039, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.005284797865897417, "rewards/diff": -0.6895497441291809, "rewards/diff_abs": 0.7025125026702881, "rewards/rejected": 0.01312162820249796, "rewards/student_margin": -0.007836826145648956, "rewards/teacher_margin": 0.6817129254341125, "step": 10 }, { "epoch": 0.02, "grad_norm": 8.5, "learning_rate": 1.1904761904761906e-06, "logits/chosen": -3.5940723419189453, "logits/rejected": -3.5770275592803955, "logps/chosen": -218.02499389648438, "logps/rejected": -209.6902313232422, "loss": 0.7101, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": -0.025884132832288742, "rewards/diff": -0.9974073171615601, "rewards/diff_abs": 1.025899887084961, "rewards/rejected": -0.01753927394747734, "rewards/student_margin": -0.00834486074745655, "rewards/teacher_margin": 0.9890626072883606, "step": 20 }, { "epoch": 0.04, "grad_norm": 7.9375, "learning_rate": 1.7857142857142859e-06, "logits/chosen": -3.4890503883361816, "logits/rejected": -3.6032581329345703, "logps/chosen": -259.52838134765625, "logps/rejected": -200.54518127441406, "loss": 0.6961, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 0.11190159618854523, "rewards/diff": -0.870490550994873, "rewards/diff_abs": 0.910025417804718, "rewards/rejected": 0.07041291892528534, "rewards/student_margin": 0.04148866608738899, "rewards/teacher_margin": 0.911979079246521, "step": 30 }, { "epoch": 0.05, "grad_norm": 7.8125, "learning_rate": 2.380952380952381e-06, "logits/chosen": -3.4495646953582764, "logits/rejected": -3.5306625366210938, "logps/chosen": -296.196044921875, "logps/rejected": -205.72494506835938, "loss": 0.6763, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 0.27481913566589355, "rewards/diff": -0.928841233253479, "rewards/diff_abs": 0.9340232610702515, "rewards/rejected": 0.1948060244321823, "rewards/student_margin": 0.08001308888196945, "rewards/teacher_margin": 1.0088541507720947, "step": 40 }, { "epoch": 0.06, "grad_norm": 7.46875, "learning_rate": 2.9761904761904763e-06, "logits/chosen": -3.6242897510528564, "logits/rejected": -3.6223366260528564, "logps/chosen": -232.1892852783203, "logps/rejected": -218.8447265625, "loss": 0.6524, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 0.5324563980102539, "rewards/diff": -1.090423583984375, "rewards/diff_abs": 1.1463050842285156, "rewards/rejected": 0.3765257000923157, "rewards/student_margin": 0.15593069791793823, "rewards/teacher_margin": 1.2463542222976685, "step": 50 }, { "epoch": 0.07, "grad_norm": 7.5, "learning_rate": 3.5714285714285718e-06, "logits/chosen": -3.52375864982605, "logits/rejected": -3.5178802013397217, "logps/chosen": -278.49578857421875, "logps/rejected": -227.9744110107422, "loss": 0.6501, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 0.9247525930404663, "rewards/diff": -0.6484954953193665, "rewards/diff_abs": 0.8814946413040161, "rewards/rejected": 0.6779355406761169, "rewards/student_margin": 0.24681702256202698, "rewards/teacher_margin": 0.895312488079071, "step": 60 }, { "epoch": 0.08, "grad_norm": 7.28125, "learning_rate": 4.166666666666667e-06, "logits/chosen": -3.57179594039917, "logits/rejected": -3.573483943939209, "logps/chosen": -299.92742919921875, "logps/rejected": -295.48846435546875, "loss": 0.6298, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 1.0992387533187866, "rewards/diff": -0.5216845273971558, "rewards/diff_abs": 0.7238657474517822, "rewards/rejected": 0.8657148480415344, "rewards/student_margin": 0.23352384567260742, "rewards/teacher_margin": 0.7552083730697632, "step": 70 }, { "epoch": 0.1, "grad_norm": 7.15625, "learning_rate": 4.761904761904762e-06, "logits/chosen": -3.3535995483398438, "logits/rejected": -3.4229187965393066, "logps/chosen": -306.759521484375, "logps/rejected": -193.92160034179688, "loss": 0.6005, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 1.2742681503295898, "rewards/diff": -0.2789258360862732, "rewards/diff_abs": 0.8034403920173645, "rewards/rejected": 0.5938189625740051, "rewards/student_margin": 0.6804491281509399, "rewards/teacher_margin": 0.9593750238418579, "step": 80 }, { "epoch": 0.11, "grad_norm": 6.3125, "learning_rate": 4.9992125742993825e-06, "logits/chosen": -3.5169739723205566, "logits/rejected": -3.478895664215088, "logps/chosen": -305.4494323730469, "logps/rejected": -259.5570373535156, "loss": 0.5942, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 1.40164053440094, "rewards/diff": -0.47005853056907654, "rewards/diff_abs": 0.7208673357963562, "rewards/rejected": 1.1060739755630493, "rewards/student_margin": 0.29556649923324585, "rewards/teacher_margin": 0.765625, "step": 90 }, { "epoch": 0.12, "grad_norm": 7.0625, "learning_rate": 4.994402324561469e-06, "logits/chosen": -3.475271701812744, "logits/rejected": -3.4696757793426514, "logps/chosen": -290.073974609375, "logps/rejected": -212.7600860595703, "loss": 0.6009, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.396482229232788, "rewards/diff": -0.12815351784229279, "rewards/diff_abs": 0.5424238443374634, "rewards/rejected": 0.6892191171646118, "rewards/student_margin": 0.7072631120681763, "rewards/teacher_margin": 0.8354166746139526, "step": 100 }, { "epoch": 0.13, "grad_norm": 7.03125, "learning_rate": 4.985227689958313e-06, "logits/chosen": -3.4492225646972656, "logits/rejected": -3.490285873413086, "logps/chosen": -309.30743408203125, "logps/rejected": -202.38356018066406, "loss": 0.5763, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 1.2701516151428223, "rewards/diff": -0.4000614583492279, "rewards/diff_abs": 0.8188215494155884, "rewards/rejected": 0.8035463094711304, "rewards/student_margin": 0.46660518646240234, "rewards/teacher_margin": 0.8666666746139526, "step": 110 }, { "epoch": 0.14, "grad_norm": 6.625, "learning_rate": 4.97170472308737e-06, "logits/chosen": -3.537369966506958, "logits/rejected": -3.5341758728027344, "logps/chosen": -238.89035034179688, "logps/rejected": -219.8264617919922, "loss": 0.5923, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 1.1539257764816284, "rewards/diff": -0.407745361328125, "rewards/diff_abs": 0.8213578462600708, "rewards/rejected": 0.6814627051353455, "rewards/student_margin": 0.47246304154396057, "rewards/teacher_margin": 0.8802083134651184, "step": 120 }, { "epoch": 0.16, "grad_norm": 6.3125, "learning_rate": 4.953857084699501e-06, "logits/chosen": -3.3898227214813232, "logits/rejected": -3.445030689239502, "logps/chosen": -237.6462860107422, "logps/rejected": -189.9452667236328, "loss": 0.5986, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 1.2968804836273193, "rewards/diff": -0.1380801498889923, "rewards/diff_abs": 0.6938132643699646, "rewards/rejected": 0.6688148379325867, "rewards/student_margin": 0.6280657649040222, "rewards/teacher_margin": 0.7661458253860474, "step": 130 }, { "epoch": 0.17, "grad_norm": 6.9375, "learning_rate": 4.931716002300424e-06, "logits/chosen": -3.4307568073272705, "logits/rejected": -3.428516387939453, "logps/chosen": -303.8690490722656, "logps/rejected": -268.0577087402344, "loss": 0.5674, "rewards/accuracies": 0.76666659116745, "rewards/chosen": 1.502533197402954, "rewards/diff": -0.11084864288568497, "rewards/diff_abs": 0.7653725743293762, "rewards/rejected": 0.7842152714729309, "rewards/student_margin": 0.7183180451393127, "rewards/teacher_margin": 0.8291667103767395, "step": 140 }, { "epoch": 0.18, "grad_norm": 5.90625, "learning_rate": 4.905320215512843e-06, "logits/chosen": -3.3582215309143066, "logits/rejected": -3.445798397064209, "logps/chosen": -272.33465576171875, "logps/rejected": -241.3258819580078, "loss": 0.5839, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.9148601293563843, "rewards/diff": -0.40072593092918396, "rewards/diff_abs": 0.7023404240608215, "rewards/rejected": 0.5593360662460327, "rewards/student_margin": 0.35552406311035156, "rewards/teacher_margin": 0.7562500238418579, "step": 150 }, { "epoch": 0.19, "grad_norm": 6.5625, "learning_rate": 4.874715908294827e-06, "logits/chosen": -3.4379913806915283, "logits/rejected": -3.411599636077881, "logps/chosen": -235.85824584960938, "logps/rejected": -200.01751708984375, "loss": 0.5673, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 0.91960209608078, "rewards/diff": -0.3414815068244934, "rewards/diff_abs": 0.8401057124137878, "rewards/rejected": 0.38087528944015503, "rewards/student_margin": 0.5387269258499146, "rewards/teacher_margin": 0.8802083730697632, "step": 160 }, { "epoch": 0.2, "grad_norm": 6.875, "learning_rate": 4.839956628133049e-06, "logits/chosen": -3.3944404125213623, "logits/rejected": -3.4562854766845703, "logps/chosen": -236.4658203125, "logps/rejected": -207.5730438232422, "loss": 0.5312, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 1.1229525804519653, "rewards/diff": -0.3498944342136383, "rewards/diff_abs": 0.8671269416809082, "rewards/rejected": 0.43118032813072205, "rewards/student_margin": 0.6917722821235657, "rewards/teacher_margin": 1.0416667461395264, "step": 170 }, { "epoch": 0.22, "grad_norm": 6.8125, "learning_rate": 4.801103192352272e-06, "logits/chosen": -3.5573208332061768, "logits/rejected": -3.619119167327881, "logps/chosen": -342.3301086425781, "logps/rejected": -242.18148803710938, "loss": 0.5428, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7506067752838135, "rewards/diff": -0.05490243434906006, "rewards/diff_abs": 1.2669219970703125, "rewards/rejected": 0.852384090423584, "rewards/student_margin": 0.8982225656509399, "rewards/teacher_margin": 0.9531251192092896, "step": 180 }, { "epoch": 0.23, "grad_norm": 6.75, "learning_rate": 4.758223581705006e-06, "logits/chosen": -3.493630886077881, "logits/rejected": -3.531799793243408, "logps/chosen": -242.2713623046875, "logps/rejected": -195.7411346435547, "loss": 0.5605, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 1.3145856857299805, "rewards/diff": -0.18605072796344757, "rewards/diff_abs": 0.9721413850784302, "rewards/rejected": 0.7516780495643616, "rewards/student_margin": 0.5629075765609741, "rewards/teacher_margin": 0.7489583492279053, "step": 190 }, { "epoch": 0.24, "grad_norm": 7.375, "learning_rate": 4.711392821427515e-06, "logits/chosen": -3.5924346446990967, "logits/rejected": -3.6110892295837402, "logps/chosen": -231.8784942626953, "logps/rejected": -160.18458557128906, "loss": 0.5528, "rewards/accuracies": 0.76666659116745, "rewards/chosen": 1.1364099979400635, "rewards/diff": -0.20100148022174835, "rewards/diff_abs": 0.8547786474227905, "rewards/rejected": 0.13949476182460785, "rewards/student_margin": 0.9969152212142944, "rewards/teacher_margin": 1.1979167461395264, "step": 200 }, { "epoch": 0.25, "grad_norm": 6.09375, "learning_rate": 4.6606928499702905e-06, "logits/chosen": -3.583310604095459, "logits/rejected": -3.646390914916992, "logps/chosen": -236.2954864501953, "logps/rejected": -226.30050659179688, "loss": 0.5455, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1413193941116333, "rewards/diff": -0.6167432069778442, "rewards/diff_abs": 0.9270604252815247, "rewards/rejected": 0.8591042757034302, "rewards/student_margin": 0.2822151482105255, "rewards/teacher_margin": 0.8989583849906921, "step": 210 }, { "epoch": 0.26, "grad_norm": 6.0625, "learning_rate": 4.606212375632682e-06, "logits/chosen": -3.3313984870910645, "logits/rejected": -3.4001998901367188, "logps/chosen": -241.50430297851562, "logps/rejected": -185.7382049560547, "loss": 0.5455, "rewards/accuracies": 0.76666659116745, "rewards/chosen": 1.245307207107544, "rewards/diff": -0.19214758276939392, "rewards/diff_abs": 0.9847003817558289, "rewards/rejected": 0.4134964942932129, "rewards/student_margin": 0.8318107724189758, "rewards/teacher_margin": 1.023958444595337, "step": 220 }, { "epoch": 0.28, "grad_norm": 7.59375, "learning_rate": 4.5480467213524935e-06, "logits/chosen": -3.4316277503967285, "logits/rejected": -3.4833552837371826, "logps/chosen": -258.82781982421875, "logps/rejected": -248.5332489013672, "loss": 0.5478, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 1.38356614112854, "rewards/diff": -0.1715225726366043, "rewards/diff_abs": 0.7107259631156921, "rewards/rejected": 0.7259219288825989, "rewards/student_margin": 0.6576440930366516, "rewards/teacher_margin": 0.8291667699813843, "step": 230 }, { "epoch": 0.29, "grad_norm": 6.40625, "learning_rate": 4.4862976579221605e-06, "logits/chosen": -3.3932158946990967, "logits/rejected": -3.4250049591064453, "logps/chosen": -303.9991149902344, "logps/rejected": -221.3593292236328, "loss": 0.5413, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 1.6610606908798218, "rewards/diff": -0.16523823142051697, "rewards/diff_abs": 0.8983039855957031, "rewards/rejected": 0.6669239401817322, "rewards/student_margin": 0.9941369295120239, "rewards/teacher_margin": 1.1593749523162842, "step": 240 }, { "epoch": 0.3, "grad_norm": 6.125, "learning_rate": 4.421073225923276e-06, "logits/chosen": -3.4080328941345215, "logits/rejected": -3.545672655105591, "logps/chosen": -302.5151062011719, "logps/rejected": -223.77474975585938, "loss": 0.5379, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 1.6596599817276, "rewards/diff": -0.011551931500434875, "rewards/diff_abs": 0.9808802604675293, "rewards/rejected": 0.7274617552757263, "rewards/student_margin": 0.9321980476379395, "rewards/teacher_margin": 0.9437500238418579, "step": 250 }, { "epoch": 0.31, "grad_norm": 7.28125, "learning_rate": 4.3524875466910634e-06, "logits/chosen": -3.377882719039917, "logits/rejected": -3.380521059036255, "logps/chosen": -247.70703125, "logps/rejected": -240.69363403320312, "loss": 0.5479, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 0.9718164205551147, "rewards/diff": -0.17575177550315857, "rewards/diff_abs": 0.8040043711662292, "rewards/rejected": 0.5954850316047668, "rewards/student_margin": 0.3763315677642822, "rewards/teacher_margin": 0.5520833730697632, "step": 260 }, { "epoch": 0.32, "grad_norm": 6.4375, "learning_rate": 4.280660622639513e-06, "logits/chosen": -3.5067367553710938, "logits/rejected": -3.5205013751983643, "logps/chosen": -237.2410125732422, "logps/rejected": -190.3438720703125, "loss": 0.5352, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 1.3214976787567139, "rewards/diff": -0.030149614438414574, "rewards/diff_abs": 0.9027697443962097, "rewards/rejected": 0.4870639443397522, "rewards/student_margin": 0.8344337344169617, "rewards/teacher_margin": 0.8645833730697632, "step": 270 }, { "epoch": 0.34, "grad_norm": 6.0625, "learning_rate": 4.205718127296574e-06, "logits/chosen": -3.5430946350097656, "logits/rejected": -3.5217278003692627, "logps/chosen": -239.96188354492188, "logps/rejected": -210.2650604248047, "loss": 0.5277, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 1.366132140159607, "rewards/diff": -0.2834340035915375, "rewards/diff_abs": 1.1607670783996582, "rewards/rejected": 0.7917537689208984, "rewards/student_margin": 0.5743785500526428, "rewards/teacher_margin": 0.8578125238418579, "step": 280 }, { "epoch": 0.35, "grad_norm": 6.90625, "learning_rate": 4.127791185416747e-06, "logits/chosen": -3.410996675491333, "logits/rejected": -3.428239345550537, "logps/chosen": -218.684326171875, "logps/rejected": -173.12939453125, "loss": 0.5492, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": 1.1556932926177979, "rewards/diff": -0.2741561830043793, "rewards/diff_abs": 0.9800466299057007, "rewards/rejected": 0.501724362373352, "rewards/student_margin": 0.6539688110351562, "rewards/teacher_margin": 0.9281250834465027, "step": 290 }, { "epoch": 0.36, "grad_norm": 5.6875, "learning_rate": 4.047016143555834e-06, "logits/chosen": -3.4146499633789062, "logits/rejected": -3.4334769248962402, "logps/chosen": -246.2406005859375, "logps/rejected": -207.5984344482422, "loss": 0.5396, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 1.4659597873687744, "rewards/diff": -0.0349099263548851, "rewards/diff_abs": 0.7459529042243958, "rewards/rejected": 0.5878490209579468, "rewards/student_margin": 0.8781110048294067, "rewards/teacher_margin": 0.91302090883255, "step": 300 }, { "epoch": 0.37, "grad_norm": 6.75, "learning_rate": 3.9635343315092374e-06, "logits/chosen": -3.3409626483917236, "logits/rejected": -3.4818501472473145, "logps/chosen": -242.3018341064453, "logps/rejected": -209.86740112304688, "loss": 0.5499, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.246741533279419, "rewards/diff": -0.13802729547023773, "rewards/diff_abs": 0.9692344665527344, "rewards/rejected": 0.40664371848106384, "rewards/student_margin": 0.8400977253913879, "rewards/teacher_margin": 0.9781249165534973, "step": 310 }, { "epoch": 0.38, "grad_norm": 6.78125, "learning_rate": 3.877491815031241e-06, "logits/chosen": -3.50838041305542, "logits/rejected": -3.6322741508483887, "logps/chosen": -257.099609375, "logps/rejected": -179.91046142578125, "loss": 0.526, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 1.3539568185806274, "rewards/diff": 0.1423492729663849, "rewards/diff_abs": 0.7491869926452637, "rewards/rejected": 0.354315847158432, "rewards/student_margin": 0.999640941619873, "rewards/teacher_margin": 0.8572916984558105, "step": 320 }, { "epoch": 0.4, "grad_norm": 6.4375, "learning_rate": 3.789039140267903e-06, "logits/chosen": -3.6152091026306152, "logits/rejected": -3.6335906982421875, "logps/chosen": -238.04483032226562, "logps/rejected": -203.68545532226562, "loss": 0.5211, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 1.1535335779190063, "rewards/diff": -0.22724106907844543, "rewards/diff_abs": 0.8418729901313782, "rewards/rejected": 0.3599412739276886, "rewards/student_margin": 0.7935922741889954, "rewards/teacher_margin": 1.0208333730697632, "step": 330 }, { "epoch": 0.41, "grad_norm": 6.25, "learning_rate": 3.6983310703507475e-06, "logits/chosen": -3.474027633666992, "logits/rejected": -3.618129253387451, "logps/chosen": -314.650390625, "logps/rejected": -292.22796630859375, "loss": 0.5083, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 1.6438385248184204, "rewards/diff": -0.008717024698853493, "rewards/diff_abs": 0.9126062393188477, "rewards/rejected": 0.937972366809845, "rewards/student_margin": 0.7058663368225098, "rewards/teacher_margin": 0.7145833969116211, "step": 340 }, { "epoch": 0.42, "grad_norm": 6.84375, "learning_rate": 3.6055263146121062e-06, "logits/chosen": -3.4695258140563965, "logits/rejected": -3.544586658477783, "logps/chosen": -241.87686157226562, "logps/rejected": -190.71157836914062, "loss": 0.5241, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 1.3957545757293701, "rewards/diff": -0.020256221294403076, "rewards/diff_abs": 1.0595004558563232, "rewards/rejected": 0.6003857851028442, "rewards/student_margin": 0.7953688502311707, "rewards/teacher_margin": 0.815625011920929, "step": 350 }, { "epoch": 0.43, "grad_norm": 6.9375, "learning_rate": 3.5107872508959144e-06, "logits/chosen": -3.5332858562469482, "logits/rejected": -3.658419370651245, "logps/chosen": -301.9664001464844, "logps/rejected": -229.6895294189453, "loss": 0.5268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3080447912216187, "rewards/diff": 0.03206339478492737, "rewards/diff_abs": 1.0055023431777954, "rewards/rejected": 0.4827522337436676, "rewards/student_margin": 0.8252925872802734, "rewards/teacher_margin": 0.7932292222976685, "step": 360 }, { "epoch": 0.44, "grad_norm": 6.5625, "learning_rate": 3.414279641449809e-06, "logits/chosen": -3.4194533824920654, "logits/rejected": -3.459688901901245, "logps/chosen": -293.80865478515625, "logps/rejected": -236.77560424804688, "loss": 0.512, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 1.4415782690048218, "rewards/diff": -0.22951290011405945, "rewards/diff_abs": 0.9373496174812317, "rewards/rejected": 0.7585911750793457, "rewards/student_margin": 0.6829870939254761, "rewards/teacher_margin": 0.9125000238418579, "step": 370 }, { "epoch": 0.46, "grad_norm": 6.125, "learning_rate": 3.3161723428956356e-06, "logits/chosen": -3.329397201538086, "logits/rejected": -3.4820456504821777, "logps/chosen": -303.4757385253906, "logps/rejected": -242.2552032470703, "loss": 0.5127, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.638362169265747, "rewards/diff": -0.17444480955600739, "rewards/diff_abs": 0.9962782859802246, "rewards/rejected": 0.7367652654647827, "rewards/student_margin": 0.9015968441963196, "rewards/teacher_margin": 1.0760416984558105, "step": 380 }, { "epoch": 0.47, "grad_norm": 6.21875, "learning_rate": 3.216637010785813e-06, "logits/chosen": -3.547212600708008, "logits/rejected": -3.542712688446045, "logps/chosen": -321.4063720703125, "logps/rejected": -284.3674621582031, "loss": 0.5144, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 1.7464603185653687, "rewards/diff": 0.06983740627765656, "rewards/diff_abs": 0.938123881816864, "rewards/rejected": 0.7797478437423706, "rewards/student_margin": 0.9667123556137085, "rewards/teacher_margin": 0.8968750238418579, "step": 390 }, { "epoch": 0.48, "grad_norm": 7.40625, "learning_rate": 3.115847799262494e-06, "logits/chosen": -3.4556503295898438, "logits/rejected": -3.5828518867492676, "logps/chosen": -256.46868896484375, "logps/rejected": -220.39010620117188, "loss": 0.5092, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 1.3820217847824097, "rewards/diff": 0.07980125397443771, "rewards/diff_abs": 0.8547611236572266, "rewards/rejected": 0.42253294587135315, "rewards/student_margin": 0.9594887495040894, "rewards/teacher_margin": 0.879687488079071, "step": 400 }, { "epoch": 0.49, "grad_norm": 6.03125, "learning_rate": 3.0139810563450094e-06, "logits/chosen": -3.592397689819336, "logits/rejected": -3.6688952445983887, "logps/chosen": -292.0425720214844, "logps/rejected": -234.98208618164062, "loss": 0.5161, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 1.5078362226486206, "rewards/diff": 0.07032543420791626, "rewards/diff_abs": 0.7481400370597839, "rewards/rejected": 0.6864690184593201, "rewards/student_margin": 0.821367084980011, "rewards/teacher_margin": 0.7510417103767395, "step": 410 }, { "epoch": 0.5, "grad_norm": 5.65625, "learning_rate": 2.911215015378752e-06, "logits/chosen": -3.552057981491089, "logits/rejected": -3.6183040142059326, "logps/chosen": -224.40554809570312, "logps/rejected": -186.09158325195312, "loss": 0.5053, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 1.171048641204834, "rewards/diff": 0.06066560745239258, "rewards/diff_abs": 1.058672547340393, "rewards/rejected": 0.3551747798919678, "rewards/student_margin": 0.8158739805221558, "rewards/teacher_margin": 0.7552083730697632, "step": 420 }, { "epoch": 0.51, "grad_norm": 6.1875, "learning_rate": 2.8077294831853547e-06, "logits/chosen": -3.4315121173858643, "logits/rejected": -3.4911365509033203, "logps/chosen": -285.84918212890625, "logps/rejected": -214.66140747070312, "loss": 0.5183, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 1.376267671585083, "rewards/diff": -0.3253711462020874, "rewards/diff_abs": 0.9324856996536255, "rewards/rejected": 0.7354929447174072, "rewards/student_margin": 0.6407747268676758, "rewards/teacher_margin": 0.9661458134651184, "step": 430 }, { "epoch": 0.53, "grad_norm": 6.40625, "learning_rate": 2.703705525459806e-06, "logits/chosen": -3.5061888694763184, "logits/rejected": -3.5336086750030518, "logps/chosen": -219.6090087890625, "logps/rejected": -203.97415161132812, "loss": 0.5235, "rewards/accuracies": 0.76666659116745, "rewards/chosen": 1.4297001361846924, "rewards/diff": 0.08995727449655533, "rewards/diff_abs": 0.5762092471122742, "rewards/rejected": 0.5720344185829163, "rewards/student_margin": 0.8576656579971313, "rewards/teacher_margin": 0.767708420753479, "step": 440 }, { "epoch": 0.54, "grad_norm": 6.78125, "learning_rate": 2.599325149964946e-06, "logits/chosen": -3.4120395183563232, "logits/rejected": -3.5835208892822266, "logps/chosen": -336.3391418457031, "logps/rejected": -304.1842956542969, "loss": 0.5215, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 1.8122284412384033, "rewards/diff": -0.12476543337106705, "rewards/diff_abs": 0.7393094897270203, "rewards/rejected": 1.2078273296356201, "rewards/student_margin": 0.604401171207428, "rewards/teacher_margin": 0.7291667461395264, "step": 450 }, { "epoch": 0.55, "grad_norm": 6.53125, "learning_rate": 2.4947709880776607e-06, "logits/chosen": -3.4514999389648438, "logits/rejected": -3.581846237182617, "logps/chosen": -248.5549774169922, "logps/rejected": -214.66116333007812, "loss": 0.5098, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": 1.2313129901885986, "rewards/diff": 0.08004424721002579, "rewards/diff_abs": 1.226075530052185, "rewards/rejected": 0.3127269446849823, "rewards/student_margin": 0.9185859560966492, "rewards/teacher_margin": 0.8385416865348816, "step": 460 }, { "epoch": 0.56, "grad_norm": 6.125, "learning_rate": 2.3902259752439462e-06, "logits/chosen": -3.492166042327881, "logits/rejected": -3.5663814544677734, "logps/chosen": -278.4722595214844, "logps/rejected": -242.76022338867188, "loss": 0.5051, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 1.3407200574874878, "rewards/diff": -0.000378596771042794, "rewards/diff_abs": 1.141404390335083, "rewards/rejected": 0.5020361542701721, "rewards/student_margin": 0.8386839628219604, "rewards/teacher_margin": 0.839062511920929, "step": 470 }, { "epoch": 0.57, "grad_norm": 6.25, "learning_rate": 2.2858730309019594e-06, "logits/chosen": -3.388932704925537, "logits/rejected": -3.441415309906006, "logps/chosen": -331.4084777832031, "logps/rejected": -241.6312713623047, "loss": 0.5096, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8032405376434326, "rewards/diff": 0.10180602222681046, "rewards/diff_abs": 1.0163486003875732, "rewards/rejected": 0.8545595407485962, "rewards/student_margin": 0.9486810564994812, "rewards/teacher_margin": 0.846875011920929, "step": 480 }, { "epoch": 0.59, "grad_norm": 7.28125, "learning_rate": 2.181894738433076e-06, "logits/chosen": -3.532305955886841, "logits/rejected": -3.5801339149475098, "logps/chosen": -246.8173065185547, "logps/rejected": -220.6660614013672, "loss": 0.5419, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 1.4681113958358765, "rewards/diff": -0.13037186861038208, "rewards/diff_abs": 0.9364659190177917, "rewards/rejected": 0.711243748664856, "rewards/student_margin": 0.7568677067756653, "rewards/teacher_margin": 0.8872395753860474, "step": 490 }, { "epoch": 0.6, "grad_norm": 6.3125, "learning_rate": 2.078473025700937e-06, "logits/chosen": -3.536620616912842, "logits/rejected": -3.610663652420044, "logps/chosen": -196.84896850585938, "logps/rejected": -168.04319763183594, "loss": 0.5438, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 0.8065615892410278, "rewards/diff": -0.20639605820178986, "rewards/diff_abs": 1.1502354145050049, "rewards/rejected": 0.2853534519672394, "rewards/student_margin": 0.5212081670761108, "rewards/teacher_margin": 0.7276042103767395, "step": 500 }, { "epoch": 0.61, "grad_norm": 6.625, "learning_rate": 1.975788846737431e-06, "logits/chosen": -3.4818530082702637, "logits/rejected": -3.5163490772247314, "logps/chosen": -223.56863403320312, "logps/rejected": -223.87515258789062, "loss": 0.5176, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 1.066042423248291, "rewards/diff": -0.1881529539823532, "rewards/diff_abs": 0.9577949643135071, "rewards/rejected": 0.4776328504085541, "rewards/student_margin": 0.5884095430374146, "rewards/teacher_margin": 0.7765625715255737, "step": 510 }, { "epoch": 0.62, "grad_norm": 6.03125, "learning_rate": 1.8740218651325714e-06, "logits/chosen": -3.465400218963623, "logits/rejected": -3.4614810943603516, "logps/chosen": -256.56890869140625, "logps/rejected": -236.2727813720703, "loss": 0.5154, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 1.6230605840682983, "rewards/diff": 0.13750340044498444, "rewards/diff_abs": 0.9272276163101196, "rewards/rejected": 0.6319113969802856, "rewards/student_margin": 0.9911492466926575, "rewards/teacher_margin": 0.853645920753479, "step": 520 }, { "epoch": 0.63, "grad_norm": 6.9375, "learning_rate": 1.7733501396822178e-06, "logits/chosen": -3.588365077972412, "logits/rejected": -3.5591952800750732, "logps/chosen": -199.75267028808594, "logps/rejected": -181.21066284179688, "loss": 0.5305, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 1.041146993637085, "rewards/diff": -0.35296258330345154, "rewards/diff_abs": 1.0002224445343018, "rewards/rejected": 0.4019221365451813, "rewards/student_margin": 0.6392248868942261, "rewards/teacher_margin": 0.9921875, "step": 530 }, { "epoch": 0.65, "grad_norm": 6.375, "learning_rate": 1.6739498128436563e-06, "logits/chosen": -3.5126869678497314, "logits/rejected": -3.5697379112243652, "logps/chosen": -275.712158203125, "logps/rejected": -249.92800903320312, "loss": 0.5093, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 1.5757232904434204, "rewards/diff": 0.19753125309944153, "rewards/diff_abs": 1.0118718147277832, "rewards/rejected": 0.4016294479370117, "rewards/student_margin": 1.1740937232971191, "rewards/teacher_margin": 0.9765625, "step": 540 }, { "epoch": 0.66, "grad_norm": 6.0, "learning_rate": 1.5759948025441535e-06, "logits/chosen": -3.370077610015869, "logits/rejected": -3.4373347759246826, "logps/chosen": -266.87689208984375, "logps/rejected": -229.03158569335938, "loss": 0.5216, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 1.2909305095672607, "rewards/diff": -0.09488488733768463, "rewards/diff_abs": 1.032594084739685, "rewards/rejected": 0.4243570864200592, "rewards/student_margin": 0.8665734529495239, "rewards/teacher_margin": 0.9614583849906921, "step": 550 }, { "epoch": 0.67, "grad_norm": 5.75, "learning_rate": 1.479656497881698e-06, "logits/chosen": -3.55267071723938, "logits/rejected": -3.6114087104797363, "logps/chosen": -230.6641082763672, "logps/rejected": -188.7877655029297, "loss": 0.4984, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.1030203104019165, "rewards/diff": -0.43049484491348267, "rewards/diff_abs": 1.0176421403884888, "rewards/rejected": 0.7288275957107544, "rewards/student_margin": 0.37419265508651733, "rewards/teacher_margin": 0.8046875, "step": 560 }, { "epoch": 0.68, "grad_norm": 6.28125, "learning_rate": 1.3851034592503648e-06, "logits/chosen": -3.3889052867889404, "logits/rejected": -3.5159294605255127, "logps/chosen": -272.55511474609375, "logps/rejected": -199.54537963867188, "loss": 0.5254, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 1.3238239288330078, "rewards/diff": 0.10217878967523575, "rewards/diff_abs": 0.8275870084762573, "rewards/rejected": 0.3945617377758026, "rewards/student_margin": 0.9292620420455933, "rewards/teacher_margin": 0.82708340883255, "step": 570 }, { "epoch": 0.69, "grad_norm": 6.78125, "learning_rate": 1.2925011234149859e-06, "logits/chosen": -3.478515148162842, "logits/rejected": -3.606118679046631, "logps/chosen": -204.07174682617188, "logps/rejected": -156.8729705810547, "loss": 0.5088, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 1.1958485841751099, "rewards/diff": 0.014614415355026722, "rewards/diff_abs": 1.1147606372833252, "rewards/rejected": 0.2895674705505371, "rewards/student_margin": 0.9062811136245728, "rewards/teacher_margin": 0.8916667699813843, "step": 580 }, { "epoch": 0.71, "grad_norm": 6.21875, "learning_rate": 1.2020115140511436e-06, "logits/chosen": -3.372546434402466, "logits/rejected": -3.3879222869873047, "logps/chosen": -285.7796936035156, "logps/rejected": -257.11016845703125, "loss": 0.5148, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 1.2352849245071411, "rewards/diff": -0.10239236056804657, "rewards/diff_abs": 0.7884107828140259, "rewards/rejected": 0.5449690818786621, "rewards/student_margin": 0.6903160214424133, "rewards/teacher_margin": 0.7927082777023315, "step": 590 }, { "epoch": 0.72, "grad_norm": 6.25, "learning_rate": 1.11379295825695e-06, "logits/chosen": -3.4046216011047363, "logits/rejected": -3.449857711791992, "logps/chosen": -274.1463317871094, "logps/rejected": -247.4075927734375, "loss": 0.5252, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 1.4221832752227783, "rewards/diff": -0.02768692374229431, "rewards/diff_abs": 0.8126093745231628, "rewards/rejected": 0.7457036972045898, "rewards/student_margin": 0.676479697227478, "rewards/teacher_margin": 0.7041667699813843, "step": 600 }, { "epoch": 0.73, "grad_norm": 6.625, "learning_rate": 1.0279998095326188e-06, "logits/chosen": -3.5202414989471436, "logits/rejected": -3.6290194988250732, "logps/chosen": -281.3245849609375, "logps/rejected": -231.3522186279297, "loss": 0.5181, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 1.2789822816848755, "rewards/diff": -0.12169651687145233, "rewards/diff_abs": 0.7308156490325928, "rewards/rejected": 0.6079703569412231, "rewards/student_margin": 0.6710118055343628, "rewards/teacher_margin": 0.7927082777023315, "step": 610 }, { "epoch": 0.74, "grad_norm": 6.15625, "learning_rate": 9.447821777125376e-07, "logits/chosen": -3.484200954437256, "logits/rejected": -3.4762959480285645, "logps/chosen": -234.85791015625, "logps/rejected": -222.82534790039062, "loss": 0.5155, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9543665647506714, "rewards/diff": -0.3350816071033478, "rewards/diff_abs": 1.0735111236572266, "rewards/rejected": 0.3592398166656494, "rewards/student_margin": 0.595126748085022, "rewards/teacher_margin": 0.9302083849906921, "step": 620 }, { "epoch": 0.75, "grad_norm": 7.375, "learning_rate": 8.642856663223537e-07, "logits/chosen": -3.6152985095977783, "logits/rejected": -3.6913936138153076, "logps/chosen": -278.0227355957031, "logps/rejected": -192.86460876464844, "loss": 0.5314, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 1.4040509462356567, "rewards/diff": -0.1374289095401764, "rewards/diff_abs": 0.7850462198257446, "rewards/rejected": 0.5149174928665161, "rewards/student_margin": 0.889133632183075, "rewards/teacher_margin": 1.0265624523162842, "step": 630 }, { "epoch": 0.77, "grad_norm": 5.71875, "learning_rate": 7.866511178206202e-07, "logits/chosen": -3.5455310344696045, "logits/rejected": -3.4960360527038574, "logps/chosen": -288.90374755859375, "logps/rejected": -259.2998046875, "loss": 0.5068, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": 1.5496443510055542, "rewards/diff": -0.2831823229789734, "rewards/diff_abs": 1.0701242685317993, "rewards/rejected": 0.9109517335891724, "rewards/student_margin": 0.6386927366256714, "rewards/teacher_margin": 0.921875, "step": 640 }, { "epoch": 0.78, "grad_norm": 6.5625, "learning_rate": 7.120143671707535e-07, "logits/chosen": -3.624680995941162, "logits/rejected": -3.571241855621338, "logps/chosen": -238.3637237548828, "logps/rejected": -190.933349609375, "loss": 0.5136, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 1.272955060005188, "rewards/diff": -0.05736231803894043, "rewards/diff_abs": 0.828034520149231, "rewards/rejected": 0.6131298542022705, "rewards/student_margin": 0.6598252058029175, "rewards/teacher_margin": 0.7171874642372131, "step": 650 }, { "epoch": 0.79, "grad_norm": 5.9375, "learning_rate": 6.405060041744557e-07, "logits/chosen": -3.3889694213867188, "logits/rejected": -3.4272830486297607, "logps/chosen": -314.39337158203125, "logps/rejected": -279.32037353515625, "loss": 0.5242, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 1.6729543209075928, "rewards/diff": -0.12356214225292206, "rewards/diff_abs": 1.1017476320266724, "rewards/rejected": 0.8991208076477051, "rewards/student_margin": 0.7738337516784668, "rewards/teacher_margin": 0.8973957896232605, "step": 660 }, { "epoch": 0.8, "grad_norm": 7.0, "learning_rate": 5.72251144982447e-07, "logits/chosen": -3.5143237113952637, "logits/rejected": -3.4414215087890625, "logps/chosen": -255.2972869873047, "logps/rejected": -279.54595947265625, "loss": 0.4898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4696900844573975, "rewards/diff": 0.25774964690208435, "rewards/diff_abs": 1.2428072690963745, "rewards/rejected": 0.48954445123672485, "rewards/student_margin": 0.9801454544067383, "rewards/teacher_margin": 0.7223958969116211, "step": 670 }, { "epoch": 0.81, "grad_norm": 6.4375, "learning_rate": 5.07369213182295e-07, "logits/chosen": -3.437652111053467, "logits/rejected": -3.513336181640625, "logps/chosen": -256.0563659667969, "logps/rejected": -192.80081176757812, "loss": 0.5181, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 0.9916725158691406, "rewards/diff": -0.04445856809616089, "rewards/diff_abs": 1.0562130212783813, "rewards/rejected": 0.035089436918497086, "rewards/student_margin": 0.9565832018852234, "rewards/teacher_margin": 1.0010416507720947, "step": 680 }, { "epoch": 0.83, "grad_norm": 5.75, "learning_rate": 4.4597373084635717e-07, "logits/chosen": -3.4108052253723145, "logits/rejected": -3.403064727783203, "logps/chosen": -295.178955078125, "logps/rejected": -241.88851928710938, "loss": 0.5054, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 1.1539413928985596, "rewards/diff": -0.40385159850120544, "rewards/diff_abs": 1.0642129182815552, "rewards/rejected": 0.7130011320114136, "rewards/student_margin": 0.44094014167785645, "rewards/teacher_margin": 0.8447917103767395, "step": 690 }, { "epoch": 0.84, "grad_norm": 6.4375, "learning_rate": 3.88172119905435e-07, "logits/chosen": -3.562473773956299, "logits/rejected": -3.469137668609619, "logps/chosen": -264.6433410644531, "logps/rejected": -231.51602172851562, "loss": 0.5061, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 1.1783157587051392, "rewards/diff": -0.00968353170901537, "rewards/diff_abs": 0.7437331080436707, "rewards/rejected": 0.3218533396720886, "rewards/student_margin": 0.856462299823761, "rewards/teacher_margin": 0.86614590883255, "step": 700 }, { "epoch": 0.85, "grad_norm": 5.25, "learning_rate": 3.3406551419567584e-07, "logits/chosen": -3.484909772872925, "logits/rejected": -3.444756269454956, "logps/chosen": -285.2689208984375, "logps/rejected": -289.5472106933594, "loss": 0.4931, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 1.4713407754898071, "rewards/diff": 0.38614755868911743, "rewards/diff_abs": 1.1154059171676636, "rewards/rejected": 0.4737350344657898, "rewards/student_margin": 0.9976059198379517, "rewards/teacher_margin": 0.6114583611488342, "step": 710 }, { "epoch": 0.86, "grad_norm": 6.15625, "learning_rate": 2.837485825075728e-07, "logits/chosen": -3.577286958694458, "logits/rejected": -3.652881145477295, "logps/chosen": -301.7745361328125, "logps/rejected": -229.53173828125, "loss": 0.5191, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 1.0710813999176025, "rewards/diff": -0.43389981985092163, "rewards/diff_abs": 1.1230199337005615, "rewards/rejected": 0.5716478228569031, "rewards/student_margin": 0.4994335174560547, "rewards/teacher_margin": 0.9333332777023315, "step": 720 }, { "epoch": 0.87, "grad_norm": 5.90625, "learning_rate": 2.37309362946673e-07, "logits/chosen": -3.4588115215301514, "logits/rejected": -3.5218307971954346, "logps/chosen": -200.84402465820312, "logps/rejected": -166.37826538085938, "loss": 0.513, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8319117426872253, "rewards/diff": -0.07107441127300262, "rewards/diff_abs": 0.7676871418952942, "rewards/rejected": 0.08527780324220657, "rewards/student_margin": 0.7466338872909546, "rewards/teacher_margin": 0.8177083134651184, "step": 730 }, { "epoch": 0.89, "grad_norm": 5.375, "learning_rate": 1.948291088958032e-07, "logits/chosen": -3.379662275314331, "logits/rejected": -3.4146881103515625, "logps/chosen": -259.4773864746094, "logps/rejected": -210.60165405273438, "loss": 0.5071, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9233649373054504, "rewards/diff": -0.2959494888782501, "rewards/diff_abs": 1.0808264017105103, "rewards/rejected": 0.5380643010139465, "rewards/student_margin": 0.38530051708221436, "rewards/teacher_margin": 0.6812499761581421, "step": 740 }, { "epoch": 0.9, "grad_norm": 7.15625, "learning_rate": 1.5638214684833923e-07, "logits/chosen": -3.3812708854675293, "logits/rejected": -3.489490032196045, "logps/chosen": -282.34906005859375, "logps/rejected": -206.3470458984375, "loss": 0.5175, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 1.5291283130645752, "rewards/diff": -0.04967302083969116, "rewards/diff_abs": 1.0276174545288086, "rewards/rejected": 0.6121346354484558, "rewards/student_margin": 0.9169937372207642, "rewards/teacher_margin": 0.9666666984558105, "step": 750 }, { "epoch": 0.91, "grad_norm": 7.0625, "learning_rate": 1.220357463612501e-07, "logits/chosen": -3.5278987884521484, "logits/rejected": -3.4870636463165283, "logps/chosen": -262.7372131347656, "logps/rejected": -204.89157104492188, "loss": 0.5368, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4189398288726807, "rewards/diff": 0.041193410754203796, "rewards/diff_abs": 0.6765682101249695, "rewards/rejected": 0.6954547166824341, "rewards/student_margin": 0.7234851121902466, "rewards/teacher_margin": 0.6822917461395264, "step": 760 }, { "epoch": 0.92, "grad_norm": 6.34375, "learning_rate": 9.185000235546443e-08, "logits/chosen": -3.531663417816162, "logits/rejected": -3.5207691192626953, "logps/chosen": -221.78579711914062, "logps/rejected": -199.05947875976562, "loss": 0.5111, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 1.0741617679595947, "rewards/diff": -0.33916252851486206, "rewards/diff_abs": 0.7832191586494446, "rewards/rejected": 0.7206159830093384, "rewards/student_margin": 0.35354581475257874, "rewards/teacher_margin": 0.6927083730697632, "step": 770 }, { "epoch": 0.93, "grad_norm": 6.78125, "learning_rate": 6.587772996949876e-08, "logits/chosen": -3.4602973461151123, "logits/rejected": -3.5840487480163574, "logps/chosen": -273.8219299316406, "logps/rejected": -187.69760131835938, "loss": 0.5151, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 1.2822003364562988, "rewards/diff": 0.012847900390625, "rewards/diff_abs": 0.7760865688323975, "rewards/rejected": 0.34331077337265015, "rewards/student_margin": 0.9388895034790039, "rewards/teacher_margin": 0.9260417819023132, "step": 780 }, { "epoch": 0.95, "grad_norm": 6.5625, "learning_rate": 4.416437215030628e-08, "logits/chosen": -3.357858657836914, "logits/rejected": -3.428370237350464, "logps/chosen": -231.9785614013672, "logps/rejected": -208.6261749267578, "loss": 0.5225, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 1.1795800924301147, "rewards/diff": -0.32492926716804504, "rewards/diff_abs": 1.1657856702804565, "rewards/rejected": 0.5086759328842163, "rewards/student_margin": 0.6709040403366089, "rewards/teacher_margin": 0.9958333969116211, "step": 790 }, { "epoch": 0.96, "grad_norm": 6.84375, "learning_rate": 2.6747920143047056e-08, "logits/chosen": -3.574307680130005, "logits/rejected": -3.662809371948242, "logps/chosen": -242.3976593017578, "logps/rejected": -184.373291015625, "loss": 0.4997, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 1.215039610862732, "rewards/diff": 0.025632739067077637, "rewards/diff_abs": 0.862860381603241, "rewards/rejected": 0.06649022549390793, "rewards/student_margin": 1.1485494375228882, "rewards/teacher_margin": 1.1229166984558105, "step": 800 }, { "epoch": 0.97, "grad_norm": 7.0, "learning_rate": 1.3658847018884758e-08, "logits/chosen": -3.3837954998016357, "logits/rejected": -3.477294445037842, "logps/chosen": -303.0797119140625, "logps/rejected": -258.445068359375, "loss": 0.5211, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 1.3037405014038086, "rewards/diff": -0.3252946734428406, "rewards/diff_abs": 1.0890836715698242, "rewards/rejected": 0.933201789855957, "rewards/student_margin": 0.3705386519432068, "rewards/teacher_margin": 0.6958333849906921, "step": 810 }, { "epoch": 0.98, "grad_norm": 5.96875, "learning_rate": 4.920054357119841e-09, "logits/chosen": -3.4326694011688232, "logits/rejected": -3.4905331134796143, "logps/chosen": -251.43948364257812, "logps/rejected": -198.3247833251953, "loss": 0.512, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 1.450547695159912, "rewards/diff": -0.012740576639771461, "rewards/diff_abs": 0.7532329559326172, "rewards/rejected": 0.5716216564178467, "rewards/student_margin": 0.878926157951355, "rewards/teacher_margin": 0.8916667699813843, "step": 820 }, { "epoch": 0.99, "grad_norm": 5.96875, "learning_rate": 5.468321749468875e-10, "logits/chosen": -3.446951389312744, "logits/rejected": -3.5641331672668457, "logps/chosen": -233.23678588867188, "logps/rejected": -200.827880859375, "loss": 0.5075, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 0.8117042779922485, "rewards/diff": -0.23592355847358704, "rewards/diff_abs": 0.7656908631324768, "rewards/rejected": 0.2184610813856125, "rewards/student_margin": 0.5932431817054749, "rewards/teacher_margin": 0.8291667103767395, "step": 830 }, { "epoch": 1.0, "step": 835, "total_flos": 0.0, "train_loss": 0.5412804069633256, "train_runtime": 5959.7316, "train_samples_per_second": 26.891, "train_steps_per_second": 0.14 } ], "logging_steps": 10, "max_steps": 835, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100000000000000000000000000000000, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }