diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4905 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9990309034133737, + "eval_steps": 500, + "global_step": 6963, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004307095940562076, + "grad_norm": 9.035698890686035, + "learning_rate": 5.730659025787966e-07, + "loss": 1.2789, + "step": 10 + }, + { + "epoch": 0.008614191881124151, + "grad_norm": 6.42362642288208, + "learning_rate": 1.1461318051575932e-06, + "loss": 1.1797, + "step": 20 + }, + { + "epoch": 0.012921287821686228, + "grad_norm": 1.6813161373138428, + "learning_rate": 1.7191977077363897e-06, + "loss": 0.9736, + "step": 30 + }, + { + "epoch": 0.017228383762248303, + "grad_norm": 0.9371042251586914, + "learning_rate": 2.2922636103151864e-06, + "loss": 0.871, + "step": 40 + }, + { + "epoch": 0.02153547970281038, + "grad_norm": 0.7695503234863281, + "learning_rate": 2.865329512893983e-06, + "loss": 0.8025, + "step": 50 + }, + { + "epoch": 0.025842575643372456, + "grad_norm": 0.716374397277832, + "learning_rate": 3.4383954154727795e-06, + "loss": 0.7698, + "step": 60 + }, + { + "epoch": 0.030149671583934532, + "grad_norm": 0.6864265203475952, + "learning_rate": 4.011461318051576e-06, + "loss": 0.7701, + "step": 70 + }, + { + "epoch": 0.034456767524496605, + "grad_norm": 0.7581704258918762, + "learning_rate": 4.584527220630373e-06, + "loss": 0.7619, + "step": 80 + }, + { + "epoch": 0.03876386346505868, + "grad_norm": 0.6616791486740112, + "learning_rate": 5.157593123209169e-06, + "loss": 0.7296, + "step": 90 + }, + { + "epoch": 0.04307095940562076, + "grad_norm": 0.6397051811218262, + "learning_rate": 5.730659025787966e-06, + "loss": 0.7453, + "step": 100 + }, + { + "epoch": 0.047378055346182835, + "grad_norm": 0.6572911143302917, + "learning_rate": 6.303724928366762e-06, + "loss": 0.767, + "step": 110 + }, + { + "epoch": 0.05168515128674491, + "grad_norm": 0.669222354888916, + "learning_rate": 6.876790830945559e-06, + "loss": 0.7369, + "step": 120 + }, + { + "epoch": 0.05599224722730699, + "grad_norm": 0.6517964601516724, + "learning_rate": 7.449856733524356e-06, + "loss": 0.7186, + "step": 130 + }, + { + "epoch": 0.060299343167869064, + "grad_norm": 0.6209223866462708, + "learning_rate": 8.022922636103152e-06, + "loss": 0.7155, + "step": 140 + }, + { + "epoch": 0.06460643910843114, + "grad_norm": 0.6591508388519287, + "learning_rate": 8.595988538681949e-06, + "loss": 0.7289, + "step": 150 + }, + { + "epoch": 0.06891353504899321, + "grad_norm": 0.5842370390892029, + "learning_rate": 9.169054441260746e-06, + "loss": 0.7183, + "step": 160 + }, + { + "epoch": 0.0732206309895553, + "grad_norm": 0.7117204070091248, + "learning_rate": 9.742120343839543e-06, + "loss": 0.7192, + "step": 170 + }, + { + "epoch": 0.07752772693011736, + "grad_norm": 0.6163178086280823, + "learning_rate": 1.0315186246418338e-05, + "loss": 0.7193, + "step": 180 + }, + { + "epoch": 0.08183482287067945, + "grad_norm": 0.5932906270027161, + "learning_rate": 1.0888252148997137e-05, + "loss": 0.714, + "step": 190 + }, + { + "epoch": 0.08614191881124152, + "grad_norm": 0.5982919335365295, + "learning_rate": 1.1461318051575932e-05, + "loss": 0.7058, + "step": 200 + }, + { + "epoch": 0.0904490147518036, + "grad_norm": 0.6208463907241821, + "learning_rate": 1.2034383954154729e-05, + "loss": 0.7189, + "step": 210 + }, + { + "epoch": 0.09475611069236567, + "grad_norm": 0.5887411236763, + "learning_rate": 1.2607449856733524e-05, + "loss": 0.7249, + "step": 220 + }, + { + "epoch": 0.09906320663292775, + "grad_norm": 0.5963988900184631, + "learning_rate": 1.3180515759312323e-05, + "loss": 0.7293, + "step": 230 + }, + { + "epoch": 0.10337030257348982, + "grad_norm": 0.5715692043304443, + "learning_rate": 1.3753581661891118e-05, + "loss": 0.6845, + "step": 240 + }, + { + "epoch": 0.1076773985140519, + "grad_norm": 0.639398455619812, + "learning_rate": 1.4326647564469915e-05, + "loss": 0.6994, + "step": 250 + }, + { + "epoch": 0.11198449445461398, + "grad_norm": 0.6884477734565735, + "learning_rate": 1.4899713467048712e-05, + "loss": 0.7126, + "step": 260 + }, + { + "epoch": 0.11629159039517606, + "grad_norm": 0.6021578907966614, + "learning_rate": 1.5472779369627507e-05, + "loss": 0.7215, + "step": 270 + }, + { + "epoch": 0.12059868633573813, + "grad_norm": 0.6716468930244446, + "learning_rate": 1.6045845272206304e-05, + "loss": 0.6969, + "step": 280 + }, + { + "epoch": 0.1249057822763002, + "grad_norm": 0.5783571600914001, + "learning_rate": 1.66189111747851e-05, + "loss": 0.7111, + "step": 290 + }, + { + "epoch": 0.12921287821686228, + "grad_norm": 0.5546681880950928, + "learning_rate": 1.7191977077363898e-05, + "loss": 0.7, + "step": 300 + }, + { + "epoch": 0.13351997415742436, + "grad_norm": 0.5409330129623413, + "learning_rate": 1.7765042979942695e-05, + "loss": 0.696, + "step": 310 + }, + { + "epoch": 0.13782707009798642, + "grad_norm": 0.5752865672111511, + "learning_rate": 1.833810888252149e-05, + "loss": 0.6883, + "step": 320 + }, + { + "epoch": 0.1421341660385485, + "grad_norm": 0.6340565085411072, + "learning_rate": 1.891117478510029e-05, + "loss": 0.6881, + "step": 330 + }, + { + "epoch": 0.1464412619791106, + "grad_norm": 0.5298891067504883, + "learning_rate": 1.9484240687679085e-05, + "loss": 0.6935, + "step": 340 + }, + { + "epoch": 0.15074835791967267, + "grad_norm": 0.5659753680229187, + "learning_rate": 1.9999998871916207e-05, + "loss": 0.7103, + "step": 350 + }, + { + "epoch": 0.15505545386023473, + "grad_norm": 0.6017744541168213, + "learning_rate": 1.999986350216883e-05, + "loss": 0.6855, + "step": 360 + }, + { + "epoch": 0.1593625498007968, + "grad_norm": 0.5426760911941528, + "learning_rate": 1.999950251916212e-05, + "loss": 0.6914, + "step": 370 + }, + { + "epoch": 0.1636696457413589, + "grad_norm": 0.5532637238502502, + "learning_rate": 1.999891593104044e-05, + "loss": 0.6895, + "step": 380 + }, + { + "epoch": 0.16797674168192098, + "grad_norm": 0.5581168532371521, + "learning_rate": 1.9998103751038177e-05, + "loss": 0.6897, + "step": 390 + }, + { + "epoch": 0.17228383762248303, + "grad_norm": 0.5208210945129395, + "learning_rate": 1.9997065997479442e-05, + "loss": 0.6889, + "step": 400 + }, + { + "epoch": 0.17659093356304512, + "grad_norm": 0.5863595604896545, + "learning_rate": 1.9995802693777644e-05, + "loss": 0.6905, + "step": 410 + }, + { + "epoch": 0.1808980295036072, + "grad_norm": 0.5605342984199524, + "learning_rate": 1.9994313868434988e-05, + "loss": 0.6815, + "step": 420 + }, + { + "epoch": 0.18520512544416926, + "grad_norm": 0.5580301880836487, + "learning_rate": 1.9992599555041798e-05, + "loss": 0.7067, + "step": 430 + }, + { + "epoch": 0.18951222138473134, + "grad_norm": 0.558312177658081, + "learning_rate": 1.999065979227579e-05, + "loss": 0.7061, + "step": 440 + }, + { + "epoch": 0.19381931732529342, + "grad_norm": 0.5273975133895874, + "learning_rate": 1.998849462390118e-05, + "loss": 0.6905, + "step": 450 + }, + { + "epoch": 0.1981264132658555, + "grad_norm": 0.4772217571735382, + "learning_rate": 1.9986104098767703e-05, + "loss": 0.686, + "step": 460 + }, + { + "epoch": 0.20243350920641756, + "grad_norm": 0.5336763858795166, + "learning_rate": 1.9983488270809515e-05, + "loss": 0.6861, + "step": 470 + }, + { + "epoch": 0.20674060514697964, + "grad_norm": 0.4961983859539032, + "learning_rate": 1.9980647199043966e-05, + "loss": 0.6882, + "step": 480 + }, + { + "epoch": 0.21104770108754173, + "grad_norm": 0.5408128499984741, + "learning_rate": 1.9977580947570275e-05, + "loss": 0.7001, + "step": 490 + }, + { + "epoch": 0.2153547970281038, + "grad_norm": 0.5350680351257324, + "learning_rate": 1.997428958556809e-05, + "loss": 0.6931, + "step": 500 + }, + { + "epoch": 0.21966189296866587, + "grad_norm": 0.5455281734466553, + "learning_rate": 1.9970773187295917e-05, + "loss": 0.6919, + "step": 510 + }, + { + "epoch": 0.22396898890922795, + "grad_norm": 0.524664580821991, + "learning_rate": 1.9967031832089438e-05, + "loss": 0.6738, + "step": 520 + }, + { + "epoch": 0.22827608484979003, + "grad_norm": 0.48598727583885193, + "learning_rate": 1.9963065604359746e-05, + "loss": 0.6678, + "step": 530 + }, + { + "epoch": 0.23258318079035212, + "grad_norm": 0.5560494065284729, + "learning_rate": 1.9958874593591418e-05, + "loss": 0.694, + "step": 540 + }, + { + "epoch": 0.23689027673091417, + "grad_norm": 0.5516777038574219, + "learning_rate": 1.99544588943405e-05, + "loss": 0.6715, + "step": 550 + }, + { + "epoch": 0.24119737267147626, + "grad_norm": 0.5097941756248474, + "learning_rate": 1.9949818606232393e-05, + "loss": 0.6782, + "step": 560 + }, + { + "epoch": 0.24550446861203834, + "grad_norm": 0.5353350639343262, + "learning_rate": 1.9944953833959567e-05, + "loss": 0.6904, + "step": 570 + }, + { + "epoch": 0.2498115645526004, + "grad_norm": 0.5160298943519592, + "learning_rate": 1.9939864687279237e-05, + "loss": 0.6756, + "step": 580 + }, + { + "epoch": 0.2541186604931625, + "grad_norm": 0.5377163887023926, + "learning_rate": 1.993455128101087e-05, + "loss": 0.712, + "step": 590 + }, + { + "epoch": 0.25842575643372456, + "grad_norm": 0.47318100929260254, + "learning_rate": 1.992901373503359e-05, + "loss": 0.6648, + "step": 600 + }, + { + "epoch": 0.2627328523742866, + "grad_norm": 0.4977729916572571, + "learning_rate": 1.992325217428348e-05, + "loss": 0.6893, + "step": 610 + }, + { + "epoch": 0.26703994831484873, + "grad_norm": 0.5569038391113281, + "learning_rate": 1.991726672875077e-05, + "loss": 0.6876, + "step": 620 + }, + { + "epoch": 0.2713470442554108, + "grad_norm": 0.544884443283081, + "learning_rate": 1.9911057533476884e-05, + "loss": 0.6736, + "step": 630 + }, + { + "epoch": 0.27565414019597284, + "grad_norm": 0.5159808993339539, + "learning_rate": 1.9904624728551417e-05, + "loss": 0.674, + "step": 640 + }, + { + "epoch": 0.27996123613653495, + "grad_norm": 0.48680537939071655, + "learning_rate": 1.989796845910896e-05, + "loss": 0.6903, + "step": 650 + }, + { + "epoch": 0.284268332077097, + "grad_norm": 0.527867317199707, + "learning_rate": 1.9891088875325827e-05, + "loss": 0.6693, + "step": 660 + }, + { + "epoch": 0.2885754280176591, + "grad_norm": 0.5441365838050842, + "learning_rate": 1.988398613241666e-05, + "loss": 0.6721, + "step": 670 + }, + { + "epoch": 0.2928825239582212, + "grad_norm": 0.5693966150283813, + "learning_rate": 1.9876660390630954e-05, + "loss": 0.6684, + "step": 680 + }, + { + "epoch": 0.29718961989878323, + "grad_norm": 0.5607503652572632, + "learning_rate": 1.986911181524941e-05, + "loss": 0.6783, + "step": 690 + }, + { + "epoch": 0.30149671583934534, + "grad_norm": 0.5421719551086426, + "learning_rate": 1.9861340576580225e-05, + "loss": 0.6658, + "step": 700 + }, + { + "epoch": 0.3058038117799074, + "grad_norm": 0.497612863779068, + "learning_rate": 1.9853346849955236e-05, + "loss": 0.6816, + "step": 710 + }, + { + "epoch": 0.31011090772046945, + "grad_norm": 0.5503632426261902, + "learning_rate": 1.984513081572598e-05, + "loss": 0.6663, + "step": 720 + }, + { + "epoch": 0.31441800366103156, + "grad_norm": 0.5319767594337463, + "learning_rate": 1.983669265925961e-05, + "loss": 0.6513, + "step": 730 + }, + { + "epoch": 0.3187250996015936, + "grad_norm": 0.5350950956344604, + "learning_rate": 1.9828032570934726e-05, + "loss": 0.6699, + "step": 740 + }, + { + "epoch": 0.3230321955421557, + "grad_norm": 0.5330127477645874, + "learning_rate": 1.9819150746137067e-05, + "loss": 0.6786, + "step": 750 + }, + { + "epoch": 0.3273392914827178, + "grad_norm": 0.4740910232067108, + "learning_rate": 1.981004738525512e-05, + "loss": 0.6867, + "step": 760 + }, + { + "epoch": 0.33164638742327984, + "grad_norm": 0.5131900906562805, + "learning_rate": 1.980072269367557e-05, + "loss": 0.6618, + "step": 770 + }, + { + "epoch": 0.33595348336384195, + "grad_norm": 0.4712623059749603, + "learning_rate": 1.97911768817787e-05, + "loss": 0.6863, + "step": 780 + }, + { + "epoch": 0.340260579304404, + "grad_norm": 0.5240254998207092, + "learning_rate": 1.9781410164933626e-05, + "loss": 0.6941, + "step": 790 + }, + { + "epoch": 0.34456767524496607, + "grad_norm": 0.5192612409591675, + "learning_rate": 1.9771422763493434e-05, + "loss": 0.6726, + "step": 800 + }, + { + "epoch": 0.3488747711855282, + "grad_norm": 0.4864448010921478, + "learning_rate": 1.9761214902790217e-05, + "loss": 0.6541, + "step": 810 + }, + { + "epoch": 0.35318186712609023, + "grad_norm": 0.5248873829841614, + "learning_rate": 1.9750786813129995e-05, + "loss": 0.6713, + "step": 820 + }, + { + "epoch": 0.3574889630666523, + "grad_norm": 0.5010212659835815, + "learning_rate": 1.9740138729787505e-05, + "loss": 0.6793, + "step": 830 + }, + { + "epoch": 0.3617960590072144, + "grad_norm": 0.4966225326061249, + "learning_rate": 1.9729270893000913e-05, + "loss": 0.6692, + "step": 840 + }, + { + "epoch": 0.36610315494777645, + "grad_norm": 0.48576685786247253, + "learning_rate": 1.9718183547966366e-05, + "loss": 0.6812, + "step": 850 + }, + { + "epoch": 0.3704102508883385, + "grad_norm": 0.5232109427452087, + "learning_rate": 1.9706876944832486e-05, + "loss": 0.6567, + "step": 860 + }, + { + "epoch": 0.3747173468289006, + "grad_norm": 0.4847777485847473, + "learning_rate": 1.9695351338694713e-05, + "loss": 0.6638, + "step": 870 + }, + { + "epoch": 0.3790244427694627, + "grad_norm": 0.49412795901298523, + "learning_rate": 1.9683606989589553e-05, + "loss": 0.6731, + "step": 880 + }, + { + "epoch": 0.3833315387100248, + "grad_norm": 0.5143546462059021, + "learning_rate": 1.9671644162488716e-05, + "loss": 0.6779, + "step": 890 + }, + { + "epoch": 0.38763863465058684, + "grad_norm": 0.5516107082366943, + "learning_rate": 1.965946312729312e-05, + "loss": 0.6798, + "step": 900 + }, + { + "epoch": 0.3919457305911489, + "grad_norm": 0.5140990018844604, + "learning_rate": 1.9647064158826825e-05, + "loss": 0.6473, + "step": 910 + }, + { + "epoch": 0.396252826531711, + "grad_norm": 0.4911974370479584, + "learning_rate": 1.9634447536830815e-05, + "loss": 0.6565, + "step": 920 + }, + { + "epoch": 0.40055992247227307, + "grad_norm": 0.4995877742767334, + "learning_rate": 1.9621613545956703e-05, + "loss": 0.6514, + "step": 930 + }, + { + "epoch": 0.4048670184128351, + "grad_norm": 0.48752328753471375, + "learning_rate": 1.9608562475760287e-05, + "loss": 0.6751, + "step": 940 + }, + { + "epoch": 0.40917411435339723, + "grad_norm": 0.4956004321575165, + "learning_rate": 1.9595294620695036e-05, + "loss": 0.6492, + "step": 950 + }, + { + "epoch": 0.4134812102939593, + "grad_norm": 0.48215603828430176, + "learning_rate": 1.958181028010544e-05, + "loss": 0.6741, + "step": 960 + }, + { + "epoch": 0.4177883062345214, + "grad_norm": 0.48835939168930054, + "learning_rate": 1.9568109758220253e-05, + "loss": 0.6638, + "step": 970 + }, + { + "epoch": 0.42209540217508346, + "grad_norm": 0.47754788398742676, + "learning_rate": 1.9554193364145635e-05, + "loss": 0.6657, + "step": 980 + }, + { + "epoch": 0.4264024981156455, + "grad_norm": 0.5080917477607727, + "learning_rate": 1.9540061411858172e-05, + "loss": 0.6675, + "step": 990 + }, + { + "epoch": 0.4307095940562076, + "grad_norm": 0.4634297788143158, + "learning_rate": 1.9525714220197802e-05, + "loss": 0.6693, + "step": 1000 + }, + { + "epoch": 0.4350166899967697, + "grad_norm": 0.4760366678237915, + "learning_rate": 1.951115211286061e-05, + "loss": 0.6721, + "step": 1010 + }, + { + "epoch": 0.43932378593733173, + "grad_norm": 0.5227916836738586, + "learning_rate": 1.9496375418391525e-05, + "loss": 0.6691, + "step": 1020 + }, + { + "epoch": 0.44363088187789385, + "grad_norm": 0.5157990455627441, + "learning_rate": 1.948138447017692e-05, + "loss": 0.6774, + "step": 1030 + }, + { + "epoch": 0.4479379778184559, + "grad_norm": 0.49596408009529114, + "learning_rate": 1.9466179606437087e-05, + "loss": 0.6313, + "step": 1040 + }, + { + "epoch": 0.45224507375901796, + "grad_norm": 0.47041237354278564, + "learning_rate": 1.945076117021859e-05, + "loss": 0.6724, + "step": 1050 + }, + { + "epoch": 0.45655216969958007, + "grad_norm": 0.5206364989280701, + "learning_rate": 1.9435129509386538e-05, + "loss": 0.6843, + "step": 1060 + }, + { + "epoch": 0.4608592656401421, + "grad_norm": 0.5067657828330994, + "learning_rate": 1.9419284976616745e-05, + "loss": 0.6649, + "step": 1070 + }, + { + "epoch": 0.46516636158070424, + "grad_norm": 1.3445152044296265, + "learning_rate": 1.9403227929387756e-05, + "loss": 0.6548, + "step": 1080 + }, + { + "epoch": 0.4694734575212663, + "grad_norm": 0.5465224385261536, + "learning_rate": 1.93869587299728e-05, + "loss": 0.6427, + "step": 1090 + }, + { + "epoch": 0.47378055346182835, + "grad_norm": 0.49137911200523376, + "learning_rate": 1.9370477745431587e-05, + "loss": 0.6519, + "step": 1100 + }, + { + "epoch": 0.47808764940239046, + "grad_norm": 0.48190736770629883, + "learning_rate": 1.935378534760206e-05, + "loss": 0.6615, + "step": 1110 + }, + { + "epoch": 0.4823947453429525, + "grad_norm": 0.4869353771209717, + "learning_rate": 1.9336881913091992e-05, + "loss": 0.65, + "step": 1120 + }, + { + "epoch": 0.48670184128351457, + "grad_norm": 0.4473590552806854, + "learning_rate": 1.931976782327048e-05, + "loss": 0.6821, + "step": 1130 + }, + { + "epoch": 0.4910089372240767, + "grad_norm": 0.4703207314014435, + "learning_rate": 1.9302443464259352e-05, + "loss": 0.657, + "step": 1140 + }, + { + "epoch": 0.49531603316463874, + "grad_norm": 0.48172295093536377, + "learning_rate": 1.9284909226924457e-05, + "loss": 0.6581, + "step": 1150 + }, + { + "epoch": 0.4996231291052008, + "grad_norm": 0.4986841082572937, + "learning_rate": 1.9267165506866835e-05, + "loss": 0.664, + "step": 1160 + }, + { + "epoch": 0.5039302250457629, + "grad_norm": 0.4936910569667816, + "learning_rate": 1.9249212704413803e-05, + "loss": 0.6409, + "step": 1170 + }, + { + "epoch": 0.508237320986325, + "grad_norm": 0.48618724942207336, + "learning_rate": 1.9231051224609918e-05, + "loss": 0.6566, + "step": 1180 + }, + { + "epoch": 0.512544416926887, + "grad_norm": 0.5300356149673462, + "learning_rate": 1.921268147720784e-05, + "loss": 0.6533, + "step": 1190 + }, + { + "epoch": 0.5168515128674491, + "grad_norm": 0.4799743890762329, + "learning_rate": 1.919410387665908e-05, + "loss": 0.6677, + "step": 1200 + }, + { + "epoch": 0.5211586088080112, + "grad_norm": 0.5317394137382507, + "learning_rate": 1.9175318842104667e-05, + "loss": 0.6464, + "step": 1210 + }, + { + "epoch": 0.5254657047485732, + "grad_norm": 0.49199768900871277, + "learning_rate": 1.9156326797365665e-05, + "loss": 0.6655, + "step": 1220 + }, + { + "epoch": 0.5297728006891353, + "grad_norm": 0.4916874170303345, + "learning_rate": 1.913712817093364e-05, + "loss": 0.6372, + "step": 1230 + }, + { + "epoch": 0.5340798966296975, + "grad_norm": 0.48562970757484436, + "learning_rate": 1.9117723395960972e-05, + "loss": 0.6639, + "step": 1240 + }, + { + "epoch": 0.5383869925702595, + "grad_norm": 0.5152992010116577, + "learning_rate": 1.909811291025109e-05, + "loss": 0.6609, + "step": 1250 + }, + { + "epoch": 0.5426940885108216, + "grad_norm": 0.48352181911468506, + "learning_rate": 1.907829715624859e-05, + "loss": 0.6726, + "step": 1260 + }, + { + "epoch": 0.5470011844513837, + "grad_norm": 0.5064017176628113, + "learning_rate": 1.905827658102926e-05, + "loss": 0.6698, + "step": 1270 + }, + { + "epoch": 0.5513082803919457, + "grad_norm": 0.46494290232658386, + "learning_rate": 1.9038051636289997e-05, + "loss": 0.68, + "step": 1280 + }, + { + "epoch": 0.5556153763325078, + "grad_norm": 0.4788792133331299, + "learning_rate": 1.9017622778338585e-05, + "loss": 0.6501, + "step": 1290 + }, + { + "epoch": 0.5599224722730699, + "grad_norm": 0.4712987542152405, + "learning_rate": 1.8996990468083448e-05, + "loss": 0.6488, + "step": 1300 + }, + { + "epoch": 0.5642295682136319, + "grad_norm": 0.4997137784957886, + "learning_rate": 1.8976155171023216e-05, + "loss": 0.6518, + "step": 1310 + }, + { + "epoch": 0.568536664154194, + "grad_norm": 0.5003030896186829, + "learning_rate": 1.895511735723623e-05, + "loss": 0.6317, + "step": 1320 + }, + { + "epoch": 0.5728437600947561, + "grad_norm": 0.4551664888858795, + "learning_rate": 1.8933877501369944e-05, + "loss": 0.6634, + "step": 1330 + }, + { + "epoch": 0.5771508560353182, + "grad_norm": 0.532534122467041, + "learning_rate": 1.891243608263021e-05, + "loss": 0.6656, + "step": 1340 + }, + { + "epoch": 0.5814579519758802, + "grad_norm": 0.47166600823402405, + "learning_rate": 1.889079358477047e-05, + "loss": 0.657, + "step": 1350 + }, + { + "epoch": 0.5857650479164423, + "grad_norm": 0.45552805066108704, + "learning_rate": 1.8868950496080832e-05, + "loss": 0.6652, + "step": 1360 + }, + { + "epoch": 0.5900721438570045, + "grad_norm": 0.5267536044120789, + "learning_rate": 1.884690730937707e-05, + "loss": 0.6463, + "step": 1370 + }, + { + "epoch": 0.5943792397975665, + "grad_norm": 0.49093228578567505, + "learning_rate": 1.882466452198949e-05, + "loss": 0.6604, + "step": 1380 + }, + { + "epoch": 0.5986863357381286, + "grad_norm": 0.5105960369110107, + "learning_rate": 1.880222263575172e-05, + "loss": 0.6457, + "step": 1390 + }, + { + "epoch": 0.6029934316786907, + "grad_norm": 0.47326135635375977, + "learning_rate": 1.8779582156989384e-05, + "loss": 0.6464, + "step": 1400 + }, + { + "epoch": 0.6073005276192527, + "grad_norm": 0.4910115599632263, + "learning_rate": 1.875674359650867e-05, + "loss": 0.6547, + "step": 1410 + }, + { + "epoch": 0.6116076235598148, + "grad_norm": 0.48352956771850586, + "learning_rate": 1.873370746958482e-05, + "loss": 0.654, + "step": 1420 + }, + { + "epoch": 0.6159147195003769, + "grad_norm": 0.4722056984901428, + "learning_rate": 1.871047429595049e-05, + "loss": 0.6372, + "step": 1430 + }, + { + "epoch": 0.6202218154409389, + "grad_norm": 0.4340212345123291, + "learning_rate": 1.868704459978405e-05, + "loss": 0.6507, + "step": 1440 + }, + { + "epoch": 0.624528911381501, + "grad_norm": 0.48497867584228516, + "learning_rate": 1.8663418909697723e-05, + "loss": 0.6349, + "step": 1450 + }, + { + "epoch": 0.6288360073220631, + "grad_norm": 0.4707370102405548, + "learning_rate": 1.863959775872567e-05, + "loss": 0.6445, + "step": 1460 + }, + { + "epoch": 0.6331431032626251, + "grad_norm": 0.5151925683021545, + "learning_rate": 1.861558168431199e-05, + "loss": 0.6493, + "step": 1470 + }, + { + "epoch": 0.6374501992031872, + "grad_norm": 0.47226110100746155, + "learning_rate": 1.8591371228298554e-05, + "loss": 0.6211, + "step": 1480 + }, + { + "epoch": 0.6417572951437494, + "grad_norm": 0.48166829347610474, + "learning_rate": 1.856696693691281e-05, + "loss": 0.6476, + "step": 1490 + }, + { + "epoch": 0.6460643910843114, + "grad_norm": 0.5039719343185425, + "learning_rate": 1.8542369360755448e-05, + "loss": 0.636, + "step": 1500 + }, + { + "epoch": 0.6503714870248735, + "grad_norm": 0.45818519592285156, + "learning_rate": 1.8517579054787974e-05, + "loss": 0.658, + "step": 1510 + }, + { + "epoch": 0.6546785829654356, + "grad_norm": 0.4803057014942169, + "learning_rate": 1.8492596578320194e-05, + "loss": 0.6468, + "step": 1520 + }, + { + "epoch": 0.6589856789059977, + "grad_norm": 0.480227530002594, + "learning_rate": 1.8467422494997593e-05, + "loss": 0.641, + "step": 1530 + }, + { + "epoch": 0.6632927748465597, + "grad_norm": 0.49187588691711426, + "learning_rate": 1.844205737278863e-05, + "loss": 0.6572, + "step": 1540 + }, + { + "epoch": 0.6675998707871218, + "grad_norm": 0.49701517820358276, + "learning_rate": 1.84165017839719e-05, + "loss": 0.6567, + "step": 1550 + }, + { + "epoch": 0.6719069667276839, + "grad_norm": 0.48368483781814575, + "learning_rate": 1.8390756305123246e-05, + "loss": 0.669, + "step": 1560 + }, + { + "epoch": 0.6762140626682459, + "grad_norm": 0.5007254481315613, + "learning_rate": 1.836482151710273e-05, + "loss": 0.6448, + "step": 1570 + }, + { + "epoch": 0.680521158608808, + "grad_norm": 0.44526585936546326, + "learning_rate": 1.8338698005041556e-05, + "loss": 0.6386, + "step": 1580 + }, + { + "epoch": 0.6848282545493701, + "grad_norm": 0.4812663197517395, + "learning_rate": 1.8312386358328828e-05, + "loss": 0.6447, + "step": 1590 + }, + { + "epoch": 0.6891353504899321, + "grad_norm": 0.4910503029823303, + "learning_rate": 1.828588717059829e-05, + "loss": 0.6449, + "step": 1600 + }, + { + "epoch": 0.6934424464304942, + "grad_norm": 0.47431930899620056, + "learning_rate": 1.8259201039714914e-05, + "loss": 0.6372, + "step": 1610 + }, + { + "epoch": 0.6977495423710564, + "grad_norm": 0.5024338364601135, + "learning_rate": 1.8232328567761416e-05, + "loss": 0.6433, + "step": 1620 + }, + { + "epoch": 0.7020566383116184, + "grad_norm": 0.47510799765586853, + "learning_rate": 1.820527036102467e-05, + "loss": 0.6601, + "step": 1630 + }, + { + "epoch": 0.7063637342521805, + "grad_norm": 0.47990313172340393, + "learning_rate": 1.8178027029982027e-05, + "loss": 0.6463, + "step": 1640 + }, + { + "epoch": 0.7106708301927426, + "grad_norm": 0.5117030739784241, + "learning_rate": 1.8150599189287553e-05, + "loss": 0.6455, + "step": 1650 + }, + { + "epoch": 0.7149779261333046, + "grad_norm": 0.4917861819267273, + "learning_rate": 1.8122987457758147e-05, + "loss": 0.6688, + "step": 1660 + }, + { + "epoch": 0.7192850220738667, + "grad_norm": 0.49872297048568726, + "learning_rate": 1.8095192458359588e-05, + "loss": 0.6513, + "step": 1670 + }, + { + "epoch": 0.7235921180144288, + "grad_norm": 0.47510796785354614, + "learning_rate": 1.806721481819247e-05, + "loss": 0.649, + "step": 1680 + }, + { + "epoch": 0.7278992139549908, + "grad_norm": 0.4924173057079315, + "learning_rate": 1.8039055168478074e-05, + "loss": 0.6177, + "step": 1690 + }, + { + "epoch": 0.7322063098955529, + "grad_norm": 0.4918348789215088, + "learning_rate": 1.8010714144544104e-05, + "loss": 0.6543, + "step": 1700 + }, + { + "epoch": 0.736513405836115, + "grad_norm": 0.45298415422439575, + "learning_rate": 1.7982192385810372e-05, + "loss": 0.6367, + "step": 1710 + }, + { + "epoch": 0.740820501776677, + "grad_norm": 0.46879851818084717, + "learning_rate": 1.795349053577435e-05, + "loss": 0.6414, + "step": 1720 + }, + { + "epoch": 0.7451275977172391, + "grad_norm": 0.4573706388473511, + "learning_rate": 1.7924609241996672e-05, + "loss": 0.628, + "step": 1730 + }, + { + "epoch": 0.7494346936578012, + "grad_norm": 0.46929094195365906, + "learning_rate": 1.7895549156086514e-05, + "loss": 0.6478, + "step": 1740 + }, + { + "epoch": 0.7537417895983634, + "grad_norm": 0.5428628325462341, + "learning_rate": 1.78663109336869e-05, + "loss": 0.6405, + "step": 1750 + }, + { + "epoch": 0.7580488855389254, + "grad_norm": 0.47853079438209534, + "learning_rate": 1.78368952344599e-05, + "loss": 0.6442, + "step": 1760 + }, + { + "epoch": 0.7623559814794875, + "grad_norm": 0.46747061610221863, + "learning_rate": 1.7807302722071742e-05, + "loss": 0.6369, + "step": 1770 + }, + { + "epoch": 0.7666630774200496, + "grad_norm": 0.5107671022415161, + "learning_rate": 1.7777534064177864e-05, + "loss": 0.6322, + "step": 1780 + }, + { + "epoch": 0.7709701733606116, + "grad_norm": 0.5013517141342163, + "learning_rate": 1.7747589932407826e-05, + "loss": 0.6384, + "step": 1790 + }, + { + "epoch": 0.7752772693011737, + "grad_norm": 0.5039073824882507, + "learning_rate": 1.7717471002350162e-05, + "loss": 0.6504, + "step": 1800 + }, + { + "epoch": 0.7795843652417358, + "grad_norm": 0.4767347276210785, + "learning_rate": 1.7687177953537148e-05, + "loss": 0.645, + "step": 1810 + }, + { + "epoch": 0.7838914611822978, + "grad_norm": 0.4766087532043457, + "learning_rate": 1.7656711469429464e-05, + "loss": 0.6249, + "step": 1820 + }, + { + "epoch": 0.7881985571228599, + "grad_norm": 0.5031486749649048, + "learning_rate": 1.7626072237400764e-05, + "loss": 0.6263, + "step": 1830 + }, + { + "epoch": 0.792505653063422, + "grad_norm": 0.444658488035202, + "learning_rate": 1.759526094872219e-05, + "loss": 0.6561, + "step": 1840 + }, + { + "epoch": 0.796812749003984, + "grad_norm": 0.5070600509643555, + "learning_rate": 1.7564278298546758e-05, + "loss": 0.6477, + "step": 1850 + }, + { + "epoch": 0.8011198449445461, + "grad_norm": 0.45487794280052185, + "learning_rate": 1.753312498589367e-05, + "loss": 0.6257, + "step": 1860 + }, + { + "epoch": 0.8054269408851082, + "grad_norm": 0.4745471477508545, + "learning_rate": 1.7501801713632568e-05, + "loss": 0.6586, + "step": 1870 + }, + { + "epoch": 0.8097340368256702, + "grad_norm": 0.4743909537792206, + "learning_rate": 1.7470309188467645e-05, + "loss": 0.6255, + "step": 1880 + }, + { + "epoch": 0.8140411327662324, + "grad_norm": 0.5165956020355225, + "learning_rate": 1.7438648120921736e-05, + "loss": 0.6592, + "step": 1890 + }, + { + "epoch": 0.8183482287067945, + "grad_norm": 0.455861359834671, + "learning_rate": 1.740681922532025e-05, + "loss": 0.6467, + "step": 1900 + }, + { + "epoch": 0.8226553246473565, + "grad_norm": 0.468013733625412, + "learning_rate": 1.7374823219775073e-05, + "loss": 0.6382, + "step": 1910 + }, + { + "epoch": 0.8269624205879186, + "grad_norm": 0.46119919419288635, + "learning_rate": 1.7342660826168374e-05, + "loss": 0.6437, + "step": 1920 + }, + { + "epoch": 0.8312695165284807, + "grad_norm": 0.4399983286857605, + "learning_rate": 1.73103327701363e-05, + "loss": 0.6379, + "step": 1930 + }, + { + "epoch": 0.8355766124690428, + "grad_norm": 0.46829739212989807, + "learning_rate": 1.7277839781052617e-05, + "loss": 0.6402, + "step": 1940 + }, + { + "epoch": 0.8398837084096048, + "grad_norm": 0.5193459987640381, + "learning_rate": 1.7245182592012248e-05, + "loss": 0.6348, + "step": 1950 + }, + { + "epoch": 0.8441908043501669, + "grad_norm": 0.5310715436935425, + "learning_rate": 1.7212361939814735e-05, + "loss": 0.6351, + "step": 1960 + }, + { + "epoch": 0.848497900290729, + "grad_norm": 0.4883059561252594, + "learning_rate": 1.7179378564947615e-05, + "loss": 0.6401, + "step": 1970 + }, + { + "epoch": 0.852804996231291, + "grad_norm": 0.5028474926948547, + "learning_rate": 1.7146233211569723e-05, + "loss": 0.6559, + "step": 1980 + }, + { + "epoch": 0.8571120921718531, + "grad_norm": 0.48668941855430603, + "learning_rate": 1.7112926627494385e-05, + "loss": 0.6572, + "step": 1990 + }, + { + "epoch": 0.8614191881124152, + "grad_norm": 0.4668605327606201, + "learning_rate": 1.7079459564172555e-05, + "loss": 0.6321, + "step": 2000 + }, + { + "epoch": 0.8657262840529772, + "grad_norm": 0.4556910991668701, + "learning_rate": 1.7045832776675863e-05, + "loss": 0.6268, + "step": 2010 + }, + { + "epoch": 0.8700333799935394, + "grad_norm": 0.45260846614837646, + "learning_rate": 1.701204702367958e-05, + "loss": 0.6271, + "step": 2020 + }, + { + "epoch": 0.8743404759341015, + "grad_norm": 0.4828309714794159, + "learning_rate": 1.6978103067445494e-05, + "loss": 0.6351, + "step": 2030 + }, + { + "epoch": 0.8786475718746635, + "grad_norm": 0.4691152274608612, + "learning_rate": 1.6944001673804723e-05, + "loss": 0.6512, + "step": 2040 + }, + { + "epoch": 0.8829546678152256, + "grad_norm": 0.4812765419483185, + "learning_rate": 1.6909743612140417e-05, + "loss": 0.6335, + "step": 2050 + }, + { + "epoch": 0.8872617637557877, + "grad_norm": 0.4415755867958069, + "learning_rate": 1.687532965537043e-05, + "loss": 0.6541, + "step": 2060 + }, + { + "epoch": 0.8915688596963497, + "grad_norm": 0.4993227422237396, + "learning_rate": 1.6840760579929846e-05, + "loss": 0.6318, + "step": 2070 + }, + { + "epoch": 0.8958759556369118, + "grad_norm": 0.4628779888153076, + "learning_rate": 1.6806037165753498e-05, + "loss": 0.6369, + "step": 2080 + }, + { + "epoch": 0.9001830515774739, + "grad_norm": 0.5235878229141235, + "learning_rate": 1.677116019625834e-05, + "loss": 0.6415, + "step": 2090 + }, + { + "epoch": 0.9044901475180359, + "grad_norm": 0.4750138819217682, + "learning_rate": 1.6736130458325793e-05, + "loss": 0.6101, + "step": 2100 + }, + { + "epoch": 0.908797243458598, + "grad_norm": 0.5292583107948303, + "learning_rate": 1.6700948742283977e-05, + "loss": 0.6248, + "step": 2110 + }, + { + "epoch": 0.9131043393991601, + "grad_norm": 0.45959070324897766, + "learning_rate": 1.6665615841889885e-05, + "loss": 0.6339, + "step": 2120 + }, + { + "epoch": 0.9174114353397222, + "grad_norm": 0.48287901282310486, + "learning_rate": 1.6630132554311486e-05, + "loss": 0.6161, + "step": 2130 + }, + { + "epoch": 0.9217185312802842, + "grad_norm": 0.4725618064403534, + "learning_rate": 1.6594499680109722e-05, + "loss": 0.627, + "step": 2140 + }, + { + "epoch": 0.9260256272208464, + "grad_norm": 0.4820912778377533, + "learning_rate": 1.6558718023220457e-05, + "loss": 0.6399, + "step": 2150 + }, + { + "epoch": 0.9303327231614085, + "grad_norm": 0.48815685510635376, + "learning_rate": 1.6522788390936328e-05, + "loss": 0.6437, + "step": 2160 + }, + { + "epoch": 0.9346398191019705, + "grad_norm": 0.4747340679168701, + "learning_rate": 1.648671159388855e-05, + "loss": 0.6455, + "step": 2170 + }, + { + "epoch": 0.9389469150425326, + "grad_norm": 0.4894673526287079, + "learning_rate": 1.6450488446028612e-05, + "loss": 0.6545, + "step": 2180 + }, + { + "epoch": 0.9432540109830947, + "grad_norm": 0.4756160080432892, + "learning_rate": 1.641411976460991e-05, + "loss": 0.6498, + "step": 2190 + }, + { + "epoch": 0.9475611069236567, + "grad_norm": 0.45228078961372375, + "learning_rate": 1.637760637016932e-05, + "loss": 0.6438, + "step": 2200 + }, + { + "epoch": 0.9518682028642188, + "grad_norm": 0.49898287653923035, + "learning_rate": 1.6340949086508676e-05, + "loss": 0.6518, + "step": 2210 + }, + { + "epoch": 0.9561752988047809, + "grad_norm": 0.4354493021965027, + "learning_rate": 1.6304148740676204e-05, + "loss": 0.6125, + "step": 2220 + }, + { + "epoch": 0.9604823947453429, + "grad_norm": 0.45118704438209534, + "learning_rate": 1.6267206162947823e-05, + "loss": 0.6146, + "step": 2230 + }, + { + "epoch": 0.964789490685905, + "grad_norm": 0.4822487533092499, + "learning_rate": 1.6230122186808443e-05, + "loss": 0.6425, + "step": 2240 + }, + { + "epoch": 0.9690965866264671, + "grad_norm": 0.490903377532959, + "learning_rate": 1.619289764893317e-05, + "loss": 0.6353, + "step": 2250 + }, + { + "epoch": 0.9734036825670291, + "grad_norm": 0.4738866686820984, + "learning_rate": 1.615553338916839e-05, + "loss": 0.6315, + "step": 2260 + }, + { + "epoch": 0.9777107785075912, + "grad_norm": 0.46285027265548706, + "learning_rate": 1.6118030250512863e-05, + "loss": 0.6501, + "step": 2270 + }, + { + "epoch": 0.9820178744481534, + "grad_norm": 0.46414172649383545, + "learning_rate": 1.6080389079098657e-05, + "loss": 0.6501, + "step": 2280 + }, + { + "epoch": 0.9863249703887154, + "grad_norm": 0.5042113661766052, + "learning_rate": 1.604261072417211e-05, + "loss": 0.6319, + "step": 2290 + }, + { + "epoch": 0.9906320663292775, + "grad_norm": 0.43653419613838196, + "learning_rate": 1.600469603807464e-05, + "loss": 0.6461, + "step": 2300 + }, + { + "epoch": 0.9949391622698396, + "grad_norm": 0.4572006165981293, + "learning_rate": 1.5966645876223505e-05, + "loss": 0.6477, + "step": 2310 + }, + { + "epoch": 0.9992462582104016, + "grad_norm": 0.43867436051368713, + "learning_rate": 1.5928461097092532e-05, + "loss": 0.6288, + "step": 2320 + }, + { + "epoch": 1.0035533541509638, + "grad_norm": 0.5620077848434448, + "learning_rate": 1.589014256219273e-05, + "loss": 0.5378, + "step": 2330 + }, + { + "epoch": 1.0078604500915258, + "grad_norm": 0.4836018681526184, + "learning_rate": 1.5851691136052842e-05, + "loss": 0.5421, + "step": 2340 + }, + { + "epoch": 1.0121675460320878, + "grad_norm": 0.49632197618484497, + "learning_rate": 1.581310768619988e-05, + "loss": 0.5237, + "step": 2350 + }, + { + "epoch": 1.01647464197265, + "grad_norm": 0.49445948004722595, + "learning_rate": 1.5774393083139513e-05, + "loss": 0.5313, + "step": 2360 + }, + { + "epoch": 1.020781737913212, + "grad_norm": 0.5299666523933411, + "learning_rate": 1.5735548200336435e-05, + "loss": 0.5326, + "step": 2370 + }, + { + "epoch": 1.025088833853774, + "grad_norm": 0.5012844204902649, + "learning_rate": 1.569657391419468e-05, + "loss": 0.5401, + "step": 2380 + }, + { + "epoch": 1.0293959297943363, + "grad_norm": 0.4741289019584656, + "learning_rate": 1.565747110403781e-05, + "loss": 0.5052, + "step": 2390 + }, + { + "epoch": 1.0337030257348983, + "grad_norm": 0.4950823485851288, + "learning_rate": 1.5618240652089123e-05, + "loss": 0.5294, + "step": 2400 + }, + { + "epoch": 1.0380101216754603, + "grad_norm": 0.4934958517551422, + "learning_rate": 1.557888344345171e-05, + "loss": 0.5278, + "step": 2410 + }, + { + "epoch": 1.0423172176160225, + "grad_norm": 0.467101514339447, + "learning_rate": 1.5539400366088503e-05, + "loss": 0.504, + "step": 2420 + }, + { + "epoch": 1.0466243135565845, + "grad_norm": 0.5479716062545776, + "learning_rate": 1.5499792310802238e-05, + "loss": 0.5256, + "step": 2430 + }, + { + "epoch": 1.0509314094971465, + "grad_norm": 0.4706737697124481, + "learning_rate": 1.5460060171215362e-05, + "loss": 0.5251, + "step": 2440 + }, + { + "epoch": 1.0552385054377087, + "grad_norm": 0.5142565965652466, + "learning_rate": 1.5420204843749857e-05, + "loss": 0.5333, + "step": 2450 + }, + { + "epoch": 1.0595456013782707, + "grad_norm": 0.5430694222450256, + "learning_rate": 1.5380227227607032e-05, + "loss": 0.5391, + "step": 2460 + }, + { + "epoch": 1.0638526973188327, + "grad_norm": 0.4780258536338806, + "learning_rate": 1.5340128224747225e-05, + "loss": 0.5338, + "step": 2470 + }, + { + "epoch": 1.068159793259395, + "grad_norm": 0.47647717595100403, + "learning_rate": 1.5299908739869464e-05, + "loss": 0.5178, + "step": 2480 + }, + { + "epoch": 1.072466889199957, + "grad_norm": 0.5330241918563843, + "learning_rate": 1.525956968039103e-05, + "loss": 0.5027, + "step": 2490 + }, + { + "epoch": 1.076773985140519, + "grad_norm": 0.4681854546070099, + "learning_rate": 1.5219111956427027e-05, + "loss": 0.5315, + "step": 2500 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.5060921311378479, + "learning_rate": 1.5178536480769803e-05, + "loss": 0.5103, + "step": 2510 + }, + { + "epoch": 1.0853881770216431, + "grad_norm": 0.497199147939682, + "learning_rate": 1.5137844168868391e-05, + "loss": 0.5302, + "step": 2520 + }, + { + "epoch": 1.0896952729622051, + "grad_norm": 0.4658927321434021, + "learning_rate": 1.5097035938807834e-05, + "loss": 0.5196, + "step": 2530 + }, + { + "epoch": 1.0940023689027674, + "grad_norm": 0.5109249353408813, + "learning_rate": 1.5056112711288475e-05, + "loss": 0.5099, + "step": 2540 + }, + { + "epoch": 1.0983094648433294, + "grad_norm": 0.5212246775627136, + "learning_rate": 1.5015075409605189e-05, + "loss": 0.4911, + "step": 2550 + }, + { + "epoch": 1.1026165607838914, + "grad_norm": 0.47850698232650757, + "learning_rate": 1.497392495962656e-05, + "loss": 0.5225, + "step": 2560 + }, + { + "epoch": 1.1069236567244536, + "grad_norm": 0.4982755184173584, + "learning_rate": 1.4932662289773969e-05, + "loss": 0.5278, + "step": 2570 + }, + { + "epoch": 1.1112307526650156, + "grad_norm": 0.49975791573524475, + "learning_rate": 1.4891288331000668e-05, + "loss": 0.5261, + "step": 2580 + }, + { + "epoch": 1.1155378486055776, + "grad_norm": 0.5002388954162598, + "learning_rate": 1.484980401677077e-05, + "loss": 0.5313, + "step": 2590 + }, + { + "epoch": 1.1198449445461398, + "grad_norm": 0.4950617253780365, + "learning_rate": 1.4808210283038183e-05, + "loss": 0.5286, + "step": 2600 + }, + { + "epoch": 1.1241520404867018, + "grad_norm": 0.49831753969192505, + "learning_rate": 1.47665080682255e-05, + "loss": 0.5133, + "step": 2610 + }, + { + "epoch": 1.128459136427264, + "grad_norm": 0.6730148792266846, + "learning_rate": 1.4724698313202825e-05, + "loss": 0.5224, + "step": 2620 + }, + { + "epoch": 1.132766232367826, + "grad_norm": 0.5355139374732971, + "learning_rate": 1.4682781961266546e-05, + "loss": 0.5188, + "step": 2630 + }, + { + "epoch": 1.137073328308388, + "grad_norm": 0.5199829936027527, + "learning_rate": 1.4640759958118045e-05, + "loss": 0.5121, + "step": 2640 + }, + { + "epoch": 1.14138042424895, + "grad_norm": 0.5292408466339111, + "learning_rate": 1.4598633251842373e-05, + "loss": 0.5267, + "step": 2650 + }, + { + "epoch": 1.1456875201895123, + "grad_norm": 0.5363121032714844, + "learning_rate": 1.4556402792886856e-05, + "loss": 0.5147, + "step": 2660 + }, + { + "epoch": 1.1499946161300743, + "grad_norm": 0.5359490513801575, + "learning_rate": 1.4514069534039649e-05, + "loss": 0.5155, + "step": 2670 + }, + { + "epoch": 1.1543017120706365, + "grad_norm": 0.4707220792770386, + "learning_rate": 1.4471634430408244e-05, + "loss": 0.5419, + "step": 2680 + }, + { + "epoch": 1.1586088080111985, + "grad_norm": 0.4798811376094818, + "learning_rate": 1.4429098439397901e-05, + "loss": 0.5152, + "step": 2690 + }, + { + "epoch": 1.1629159039517605, + "grad_norm": 0.4730081260204315, + "learning_rate": 1.4386462520690087e-05, + "loss": 0.5283, + "step": 2700 + }, + { + "epoch": 1.1672229998923225, + "grad_norm": 0.524276614189148, + "learning_rate": 1.4343727636220785e-05, + "loss": 0.5087, + "step": 2710 + }, + { + "epoch": 1.1715300958328847, + "grad_norm": 0.5093454122543335, + "learning_rate": 1.430089475015882e-05, + "loss": 0.5371, + "step": 2720 + }, + { + "epoch": 1.1758371917734467, + "grad_norm": 0.5228180289268494, + "learning_rate": 1.4257964828884077e-05, + "loss": 0.5121, + "step": 2730 + }, + { + "epoch": 1.180144287714009, + "grad_norm": 0.5263434052467346, + "learning_rate": 1.4214938840965729e-05, + "loss": 0.5104, + "step": 2740 + }, + { + "epoch": 1.184451383654571, + "grad_norm": 0.5519675612449646, + "learning_rate": 1.417181775714036e-05, + "loss": 0.5081, + "step": 2750 + }, + { + "epoch": 1.188758479595133, + "grad_norm": 0.48901626467704773, + "learning_rate": 1.4128602550290078e-05, + "loss": 0.5332, + "step": 2760 + }, + { + "epoch": 1.1930655755356951, + "grad_norm": 0.5022098422050476, + "learning_rate": 1.4085294195420563e-05, + "loss": 0.5267, + "step": 2770 + }, + { + "epoch": 1.1973726714762571, + "grad_norm": 0.5244942307472229, + "learning_rate": 1.4041893669639053e-05, + "loss": 0.5309, + "step": 2780 + }, + { + "epoch": 1.2016797674168191, + "grad_norm": 0.5060109496116638, + "learning_rate": 1.399840195213233e-05, + "loss": 0.509, + "step": 2790 + }, + { + "epoch": 1.2059868633573814, + "grad_norm": 0.48709142208099365, + "learning_rate": 1.3954820024144595e-05, + "loss": 0.5249, + "step": 2800 + }, + { + "epoch": 1.2102939592979434, + "grad_norm": 0.48755279183387756, + "learning_rate": 1.3911148868955357e-05, + "loss": 0.5216, + "step": 2810 + }, + { + "epoch": 1.2146010552385054, + "grad_norm": 0.4871668219566345, + "learning_rate": 1.3867389471857229e-05, + "loss": 0.5199, + "step": 2820 + }, + { + "epoch": 1.2189081511790676, + "grad_norm": 0.5313363671302795, + "learning_rate": 1.3823542820133706e-05, + "loss": 0.5146, + "step": 2830 + }, + { + "epoch": 1.2232152471196296, + "grad_norm": 0.48473960161209106, + "learning_rate": 1.3779609903036894e-05, + "loss": 0.5126, + "step": 2840 + }, + { + "epoch": 1.2275223430601916, + "grad_norm": 0.5411814451217651, + "learning_rate": 1.3735591711765189e-05, + "loss": 0.5186, + "step": 2850 + }, + { + "epoch": 1.2318294390007538, + "grad_norm": 0.5286210775375366, + "learning_rate": 1.3691489239440899e-05, + "loss": 0.513, + "step": 2860 + }, + { + "epoch": 1.2361365349413158, + "grad_norm": 0.47112423181533813, + "learning_rate": 1.3647303481087858e-05, + "loss": 0.5268, + "step": 2870 + }, + { + "epoch": 1.2404436308818778, + "grad_norm": 0.5465208888053894, + "learning_rate": 1.3603035433608977e-05, + "loss": 0.5109, + "step": 2880 + }, + { + "epoch": 1.24475072682244, + "grad_norm": 0.4758882522583008, + "learning_rate": 1.3558686095763732e-05, + "loss": 0.5307, + "step": 2890 + }, + { + "epoch": 1.249057822763002, + "grad_norm": 0.5721794962882996, + "learning_rate": 1.3514256468145645e-05, + "loss": 0.5104, + "step": 2900 + }, + { + "epoch": 1.2533649187035643, + "grad_norm": 0.5125982761383057, + "learning_rate": 1.3469747553159714e-05, + "loss": 0.5278, + "step": 2910 + }, + { + "epoch": 1.2576720146441263, + "grad_norm": 0.5272653698921204, + "learning_rate": 1.342516035499978e-05, + "loss": 0.5276, + "step": 2920 + }, + { + "epoch": 1.2619791105846883, + "grad_norm": 0.5423816442489624, + "learning_rate": 1.3380495879625884e-05, + "loss": 0.5408, + "step": 2930 + }, + { + "epoch": 1.2662862065252503, + "grad_norm": 0.4817509055137634, + "learning_rate": 1.333575513474157e-05, + "loss": 0.5152, + "step": 2940 + }, + { + "epoch": 1.2705933024658125, + "grad_norm": 0.5113592147827148, + "learning_rate": 1.3290939129771143e-05, + "loss": 0.5397, + "step": 2950 + }, + { + "epoch": 1.2749003984063745, + "grad_norm": 0.5106224417686462, + "learning_rate": 1.3246048875836898e-05, + "loss": 0.5269, + "step": 2960 + }, + { + "epoch": 1.2792074943469367, + "grad_norm": 0.5446826219558716, + "learning_rate": 1.3201085385736313e-05, + "loss": 0.5252, + "step": 2970 + }, + { + "epoch": 1.2835145902874987, + "grad_norm": 0.484943151473999, + "learning_rate": 1.3156049673919184e-05, + "loss": 0.525, + "step": 2980 + }, + { + "epoch": 1.2878216862280607, + "grad_norm": 0.5692194700241089, + "learning_rate": 1.3110942756464764e-05, + "loss": 0.5197, + "step": 2990 + }, + { + "epoch": 1.2921287821686227, + "grad_norm": 0.5009827017784119, + "learning_rate": 1.3065765651058802e-05, + "loss": 0.5325, + "step": 3000 + }, + { + "epoch": 1.296435878109185, + "grad_norm": 0.4953298568725586, + "learning_rate": 1.3020519376970613e-05, + "loss": 0.5095, + "step": 3010 + }, + { + "epoch": 1.300742974049747, + "grad_norm": 0.5116891264915466, + "learning_rate": 1.2975204955030068e-05, + "loss": 0.5263, + "step": 3020 + }, + { + "epoch": 1.3050500699903091, + "grad_norm": 0.4844088554382324, + "learning_rate": 1.2929823407604567e-05, + "loss": 0.5113, + "step": 3030 + }, + { + "epoch": 1.3093571659308711, + "grad_norm": 0.4732029438018799, + "learning_rate": 1.2884375758575967e-05, + "loss": 0.532, + "step": 3040 + }, + { + "epoch": 1.3136642618714331, + "grad_norm": 0.5469485521316528, + "learning_rate": 1.2838863033317484e-05, + "loss": 0.519, + "step": 3050 + }, + { + "epoch": 1.3179713578119951, + "grad_norm": 0.4888254702091217, + "learning_rate": 1.2793286258670565e-05, + "loss": 0.5097, + "step": 3060 + }, + { + "epoch": 1.3222784537525574, + "grad_norm": 0.5359517335891724, + "learning_rate": 1.2747646462921717e-05, + "loss": 0.5246, + "step": 3070 + }, + { + "epoch": 1.3265855496931194, + "grad_norm": 0.5013801455497742, + "learning_rate": 1.2701944675779299e-05, + "loss": 0.524, + "step": 3080 + }, + { + "epoch": 1.3308926456336816, + "grad_norm": 0.49307557940483093, + "learning_rate": 1.2656181928350301e-05, + "loss": 0.5403, + "step": 3090 + }, + { + "epoch": 1.3351997415742436, + "grad_norm": 0.47625210881233215, + "learning_rate": 1.2610359253117078e-05, + "loss": 0.5275, + "step": 3100 + }, + { + "epoch": 1.3395068375148056, + "grad_norm": 0.5096368789672852, + "learning_rate": 1.2564477683914053e-05, + "loss": 0.5231, + "step": 3110 + }, + { + "epoch": 1.3438139334553676, + "grad_norm": 0.4992668926715851, + "learning_rate": 1.2518538255904389e-05, + "loss": 0.5235, + "step": 3120 + }, + { + "epoch": 1.3481210293959298, + "grad_norm": 0.491062194108963, + "learning_rate": 1.2472542005556647e-05, + "loss": 0.5432, + "step": 3130 + }, + { + "epoch": 1.3524281253364918, + "grad_norm": 0.48666131496429443, + "learning_rate": 1.2426489970621385e-05, + "loss": 0.531, + "step": 3140 + }, + { + "epoch": 1.356735221277054, + "grad_norm": 0.4706876575946808, + "learning_rate": 1.2380383190107757e-05, + "loss": 0.5188, + "step": 3150 + }, + { + "epoch": 1.361042317217616, + "grad_norm": 0.4910385310649872, + "learning_rate": 1.2334222704260063e-05, + "loss": 0.5106, + "step": 3160 + }, + { + "epoch": 1.365349413158178, + "grad_norm": 0.506514847278595, + "learning_rate": 1.2288009554534291e-05, + "loss": 0.5292, + "step": 3170 + }, + { + "epoch": 1.36965650909874, + "grad_norm": 0.49671700596809387, + "learning_rate": 1.2241744783574596e-05, + "loss": 0.5284, + "step": 3180 + }, + { + "epoch": 1.3739636050393023, + "grad_norm": 0.4892718195915222, + "learning_rate": 1.219542943518981e-05, + "loss": 0.5215, + "step": 3190 + }, + { + "epoch": 1.3782707009798643, + "grad_norm": 0.5412102937698364, + "learning_rate": 1.2149064554329864e-05, + "loss": 0.5256, + "step": 3200 + }, + { + "epoch": 1.3825777969204265, + "grad_norm": 0.4869970679283142, + "learning_rate": 1.2102651187062227e-05, + "loss": 0.5218, + "step": 3210 + }, + { + "epoch": 1.3868848928609885, + "grad_norm": 0.5195066332817078, + "learning_rate": 1.2056190380548299e-05, + "loss": 0.5269, + "step": 3220 + }, + { + "epoch": 1.3911919888015505, + "grad_norm": 0.5343438982963562, + "learning_rate": 1.2009683183019788e-05, + "loss": 0.5301, + "step": 3230 + }, + { + "epoch": 1.3954990847421127, + "grad_norm": 0.522270679473877, + "learning_rate": 1.1963130643755055e-05, + "loss": 0.545, + "step": 3240 + }, + { + "epoch": 1.3998061806826747, + "grad_norm": 0.501485288143158, + "learning_rate": 1.191653381305545e-05, + "loss": 0.5253, + "step": 3250 + }, + { + "epoch": 1.4041132766232367, + "grad_norm": 0.5288712382316589, + "learning_rate": 1.186989374222161e-05, + "loss": 0.5181, + "step": 3260 + }, + { + "epoch": 1.408420372563799, + "grad_norm": 0.5131502151489258, + "learning_rate": 1.1823211483529733e-05, + "loss": 0.5138, + "step": 3270 + }, + { + "epoch": 1.412727468504361, + "grad_norm": 0.4853404462337494, + "learning_rate": 1.1776488090207852e-05, + "loss": 0.5319, + "step": 3280 + }, + { + "epoch": 1.417034564444923, + "grad_norm": 0.5093010663986206, + "learning_rate": 1.1729724616412062e-05, + "loss": 0.5155, + "step": 3290 + }, + { + "epoch": 1.4213416603854852, + "grad_norm": 0.5078168511390686, + "learning_rate": 1.1682922117202736e-05, + "loss": 0.5206, + "step": 3300 + }, + { + "epoch": 1.4256487563260472, + "grad_norm": 0.5315324664115906, + "learning_rate": 1.163608164852073e-05, + "loss": 0.5314, + "step": 3310 + }, + { + "epoch": 1.4299558522666094, + "grad_norm": 0.4705192446708679, + "learning_rate": 1.1589204267163545e-05, + "loss": 0.4966, + "step": 3320 + }, + { + "epoch": 1.4342629482071714, + "grad_norm": 0.48757535219192505, + "learning_rate": 1.15422910307615e-05, + "loss": 0.5299, + "step": 3330 + }, + { + "epoch": 1.4385700441477334, + "grad_norm": 0.5582148432731628, + "learning_rate": 1.1495342997753864e-05, + "loss": 0.5201, + "step": 3340 + }, + { + "epoch": 1.4428771400882954, + "grad_norm": 0.5134326219558716, + "learning_rate": 1.1448361227364963e-05, + "loss": 0.5061, + "step": 3350 + }, + { + "epoch": 1.4471842360288576, + "grad_norm": 0.5316387414932251, + "learning_rate": 1.1401346779580303e-05, + "loss": 0.5145, + "step": 3360 + }, + { + "epoch": 1.4514913319694196, + "grad_norm": 0.5328738689422607, + "learning_rate": 1.1354300715122637e-05, + "loss": 0.5288, + "step": 3370 + }, + { + "epoch": 1.4557984279099818, + "grad_norm": 0.5279168486595154, + "learning_rate": 1.1307224095428058e-05, + "loss": 0.5031, + "step": 3380 + }, + { + "epoch": 1.4601055238505438, + "grad_norm": 0.5049686431884766, + "learning_rate": 1.1260117982622021e-05, + "loss": 0.5004, + "step": 3390 + }, + { + "epoch": 1.4644126197911058, + "grad_norm": 0.47000184655189514, + "learning_rate": 1.1212983439495392e-05, + "loss": 0.5267, + "step": 3400 + }, + { + "epoch": 1.4687197157316678, + "grad_norm": 0.49505382776260376, + "learning_rate": 1.1165821529480483e-05, + "loss": 0.5278, + "step": 3410 + }, + { + "epoch": 1.47302681167223, + "grad_norm": 0.568454384803772, + "learning_rate": 1.1118633316627037e-05, + "loss": 0.5116, + "step": 3420 + }, + { + "epoch": 1.477333907612792, + "grad_norm": 0.5094279646873474, + "learning_rate": 1.1071419865578241e-05, + "loss": 0.5181, + "step": 3430 + }, + { + "epoch": 1.4816410035533543, + "grad_norm": 0.5605435371398926, + "learning_rate": 1.1024182241546686e-05, + "loss": 0.5191, + "step": 3440 + }, + { + "epoch": 1.4859480994939163, + "grad_norm": 0.49941274523735046, + "learning_rate": 1.097692151029036e-05, + "loss": 0.5036, + "step": 3450 + }, + { + "epoch": 1.4902551954344783, + "grad_norm": 0.5064433813095093, + "learning_rate": 1.0929638738088571e-05, + "loss": 0.5195, + "step": 3460 + }, + { + "epoch": 1.4945622913750403, + "grad_norm": 0.5021061301231384, + "learning_rate": 1.088233499171792e-05, + "loss": 0.522, + "step": 3470 + }, + { + "epoch": 1.4988693873156025, + "grad_norm": 0.5188096761703491, + "learning_rate": 1.0835011338428217e-05, + "loss": 0.5156, + "step": 3480 + }, + { + "epoch": 1.5031764832561645, + "grad_norm": 0.6124559640884399, + "learning_rate": 1.0787668845918393e-05, + "loss": 0.5145, + "step": 3490 + }, + { + "epoch": 1.5074835791967267, + "grad_norm": 0.48937344551086426, + "learning_rate": 1.074030858231244e-05, + "loss": 0.515, + "step": 3500 + }, + { + "epoch": 1.5117906751372887, + "grad_norm": 0.518526017665863, + "learning_rate": 1.0692931616135283e-05, + "loss": 0.505, + "step": 3510 + }, + { + "epoch": 1.5160977710778507, + "grad_norm": 0.5395667552947998, + "learning_rate": 1.0645539016288686e-05, + "loss": 0.5076, + "step": 3520 + }, + { + "epoch": 1.5204048670184127, + "grad_norm": 0.495190292596817, + "learning_rate": 1.059813185202714e-05, + "loss": 0.523, + "step": 3530 + }, + { + "epoch": 1.524711962958975, + "grad_norm": 0.49644342064857483, + "learning_rate": 1.055071119293373e-05, + "loss": 0.5038, + "step": 3540 + }, + { + "epoch": 1.5290190588995372, + "grad_norm": 0.483696848154068, + "learning_rate": 1.0503278108896e-05, + "loss": 0.5103, + "step": 3550 + }, + { + "epoch": 1.5333261548400992, + "grad_norm": 0.5149986147880554, + "learning_rate": 1.0455833670081831e-05, + "loss": 0.5402, + "step": 3560 + }, + { + "epoch": 1.5376332507806612, + "grad_norm": 0.4734952449798584, + "learning_rate": 1.0408378946915282e-05, + "loss": 0.5292, + "step": 3570 + }, + { + "epoch": 1.5419403467212232, + "grad_norm": 0.5490080118179321, + "learning_rate": 1.0360915010052443e-05, + "loss": 0.5155, + "step": 3580 + }, + { + "epoch": 1.5462474426617852, + "grad_norm": 0.5176838636398315, + "learning_rate": 1.0313442930357278e-05, + "loss": 0.5111, + "step": 3590 + }, + { + "epoch": 1.5505545386023474, + "grad_norm": 0.5659157633781433, + "learning_rate": 1.026596377887747e-05, + "loss": 0.5152, + "step": 3600 + }, + { + "epoch": 1.5548616345429096, + "grad_norm": 0.5195504426956177, + "learning_rate": 1.0218478626820256e-05, + "loss": 0.5178, + "step": 3610 + }, + { + "epoch": 1.5591687304834716, + "grad_norm": 0.533338189125061, + "learning_rate": 1.0170988545528248e-05, + "loss": 0.5138, + "step": 3620 + }, + { + "epoch": 1.5634758264240336, + "grad_norm": 0.5108840465545654, + "learning_rate": 1.0123494606455278e-05, + "loss": 0.5273, + "step": 3630 + }, + { + "epoch": 1.5677829223645956, + "grad_norm": 0.4785379469394684, + "learning_rate": 1.0075997881142208e-05, + "loss": 0.5071, + "step": 3640 + }, + { + "epoch": 1.5720900183051576, + "grad_norm": 0.49497827887535095, + "learning_rate": 1.0028499441192765e-05, + "loss": 0.5132, + "step": 3650 + }, + { + "epoch": 1.5763971142457198, + "grad_norm": 0.5214102864265442, + "learning_rate": 9.981000358249368e-06, + "loss": 0.5133, + "step": 3660 + }, + { + "epoch": 1.580704210186282, + "grad_norm": 0.47462400794029236, + "learning_rate": 9.933501703968928e-06, + "loss": 0.5226, + "step": 3670 + }, + { + "epoch": 1.585011306126844, + "grad_norm": 0.4743979275226593, + "learning_rate": 9.8860045499987e-06, + "loss": 0.5219, + "step": 3680 + }, + { + "epoch": 1.589318402067406, + "grad_norm": 0.5265910625457764, + "learning_rate": 9.838509967952076e-06, + "loss": 0.4945, + "step": 3690 + }, + { + "epoch": 1.593625498007968, + "grad_norm": 0.5075172185897827, + "learning_rate": 9.791019029384437e-06, + "loss": 0.5175, + "step": 3700 + }, + { + "epoch": 1.59793259394853, + "grad_norm": 0.5206677913665771, + "learning_rate": 9.743532805768948e-06, + "loss": 0.5188, + "step": 3710 + }, + { + "epoch": 1.6022396898890923, + "grad_norm": 0.4802674651145935, + "learning_rate": 9.696052368472406e-06, + "loss": 0.5064, + "step": 3720 + }, + { + "epoch": 1.6065467858296545, + "grad_norm": 0.5289535522460938, + "learning_rate": 9.648578788731044e-06, + "loss": 0.5281, + "step": 3730 + }, + { + "epoch": 1.6108538817702165, + "grad_norm": 0.47722700238227844, + "learning_rate": 9.601113137626394e-06, + "loss": 0.5151, + "step": 3740 + }, + { + "epoch": 1.6151609777107785, + "grad_norm": 0.4994152784347534, + "learning_rate": 9.553656486061098e-06, + "loss": 0.52, + "step": 3750 + }, + { + "epoch": 1.6194680736513405, + "grad_norm": 0.48130089044570923, + "learning_rate": 9.506209904734753e-06, + "loss": 0.5336, + "step": 3760 + }, + { + "epoch": 1.6237751695919027, + "grad_norm": 0.48449528217315674, + "learning_rate": 9.45877446411976e-06, + "loss": 0.5252, + "step": 3770 + }, + { + "epoch": 1.6280822655324647, + "grad_norm": 0.5411643981933594, + "learning_rate": 9.411351234437163e-06, + "loss": 0.5187, + "step": 3780 + }, + { + "epoch": 1.632389361473027, + "grad_norm": 0.5133873820304871, + "learning_rate": 9.363941285632507e-06, + "loss": 0.5217, + "step": 3790 + }, + { + "epoch": 1.636696457413589, + "grad_norm": 0.5814666748046875, + "learning_rate": 9.3165456873517e-06, + "loss": 0.5, + "step": 3800 + }, + { + "epoch": 1.641003553354151, + "grad_norm": 0.52715665102005, + "learning_rate": 9.269165508916883e-06, + "loss": 0.5184, + "step": 3810 + }, + { + "epoch": 1.645310649294713, + "grad_norm": 0.48196879029273987, + "learning_rate": 9.221801819302288e-06, + "loss": 0.5191, + "step": 3820 + }, + { + "epoch": 1.6496177452352752, + "grad_norm": 0.49397778511047363, + "learning_rate": 9.174455687110142e-06, + "loss": 0.5013, + "step": 3830 + }, + { + "epoch": 1.6539248411758372, + "grad_norm": 0.5037091970443726, + "learning_rate": 9.127128180546548e-06, + "loss": 0.5298, + "step": 3840 + }, + { + "epoch": 1.6582319371163994, + "grad_norm": 0.5031833052635193, + "learning_rate": 9.079820367397384e-06, + "loss": 0.4929, + "step": 3850 + }, + { + "epoch": 1.6625390330569614, + "grad_norm": 0.5380353927612305, + "learning_rate": 9.032533315004207e-06, + "loss": 0.4968, + "step": 3860 + }, + { + "epoch": 1.6668461289975234, + "grad_norm": 0.5191226005554199, + "learning_rate": 8.98526809024018e-06, + "loss": 0.5267, + "step": 3870 + }, + { + "epoch": 1.6711532249380854, + "grad_norm": 0.5179468393325806, + "learning_rate": 8.938025759486007e-06, + "loss": 0.5159, + "step": 3880 + }, + { + "epoch": 1.6754603208786476, + "grad_norm": 0.4779166579246521, + "learning_rate": 8.89080738860585e-06, + "loss": 0.5211, + "step": 3890 + }, + { + "epoch": 1.6797674168192096, + "grad_norm": 0.5136571526527405, + "learning_rate": 8.843614042923318e-06, + "loss": 0.5003, + "step": 3900 + }, + { + "epoch": 1.6840745127597718, + "grad_norm": 0.540773332118988, + "learning_rate": 8.796446787197383e-06, + "loss": 0.5131, + "step": 3910 + }, + { + "epoch": 1.6883816087003338, + "grad_norm": 0.5126665234565735, + "learning_rate": 8.749306685598409e-06, + "loss": 0.5093, + "step": 3920 + }, + { + "epoch": 1.6926887046408958, + "grad_norm": 0.47659188508987427, + "learning_rate": 8.702194801684112e-06, + "loss": 0.5158, + "step": 3930 + }, + { + "epoch": 1.6969958005814578, + "grad_norm": 0.47945475578308105, + "learning_rate": 8.655112198375564e-06, + "loss": 0.5026, + "step": 3940 + }, + { + "epoch": 1.70130289652202, + "grad_norm": 0.4939498007297516, + "learning_rate": 8.60805993793323e-06, + "loss": 0.5099, + "step": 3950 + }, + { + "epoch": 1.7056099924625823, + "grad_norm": 0.5328351259231567, + "learning_rate": 8.561039081932975e-06, + "loss": 0.52, + "step": 3960 + }, + { + "epoch": 1.7099170884031443, + "grad_norm": 0.49865198135375977, + "learning_rate": 8.514050691242145e-06, + "loss": 0.5077, + "step": 3970 + }, + { + "epoch": 1.7142241843437063, + "grad_norm": 0.49807870388031006, + "learning_rate": 8.467095825995605e-06, + "loss": 0.4976, + "step": 3980 + }, + { + "epoch": 1.7185312802842683, + "grad_norm": 0.5023031234741211, + "learning_rate": 8.420175545571837e-06, + "loss": 0.5233, + "step": 3990 + }, + { + "epoch": 1.7228383762248303, + "grad_norm": 0.49054110050201416, + "learning_rate": 8.373290908569026e-06, + "loss": 0.5115, + "step": 4000 + }, + { + "epoch": 1.7271454721653925, + "grad_norm": 0.47637811303138733, + "learning_rate": 8.32644297278119e-06, + "loss": 0.5103, + "step": 4010 + }, + { + "epoch": 1.7314525681059547, + "grad_norm": 0.5239661931991577, + "learning_rate": 8.279632795174304e-06, + "loss": 0.5161, + "step": 4020 + }, + { + "epoch": 1.7357596640465167, + "grad_norm": 0.5000544190406799, + "learning_rate": 8.232861431862457e-06, + "loss": 0.5113, + "step": 4030 + }, + { + "epoch": 1.7400667599870787, + "grad_norm": 0.5361005067825317, + "learning_rate": 8.186129938084028e-06, + "loss": 0.5137, + "step": 4040 + }, + { + "epoch": 1.7443738559276407, + "grad_norm": 0.48270535469055176, + "learning_rate": 8.139439368177868e-06, + "loss": 0.5116, + "step": 4050 + }, + { + "epoch": 1.7486809518682027, + "grad_norm": 0.48645904660224915, + "learning_rate": 8.092790775559522e-06, + "loss": 0.517, + "step": 4060 + }, + { + "epoch": 1.752988047808765, + "grad_norm": 0.4865799844264984, + "learning_rate": 8.046185212697459e-06, + "loss": 0.5202, + "step": 4070 + }, + { + "epoch": 1.7572951437493272, + "grad_norm": 0.5095897912979126, + "learning_rate": 7.999623731089327e-06, + "loss": 0.5186, + "step": 4080 + }, + { + "epoch": 1.7616022396898892, + "grad_norm": 0.49918055534362793, + "learning_rate": 7.953107381238226e-06, + "loss": 0.5091, + "step": 4090 + }, + { + "epoch": 1.7659093356304512, + "grad_norm": 0.5209227204322815, + "learning_rate": 7.906637212629011e-06, + "loss": 0.5098, + "step": 4100 + }, + { + "epoch": 1.7702164315710132, + "grad_norm": 0.5320930480957031, + "learning_rate": 7.860214273704614e-06, + "loss": 0.5172, + "step": 4110 + }, + { + "epoch": 1.7745235275115752, + "grad_norm": 0.4841155707836151, + "learning_rate": 7.813839611842387e-06, + "loss": 0.4851, + "step": 4120 + }, + { + "epoch": 1.7788306234521374, + "grad_norm": 0.5300472378730774, + "learning_rate": 7.767514273330473e-06, + "loss": 0.4953, + "step": 4130 + }, + { + "epoch": 1.7831377193926996, + "grad_norm": 0.5021957159042358, + "learning_rate": 7.721239303344201e-06, + "loss": 0.5112, + "step": 4140 + }, + { + "epoch": 1.7874448153332616, + "grad_norm": 0.498737096786499, + "learning_rate": 7.675015745922499e-06, + "loss": 0.5045, + "step": 4150 + }, + { + "epoch": 1.7917519112738236, + "grad_norm": 0.4690532684326172, + "learning_rate": 7.628844643944349e-06, + "loss": 0.5102, + "step": 4160 + }, + { + "epoch": 1.7960590072143856, + "grad_norm": 0.5077162384986877, + "learning_rate": 7.582727039105255e-06, + "loss": 0.5105, + "step": 4170 + }, + { + "epoch": 1.8003661031549478, + "grad_norm": 0.47492554783821106, + "learning_rate": 7.536663971893724e-06, + "loss": 0.5008, + "step": 4180 + }, + { + "epoch": 1.8046731990955098, + "grad_norm": 0.5036799907684326, + "learning_rate": 7.4906564815678205e-06, + "loss": 0.5179, + "step": 4190 + }, + { + "epoch": 1.808980295036072, + "grad_norm": 0.5044455528259277, + "learning_rate": 7.444705606131697e-06, + "loss": 0.5171, + "step": 4200 + }, + { + "epoch": 1.813287390976634, + "grad_norm": 0.5645790696144104, + "learning_rate": 7.39881238231218e-06, + "loss": 0.5111, + "step": 4210 + }, + { + "epoch": 1.817594486917196, + "grad_norm": 0.4966265857219696, + "learning_rate": 7.352977845535387e-06, + "loss": 0.5144, + "step": 4220 + }, + { + "epoch": 1.821901582857758, + "grad_norm": 0.5225628614425659, + "learning_rate": 7.307203029903354e-06, + "loss": 0.5115, + "step": 4230 + }, + { + "epoch": 1.8262086787983203, + "grad_norm": 0.5282090902328491, + "learning_rate": 7.261488968170713e-06, + "loss": 0.5251, + "step": 4240 + }, + { + "epoch": 1.8305157747388823, + "grad_norm": 0.5346629023551941, + "learning_rate": 7.21583669172139e-06, + "loss": 0.5042, + "step": 4250 + }, + { + "epoch": 1.8348228706794445, + "grad_norm": 0.5141210556030273, + "learning_rate": 7.170247230545335e-06, + "loss": 0.5199, + "step": 4260 + }, + { + "epoch": 1.8391299666200065, + "grad_norm": 0.5251668691635132, + "learning_rate": 7.124721613215275e-06, + "loss": 0.4936, + "step": 4270 + }, + { + "epoch": 1.8434370625605685, + "grad_norm": 0.5125293731689453, + "learning_rate": 7.079260866863523e-06, + "loss": 0.5161, + "step": 4280 + }, + { + "epoch": 1.8477441585011305, + "grad_norm": 0.4881208837032318, + "learning_rate": 7.033866017158797e-06, + "loss": 0.5142, + "step": 4290 + }, + { + "epoch": 1.8520512544416927, + "grad_norm": 0.5215027928352356, + "learning_rate": 6.9885380882830735e-06, + "loss": 0.5097, + "step": 4300 + }, + { + "epoch": 1.8563583503822547, + "grad_norm": 0.4931368827819824, + "learning_rate": 6.943278102908491e-06, + "loss": 0.5123, + "step": 4310 + }, + { + "epoch": 1.860665446322817, + "grad_norm": 0.5080362558364868, + "learning_rate": 6.898087082174267e-06, + "loss": 0.5093, + "step": 4320 + }, + { + "epoch": 1.864972542263379, + "grad_norm": 0.537807285785675, + "learning_rate": 6.852966045663671e-06, + "loss": 0.5245, + "step": 4330 + }, + { + "epoch": 1.869279638203941, + "grad_norm": 0.5395597815513611, + "learning_rate": 6.807916011381008e-06, + "loss": 0.5016, + "step": 4340 + }, + { + "epoch": 1.873586734144503, + "grad_norm": 0.48623430728912354, + "learning_rate": 6.762937995728663e-06, + "loss": 0.4962, + "step": 4350 + }, + { + "epoch": 1.8778938300850652, + "grad_norm": 0.5058403611183167, + "learning_rate": 6.718033013484147e-06, + "loss": 0.5401, + "step": 4360 + }, + { + "epoch": 1.8822009260256274, + "grad_norm": 0.5220633149147034, + "learning_rate": 6.673202077777239e-06, + "loss": 0.5112, + "step": 4370 + }, + { + "epoch": 1.8865080219661894, + "grad_norm": 0.5163370966911316, + "learning_rate": 6.6284462000670924e-06, + "loss": 0.5231, + "step": 4380 + }, + { + "epoch": 1.8908151179067514, + "grad_norm": 0.508660614490509, + "learning_rate": 6.583766390119437e-06, + "loss": 0.5304, + "step": 4390 + }, + { + "epoch": 1.8951222138473134, + "grad_norm": 0.568144679069519, + "learning_rate": 6.539163655983786e-06, + "loss": 0.5086, + "step": 4400 + }, + { + "epoch": 1.8994293097878754, + "grad_norm": 0.5001341700553894, + "learning_rate": 6.494639003970701e-06, + "loss": 0.5084, + "step": 4410 + }, + { + "epoch": 1.9037364057284376, + "grad_norm": 0.5228297710418701, + "learning_rate": 6.450193438629078e-06, + "loss": 0.504, + "step": 4420 + }, + { + "epoch": 1.9080435016689998, + "grad_norm": 0.4816001057624817, + "learning_rate": 6.40582796272349e-06, + "loss": 0.5102, + "step": 4430 + }, + { + "epoch": 1.9123505976095618, + "grad_norm": 0.5058324933052063, + "learning_rate": 6.361543577211566e-06, + "loss": 0.524, + "step": 4440 + }, + { + "epoch": 1.9166576935501238, + "grad_norm": 0.5428106188774109, + "learning_rate": 6.317341281221392e-06, + "loss": 0.5082, + "step": 4450 + }, + { + "epoch": 1.9209647894906858, + "grad_norm": 0.5131290555000305, + "learning_rate": 6.273222072028991e-06, + "loss": 0.5316, + "step": 4460 + }, + { + "epoch": 1.9252718854312478, + "grad_norm": 0.5238609910011292, + "learning_rate": 6.2291869450358074e-06, + "loss": 0.5021, + "step": 4470 + }, + { + "epoch": 1.92957898137181, + "grad_norm": 0.4843258261680603, + "learning_rate": 6.1852368937462585e-06, + "loss": 0.5048, + "step": 4480 + }, + { + "epoch": 1.9338860773123723, + "grad_norm": 0.5138316750526428, + "learning_rate": 6.141372909745307e-06, + "loss": 0.5352, + "step": 4490 + }, + { + "epoch": 1.9381931732529343, + "grad_norm": 0.49319642782211304, + "learning_rate": 6.097595982676103e-06, + "loss": 0.5065, + "step": 4500 + }, + { + "epoch": 1.9425002691934963, + "grad_norm": 0.5176106095314026, + "learning_rate": 6.053907100217648e-06, + "loss": 0.5155, + "step": 4510 + }, + { + "epoch": 1.9468073651340583, + "grad_norm": 0.4772352874279022, + "learning_rate": 6.010307248062514e-06, + "loss": 0.5056, + "step": 4520 + }, + { + "epoch": 1.9511144610746203, + "grad_norm": 0.5366437435150146, + "learning_rate": 5.966797409894607e-06, + "loss": 0.4888, + "step": 4530 + }, + { + "epoch": 1.9554215570151825, + "grad_norm": 0.4917809069156647, + "learning_rate": 5.923378567366956e-06, + "loss": 0.5221, + "step": 4540 + }, + { + "epoch": 1.9597286529557447, + "grad_norm": 0.5597509741783142, + "learning_rate": 5.880051700079596e-06, + "loss": 0.5225, + "step": 4550 + }, + { + "epoch": 1.9640357488963067, + "grad_norm": 0.5258151888847351, + "learning_rate": 5.836817785557448e-06, + "loss": 0.5031, + "step": 4560 + }, + { + "epoch": 1.9683428448368687, + "grad_norm": 0.5679864287376404, + "learning_rate": 5.7936777992282565e-06, + "loss": 0.5074, + "step": 4570 + }, + { + "epoch": 1.9726499407774307, + "grad_norm": 0.5309889912605286, + "learning_rate": 5.750632714400607e-06, + "loss": 0.521, + "step": 4580 + }, + { + "epoch": 1.976957036717993, + "grad_norm": 0.5293132662773132, + "learning_rate": 5.707683502241936e-06, + "loss": 0.5133, + "step": 4590 + }, + { + "epoch": 1.981264132658555, + "grad_norm": 0.5223381519317627, + "learning_rate": 5.664831131756652e-06, + "loss": 0.5129, + "step": 4600 + }, + { + "epoch": 1.9855712285991172, + "grad_norm": 0.5365522503852844, + "learning_rate": 5.622076569764247e-06, + "loss": 0.504, + "step": 4610 + }, + { + "epoch": 1.9898783245396792, + "grad_norm": 0.5084212422370911, + "learning_rate": 5.5794207808774904e-06, + "loss": 0.488, + "step": 4620 + }, + { + "epoch": 1.9941854204802412, + "grad_norm": 0.4913804531097412, + "learning_rate": 5.536864727480683e-06, + "loss": 0.5098, + "step": 4630 + }, + { + "epoch": 1.9984925164208032, + "grad_norm": 0.5197212100028992, + "learning_rate": 5.4944093697079136e-06, + "loss": 0.5066, + "step": 4640 + }, + { + "epoch": 2.002799612361365, + "grad_norm": 0.51143479347229, + "learning_rate": 5.45205566542143e-06, + "loss": 0.4521, + "step": 4650 + }, + { + "epoch": 2.0071067083019276, + "grad_norm": 0.5107315182685852, + "learning_rate": 5.4098045701899934e-06, + "loss": 0.3968, + "step": 4660 + }, + { + "epoch": 2.0114138042424896, + "grad_norm": 0.5407351851463318, + "learning_rate": 5.367657037267354e-06, + "loss": 0.3933, + "step": 4670 + }, + { + "epoch": 2.0157209001830516, + "grad_norm": 0.5835046172142029, + "learning_rate": 5.325614017570712e-06, + "loss": 0.3897, + "step": 4680 + }, + { + "epoch": 2.0200279961236136, + "grad_norm": 0.5047739744186401, + "learning_rate": 5.283676459659288e-06, + "loss": 0.3992, + "step": 4690 + }, + { + "epoch": 2.0243350920641756, + "grad_norm": 0.5422953963279724, + "learning_rate": 5.241845309712921e-06, + "loss": 0.4131, + "step": 4700 + }, + { + "epoch": 2.0286421880047376, + "grad_norm": 0.5471384525299072, + "learning_rate": 5.2001215115106814e-06, + "loss": 0.3955, + "step": 4710 + }, + { + "epoch": 2.0329492839453, + "grad_norm": 0.5800908803939819, + "learning_rate": 5.158506006409644e-06, + "loss": 0.397, + "step": 4720 + }, + { + "epoch": 2.037256379885862, + "grad_norm": 0.5329377055168152, + "learning_rate": 5.116999733323591e-06, + "loss": 0.4017, + "step": 4730 + }, + { + "epoch": 2.041563475826424, + "grad_norm": 0.556845486164093, + "learning_rate": 5.075603628701869e-06, + "loss": 0.4009, + "step": 4740 + }, + { + "epoch": 2.045870571766986, + "grad_norm": 0.5501790642738342, + "learning_rate": 5.034318626508223e-06, + "loss": 0.3969, + "step": 4750 + }, + { + "epoch": 2.050177667707548, + "grad_norm": 0.5467825531959534, + "learning_rate": 4.993145658199766e-06, + "loss": 0.3996, + "step": 4760 + }, + { + "epoch": 2.05448476364811, + "grad_norm": 0.5644121766090393, + "learning_rate": 4.952085652705938e-06, + "loss": 0.3926, + "step": 4770 + }, + { + "epoch": 2.0587918595886725, + "grad_norm": 0.5279033780097961, + "learning_rate": 4.911139536407542e-06, + "loss": 0.3742, + "step": 4780 + }, + { + "epoch": 2.0630989555292345, + "grad_norm": 0.5283676981925964, + "learning_rate": 4.870308233115876e-06, + "loss": 0.3893, + "step": 4790 + }, + { + "epoch": 2.0674060514697965, + "grad_norm": 0.5302291512489319, + "learning_rate": 4.82959266405184e-06, + "loss": 0.3956, + "step": 4800 + }, + { + "epoch": 2.0717131474103585, + "grad_norm": 0.5381713509559631, + "learning_rate": 4.788993747825209e-06, + "loss": 0.4124, + "step": 4810 + }, + { + "epoch": 2.0760202433509205, + "grad_norm": 0.5772622227668762, + "learning_rate": 4.748512400413861e-06, + "loss": 0.405, + "step": 4820 + }, + { + "epoch": 2.0803273392914825, + "grad_norm": 0.5383191704750061, + "learning_rate": 4.708149535143138e-06, + "loss": 0.3874, + "step": 4830 + }, + { + "epoch": 2.084634435232045, + "grad_norm": 0.5546970963478088, + "learning_rate": 4.667906062665234e-06, + "loss": 0.3994, + "step": 4840 + }, + { + "epoch": 2.088941531172607, + "grad_norm": 0.5541481375694275, + "learning_rate": 4.627782890938632e-06, + "loss": 0.4073, + "step": 4850 + }, + { + "epoch": 2.093248627113169, + "grad_norm": 0.5656886100769043, + "learning_rate": 4.587780925207654e-06, + "loss": 0.3986, + "step": 4860 + }, + { + "epoch": 2.097555723053731, + "grad_norm": 0.5167860984802246, + "learning_rate": 4.5479010679819965e-06, + "loss": 0.3994, + "step": 4870 + }, + { + "epoch": 2.101862818994293, + "grad_norm": 0.585415780544281, + "learning_rate": 4.50814421901641e-06, + "loss": 0.3959, + "step": 4880 + }, + { + "epoch": 2.1061699149348554, + "grad_norm": 0.5390037894248962, + "learning_rate": 4.46851127529035e-06, + "loss": 0.393, + "step": 4890 + }, + { + "epoch": 2.1104770108754174, + "grad_norm": 0.5685362815856934, + "learning_rate": 4.42900313098779e-06, + "loss": 0.4031, + "step": 4900 + }, + { + "epoch": 2.1147841068159794, + "grad_norm": 0.5294394493103027, + "learning_rate": 4.389620677477023e-06, + "loss": 0.3926, + "step": 4910 + }, + { + "epoch": 2.1190912027565414, + "grad_norm": 0.5693227648735046, + "learning_rate": 4.3503648032905384e-06, + "loss": 0.3909, + "step": 4920 + }, + { + "epoch": 2.1233982986971034, + "grad_norm": 0.6294069886207581, + "learning_rate": 4.311236394105006e-06, + "loss": 0.3908, + "step": 4930 + }, + { + "epoch": 2.1277053946376654, + "grad_norm": 0.566862165927887, + "learning_rate": 4.27223633272126e-06, + "loss": 0.4019, + "step": 4940 + }, + { + "epoch": 2.132012490578228, + "grad_norm": 0.5680539608001709, + "learning_rate": 4.233365499044416e-06, + "loss": 0.3957, + "step": 4950 + }, + { + "epoch": 2.13631958651879, + "grad_norm": 0.5697780251502991, + "learning_rate": 4.194624770063985e-06, + "loss": 0.3876, + "step": 4960 + }, + { + "epoch": 2.140626682459352, + "grad_norm": 0.5857852697372437, + "learning_rate": 4.1560150198341174e-06, + "loss": 0.3986, + "step": 4970 + }, + { + "epoch": 2.144933778399914, + "grad_norm": 0.5707722306251526, + "learning_rate": 4.11753711945386e-06, + "loss": 0.4165, + "step": 4980 + }, + { + "epoch": 2.149240874340476, + "grad_norm": 0.5498836040496826, + "learning_rate": 4.079191937047511e-06, + "loss": 0.4236, + "step": 4990 + }, + { + "epoch": 2.153547970281038, + "grad_norm": 0.6008414626121521, + "learning_rate": 4.040980337745044e-06, + "loss": 0.3955, + "step": 5000 + }, + { + "epoch": 2.1578550662216003, + "grad_norm": 0.5871570110321045, + "learning_rate": 4.002903183662566e-06, + "loss": 0.3939, + "step": 5010 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.5556260347366333, + "learning_rate": 3.964961333882893e-06, + "loss": 0.4005, + "step": 5020 + }, + { + "epoch": 2.1664692581027243, + "grad_norm": 0.5592585206031799, + "learning_rate": 3.927155644436144e-06, + "loss": 0.4035, + "step": 5030 + }, + { + "epoch": 2.1707763540432863, + "grad_norm": 0.5638931393623352, + "learning_rate": 3.889486968280448e-06, + "loss": 0.3961, + "step": 5040 + }, + { + "epoch": 2.1750834499838483, + "grad_norm": 0.5473156571388245, + "learning_rate": 3.851956155282682e-06, + "loss": 0.3999, + "step": 5050 + }, + { + "epoch": 2.1793905459244103, + "grad_norm": 0.7088154554367065, + "learning_rate": 3.814564052199313e-06, + "loss": 0.3919, + "step": 5060 + }, + { + "epoch": 2.1836976418649727, + "grad_norm": 0.569315493106842, + "learning_rate": 3.777311502657279e-06, + "loss": 0.3924, + "step": 5070 + }, + { + "epoch": 2.1880047378055347, + "grad_norm": 0.6128218770027161, + "learning_rate": 3.7401993471349616e-06, + "loss": 0.4094, + "step": 5080 + }, + { + "epoch": 2.1923118337460967, + "grad_norm": 0.5971004962921143, + "learning_rate": 3.7032284229432325e-06, + "loss": 0.3786, + "step": 5090 + }, + { + "epoch": 2.1966189296866587, + "grad_norm": 0.5701526999473572, + "learning_rate": 3.666399564206541e-06, + "loss": 0.3912, + "step": 5100 + }, + { + "epoch": 2.2009260256272207, + "grad_norm": 0.5547009706497192, + "learning_rate": 3.6297136018441215e-06, + "loss": 0.3866, + "step": 5110 + }, + { + "epoch": 2.2052331215677827, + "grad_norm": 0.5613463521003723, + "learning_rate": 3.59317136355122e-06, + "loss": 0.3926, + "step": 5120 + }, + { + "epoch": 2.209540217508345, + "grad_norm": 0.6126610040664673, + "learning_rate": 3.556773673780446e-06, + "loss": 0.389, + "step": 5130 + }, + { + "epoch": 2.213847313448907, + "grad_norm": 0.5699272751808167, + "learning_rate": 3.520521353723142e-06, + "loss": 0.3982, + "step": 5140 + }, + { + "epoch": 2.218154409389469, + "grad_norm": 0.593333899974823, + "learning_rate": 3.484415221290889e-06, + "loss": 0.3826, + "step": 5150 + }, + { + "epoch": 2.222461505330031, + "grad_norm": 0.6188777685165405, + "learning_rate": 3.448456091097023e-06, + "loss": 0.4, + "step": 5160 + }, + { + "epoch": 2.226768601270593, + "grad_norm": 0.5949888825416565, + "learning_rate": 3.4126447744382753e-06, + "loss": 0.4062, + "step": 5170 + }, + { + "epoch": 2.231075697211155, + "grad_norm": 0.5788257718086243, + "learning_rate": 3.376982079276464e-06, + "loss": 0.3881, + "step": 5180 + }, + { + "epoch": 2.2353827931517176, + "grad_norm": 0.5726456642150879, + "learning_rate": 3.3414688102202564e-06, + "loss": 0.3968, + "step": 5190 + }, + { + "epoch": 2.2396898890922796, + "grad_norm": 0.5855600833892822, + "learning_rate": 3.3061057685070354e-06, + "loss": 0.3925, + "step": 5200 + }, + { + "epoch": 2.2439969850328416, + "grad_norm": 0.5823237299919128, + "learning_rate": 3.2708937519847916e-06, + "loss": 0.3875, + "step": 5210 + }, + { + "epoch": 2.2483040809734036, + "grad_norm": 0.5852989554405212, + "learning_rate": 3.23583355509416e-06, + "loss": 0.3985, + "step": 5220 + }, + { + "epoch": 2.2526111769139656, + "grad_norm": 0.5461825728416443, + "learning_rate": 3.200925968850459e-06, + "loss": 0.3917, + "step": 5230 + }, + { + "epoch": 2.256918272854528, + "grad_norm": 0.5536659359931946, + "learning_rate": 3.166171780825876e-06, + "loss": 0.3963, + "step": 5240 + }, + { + "epoch": 2.26122536879509, + "grad_norm": 0.5736192464828491, + "learning_rate": 3.1315717751316755e-06, + "loss": 0.4114, + "step": 5250 + }, + { + "epoch": 2.265532464735652, + "grad_norm": 0.5808764100074768, + "learning_rate": 3.097126732400515e-06, + "loss": 0.3795, + "step": 5260 + }, + { + "epoch": 2.269839560676214, + "grad_norm": 0.5790621042251587, + "learning_rate": 3.0628374297688436e-06, + "loss": 0.3991, + "step": 5270 + }, + { + "epoch": 2.274146656616776, + "grad_norm": 0.5211635231971741, + "learning_rate": 3.0287046408593478e-06, + "loss": 0.3796, + "step": 5280 + }, + { + "epoch": 2.278453752557338, + "grad_norm": 0.6152241230010986, + "learning_rate": 2.994729135763522e-06, + "loss": 0.3976, + "step": 5290 + }, + { + "epoch": 2.2827608484979, + "grad_norm": 0.6017261147499084, + "learning_rate": 2.9609116810242677e-06, + "loss": 0.4031, + "step": 5300 + }, + { + "epoch": 2.2870679444384625, + "grad_norm": 0.5612776279449463, + "learning_rate": 2.9272530396186194e-06, + "loss": 0.3985, + "step": 5310 + }, + { + "epoch": 2.2913750403790245, + "grad_norm": 0.6065710186958313, + "learning_rate": 2.893753970940525e-06, + "loss": 0.3975, + "step": 5320 + }, + { + "epoch": 2.2956821363195865, + "grad_norm": 0.5793972611427307, + "learning_rate": 2.8604152307837064e-06, + "loss": 0.3889, + "step": 5330 + }, + { + "epoch": 2.2999892322601485, + "grad_norm": 0.5591062307357788, + "learning_rate": 2.8272375713246125e-06, + "loss": 0.3903, + "step": 5340 + }, + { + "epoch": 2.3042963282007105, + "grad_norm": 0.5505937337875366, + "learning_rate": 2.794221741105446e-06, + "loss": 0.397, + "step": 5350 + }, + { + "epoch": 2.308603424141273, + "grad_norm": 0.6174246668815613, + "learning_rate": 2.7613684850172882e-06, + "loss": 0.3966, + "step": 5360 + }, + { + "epoch": 2.312910520081835, + "grad_norm": 0.6093124747276306, + "learning_rate": 2.7286785442832685e-06, + "loss": 0.3902, + "step": 5370 + }, + { + "epoch": 2.317217616022397, + "grad_norm": 0.5350244045257568, + "learning_rate": 2.696152656441868e-06, + "loss": 0.3935, + "step": 5380 + }, + { + "epoch": 2.321524711962959, + "grad_norm": 0.5422816276550293, + "learning_rate": 2.663791555330255e-06, + "loss": 0.3924, + "step": 5390 + }, + { + "epoch": 2.325831807903521, + "grad_norm": 0.5582048892974854, + "learning_rate": 2.6315959710677464e-06, + "loss": 0.397, + "step": 5400 + }, + { + "epoch": 2.330138903844083, + "grad_norm": 0.5601301789283752, + "learning_rate": 2.599566630039332e-06, + "loss": 0.3813, + "step": 5410 + }, + { + "epoch": 2.334445999784645, + "grad_norm": 0.5601345896720886, + "learning_rate": 2.567704254879274e-06, + "loss": 0.3974, + "step": 5420 + }, + { + "epoch": 2.3387530957252074, + "grad_norm": 0.614778459072113, + "learning_rate": 2.536009564454817e-06, + "loss": 0.3836, + "step": 5430 + }, + { + "epoch": 2.3430601916657694, + "grad_norm": 0.5759994983673096, + "learning_rate": 2.504483273849958e-06, + "loss": 0.3949, + "step": 5440 + }, + { + "epoch": 2.3473672876063314, + "grad_norm": 0.586625874042511, + "learning_rate": 2.473126094349331e-06, + "loss": 0.3829, + "step": 5450 + }, + { + "epoch": 2.3516743835468934, + "grad_norm": 0.5470960736274719, + "learning_rate": 2.4419387334221333e-06, + "loss": 0.3881, + "step": 5460 + }, + { + "epoch": 2.3559814794874554, + "grad_norm": 0.5486071705818176, + "learning_rate": 2.4109218947061884e-06, + "loss": 0.399, + "step": 5470 + }, + { + "epoch": 2.360288575428018, + "grad_norm": 0.5942230820655823, + "learning_rate": 2.3800762779920574e-06, + "loss": 0.3921, + "step": 5480 + }, + { + "epoch": 2.36459567136858, + "grad_norm": 0.5786502957344055, + "learning_rate": 2.3494025792072474e-06, + "loss": 0.3901, + "step": 5490 + }, + { + "epoch": 2.368902767309142, + "grad_norm": 0.6082814931869507, + "learning_rate": 2.3189014904005247e-06, + "loss": 0.391, + "step": 5500 + }, + { + "epoch": 2.373209863249704, + "grad_norm": 0.612694501876831, + "learning_rate": 2.2885736997262863e-06, + "loss": 0.3981, + "step": 5510 + }, + { + "epoch": 2.377516959190266, + "grad_norm": 0.5050374865531921, + "learning_rate": 2.2584198914290435e-06, + "loss": 0.3951, + "step": 5520 + }, + { + "epoch": 2.381824055130828, + "grad_norm": 0.5465214848518372, + "learning_rate": 2.2284407458279743e-06, + "loss": 0.4, + "step": 5530 + }, + { + "epoch": 2.3861311510713903, + "grad_norm": 0.5544529557228088, + "learning_rate": 2.1986369393015914e-06, + "loss": 0.3836, + "step": 5540 + }, + { + "epoch": 2.3904382470119523, + "grad_norm": 0.586337149143219, + "learning_rate": 2.169009144272467e-06, + "loss": 0.4139, + "step": 5550 + }, + { + "epoch": 2.3947453429525143, + "grad_norm": 0.6219981908798218, + "learning_rate": 2.1395580291920625e-06, + "loss": 0.4011, + "step": 5560 + }, + { + "epoch": 2.3990524388930763, + "grad_norm": 0.6941688060760498, + "learning_rate": 2.110284258525658e-06, + "loss": 0.405, + "step": 5570 + }, + { + "epoch": 2.4033595348336383, + "grad_norm": 0.5210332274436951, + "learning_rate": 2.081188492737345e-06, + "loss": 0.4017, + "step": 5580 + }, + { + "epoch": 2.4076666307742007, + "grad_norm": 0.5930879712104797, + "learning_rate": 2.0522713882751445e-06, + "loss": 0.3918, + "step": 5590 + }, + { + "epoch": 2.4119737267147627, + "grad_norm": 0.5910641551017761, + "learning_rate": 2.0235335975561775e-06, + "loss": 0.3996, + "step": 5600 + }, + { + "epoch": 2.4162808226553247, + "grad_norm": 0.5827698111534119, + "learning_rate": 1.9949757689519555e-06, + "loss": 0.3854, + "step": 5610 + }, + { + "epoch": 2.4205879185958867, + "grad_norm": 0.5518185496330261, + "learning_rate": 1.966598546773757e-06, + "loss": 0.4077, + "step": 5620 + }, + { + "epoch": 2.4248950145364487, + "grad_norm": 0.6005439162254333, + "learning_rate": 1.938402571258073e-06, + "loss": 0.4095, + "step": 5630 + }, + { + "epoch": 2.4292021104770107, + "grad_norm": 0.5761522054672241, + "learning_rate": 1.9103884785521887e-06, + "loss": 0.3966, + "step": 5640 + }, + { + "epoch": 2.4335092064175727, + "grad_norm": 0.5546764135360718, + "learning_rate": 1.8825569006998012e-06, + "loss": 0.395, + "step": 5650 + }, + { + "epoch": 2.437816302358135, + "grad_norm": 0.5639533996582031, + "learning_rate": 1.8549084656267846e-06, + "loss": 0.3938, + "step": 5660 + }, + { + "epoch": 2.442123398298697, + "grad_norm": 0.5662581324577332, + "learning_rate": 1.8274437971270044e-06, + "loss": 0.4004, + "step": 5670 + }, + { + "epoch": 2.446430494239259, + "grad_norm": 0.5856819748878479, + "learning_rate": 1.8001635148482621e-06, + "loss": 0.3946, + "step": 5680 + }, + { + "epoch": 2.450737590179821, + "grad_norm": 0.5766512751579285, + "learning_rate": 1.7730682342782967e-06, + "loss": 0.3931, + "step": 5690 + }, + { + "epoch": 2.455044686120383, + "grad_norm": 0.6373909711837769, + "learning_rate": 1.7461585667309045e-06, + "loss": 0.4006, + "step": 5700 + }, + { + "epoch": 2.4593517820609456, + "grad_norm": 0.5694748759269714, + "learning_rate": 1.719435119332159e-06, + "loss": 0.3989, + "step": 5710 + }, + { + "epoch": 2.4636588780015076, + "grad_norm": 0.5339934229850769, + "learning_rate": 1.6928984950066918e-06, + "loss": 0.3966, + "step": 5720 + }, + { + "epoch": 2.4679659739420696, + "grad_norm": 0.5888383388519287, + "learning_rate": 1.6665492924641113e-06, + "loss": 0.3833, + "step": 5730 + }, + { + "epoch": 2.4722730698826316, + "grad_norm": 0.5573282241821289, + "learning_rate": 1.6403881061854732e-06, + "loss": 0.4, + "step": 5740 + }, + { + "epoch": 2.4765801658231936, + "grad_norm": 0.5756634473800659, + "learning_rate": 1.6144155264098883e-06, + "loss": 0.3964, + "step": 5750 + }, + { + "epoch": 2.4808872617637556, + "grad_norm": 0.5784355401992798, + "learning_rate": 1.58863213912119e-06, + "loss": 0.3762, + "step": 5760 + }, + { + "epoch": 2.4851943577043176, + "grad_norm": 0.6090006828308105, + "learning_rate": 1.563038526034727e-06, + "loss": 0.3986, + "step": 5770 + }, + { + "epoch": 2.48950145364488, + "grad_norm": 0.5565779209136963, + "learning_rate": 1.5376352645842242e-06, + "loss": 0.3916, + "step": 5780 + }, + { + "epoch": 2.493808549585442, + "grad_norm": 0.6107103228569031, + "learning_rate": 1.5124229279087655e-06, + "loss": 0.4093, + "step": 5790 + }, + { + "epoch": 2.498115645526004, + "grad_norm": 0.5300205945968628, + "learning_rate": 1.487402084839864e-06, + "loss": 0.4047, + "step": 5800 + }, + { + "epoch": 2.502422741466566, + "grad_norm": 0.6008495688438416, + "learning_rate": 1.4625732998886178e-06, + "loss": 0.4023, + "step": 5810 + }, + { + "epoch": 2.5067298374071285, + "grad_norm": 0.5560673475265503, + "learning_rate": 1.437937133232985e-06, + "loss": 0.3968, + "step": 5820 + }, + { + "epoch": 2.5110369333476905, + "grad_norm": 0.5503118634223938, + "learning_rate": 1.413494140705136e-06, + "loss": 0.3876, + "step": 5830 + }, + { + "epoch": 2.5153440292882525, + "grad_norm": 0.5559957027435303, + "learning_rate": 1.3892448737789243e-06, + "loss": 0.392, + "step": 5840 + }, + { + "epoch": 2.5196511252288145, + "grad_norm": 0.5354902148246765, + "learning_rate": 1.365189879557426e-06, + "loss": 0.3988, + "step": 5850 + }, + { + "epoch": 2.5239582211693765, + "grad_norm": 0.577046275138855, + "learning_rate": 1.3413297007606196e-06, + "loss": 0.3948, + "step": 5860 + }, + { + "epoch": 2.5282653171099385, + "grad_norm": 0.5745800733566284, + "learning_rate": 1.3176648757131205e-06, + "loss": 0.395, + "step": 5870 + }, + { + "epoch": 2.5325724130505005, + "grad_norm": 0.5721185207366943, + "learning_rate": 1.2941959383320478e-06, + "loss": 0.3918, + "step": 5880 + }, + { + "epoch": 2.5368795089910625, + "grad_norm": 0.5935482978820801, + "learning_rate": 1.2709234181149765e-06, + "loss": 0.376, + "step": 5890 + }, + { + "epoch": 2.541186604931625, + "grad_norm": 0.5709375143051147, + "learning_rate": 1.2478478401279848e-06, + "loss": 0.3881, + "step": 5900 + }, + { + "epoch": 2.545493700872187, + "grad_norm": 0.5233684182167053, + "learning_rate": 1.2249697249938197e-06, + "loss": 0.3945, + "step": 5910 + }, + { + "epoch": 2.549800796812749, + "grad_norm": 0.5812388062477112, + "learning_rate": 1.2022895888801333e-06, + "loss": 0.3984, + "step": 5920 + }, + { + "epoch": 2.554107892753311, + "grad_norm": 0.560550332069397, + "learning_rate": 1.1798079434878584e-06, + "loss": 0.3942, + "step": 5930 + }, + { + "epoch": 2.5584149886938734, + "grad_norm": 0.6010858416557312, + "learning_rate": 1.1575252960396422e-06, + "loss": 0.3851, + "step": 5940 + }, + { + "epoch": 2.5627220846344354, + "grad_norm": 0.5857875347137451, + "learning_rate": 1.1354421492684252e-06, + "loss": 0.3993, + "step": 5950 + }, + { + "epoch": 2.5670291805749974, + "grad_norm": 0.604179859161377, + "learning_rate": 1.1135590014060772e-06, + "loss": 0.388, + "step": 5960 + }, + { + "epoch": 2.5713362765155594, + "grad_norm": 0.569106936454773, + "learning_rate": 1.0918763461721648e-06, + "loss": 0.4014, + "step": 5970 + }, + { + "epoch": 2.5756433724561214, + "grad_norm": 0.5742547512054443, + "learning_rate": 1.0703946727628234e-06, + "loss": 0.3839, + "step": 5980 + }, + { + "epoch": 2.5799504683966834, + "grad_norm": 0.5561407208442688, + "learning_rate": 1.0491144658397e-06, + "loss": 0.3853, + "step": 5990 + }, + { + "epoch": 2.5842575643372454, + "grad_norm": 0.5482295155525208, + "learning_rate": 1.0280362055190341e-06, + "loss": 0.3876, + "step": 6000 + }, + { + "epoch": 2.588564660277808, + "grad_norm": 0.5737982392311096, + "learning_rate": 1.0071603673608176e-06, + "loss": 0.4059, + "step": 6010 + }, + { + "epoch": 2.59287175621837, + "grad_norm": 0.547715961933136, + "learning_rate": 9.864874223580668e-07, + "loss": 0.3837, + "step": 6020 + }, + { + "epoch": 2.597178852158932, + "grad_norm": 0.607851505279541, + "learning_rate": 9.66017836926203e-07, + "loss": 0.3779, + "step": 6030 + }, + { + "epoch": 2.601485948099494, + "grad_norm": 0.5557613968849182, + "learning_rate": 9.457520728925151e-07, + "loss": 0.3995, + "step": 6040 + }, + { + "epoch": 2.605793044040056, + "grad_norm": 0.5470052361488342, + "learning_rate": 9.256905874857535e-07, + "loss": 0.3916, + "step": 6050 + }, + { + "epoch": 2.6101001399806183, + "grad_norm": 0.5718830227851868, + "learning_rate": 9.058338333258032e-07, + "loss": 0.3997, + "step": 6060 + }, + { + "epoch": 2.6144072359211803, + "grad_norm": 0.5838637948036194, + "learning_rate": 8.861822584134882e-07, + "loss": 0.39, + "step": 6070 + }, + { + "epoch": 2.6187143318617423, + "grad_norm": 0.5819488763809204, + "learning_rate": 8.667363061204415e-07, + "loss": 0.4028, + "step": 6080 + }, + { + "epoch": 2.6230214278023043, + "grad_norm": 0.5477743744850159, + "learning_rate": 8.474964151791232e-07, + "loss": 0.3979, + "step": 6090 + }, + { + "epoch": 2.6273285237428663, + "grad_norm": 0.6217262744903564, + "learning_rate": 8.284630196729059e-07, + "loss": 0.3993, + "step": 6100 + }, + { + "epoch": 2.6316356196834283, + "grad_norm": 0.5514227747917175, + "learning_rate": 8.096365490262925e-07, + "loss": 0.4058, + "step": 6110 + }, + { + "epoch": 2.6359427156239903, + "grad_norm": 0.645946204662323, + "learning_rate": 7.910174279952232e-07, + "loss": 0.3992, + "step": 6120 + }, + { + "epoch": 2.6402498115645527, + "grad_norm": 0.5741420984268188, + "learning_rate": 7.726060766574883e-07, + "loss": 0.3938, + "step": 6130 + }, + { + "epoch": 2.6445569075051147, + "grad_norm": 0.5910946726799011, + "learning_rate": 7.544029104032558e-07, + "loss": 0.3898, + "step": 6140 + }, + { + "epoch": 2.6488640034456767, + "grad_norm": 0.5803595185279846, + "learning_rate": 7.364083399256971e-07, + "loss": 0.388, + "step": 6150 + }, + { + "epoch": 2.6531710993862387, + "grad_norm": 0.596809446811676, + "learning_rate": 7.186227712117266e-07, + "loss": 0.388, + "step": 6160 + }, + { + "epoch": 2.6574781953268007, + "grad_norm": 0.6213387250900269, + "learning_rate": 7.010466055328313e-07, + "loss": 0.3839, + "step": 6170 + }, + { + "epoch": 2.661785291267363, + "grad_norm": 0.5913180112838745, + "learning_rate": 6.836802394360276e-07, + "loss": 0.3989, + "step": 6180 + }, + { + "epoch": 2.666092387207925, + "grad_norm": 0.6089721322059631, + "learning_rate": 6.665240647349125e-07, + "loss": 0.4039, + "step": 6190 + }, + { + "epoch": 2.670399483148487, + "grad_norm": 0.5730729103088379, + "learning_rate": 6.495784685008133e-07, + "loss": 0.3951, + "step": 6200 + }, + { + "epoch": 2.674706579089049, + "grad_norm": 0.5562758445739746, + "learning_rate": 6.32843833054072e-07, + "loss": 0.3837, + "step": 6210 + }, + { + "epoch": 2.679013675029611, + "grad_norm": 0.5627213716506958, + "learning_rate": 6.16320535955407e-07, + "loss": 0.3712, + "step": 6220 + }, + { + "epoch": 2.683320770970173, + "grad_norm": 0.559660017490387, + "learning_rate": 6.000089499973971e-07, + "loss": 0.3901, + "step": 6230 + }, + { + "epoch": 2.687627866910735, + "grad_norm": 0.6018761992454529, + "learning_rate": 5.839094431960713e-07, + "loss": 0.383, + "step": 6240 + }, + { + "epoch": 2.6919349628512976, + "grad_norm": 0.5534284710884094, + "learning_rate": 5.680223787826089e-07, + "loss": 0.3925, + "step": 6250 + }, + { + "epoch": 2.6962420587918596, + "grad_norm": 0.5682888031005859, + "learning_rate": 5.523481151951427e-07, + "loss": 0.3929, + "step": 6260 + }, + { + "epoch": 2.7005491547324216, + "grad_norm": 0.6271238923072815, + "learning_rate": 5.368870060706677e-07, + "loss": 0.3942, + "step": 6270 + }, + { + "epoch": 2.7048562506729836, + "grad_norm": 0.5881267786026001, + "learning_rate": 5.216394002370695e-07, + "loss": 0.3876, + "step": 6280 + }, + { + "epoch": 2.709163346613546, + "grad_norm": 0.6085900068283081, + "learning_rate": 5.066056417052445e-07, + "loss": 0.3958, + "step": 6290 + }, + { + "epoch": 2.713470442554108, + "grad_norm": 0.5912172198295593, + "learning_rate": 4.917860696613541e-07, + "loss": 0.3887, + "step": 6300 + }, + { + "epoch": 2.71777753849467, + "grad_norm": 0.6698789596557617, + "learning_rate": 4.771810184591541e-07, + "loss": 0.3899, + "step": 6310 + }, + { + "epoch": 2.722084634435232, + "grad_norm": 0.5682712197303772, + "learning_rate": 4.627908176124618e-07, + "loss": 0.3826, + "step": 6320 + }, + { + "epoch": 2.726391730375794, + "grad_norm": 0.5702280402183533, + "learning_rate": 4.486157917877232e-07, + "loss": 0.3908, + "step": 6330 + }, + { + "epoch": 2.730698826316356, + "grad_norm": 0.5540564060211182, + "learning_rate": 4.346562607966787e-07, + "loss": 0.3962, + "step": 6340 + }, + { + "epoch": 2.735005922256918, + "grad_norm": 0.6031074523925781, + "learning_rate": 4.209125395891589e-07, + "loss": 0.3791, + "step": 6350 + }, + { + "epoch": 2.73931301819748, + "grad_norm": 0.5727553963661194, + "learning_rate": 4.0738493824596715e-07, + "loss": 0.4023, + "step": 6360 + }, + { + "epoch": 2.7436201141380425, + "grad_norm": 0.5374717116355896, + "learning_rate": 3.940737619718937e-07, + "loss": 0.38, + "step": 6370 + }, + { + "epoch": 2.7479272100786045, + "grad_norm": 0.5720168352127075, + "learning_rate": 3.809793110888249e-07, + "loss": 0.4011, + "step": 6380 + }, + { + "epoch": 2.7522343060191665, + "grad_norm": 0.5751203894615173, + "learning_rate": 3.6810188102896605e-07, + "loss": 0.3941, + "step": 6390 + }, + { + "epoch": 2.7565414019597285, + "grad_norm": 0.5838513970375061, + "learning_rate": 3.554417623281825e-07, + "loss": 0.3834, + "step": 6400 + }, + { + "epoch": 2.760848497900291, + "grad_norm": 0.6204310059547424, + "learning_rate": 3.429992406194338e-07, + "loss": 0.3933, + "step": 6410 + }, + { + "epoch": 2.765155593840853, + "grad_norm": 0.6237754225730896, + "learning_rate": 3.3077459662634205e-07, + "loss": 0.3911, + "step": 6420 + }, + { + "epoch": 2.769462689781415, + "grad_norm": 0.561553418636322, + "learning_rate": 3.1876810615684705e-07, + "loss": 0.3847, + "step": 6430 + }, + { + "epoch": 2.773769785721977, + "grad_norm": 0.568580150604248, + "learning_rate": 3.069800400969947e-07, + "loss": 0.3967, + "step": 6440 + }, + { + "epoch": 2.778076881662539, + "grad_norm": 0.6103531122207642, + "learning_rate": 2.954106644048127e-07, + "loss": 0.3731, + "step": 6450 + }, + { + "epoch": 2.782383977603101, + "grad_norm": 0.560199499130249, + "learning_rate": 2.840602401043213e-07, + "loss": 0.3889, + "step": 6460 + }, + { + "epoch": 2.786691073543663, + "grad_norm": 0.5612174868583679, + "learning_rate": 2.7292902327963776e-07, + "loss": 0.3915, + "step": 6470 + }, + { + "epoch": 2.7909981694842254, + "grad_norm": 0.5860500335693359, + "learning_rate": 2.620172650692021e-07, + "loss": 0.4063, + "step": 6480 + }, + { + "epoch": 2.7953052654247874, + "grad_norm": 0.6044652462005615, + "learning_rate": 2.513252116601062e-07, + "loss": 0.39, + "step": 6490 + }, + { + "epoch": 2.7996123613653494, + "grad_norm": 0.5966377258300781, + "learning_rate": 2.408531042825446e-07, + "loss": 0.3965, + "step": 6500 + }, + { + "epoch": 2.8039194573059114, + "grad_norm": 0.5729289650917053, + "learning_rate": 2.3060117920437164e-07, + "loss": 0.3798, + "step": 6510 + }, + { + "epoch": 2.8082265532464734, + "grad_norm": 0.6403810977935791, + "learning_rate": 2.2056966772576626e-07, + "loss": 0.4096, + "step": 6520 + }, + { + "epoch": 2.812533649187036, + "grad_norm": 0.5852852463722229, + "learning_rate": 2.1075879617401984e-07, + "loss": 0.383, + "step": 6530 + }, + { + "epoch": 2.816840745127598, + "grad_norm": 0.6858223080635071, + "learning_rate": 2.0116878589842236e-07, + "loss": 0.3763, + "step": 6540 + }, + { + "epoch": 2.82114784106816, + "grad_norm": 0.5583459138870239, + "learning_rate": 1.917998532652765e-07, + "loss": 0.4007, + "step": 6550 + }, + { + "epoch": 2.825454937008722, + "grad_norm": 0.6212313175201416, + "learning_rate": 1.8265220965300812e-07, + "loss": 0.3946, + "step": 6560 + }, + { + "epoch": 2.829762032949284, + "grad_norm": 0.5777102112770081, + "learning_rate": 1.7372606144740567e-07, + "loss": 0.3908, + "step": 6570 + }, + { + "epoch": 2.834069128889846, + "grad_norm": 0.5885289311408997, + "learning_rate": 1.6502161003695615e-07, + "loss": 0.4051, + "step": 6580 + }, + { + "epoch": 2.838376224830408, + "grad_norm": 0.6133362054824829, + "learning_rate": 1.5653905180830432e-07, + "loss": 0.3909, + "step": 6590 + }, + { + "epoch": 2.8426833207709703, + "grad_norm": 0.5662548542022705, + "learning_rate": 1.48278578141825e-07, + "loss": 0.3689, + "step": 6600 + }, + { + "epoch": 2.8469904167115323, + "grad_norm": 0.5703479647636414, + "learning_rate": 1.4024037540730006e-07, + "loss": 0.3812, + "step": 6610 + }, + { + "epoch": 2.8512975126520943, + "grad_norm": 0.5604844689369202, + "learning_rate": 1.324246249597183e-07, + "loss": 0.3992, + "step": 6620 + }, + { + "epoch": 2.8556046085926563, + "grad_norm": 0.6033147573471069, + "learning_rate": 1.2483150313517766e-07, + "loss": 0.3937, + "step": 6630 + }, + { + "epoch": 2.8599117045332187, + "grad_norm": 0.5846080780029297, + "learning_rate": 1.1746118124691508e-07, + "loss": 0.4123, + "step": 6640 + }, + { + "epoch": 2.8642188004737807, + "grad_norm": 0.63025963306427, + "learning_rate": 1.103138255814329e-07, + "loss": 0.3998, + "step": 6650 + }, + { + "epoch": 2.8685258964143427, + "grad_norm": 0.5580465197563171, + "learning_rate": 1.0338959739475296e-07, + "loss": 0.4007, + "step": 6660 + }, + { + "epoch": 2.8728329923549047, + "grad_norm": 0.5767059326171875, + "learning_rate": 9.66886529087785e-08, + "loss": 0.4008, + "step": 6670 + }, + { + "epoch": 2.8771400882954667, + "grad_norm": 0.583044707775116, + "learning_rate": 9.021114330776348e-08, + "loss": 0.403, + "step": 6680 + }, + { + "epoch": 2.8814471842360287, + "grad_norm": 0.5440847873687744, + "learning_rate": 8.395721473490992e-08, + "loss": 0.3839, + "step": 6690 + }, + { + "epoch": 2.8857542801765907, + "grad_norm": 0.55162513256073, + "learning_rate": 7.792700828906374e-08, + "loss": 0.4017, + "step": 6700 + }, + { + "epoch": 2.8900613761171527, + "grad_norm": 0.5817933082580566, + "learning_rate": 7.212066002153518e-08, + "loss": 0.4009, + "step": 6710 + }, + { + "epoch": 2.894368472057715, + "grad_norm": 0.6080750226974487, + "learning_rate": 6.653830093302782e-08, + "loss": 0.3964, + "step": 6720 + }, + { + "epoch": 2.898675567998277, + "grad_norm": 0.5681482553482056, + "learning_rate": 6.11800569706833e-08, + "loss": 0.4003, + "step": 6730 + }, + { + "epoch": 2.902982663938839, + "grad_norm": 0.5769705176353455, + "learning_rate": 5.604604902524235e-08, + "loss": 0.4017, + "step": 6740 + }, + { + "epoch": 2.907289759879401, + "grad_norm": 0.546116828918457, + "learning_rate": 5.113639292831152e-08, + "loss": 0.3828, + "step": 6750 + }, + { + "epoch": 2.9115968558199636, + "grad_norm": 0.590798020362854, + "learning_rate": 4.645119944975296e-08, + "loss": 0.3853, + "step": 6760 + }, + { + "epoch": 2.9159039517605256, + "grad_norm": 0.5748469233512878, + "learning_rate": 4.1990574295187606e-08, + "loss": 0.4107, + "step": 6770 + }, + { + "epoch": 2.9202110477010876, + "grad_norm": 0.5733410716056824, + "learning_rate": 3.7754618103608144e-08, + "loss": 0.4052, + "step": 6780 + }, + { + "epoch": 2.9245181436416496, + "grad_norm": 0.5576743483543396, + "learning_rate": 3.374342644510531e-08, + "loss": 0.3846, + "step": 6790 + }, + { + "epoch": 2.9288252395822116, + "grad_norm": 0.596834123134613, + "learning_rate": 2.9957089818718476e-08, + "loss": 0.4029, + "step": 6800 + }, + { + "epoch": 2.9331323355227736, + "grad_norm": 0.5680873990058899, + "learning_rate": 2.639569365038841e-08, + "loss": 0.381, + "step": 6810 + }, + { + "epoch": 2.9374394314633356, + "grad_norm": 0.5597060918807983, + "learning_rate": 2.305931829102992e-08, + "loss": 0.3974, + "step": 6820 + }, + { + "epoch": 2.941746527403898, + "grad_norm": 0.5827191472053528, + "learning_rate": 1.9948039014724417e-08, + "loss": 0.3973, + "step": 6830 + }, + { + "epoch": 2.94605362334446, + "grad_norm": 0.6119829416275024, + "learning_rate": 1.706192601701462e-08, + "loss": 0.3984, + "step": 6840 + }, + { + "epoch": 2.950360719285022, + "grad_norm": 0.602497935295105, + "learning_rate": 1.4401044413324682e-08, + "loss": 0.4086, + "step": 6850 + }, + { + "epoch": 2.954667815225584, + "grad_norm": 0.5783790349960327, + "learning_rate": 1.1965454237493623e-08, + "loss": 0.3945, + "step": 6860 + }, + { + "epoch": 2.958974911166146, + "grad_norm": 0.5653091073036194, + "learning_rate": 9.755210440413055e-09, + "loss": 0.3938, + "step": 6870 + }, + { + "epoch": 2.9632820071067085, + "grad_norm": 0.5716846585273743, + "learning_rate": 7.770362888795957e-09, + "loss": 0.3935, + "step": 6880 + }, + { + "epoch": 2.9675891030472705, + "grad_norm": 0.6015262603759766, + "learning_rate": 6.0109563640442515e-09, + "loss": 0.3955, + "step": 6890 + }, + { + "epoch": 2.9718961989878325, + "grad_norm": 0.5763514041900635, + "learning_rate": 4.477030561246265e-09, + "loss": 0.4069, + "step": 6900 + }, + { + "epoch": 2.9762032949283945, + "grad_norm": 0.5644577741622925, + "learning_rate": 3.168620088271901e-09, + "loss": 0.3921, + "step": 6910 + }, + { + "epoch": 2.9805103908689565, + "grad_norm": 0.5302848219871521, + "learning_rate": 2.0857544650010332e-09, + "loss": 0.404, + "step": 6920 + }, + { + "epoch": 2.9848174868095185, + "grad_norm": 0.6025976538658142, + "learning_rate": 1.2284581226507108e-09, + "loss": 0.4037, + "step": 6930 + }, + { + "epoch": 2.9891245827500805, + "grad_norm": 0.5681896805763245, + "learning_rate": 5.967504032267091e-10, + "loss": 0.4031, + "step": 6940 + }, + { + "epoch": 2.993431678690643, + "grad_norm": 0.5708478093147278, + "learning_rate": 1.906455590883205e-10, + "loss": 0.4206, + "step": 6950 + }, + { + "epoch": 2.997738774631205, + "grad_norm": 0.5966918468475342, + "learning_rate": 1.015275262306048e-11, + "loss": 0.4014, + "step": 6960 + } + ], + "logging_steps": 10, + "max_steps": 6963, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7523782707118080.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}