{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99876492383697, "eval_steps": 500, "global_step": 5463, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005489227391244683, "grad_norm": 1.3054485321044922, "learning_rate": 9.140767824497258e-07, "loss": 1.7028, "step": 10 }, { "epoch": 0.010978454782489365, "grad_norm": 1.6326860189437866, "learning_rate": 1.8281535648994516e-06, "loss": 1.6575, "step": 20 }, { "epoch": 0.016467682173734045, "grad_norm": 0.9034647345542908, "learning_rate": 2.7422303473491773e-06, "loss": 1.3904, "step": 30 }, { "epoch": 0.02195690956497873, "grad_norm": 0.8317720293998718, "learning_rate": 3.6563071297989032e-06, "loss": 1.2568, "step": 40 }, { "epoch": 0.027446136956223412, "grad_norm": 0.8646258115768433, "learning_rate": 4.570383912248629e-06, "loss": 1.2404, "step": 50 }, { "epoch": 0.03293536434746809, "grad_norm": 0.7493156790733337, "learning_rate": 5.484460694698355e-06, "loss": 1.1597, "step": 60 }, { "epoch": 0.03842459173871278, "grad_norm": 0.7537096738815308, "learning_rate": 6.398537477148081e-06, "loss": 1.1168, "step": 70 }, { "epoch": 0.04391381912995746, "grad_norm": 0.6768060922622681, "learning_rate": 7.3126142595978065e-06, "loss": 1.0531, "step": 80 }, { "epoch": 0.04940304652120214, "grad_norm": 0.8540539145469666, "learning_rate": 8.226691042047533e-06, "loss": 1.0974, "step": 90 }, { "epoch": 0.054892273912446825, "grad_norm": 0.7654123306274414, "learning_rate": 9.140767824497258e-06, "loss": 1.0468, "step": 100 }, { "epoch": 0.06038150130369151, "grad_norm": 0.838114857673645, "learning_rate": 1.0054844606946984e-05, "loss": 1.0481, "step": 110 }, { "epoch": 0.06587072869493618, "grad_norm": 0.7839793562889099, "learning_rate": 1.096892138939671e-05, "loss": 1.0444, "step": 120 }, { "epoch": 0.07135995608618087, "grad_norm": 1.0483232736587524, "learning_rate": 1.1882998171846435e-05, "loss": 1.005, "step": 130 }, { "epoch": 0.07684918347742556, "grad_norm": 0.9476339221000671, "learning_rate": 1.2797074954296162e-05, "loss": 1.0538, "step": 140 }, { "epoch": 0.08233841086867023, "grad_norm": 0.8280003070831299, "learning_rate": 1.3711151736745886e-05, "loss": 1.0122, "step": 150 }, { "epoch": 0.08782763825991492, "grad_norm": 0.8112940788269043, "learning_rate": 1.4625228519195613e-05, "loss": 0.9613, "step": 160 }, { "epoch": 0.0933168656511596, "grad_norm": 0.9424939155578613, "learning_rate": 1.553930530164534e-05, "loss": 0.9637, "step": 170 }, { "epoch": 0.09880609304240429, "grad_norm": 0.781250536441803, "learning_rate": 1.6453382084095066e-05, "loss": 0.9984, "step": 180 }, { "epoch": 0.10429532043364896, "grad_norm": 0.9252836108207703, "learning_rate": 1.7367458866544793e-05, "loss": 0.9984, "step": 190 }, { "epoch": 0.10978454782489365, "grad_norm": 0.9257864356040955, "learning_rate": 1.8281535648994517e-05, "loss": 0.949, "step": 200 }, { "epoch": 0.11527377521613832, "grad_norm": 1.042043924331665, "learning_rate": 1.9195612431444244e-05, "loss": 1.0031, "step": 210 }, { "epoch": 0.12076300260738301, "grad_norm": 1.0521234273910522, "learning_rate": 2.0109689213893968e-05, "loss": 0.9751, "step": 220 }, { "epoch": 0.1262522299986277, "grad_norm": 0.865064263343811, "learning_rate": 2.1023765996343695e-05, "loss": 0.9564, "step": 230 }, { "epoch": 0.13174145738987236, "grad_norm": 0.8879236578941345, "learning_rate": 2.193784277879342e-05, "loss": 0.9182, "step": 240 }, { "epoch": 0.13723068478111705, "grad_norm": 0.9224317669868469, "learning_rate": 2.2851919561243146e-05, "loss": 0.9037, "step": 250 }, { "epoch": 0.14271991217236174, "grad_norm": 0.8295108675956726, "learning_rate": 2.376599634369287e-05, "loss": 0.9708, "step": 260 }, { "epoch": 0.14820913956360643, "grad_norm": 0.7987868785858154, "learning_rate": 2.4680073126142597e-05, "loss": 0.9611, "step": 270 }, { "epoch": 0.15369836695485112, "grad_norm": 0.774760901927948, "learning_rate": 2.5594149908592324e-05, "loss": 0.9872, "step": 280 }, { "epoch": 0.15918759434609578, "grad_norm": 0.7601301670074463, "learning_rate": 2.6508226691042048e-05, "loss": 0.9, "step": 290 }, { "epoch": 0.16467682173734047, "grad_norm": 0.9270791411399841, "learning_rate": 2.742230347349177e-05, "loss": 0.8798, "step": 300 }, { "epoch": 0.17016604912858516, "grad_norm": 0.873102605342865, "learning_rate": 2.8336380255941502e-05, "loss": 0.8962, "step": 310 }, { "epoch": 0.17565527651982984, "grad_norm": 0.9427269101142883, "learning_rate": 2.9250457038391226e-05, "loss": 0.886, "step": 320 }, { "epoch": 0.1811445039110745, "grad_norm": 0.8019095659255981, "learning_rate": 3.016453382084095e-05, "loss": 0.8335, "step": 330 }, { "epoch": 0.1866337313023192, "grad_norm": 0.9028713703155518, "learning_rate": 3.107861060329068e-05, "loss": 0.8946, "step": 340 }, { "epoch": 0.19212295869356388, "grad_norm": 1.0009723901748657, "learning_rate": 3.1992687385740404e-05, "loss": 0.9274, "step": 350 }, { "epoch": 0.19761218608480857, "grad_norm": 0.7785693407058716, "learning_rate": 3.290676416819013e-05, "loss": 0.8001, "step": 360 }, { "epoch": 0.20310141347605323, "grad_norm": 0.9450286030769348, "learning_rate": 3.382084095063985e-05, "loss": 0.9036, "step": 370 }, { "epoch": 0.20859064086729792, "grad_norm": 0.899732232093811, "learning_rate": 3.4734917733089586e-05, "loss": 0.8944, "step": 380 }, { "epoch": 0.2140798682585426, "grad_norm": 1.35003662109375, "learning_rate": 3.5648994515539306e-05, "loss": 0.8581, "step": 390 }, { "epoch": 0.2195690956497873, "grad_norm": 1.1555213928222656, "learning_rate": 3.656307129798903e-05, "loss": 0.8412, "step": 400 }, { "epoch": 0.225058323041032, "grad_norm": 0.8920039534568787, "learning_rate": 3.7477148080438754e-05, "loss": 0.9105, "step": 410 }, { "epoch": 0.23054755043227665, "grad_norm": 0.8022063970565796, "learning_rate": 3.839122486288849e-05, "loss": 0.9217, "step": 420 }, { "epoch": 0.23603677782352134, "grad_norm": 1.1498247385025024, "learning_rate": 3.930530164533821e-05, "loss": 0.9259, "step": 430 }, { "epoch": 0.24152600521476603, "grad_norm": 1.0198287963867188, "learning_rate": 4.0219378427787935e-05, "loss": 0.8857, "step": 440 }, { "epoch": 0.24701523260601072, "grad_norm": 0.9331903457641602, "learning_rate": 4.113345521023766e-05, "loss": 0.8754, "step": 450 }, { "epoch": 0.2525044599972554, "grad_norm": 0.9897291660308838, "learning_rate": 4.204753199268739e-05, "loss": 0.8769, "step": 460 }, { "epoch": 0.2579936873885001, "grad_norm": 1.6721230745315552, "learning_rate": 4.296160877513711e-05, "loss": 0.8992, "step": 470 }, { "epoch": 0.2634829147797447, "grad_norm": 1.1787182092666626, "learning_rate": 4.387568555758684e-05, "loss": 0.8918, "step": 480 }, { "epoch": 0.2689721421709894, "grad_norm": 1.0543595552444458, "learning_rate": 4.4789762340036564e-05, "loss": 0.8167, "step": 490 }, { "epoch": 0.2744613695622341, "grad_norm": 0.9777544140815735, "learning_rate": 4.570383912248629e-05, "loss": 0.8888, "step": 500 }, { "epoch": 0.2799505969534788, "grad_norm": 0.9173258543014526, "learning_rate": 4.661791590493602e-05, "loss": 0.8296, "step": 510 }, { "epoch": 0.2854398243447235, "grad_norm": 1.0830740928649902, "learning_rate": 4.753199268738574e-05, "loss": 0.8703, "step": 520 }, { "epoch": 0.29092905173596817, "grad_norm": 1.115646243095398, "learning_rate": 4.844606946983547e-05, "loss": 0.9081, "step": 530 }, { "epoch": 0.29641827912721286, "grad_norm": 1.219681739807129, "learning_rate": 4.936014625228519e-05, "loss": 0.9165, "step": 540 }, { "epoch": 0.30190750651845755, "grad_norm": 1.178253173828125, "learning_rate": 4.999995405604411e-05, "loss": 0.8977, "step": 550 }, { "epoch": 0.30739673390970224, "grad_norm": 1.3987079858779907, "learning_rate": 4.999913727930364e-05, "loss": 0.8527, "step": 560 }, { "epoch": 0.31288596130094687, "grad_norm": 1.095534324645996, "learning_rate": 4.999729956415998e-05, "loss": 0.8716, "step": 570 }, { "epoch": 0.31837518869219156, "grad_norm": 1.1343433856964111, "learning_rate": 4.9994440985663475e-05, "loss": 0.8402, "step": 580 }, { "epoch": 0.32386441608343625, "grad_norm": 1.177049994468689, "learning_rate": 4.9990561660555454e-05, "loss": 0.8629, "step": 590 }, { "epoch": 0.32935364347468093, "grad_norm": 1.091205358505249, "learning_rate": 4.998566174726347e-05, "loss": 0.7973, "step": 600 }, { "epoch": 0.3348428708659256, "grad_norm": 1.3899606466293335, "learning_rate": 4.997974144589481e-05, "loss": 0.8956, "step": 610 }, { "epoch": 0.3403320982571703, "grad_norm": 1.2220797538757324, "learning_rate": 4.997280099822833e-05, "loss": 0.794, "step": 620 }, { "epoch": 0.345821325648415, "grad_norm": 1.0186364650726318, "learning_rate": 4.996484068770461e-05, "loss": 0.7641, "step": 630 }, { "epoch": 0.3513105530396597, "grad_norm": 1.3602491617202759, "learning_rate": 4.9955860839414324e-05, "loss": 0.8582, "step": 640 }, { "epoch": 0.3567997804309043, "grad_norm": 1.2544053792953491, "learning_rate": 4.994586182008501e-05, "loss": 0.8087, "step": 650 }, { "epoch": 0.362289007822149, "grad_norm": 1.25338876247406, "learning_rate": 4.993484403806609e-05, "loss": 0.8814, "step": 660 }, { "epoch": 0.3677782352133937, "grad_norm": 1.6701184511184692, "learning_rate": 4.9922807943312135e-05, "loss": 0.8039, "step": 670 }, { "epoch": 0.3732674626046384, "grad_norm": 1.2474104166030884, "learning_rate": 4.990975402736457e-05, "loss": 0.8411, "step": 680 }, { "epoch": 0.3787566899958831, "grad_norm": 1.0203585624694824, "learning_rate": 4.9895682823331564e-05, "loss": 0.7838, "step": 690 }, { "epoch": 0.38424591738712777, "grad_norm": 1.2643638849258423, "learning_rate": 4.988059490586624e-05, "loss": 0.7802, "step": 700 }, { "epoch": 0.38973514477837246, "grad_norm": 1.2018098831176758, "learning_rate": 4.986449089114325e-05, "loss": 0.8049, "step": 710 }, { "epoch": 0.39522437216961714, "grad_norm": 1.5600682497024536, "learning_rate": 4.984737143683356e-05, "loss": 0.864, "step": 720 }, { "epoch": 0.40071359956086183, "grad_norm": 1.1996121406555176, "learning_rate": 4.982923724207764e-05, "loss": 0.8222, "step": 730 }, { "epoch": 0.40620282695210647, "grad_norm": 1.2239071130752563, "learning_rate": 4.9810089047456873e-05, "loss": 0.7757, "step": 740 }, { "epoch": 0.41169205434335115, "grad_norm": 1.278192162513733, "learning_rate": 4.978992763496334e-05, "loss": 0.7693, "step": 750 }, { "epoch": 0.41718128173459584, "grad_norm": 1.3768647909164429, "learning_rate": 4.976875382796786e-05, "loss": 0.7927, "step": 760 }, { "epoch": 0.42267050912584053, "grad_norm": 1.3257420063018799, "learning_rate": 4.974656849118638e-05, "loss": 0.7997, "step": 770 }, { "epoch": 0.4281597365170852, "grad_norm": 1.4355076551437378, "learning_rate": 4.972337253064466e-05, "loss": 0.7719, "step": 780 }, { "epoch": 0.4336489639083299, "grad_norm": 1.0469034910202026, "learning_rate": 4.969916689364128e-05, "loss": 0.8203, "step": 790 }, { "epoch": 0.4391381912995746, "grad_norm": 1.4641021490097046, "learning_rate": 4.9673952568708906e-05, "loss": 0.8303, "step": 800 }, { "epoch": 0.4446274186908193, "grad_norm": 1.6394554376602173, "learning_rate": 4.964773058557399e-05, "loss": 0.8693, "step": 810 }, { "epoch": 0.450116646082064, "grad_norm": 1.777869462966919, "learning_rate": 4.9620502015114675e-05, "loss": 0.7929, "step": 820 }, { "epoch": 0.4556058734733086, "grad_norm": 1.161238670349121, "learning_rate": 4.959226796931706e-05, "loss": 0.8393, "step": 830 }, { "epoch": 0.4610951008645533, "grad_norm": 1.5231930017471313, "learning_rate": 4.95630296012298e-05, "loss": 0.8195, "step": 840 }, { "epoch": 0.466584328255798, "grad_norm": 1.446094274520874, "learning_rate": 4.953278810491701e-05, "loss": 0.8157, "step": 850 }, { "epoch": 0.4720735556470427, "grad_norm": 1.702967882156372, "learning_rate": 4.950154471540951e-05, "loss": 0.7932, "step": 860 }, { "epoch": 0.47756278303828736, "grad_norm": 1.3679907321929932, "learning_rate": 4.9469300708654385e-05, "loss": 0.7741, "step": 870 }, { "epoch": 0.48305201042953205, "grad_norm": 1.1557847261428833, "learning_rate": 4.943605740146286e-05, "loss": 0.8406, "step": 880 }, { "epoch": 0.48854123782077674, "grad_norm": 1.4791802167892456, "learning_rate": 4.940181615145655e-05, "loss": 0.7731, "step": 890 }, { "epoch": 0.49403046521202143, "grad_norm": 1.3994717597961426, "learning_rate": 4.936657835701198e-05, "loss": 0.7903, "step": 900 }, { "epoch": 0.49951969260326606, "grad_norm": 1.2580246925354004, "learning_rate": 4.933034545720354e-05, "loss": 0.7601, "step": 910 }, { "epoch": 0.5050089199945108, "grad_norm": 1.4461493492126465, "learning_rate": 4.9293118931744624e-05, "loss": 0.8246, "step": 920 }, { "epoch": 0.5104981473857555, "grad_norm": 1.9255192279815674, "learning_rate": 4.925490030092729e-05, "loss": 0.7729, "step": 930 }, { "epoch": 0.5159873747770002, "grad_norm": 1.2568154335021973, "learning_rate": 4.9215691125560104e-05, "loss": 0.7711, "step": 940 }, { "epoch": 0.5214766021682449, "grad_norm": 1.2998193502426147, "learning_rate": 4.917549300690445e-05, "loss": 0.7897, "step": 950 }, { "epoch": 0.5269658295594895, "grad_norm": 1.712433099746704, "learning_rate": 4.9134307586609104e-05, "loss": 0.7356, "step": 960 }, { "epoch": 0.5324550569507341, "grad_norm": 1.4403119087219238, "learning_rate": 4.9092136546643184e-05, "loss": 0.7599, "step": 970 }, { "epoch": 0.5379442843419788, "grad_norm": 1.2811603546142578, "learning_rate": 4.9048981609227504e-05, "loss": 0.7572, "step": 980 }, { "epoch": 0.5434335117332235, "grad_norm": 1.6650887727737427, "learning_rate": 4.9004844536764185e-05, "loss": 0.7726, "step": 990 }, { "epoch": 0.5489227391244682, "grad_norm": 1.4498590230941772, "learning_rate": 4.8959727131764735e-05, "loss": 0.7772, "step": 1000 }, { "epoch": 0.5544119665157129, "grad_norm": 1.38353431224823, "learning_rate": 4.891363123677638e-05, "loss": 0.7954, "step": 1010 }, { "epoch": 0.5599011939069576, "grad_norm": 1.5972951650619507, "learning_rate": 4.886655873430687e-05, "loss": 0.759, "step": 1020 }, { "epoch": 0.5653904212982023, "grad_norm": 1.409515380859375, "learning_rate": 4.881851154674757e-05, "loss": 0.675, "step": 1030 }, { "epoch": 0.570879648689447, "grad_norm": 1.2562367916107178, "learning_rate": 4.876949163629494e-05, "loss": 0.8194, "step": 1040 }, { "epoch": 0.5763688760806917, "grad_norm": 1.520317554473877, "learning_rate": 4.871950100487043e-05, "loss": 0.7587, "step": 1050 }, { "epoch": 0.5818581034719363, "grad_norm": 1.5166853666305542, "learning_rate": 4.866854169403871e-05, "loss": 0.6909, "step": 1060 }, { "epoch": 0.587347330863181, "grad_norm": 1.4219826459884644, "learning_rate": 4.861661578492429e-05, "loss": 0.7907, "step": 1070 }, { "epoch": 0.5928365582544257, "grad_norm": 1.449629545211792, "learning_rate": 4.856372539812655e-05, "loss": 0.7512, "step": 1080 }, { "epoch": 0.5983257856456704, "grad_norm": 1.715462565422058, "learning_rate": 4.850987269363311e-05, "loss": 0.7171, "step": 1090 }, { "epoch": 0.6038150130369151, "grad_norm": 1.6240124702453613, "learning_rate": 4.845505987073161e-05, "loss": 0.763, "step": 1100 }, { "epoch": 0.6093042404281598, "grad_norm": 1.3949427604675293, "learning_rate": 4.839928916791996e-05, "loss": 0.7513, "step": 1110 }, { "epoch": 0.6147934678194045, "grad_norm": 1.491368293762207, "learning_rate": 4.834256286281482e-05, "loss": 0.6982, "step": 1120 }, { "epoch": 0.620282695210649, "grad_norm": 1.2943052053451538, "learning_rate": 4.82848832720587e-05, "loss": 0.8051, "step": 1130 }, { "epoch": 0.6257719226018937, "grad_norm": 1.7091878652572632, "learning_rate": 4.8226252751225245e-05, "loss": 0.7914, "step": 1140 }, { "epoch": 0.6312611499931384, "grad_norm": 1.2987576723098755, "learning_rate": 4.816667369472309e-05, "loss": 0.7705, "step": 1150 }, { "epoch": 0.6367503773843831, "grad_norm": 1.4213101863861084, "learning_rate": 4.810614853569807e-05, "loss": 0.7916, "step": 1160 }, { "epoch": 0.6422396047756278, "grad_norm": 1.4974167346954346, "learning_rate": 4.804467974593387e-05, "loss": 0.7628, "step": 1170 }, { "epoch": 0.6477288321668725, "grad_norm": 1.729684591293335, "learning_rate": 4.798226983575103e-05, "loss": 0.7393, "step": 1180 }, { "epoch": 0.6532180595581172, "grad_norm": 1.765308141708374, "learning_rate": 4.7918921353904464e-05, "loss": 0.7251, "step": 1190 }, { "epoch": 0.6587072869493619, "grad_norm": 1.7703893184661865, "learning_rate": 4.785463688747937e-05, "loss": 0.7329, "step": 1200 }, { "epoch": 0.6641965143406066, "grad_norm": 2.700155258178711, "learning_rate": 4.778941906178556e-05, "loss": 0.6967, "step": 1210 }, { "epoch": 0.6696857417318512, "grad_norm": 1.3553398847579956, "learning_rate": 4.772327054025027e-05, "loss": 0.7221, "step": 1220 }, { "epoch": 0.6751749691230959, "grad_norm": 1.2455166578292847, "learning_rate": 4.765619402430934e-05, "loss": 0.6925, "step": 1230 }, { "epoch": 0.6806641965143406, "grad_norm": 1.7047752141952515, "learning_rate": 4.758819225329696e-05, "loss": 0.7373, "step": 1240 }, { "epoch": 0.6861534239055853, "grad_norm": 1.5384269952774048, "learning_rate": 4.751926800433374e-05, "loss": 0.7348, "step": 1250 }, { "epoch": 0.69164265129683, "grad_norm": 1.491666316986084, "learning_rate": 4.744942409221333e-05, "loss": 0.7121, "step": 1260 }, { "epoch": 0.6971318786880747, "grad_norm": 1.4360090494155884, "learning_rate": 4.7378663369287445e-05, "loss": 0.6728, "step": 1270 }, { "epoch": 0.7026211060793194, "grad_norm": 1.3977197408676147, "learning_rate": 4.730698872534938e-05, "loss": 0.7617, "step": 1280 }, { "epoch": 0.7081103334705641, "grad_norm": 1.7353872060775757, "learning_rate": 4.723440308751601e-05, "loss": 0.6887, "step": 1290 }, { "epoch": 0.7135995608618086, "grad_norm": 1.3200151920318604, "learning_rate": 4.716090942010823e-05, "loss": 0.752, "step": 1300 }, { "epoch": 0.7190887882530533, "grad_norm": 1.333355188369751, "learning_rate": 4.708651072452993e-05, "loss": 0.7336, "step": 1310 }, { "epoch": 0.724578015644298, "grad_norm": 1.6440070867538452, "learning_rate": 4.701121003914537e-05, "loss": 0.7333, "step": 1320 }, { "epoch": 0.7300672430355427, "grad_norm": 1.848791480064392, "learning_rate": 4.693501043915514e-05, "loss": 0.7648, "step": 1330 }, { "epoch": 0.7355564704267874, "grad_norm": 1.593891978263855, "learning_rate": 4.685791503647052e-05, "loss": 0.787, "step": 1340 }, { "epoch": 0.7410456978180321, "grad_norm": 1.6957751512527466, "learning_rate": 4.6779926979586475e-05, "loss": 0.7212, "step": 1350 }, { "epoch": 0.7465349252092768, "grad_norm": 1.3588330745697021, "learning_rate": 4.6701049453453e-05, "loss": 0.7175, "step": 1360 }, { "epoch": 0.7520241526005215, "grad_norm": 1.462112307548523, "learning_rate": 4.662128567934509e-05, "loss": 0.7133, "step": 1370 }, { "epoch": 0.7575133799917662, "grad_norm": 1.5633749961853027, "learning_rate": 4.654063891473115e-05, "loss": 0.6978, "step": 1380 }, { "epoch": 0.7630026073830108, "grad_norm": 1.7605217695236206, "learning_rate": 4.645911245314e-05, "loss": 0.7019, "step": 1390 }, { "epoch": 0.7684918347742555, "grad_norm": 1.6843842267990112, "learning_rate": 4.637670962402636e-05, "loss": 0.7483, "step": 1400 }, { "epoch": 0.7739810621655002, "grad_norm": 2.015845537185669, "learning_rate": 4.629343379263487e-05, "loss": 0.7208, "step": 1410 }, { "epoch": 0.7794702895567449, "grad_norm": 1.8158447742462158, "learning_rate": 4.620928835986267e-05, "loss": 0.7733, "step": 1420 }, { "epoch": 0.7849595169479896, "grad_norm": 1.7793387174606323, "learning_rate": 4.6124276762120485e-05, "loss": 0.7111, "step": 1430 }, { "epoch": 0.7904487443392343, "grad_norm": 1.6674373149871826, "learning_rate": 4.603840247119233e-05, "loss": 0.6663, "step": 1440 }, { "epoch": 0.795937971730479, "grad_norm": 1.4028520584106445, "learning_rate": 4.595166899409368e-05, "loss": 0.7692, "step": 1450 }, { "epoch": 0.8014271991217237, "grad_norm": 1.6022142171859741, "learning_rate": 4.5864079872928265e-05, "loss": 0.7305, "step": 1460 }, { "epoch": 0.8069164265129684, "grad_norm": 1.4971508979797363, "learning_rate": 4.577563868474344e-05, "loss": 0.6875, "step": 1470 }, { "epoch": 0.8124056539042129, "grad_norm": 1.8490726947784424, "learning_rate": 4.5686349041384055e-05, "loss": 0.6849, "step": 1480 }, { "epoch": 0.8178948812954576, "grad_norm": 1.9100017547607422, "learning_rate": 4.559621458934498e-05, "loss": 0.6506, "step": 1490 }, { "epoch": 0.8233841086867023, "grad_norm": 1.6782461404800415, "learning_rate": 4.550523900962219e-05, "loss": 0.704, "step": 1500 }, { "epoch": 0.828873336077947, "grad_norm": 2.1226425170898438, "learning_rate": 4.541342601756242e-05, "loss": 0.6988, "step": 1510 }, { "epoch": 0.8343625634691917, "grad_norm": 1.658097267150879, "learning_rate": 4.532077936271144e-05, "loss": 0.705, "step": 1520 }, { "epoch": 0.8398517908604364, "grad_norm": 1.8850988149642944, "learning_rate": 4.522730282866093e-05, "loss": 0.6801, "step": 1530 }, { "epoch": 0.8453410182516811, "grad_norm": 1.5480940341949463, "learning_rate": 4.513300023289397e-05, "loss": 0.6308, "step": 1540 }, { "epoch": 0.8508302456429258, "grad_norm": 1.7652947902679443, "learning_rate": 4.503787542662912e-05, "loss": 0.6731, "step": 1550 }, { "epoch": 0.8563194730341704, "grad_norm": 1.902155876159668, "learning_rate": 4.494193229466314e-05, "loss": 0.7404, "step": 1560 }, { "epoch": 0.8618087004254151, "grad_norm": 2.1436920166015625, "learning_rate": 4.4845174755212385e-05, "loss": 0.6884, "step": 1570 }, { "epoch": 0.8672979278166598, "grad_norm": 1.6139538288116455, "learning_rate": 4.47476067597527e-05, "loss": 0.6947, "step": 1580 }, { "epoch": 0.8727871552079045, "grad_norm": 1.5919870138168335, "learning_rate": 4.464923229285816e-05, "loss": 0.6982, "step": 1590 }, { "epoch": 0.8782763825991492, "grad_norm": 1.6209038496017456, "learning_rate": 4.4550055372038225e-05, "loss": 0.7124, "step": 1600 }, { "epoch": 0.8837656099903939, "grad_norm": 1.631515383720398, "learning_rate": 4.445008004757376e-05, "loss": 0.6771, "step": 1610 }, { "epoch": 0.8892548373816386, "grad_norm": 1.4836645126342773, "learning_rate": 4.434931040235159e-05, "loss": 0.6272, "step": 1620 }, { "epoch": 0.8947440647728833, "grad_norm": 1.3640625476837158, "learning_rate": 4.4247750551697756e-05, "loss": 0.6477, "step": 1630 }, { "epoch": 0.900233292164128, "grad_norm": 1.5562537908554077, "learning_rate": 4.414540464320945e-05, "loss": 0.7128, "step": 1640 }, { "epoch": 0.9057225195553725, "grad_norm": 1.548048973083496, "learning_rate": 4.404227685658565e-05, "loss": 0.7098, "step": 1650 }, { "epoch": 0.9112117469466172, "grad_norm": 1.613368034362793, "learning_rate": 4.39383714034564e-05, "loss": 0.6926, "step": 1660 }, { "epoch": 0.9167009743378619, "grad_norm": 1.789654016494751, "learning_rate": 4.383369252721084e-05, "loss": 0.6398, "step": 1670 }, { "epoch": 0.9221902017291066, "grad_norm": 1.625928282737732, "learning_rate": 4.372824450282388e-05, "loss": 0.7087, "step": 1680 }, { "epoch": 0.9276794291203513, "grad_norm": 1.686936855316162, "learning_rate": 4.362203163668164e-05, "loss": 0.6764, "step": 1690 }, { "epoch": 0.933168656511596, "grad_norm": 1.6460559368133545, "learning_rate": 4.351505826640555e-05, "loss": 0.6969, "step": 1700 }, { "epoch": 0.9386578839028407, "grad_norm": 1.6267837285995483, "learning_rate": 4.3407328760675245e-05, "loss": 0.672, "step": 1710 }, { "epoch": 0.9441471112940854, "grad_norm": 1.5070548057556152, "learning_rate": 4.329884751905014e-05, "loss": 0.6586, "step": 1720 }, { "epoch": 0.94963633868533, "grad_norm": 1.8759193420410156, "learning_rate": 4.3189618971789747e-05, "loss": 0.6601, "step": 1730 }, { "epoch": 0.9551255660765747, "grad_norm": 1.6111549139022827, "learning_rate": 4.307964757967273e-05, "loss": 0.7042, "step": 1740 }, { "epoch": 0.9606147934678194, "grad_norm": 1.3748118877410889, "learning_rate": 4.2968937833814784e-05, "loss": 0.6573, "step": 1750 }, { "epoch": 0.9661040208590641, "grad_norm": 1.7284533977508545, "learning_rate": 4.285749425548518e-05, "loss": 0.619, "step": 1760 }, { "epoch": 0.9715932482503088, "grad_norm": 1.5528743267059326, "learning_rate": 4.274532139592211e-05, "loss": 0.6601, "step": 1770 }, { "epoch": 0.9770824756415535, "grad_norm": 1.6220190525054932, "learning_rate": 4.2632423836146885e-05, "loss": 0.6449, "step": 1780 }, { "epoch": 0.9825717030327982, "grad_norm": 2.00435471534729, "learning_rate": 4.251880618677678e-05, "loss": 0.6404, "step": 1790 }, { "epoch": 0.9880609304240429, "grad_norm": 1.8456660509109497, "learning_rate": 4.240447308783679e-05, "loss": 0.7124, "step": 1800 }, { "epoch": 0.9935501578152875, "grad_norm": 1.8724040985107422, "learning_rate": 4.2289429208570094e-05, "loss": 0.7138, "step": 1810 }, { "epoch": 0.9990393852065321, "grad_norm": 1.441105842590332, "learning_rate": 4.217367924724741e-05, "loss": 0.7439, "step": 1820 }, { "epoch": 1.0045286125977768, "grad_norm": 1.392276406288147, "learning_rate": 4.2057227930975066e-05, "loss": 0.4876, "step": 1830 }, { "epoch": 1.0100178399890216, "grad_norm": 1.4682689905166626, "learning_rate": 4.194008001550204e-05, "loss": 0.4949, "step": 1840 }, { "epoch": 1.0155070673802662, "grad_norm": 1.7317707538604736, "learning_rate": 4.1822240285025635e-05, "loss": 0.5329, "step": 1850 }, { "epoch": 1.020996294771511, "grad_norm": 1.9328278303146362, "learning_rate": 4.170371355199621e-05, "loss": 0.5068, "step": 1860 }, { "epoch": 1.0264855221627556, "grad_norm": 1.7879178524017334, "learning_rate": 4.158450465692051e-05, "loss": 0.5112, "step": 1870 }, { "epoch": 1.0319747495540004, "grad_norm": 1.6801658868789673, "learning_rate": 4.146461846816411e-05, "loss": 0.4826, "step": 1880 }, { "epoch": 1.037463976945245, "grad_norm": 1.6541537046432495, "learning_rate": 4.1344059881752534e-05, "loss": 0.4522, "step": 1890 }, { "epoch": 1.0429532043364897, "grad_norm": 2.27681303024292, "learning_rate": 4.1222833821171315e-05, "loss": 0.4726, "step": 1900 }, { "epoch": 1.0484424317277343, "grad_norm": 1.6335279941558838, "learning_rate": 4.110094523716492e-05, "loss": 0.469, "step": 1910 }, { "epoch": 1.053931659118979, "grad_norm": 1.730760931968689, "learning_rate": 4.0978399107534584e-05, "loss": 0.4554, "step": 1920 }, { "epoch": 1.0594208865102237, "grad_norm": 1.636106014251709, "learning_rate": 4.0855200436935e-05, "loss": 0.4914, "step": 1930 }, { "epoch": 1.0649101139014683, "grad_norm": 1.855231523513794, "learning_rate": 4.073135425666997e-05, "loss": 0.4609, "step": 1940 }, { "epoch": 1.070399341292713, "grad_norm": 2.0908730030059814, "learning_rate": 4.0606865624486875e-05, "loss": 0.472, "step": 1950 }, { "epoch": 1.0758885686839577, "grad_norm": 1.7960741519927979, "learning_rate": 4.048173962437019e-05, "loss": 0.5072, "step": 1960 }, { "epoch": 1.0813777960752025, "grad_norm": 1.6274662017822266, "learning_rate": 4.035598136633378e-05, "loss": 0.455, "step": 1970 }, { "epoch": 1.086867023466447, "grad_norm": 1.898768663406372, "learning_rate": 4.0229595986212304e-05, "loss": 0.5023, "step": 1980 }, { "epoch": 1.0923562508576918, "grad_norm": 1.6245406866073608, "learning_rate": 4.0102588645451396e-05, "loss": 0.4863, "step": 1990 }, { "epoch": 1.0978454782489364, "grad_norm": 1.440356731414795, "learning_rate": 3.997496453089692e-05, "loss": 0.4912, "step": 2000 }, { "epoch": 1.1033347056401812, "grad_norm": 1.9108120203018188, "learning_rate": 3.984672885458312e-05, "loss": 0.4691, "step": 2010 }, { "epoch": 1.1088239330314258, "grad_norm": 1.7355122566223145, "learning_rate": 3.971788685351978e-05, "loss": 0.4965, "step": 2020 }, { "epoch": 1.1143131604226706, "grad_norm": 1.7125989198684692, "learning_rate": 3.9588443789478366e-05, "loss": 0.468, "step": 2030 }, { "epoch": 1.1198023878139152, "grad_norm": 1.8434703350067139, "learning_rate": 3.945840494877709e-05, "loss": 0.4886, "step": 2040 }, { "epoch": 1.12529161520516, "grad_norm": 2.302004337310791, "learning_rate": 3.934086499185402e-05, "loss": 0.4932, "step": 2050 }, { "epoch": 1.1307808425964045, "grad_norm": 1.931429147720337, "learning_rate": 3.9209708826272075e-05, "loss": 0.5121, "step": 2060 }, { "epoch": 1.1362700699876491, "grad_norm": 1.889414668083191, "learning_rate": 3.907797235116677e-05, "loss": 0.5094, "step": 2070 }, { "epoch": 1.141759297378894, "grad_norm": 2.243352174758911, "learning_rate": 3.894566094651682e-05, "loss": 0.488, "step": 2080 }, { "epoch": 1.1472485247701387, "grad_norm": 1.6855474710464478, "learning_rate": 3.881278001578046e-05, "loss": 0.531, "step": 2090 }, { "epoch": 1.1527377521613833, "grad_norm": 2.328468084335327, "learning_rate": 3.8679334985674786e-05, "loss": 0.5397, "step": 2100 }, { "epoch": 1.1582269795526279, "grad_norm": 1.8057246208190918, "learning_rate": 3.854533130595408e-05, "loss": 0.4964, "step": 2110 }, { "epoch": 1.1637162069438727, "grad_norm": 1.6702812910079956, "learning_rate": 3.8410774449187315e-05, "loss": 0.5011, "step": 2120 }, { "epoch": 1.1692054343351173, "grad_norm": 1.4972355365753174, "learning_rate": 3.827566991053461e-05, "loss": 0.4922, "step": 2130 }, { "epoch": 1.174694661726362, "grad_norm": 1.739022970199585, "learning_rate": 3.814002320752287e-05, "loss": 0.4309, "step": 2140 }, { "epoch": 1.1801838891176066, "grad_norm": 1.8909087181091309, "learning_rate": 3.8003839879820377e-05, "loss": 0.4761, "step": 2150 }, { "epoch": 1.1856731165088514, "grad_norm": 1.9765682220458984, "learning_rate": 3.786712548901064e-05, "loss": 0.4895, "step": 2160 }, { "epoch": 1.191162343900096, "grad_norm": 2.1266307830810547, "learning_rate": 3.772988561836517e-05, "loss": 0.4894, "step": 2170 }, { "epoch": 1.1966515712913408, "grad_norm": 1.7856028079986572, "learning_rate": 3.759212587261559e-05, "loss": 0.4812, "step": 2180 }, { "epoch": 1.2021407986825854, "grad_norm": 1.8546531200408936, "learning_rate": 3.745385187772463e-05, "loss": 0.4928, "step": 2190 }, { "epoch": 1.2076300260738302, "grad_norm": 1.8596118688583374, "learning_rate": 3.731506928065641e-05, "loss": 0.512, "step": 2200 }, { "epoch": 1.2131192534650748, "grad_norm": 2.024635076522827, "learning_rate": 3.717578374914585e-05, "loss": 0.4715, "step": 2210 }, { "epoch": 1.2186084808563196, "grad_norm": 2.1620028018951416, "learning_rate": 3.703600097146718e-05, "loss": 0.4754, "step": 2220 }, { "epoch": 1.2240977082475641, "grad_norm": 1.9437251091003418, "learning_rate": 3.68957266562016e-05, "loss": 0.475, "step": 2230 }, { "epoch": 1.229586935638809, "grad_norm": 2.7284131050109863, "learning_rate": 3.675496653200425e-05, "loss": 0.4901, "step": 2240 }, { "epoch": 1.2350761630300535, "grad_norm": 2.2666921615600586, "learning_rate": 3.661372634737013e-05, "loss": 0.4694, "step": 2250 }, { "epoch": 1.240565390421298, "grad_norm": 1.5657079219818115, "learning_rate": 3.647201187039946e-05, "loss": 0.4809, "step": 2260 }, { "epoch": 1.246054617812543, "grad_norm": 2.3592708110809326, "learning_rate": 3.632982888856202e-05, "loss": 0.4539, "step": 2270 }, { "epoch": 1.2515438452037877, "grad_norm": 1.7647560834884644, "learning_rate": 3.6187183208460844e-05, "loss": 0.4945, "step": 2280 }, { "epoch": 1.2570330725950323, "grad_norm": 2.0273566246032715, "learning_rate": 3.604408065559508e-05, "loss": 0.4853, "step": 2290 }, { "epoch": 1.2625222999862769, "grad_norm": 2.0692555904388428, "learning_rate": 3.590052707412208e-05, "loss": 0.498, "step": 2300 }, { "epoch": 1.2680115273775217, "grad_norm": 2.35859751701355, "learning_rate": 3.575652832661872e-05, "loss": 0.5287, "step": 2310 }, { "epoch": 1.2735007547687662, "grad_norm": 1.8455514907836914, "learning_rate": 3.5612090293841994e-05, "loss": 0.5035, "step": 2320 }, { "epoch": 1.278989982160011, "grad_norm": 2.233416795730591, "learning_rate": 3.5467218874488837e-05, "loss": 0.5078, "step": 2330 }, { "epoch": 1.2844792095512556, "grad_norm": 1.7934064865112305, "learning_rate": 3.5321919984955244e-05, "loss": 0.5015, "step": 2340 }, { "epoch": 1.2899684369425004, "grad_norm": 1.753578543663025, "learning_rate": 3.517619955909463e-05, "loss": 0.4556, "step": 2350 }, { "epoch": 1.295457664333745, "grad_norm": 1.9207135438919067, "learning_rate": 3.5030063547975525e-05, "loss": 0.4417, "step": 2360 }, { "epoch": 1.3009468917249898, "grad_norm": 1.77664315700531, "learning_rate": 3.488351791963849e-05, "loss": 0.435, "step": 2370 }, { "epoch": 1.3064361191162344, "grad_norm": 1.5567264556884766, "learning_rate": 3.473656865885248e-05, "loss": 0.4872, "step": 2380 }, { "epoch": 1.3119253465074792, "grad_norm": 1.9232813119888306, "learning_rate": 3.4589221766870306e-05, "loss": 0.479, "step": 2390 }, { "epoch": 1.3174145738987237, "grad_norm": 1.6090134382247925, "learning_rate": 3.444148326118366e-05, "loss": 0.577, "step": 2400 }, { "epoch": 1.3229038012899683, "grad_norm": 1.7962336540222168, "learning_rate": 3.4293359175277314e-05, "loss": 0.4801, "step": 2410 }, { "epoch": 1.3283930286812131, "grad_norm": 2.1019630432128906, "learning_rate": 3.414485555838273e-05, "loss": 0.4884, "step": 2420 }, { "epoch": 1.333882256072458, "grad_norm": 1.8056087493896484, "learning_rate": 3.3995978475231024e-05, "loss": 0.4527, "step": 2430 }, { "epoch": 1.3393714834637025, "grad_norm": 1.7557107210159302, "learning_rate": 3.3846734005805254e-05, "loss": 0.4831, "step": 2440 }, { "epoch": 1.344860710854947, "grad_norm": 1.7773773670196533, "learning_rate": 3.369712824509217e-05, "loss": 0.4994, "step": 2450 }, { "epoch": 1.3503499382461919, "grad_norm": 1.7856857776641846, "learning_rate": 3.354716730283327e-05, "loss": 0.4761, "step": 2460 }, { "epoch": 1.3558391656374364, "grad_norm": 2.119858980178833, "learning_rate": 3.3396857303275296e-05, "loss": 0.4891, "step": 2470 }, { "epoch": 1.3613283930286812, "grad_norm": 1.9912039041519165, "learning_rate": 3.324620438492011e-05, "loss": 0.4415, "step": 2480 }, { "epoch": 1.3668176204199258, "grad_norm": 2.347066879272461, "learning_rate": 3.309521470027403e-05, "loss": 0.4733, "step": 2490 }, { "epoch": 1.3723068478111706, "grad_norm": 1.963139533996582, "learning_rate": 3.294389441559655e-05, "loss": 0.4626, "step": 2500 }, { "epoch": 1.3777960752024152, "grad_norm": 2.103672742843628, "learning_rate": 3.279224971064851e-05, "loss": 0.5168, "step": 2510 }, { "epoch": 1.38328530259366, "grad_norm": 1.741493821144104, "learning_rate": 3.2640286778439746e-05, "loss": 0.4687, "step": 2520 }, { "epoch": 1.3887745299849046, "grad_norm": 1.9451817274093628, "learning_rate": 3.248801182497615e-05, "loss": 0.454, "step": 2530 }, { "epoch": 1.3942637573761494, "grad_norm": 2.4190995693206787, "learning_rate": 3.233543106900624e-05, "loss": 0.4594, "step": 2540 }, { "epoch": 1.399752984767394, "grad_norm": 1.543632984161377, "learning_rate": 3.21825507417672e-05, "loss": 0.4408, "step": 2550 }, { "epoch": 1.4052422121586385, "grad_norm": 2.006373882293701, "learning_rate": 3.202937708673033e-05, "loss": 0.4802, "step": 2560 }, { "epoch": 1.4107314395498833, "grad_norm": 1.912208914756775, "learning_rate": 3.1875916359346214e-05, "loss": 0.4731, "step": 2570 }, { "epoch": 1.4162206669411281, "grad_norm": 1.6737933158874512, "learning_rate": 3.17221748267891e-05, "loss": 0.454, "step": 2580 }, { "epoch": 1.4217098943323727, "grad_norm": 1.9672836065292358, "learning_rate": 3.156815876770105e-05, "loss": 0.4229, "step": 2590 }, { "epoch": 1.4271991217236173, "grad_norm": 1.512810230255127, "learning_rate": 3.1413874471935496e-05, "loss": 0.4896, "step": 2600 }, { "epoch": 1.432688349114862, "grad_norm": 1.8552961349487305, "learning_rate": 3.125932824030037e-05, "loss": 0.4808, "step": 2610 }, { "epoch": 1.438177576506107, "grad_norm": 1.7483348846435547, "learning_rate": 3.110452638430081e-05, "loss": 0.4271, "step": 2620 }, { "epoch": 1.4436668038973515, "grad_norm": 1.7746537923812866, "learning_rate": 3.094947522588135e-05, "loss": 0.4618, "step": 2630 }, { "epoch": 1.449156031288596, "grad_norm": 2.1067216396331787, "learning_rate": 3.079418109716778e-05, "loss": 0.4765, "step": 2640 }, { "epoch": 1.4546452586798408, "grad_norm": 1.6052168607711792, "learning_rate": 3.063865034020857e-05, "loss": 0.4596, "step": 2650 }, { "epoch": 1.4601344860710854, "grad_norm": 1.6968189477920532, "learning_rate": 3.0482889306715813e-05, "loss": 0.4384, "step": 2660 }, { "epoch": 1.4656237134623302, "grad_norm": 1.869379997253418, "learning_rate": 3.032690435780584e-05, "loss": 0.4872, "step": 2670 }, { "epoch": 1.4711129408535748, "grad_norm": 1.8812456130981445, "learning_rate": 3.017070186373949e-05, "loss": 0.4581, "step": 2680 }, { "epoch": 1.4766021682448196, "grad_norm": 1.99275803565979, "learning_rate": 3.001428820366187e-05, "loss": 0.49, "step": 2690 }, { "epoch": 1.4820913956360642, "grad_norm": 1.6111352443695068, "learning_rate": 2.9857669765341928e-05, "loss": 0.4262, "step": 2700 }, { "epoch": 1.487580623027309, "grad_norm": 2.6115357875823975, "learning_rate": 2.9700852944911512e-05, "loss": 0.4774, "step": 2710 }, { "epoch": 1.4930698504185536, "grad_norm": 1.8144830465316772, "learning_rate": 2.9543844146604195e-05, "loss": 0.4618, "step": 2720 }, { "epoch": 1.4985590778097984, "grad_norm": 1.7375366687774658, "learning_rate": 2.938664978249372e-05, "loss": 0.4278, "step": 2730 }, { "epoch": 1.504048305201043, "grad_norm": 1.914023756980896, "learning_rate": 2.9229276272232146e-05, "loss": 0.4706, "step": 2740 }, { "epoch": 1.5095375325922875, "grad_norm": 1.7386229038238525, "learning_rate": 2.907173004278768e-05, "loss": 0.4308, "step": 2750 }, { "epoch": 1.5150267599835323, "grad_norm": 1.5574982166290283, "learning_rate": 2.8914017528182185e-05, "loss": 0.4487, "step": 2760 }, { "epoch": 1.5205159873747771, "grad_norm": 2.144409418106079, "learning_rate": 2.8756145169228432e-05, "loss": 0.4232, "step": 2770 }, { "epoch": 1.5260052147660217, "grad_norm": 2.5904343128204346, "learning_rate": 2.859811941326709e-05, "loss": 0.4603, "step": 2780 }, { "epoch": 1.5314944421572663, "grad_norm": 2.3824493885040283, "learning_rate": 2.8439946713903354e-05, "loss": 0.4649, "step": 2790 }, { "epoch": 1.536983669548511, "grad_norm": 2.1540448665618896, "learning_rate": 2.8281633530743497e-05, "loss": 0.4988, "step": 2800 }, { "epoch": 1.5424728969397559, "grad_norm": 2.121973752975464, "learning_rate": 2.8123186329130942e-05, "loss": 0.4795, "step": 2810 }, { "epoch": 1.5479621243310004, "grad_norm": 1.8560881614685059, "learning_rate": 2.7964611579882317e-05, "loss": 0.427, "step": 2820 }, { "epoch": 1.553451351722245, "grad_norm": 2.625507354736328, "learning_rate": 2.7805915759023153e-05, "loss": 0.4982, "step": 2830 }, { "epoch": 1.5589405791134898, "grad_norm": 1.8594845533370972, "learning_rate": 2.764710534752342e-05, "loss": 0.4489, "step": 2840 }, { "epoch": 1.5644298065047346, "grad_norm": 1.9746872186660767, "learning_rate": 2.748818683103285e-05, "loss": 0.4177, "step": 2850 }, { "epoch": 1.569919033895979, "grad_norm": 1.9741085767745972, "learning_rate": 2.7329166699616064e-05, "loss": 0.4816, "step": 2860 }, { "epoch": 1.5754082612872238, "grad_norm": 1.9904859066009521, "learning_rate": 2.7170051447487532e-05, "loss": 0.4392, "step": 2870 }, { "epoch": 1.5808974886784686, "grad_norm": 1.9376888275146484, "learning_rate": 2.7010847572746356e-05, "loss": 0.5002, "step": 2880 }, { "epoch": 1.5863867160697132, "grad_norm": 1.6673862934112549, "learning_rate": 2.6851561577110874e-05, "loss": 0.437, "step": 2890 }, { "epoch": 1.5918759434609577, "grad_norm": 1.8041437864303589, "learning_rate": 2.6692199965653185e-05, "loss": 0.4565, "step": 2900 }, { "epoch": 1.5973651708522025, "grad_norm": 1.6648041009902954, "learning_rate": 2.6532769246533435e-05, "loss": 0.4755, "step": 2910 }, { "epoch": 1.6028543982434473, "grad_norm": 2.290234088897705, "learning_rate": 2.6373275930734075e-05, "loss": 0.4603, "step": 2920 }, { "epoch": 1.608343625634692, "grad_norm": 2.1882123947143555, "learning_rate": 2.621372653179391e-05, "loss": 0.4551, "step": 2930 }, { "epoch": 1.6138328530259365, "grad_norm": 1.7908653020858765, "learning_rate": 2.6054127565542146e-05, "loss": 0.5062, "step": 2940 }, { "epoch": 1.6193220804171813, "grad_norm": 2.1407206058502197, "learning_rate": 2.5894485549832254e-05, "loss": 0.5046, "step": 2950 }, { "epoch": 1.624811307808426, "grad_norm": 1.8676074743270874, "learning_rate": 2.57348070042758e-05, "loss": 0.4685, "step": 2960 }, { "epoch": 1.6303005351996707, "grad_norm": 2.0238535404205322, "learning_rate": 2.5575098449976204e-05, "loss": 0.4836, "step": 2970 }, { "epoch": 1.6357897625909152, "grad_norm": 2.1416938304901123, "learning_rate": 2.541536640926238e-05, "loss": 0.4146, "step": 2980 }, { "epoch": 1.64127898998216, "grad_norm": 2.006524085998535, "learning_rate": 2.5255617405422443e-05, "loss": 0.441, "step": 2990 }, { "epoch": 1.6467682173734048, "grad_norm": 2.2434608936309814, "learning_rate": 2.5095857962437226e-05, "loss": 0.4932, "step": 3000 }, { "epoch": 1.6522574447646494, "grad_norm": 2.113938093185425, "learning_rate": 2.4936094604713918e-05, "loss": 0.4324, "step": 3010 }, { "epoch": 1.657746672155894, "grad_norm": 1.8772289752960205, "learning_rate": 2.4776333856819565e-05, "loss": 0.4655, "step": 3020 }, { "epoch": 1.6632358995471388, "grad_norm": 2.1245956420898438, "learning_rate": 2.4616582243214623e-05, "loss": 0.4631, "step": 3030 }, { "epoch": 1.6687251269383834, "grad_norm": 2.2539162635803223, "learning_rate": 2.4456846287986525e-05, "loss": 0.4492, "step": 3040 }, { "epoch": 1.674214354329628, "grad_norm": 2.4101765155792236, "learning_rate": 2.429713251458323e-05, "loss": 0.4326, "step": 3050 }, { "epoch": 1.6797035817208728, "grad_norm": 2.1554176807403564, "learning_rate": 2.4137447445546837e-05, "loss": 0.4527, "step": 3060 }, { "epoch": 1.6851928091121176, "grad_norm": 2.0779566764831543, "learning_rate": 2.397779760224713e-05, "loss": 0.4331, "step": 3070 }, { "epoch": 1.6906820365033621, "grad_norm": 2.3145909309387207, "learning_rate": 2.3818189504615367e-05, "loss": 0.4159, "step": 3080 }, { "epoch": 1.6961712638946067, "grad_norm": 2.1414687633514404, "learning_rate": 2.3658629670877938e-05, "loss": 0.4996, "step": 3090 }, { "epoch": 1.7016604912858515, "grad_norm": 1.6377606391906738, "learning_rate": 2.3499124617290187e-05, "loss": 0.4827, "step": 3100 }, { "epoch": 1.7071497186770963, "grad_norm": 2.27193546295166, "learning_rate": 2.3339680857870288e-05, "loss": 0.5358, "step": 3110 }, { "epoch": 1.7126389460683409, "grad_norm": 1.6943323612213135, "learning_rate": 2.318030490413323e-05, "loss": 0.4584, "step": 3120 }, { "epoch": 1.7181281734595855, "grad_norm": 2.1574575901031494, "learning_rate": 2.30210032648249e-05, "loss": 0.4366, "step": 3130 }, { "epoch": 1.7236174008508303, "grad_norm": 1.805962085723877, "learning_rate": 2.286178244565625e-05, "loss": 0.4633, "step": 3140 }, { "epoch": 1.729106628242075, "grad_norm": 2.4552011489868164, "learning_rate": 2.2702648949037618e-05, "loss": 0.4861, "step": 3150 }, { "epoch": 1.7345958556333196, "grad_norm": 1.9455459117889404, "learning_rate": 2.2543609273813195e-05, "loss": 0.4881, "step": 3160 }, { "epoch": 1.7400850830245642, "grad_norm": 1.8050341606140137, "learning_rate": 2.2384669914995592e-05, "loss": 0.418, "step": 3170 }, { "epoch": 1.745574310415809, "grad_norm": 2.0198020935058594, "learning_rate": 2.2225837363500636e-05, "loss": 0.472, "step": 3180 }, { "epoch": 1.7510635378070538, "grad_norm": 2.245699167251587, "learning_rate": 2.2067118105882195e-05, "loss": 0.4718, "step": 3190 }, { "epoch": 1.7565527651982984, "grad_norm": 2.3782804012298584, "learning_rate": 2.190851862406739e-05, "loss": 0.4318, "step": 3200 }, { "epoch": 1.762041992589543, "grad_norm": 1.766295075416565, "learning_rate": 2.17500453950918e-05, "loss": 0.4728, "step": 3210 }, { "epoch": 1.7675312199807878, "grad_norm": 1.883118987083435, "learning_rate": 2.159170489083498e-05, "loss": 0.4229, "step": 3220 }, { "epoch": 1.7730204473720323, "grad_norm": 2.2457189559936523, "learning_rate": 2.1433503577756137e-05, "loss": 0.3906, "step": 3230 }, { "epoch": 1.778509674763277, "grad_norm": 1.66023850440979, "learning_rate": 2.1275447916630055e-05, "loss": 0.379, "step": 3240 }, { "epoch": 1.7839989021545217, "grad_norm": 2.2401814460754395, "learning_rate": 2.1117544362283286e-05, "loss": 0.4173, "step": 3250 }, { "epoch": 1.7894881295457665, "grad_norm": 2.2202141284942627, "learning_rate": 2.0959799363330425e-05, "loss": 0.426, "step": 3260 }, { "epoch": 1.794977356937011, "grad_norm": 2.292778253555298, "learning_rate": 2.0802219361910908e-05, "loss": 0.4165, "step": 3270 }, { "epoch": 1.8004665843282557, "grad_norm": 2.025392770767212, "learning_rate": 2.0644810793425807e-05, "loss": 0.4216, "step": 3280 }, { "epoch": 1.8059558117195005, "grad_norm": 1.669911503791809, "learning_rate": 2.048758008627506e-05, "loss": 0.4745, "step": 3290 }, { "epoch": 1.8114450391107453, "grad_norm": 2.2425777912139893, "learning_rate": 2.033053366159493e-05, "loss": 0.4314, "step": 3300 }, { "epoch": 1.8169342665019899, "grad_norm": 2.065985679626465, "learning_rate": 2.0173677932995787e-05, "loss": 0.4882, "step": 3310 }, { "epoch": 1.8224234938932344, "grad_norm": 1.8231384754180908, "learning_rate": 2.0017019306300182e-05, "loss": 0.4346, "step": 3320 }, { "epoch": 1.8279127212844792, "grad_norm": 2.203216075897217, "learning_rate": 1.9860564179281217e-05, "loss": 0.515, "step": 3330 }, { "epoch": 1.833401948675724, "grad_norm": 1.8703504800796509, "learning_rate": 1.970431894140128e-05, "loss": 0.4268, "step": 3340 }, { "epoch": 1.8388911760669686, "grad_norm": 2.13779616355896, "learning_rate": 1.954828997355112e-05, "loss": 0.4324, "step": 3350 }, { "epoch": 1.8443804034582132, "grad_norm": 2.4620044231414795, "learning_rate": 1.939248364778924e-05, "loss": 0.4542, "step": 3360 }, { "epoch": 1.849869630849458, "grad_norm": 2.0297598838806152, "learning_rate": 1.923690632708169e-05, "loss": 0.4695, "step": 3370 }, { "epoch": 1.8553588582407026, "grad_norm": 2.2314670085906982, "learning_rate": 1.908156436504215e-05, "loss": 0.4433, "step": 3380 }, { "epoch": 1.8608480856319471, "grad_norm": 1.7243778705596924, "learning_rate": 1.892646410567255e-05, "loss": 0.4257, "step": 3390 }, { "epoch": 1.866337313023192, "grad_norm": 1.6004356145858765, "learning_rate": 1.877161188310392e-05, "loss": 0.3585, "step": 3400 }, { "epoch": 1.8718265404144367, "grad_norm": 2.1382360458374023, "learning_rate": 1.8617014021337732e-05, "loss": 0.4234, "step": 3410 }, { "epoch": 1.8773157678056813, "grad_norm": 1.9200503826141357, "learning_rate": 1.846267683398761e-05, "loss": 0.4546, "step": 3420 }, { "epoch": 1.882804995196926, "grad_norm": 1.9421885013580322, "learning_rate": 1.830860662402153e-05, "loss": 0.4505, "step": 3430 }, { "epoch": 1.8882942225881707, "grad_norm": 2.25044584274292, "learning_rate": 1.8154809683504403e-05, "loss": 0.4684, "step": 3440 }, { "epoch": 1.8937834499794155, "grad_norm": 1.7311737537384033, "learning_rate": 1.8001292293341087e-05, "loss": 0.4478, "step": 3450 }, { "epoch": 1.89927267737066, "grad_norm": 1.8768479824066162, "learning_rate": 1.7848060723019894e-05, "loss": 0.4323, "step": 3460 }, { "epoch": 1.9047619047619047, "grad_norm": 1.617492914199829, "learning_rate": 1.7695121230356566e-05, "loss": 0.4429, "step": 3470 }, { "epoch": 1.9102511321531495, "grad_norm": 2.0293169021606445, "learning_rate": 1.7542480061238685e-05, "loss": 0.4399, "step": 3480 }, { "epoch": 1.9157403595443943, "grad_norm": 1.7568955421447754, "learning_rate": 1.7390143449370663e-05, "loss": 0.4029, "step": 3490 }, { "epoch": 1.9212295869356388, "grad_norm": 1.6997624635696411, "learning_rate": 1.723811761601904e-05, "loss": 0.4072, "step": 3500 }, { "epoch": 1.9267188143268834, "grad_norm": 2.185622453689575, "learning_rate": 1.708640876975855e-05, "loss": 0.4502, "step": 3510 }, { "epoch": 1.9322080417181282, "grad_norm": 2.803870439529419, "learning_rate": 1.693502310621848e-05, "loss": 0.4397, "step": 3520 }, { "epoch": 1.937697269109373, "grad_norm": 2.163422107696533, "learning_rate": 1.6783966807829692e-05, "loss": 0.4562, "step": 3530 }, { "epoch": 1.9431864965006176, "grad_norm": 1.7700269222259521, "learning_rate": 1.66332460435721e-05, "loss": 0.4332, "step": 3540 }, { "epoch": 1.9486757238918622, "grad_norm": 1.9938660860061646, "learning_rate": 1.648286696872277e-05, "loss": 0.4392, "step": 3550 }, { "epoch": 1.954164951283107, "grad_norm": 1.796387791633606, "learning_rate": 1.6332835724604556e-05, "loss": 0.4079, "step": 3560 }, { "epoch": 1.9596541786743515, "grad_norm": 1.6725765466690063, "learning_rate": 1.6183158438335223e-05, "loss": 0.4156, "step": 3570 }, { "epoch": 1.9651434060655961, "grad_norm": 2.1465470790863037, "learning_rate": 1.6033841222577312e-05, "loss": 0.4514, "step": 3580 }, { "epoch": 1.970632633456841, "grad_norm": 2.0636017322540283, "learning_rate": 1.588489017528844e-05, "loss": 0.4107, "step": 3590 }, { "epoch": 1.9761218608480857, "grad_norm": 2.281461000442505, "learning_rate": 1.573631137947232e-05, "loss": 0.4247, "step": 3600 }, { "epoch": 1.9816110882393303, "grad_norm": 1.8369041681289673, "learning_rate": 1.5588110902930252e-05, "loss": 0.3993, "step": 3610 }, { "epoch": 1.9871003156305749, "grad_norm": 1.538087010383606, "learning_rate": 1.5440294798013445e-05, "loss": 0.4032, "step": 3620 }, { "epoch": 1.9925895430218197, "grad_norm": 1.7788771390914917, "learning_rate": 1.5292869101375718e-05, "loss": 0.4191, "step": 3630 }, { "epoch": 1.9980787704130645, "grad_norm": 1.663266897201538, "learning_rate": 1.514583983372707e-05, "loss": 0.4065, "step": 3640 }, { "epoch": 2.003567997804309, "grad_norm": 1.6794120073318481, "learning_rate": 1.4999212999587723e-05, "loss": 0.3012, "step": 3650 }, { "epoch": 2.0090572251955536, "grad_norm": 2.0947213172912598, "learning_rate": 1.4852994587042957e-05, "loss": 0.2699, "step": 3660 }, { "epoch": 2.0145464525867984, "grad_norm": 1.647148609161377, "learning_rate": 1.4707190567498552e-05, "loss": 0.256, "step": 3670 }, { "epoch": 2.0200356799780432, "grad_norm": 1.6485075950622559, "learning_rate": 1.4561806895436907e-05, "loss": 0.2306, "step": 3680 }, { "epoch": 2.0255249073692876, "grad_norm": 1.5765630006790161, "learning_rate": 1.4416849508173864e-05, "loss": 0.2363, "step": 3690 }, { "epoch": 2.0310141347605324, "grad_norm": 2.0813677310943604, "learning_rate": 1.4272324325616251e-05, "loss": 0.2407, "step": 3700 }, { "epoch": 2.036503362151777, "grad_norm": 2.0815207958221436, "learning_rate": 1.4128237250020115e-05, "loss": 0.2525, "step": 3710 }, { "epoch": 2.041992589543022, "grad_norm": 2.1880548000335693, "learning_rate": 1.3984594165749676e-05, "loss": 0.2371, "step": 3720 }, { "epoch": 2.0474818169342663, "grad_norm": 1.7031059265136719, "learning_rate": 1.3841400939037013e-05, "loss": 0.241, "step": 3730 }, { "epoch": 2.052971044325511, "grad_norm": 1.9227020740509033, "learning_rate": 1.3698663417742496e-05, "loss": 0.2509, "step": 3740 }, { "epoch": 2.058460271716756, "grad_norm": 1.8670146465301514, "learning_rate": 1.3556387431115969e-05, "loss": 0.265, "step": 3750 }, { "epoch": 2.0639494991080007, "grad_norm": 2.06124210357666, "learning_rate": 1.3414578789558696e-05, "loss": 0.2182, "step": 3760 }, { "epoch": 2.069438726499245, "grad_norm": 2.2382616996765137, "learning_rate": 1.3273243284386023e-05, "loss": 0.2477, "step": 3770 }, { "epoch": 2.07492795389049, "grad_norm": 2.178103446960449, "learning_rate": 1.3132386687590958e-05, "loss": 0.2379, "step": 3780 }, { "epoch": 2.0804171812817347, "grad_norm": 1.947129249572754, "learning_rate": 1.2992014751608372e-05, "loss": 0.2319, "step": 3790 }, { "epoch": 2.0859064086729795, "grad_norm": 2.0307064056396484, "learning_rate": 1.2852133209080097e-05, "loss": 0.2416, "step": 3800 }, { "epoch": 2.091395636064224, "grad_norm": 1.988174319267273, "learning_rate": 1.2712747772620801e-05, "loss": 0.2629, "step": 3810 }, { "epoch": 2.0968848634554687, "grad_norm": 3.22770094871521, "learning_rate": 1.2573864134584718e-05, "loss": 0.2069, "step": 3820 }, { "epoch": 2.1023740908467135, "grad_norm": 2.0717577934265137, "learning_rate": 1.243548796683319e-05, "loss": 0.237, "step": 3830 }, { "epoch": 2.107863318237958, "grad_norm": 2.118257522583008, "learning_rate": 1.2297624920502953e-05, "loss": 0.2531, "step": 3840 }, { "epoch": 2.1133525456292026, "grad_norm": 2.584071636199951, "learning_rate": 1.2160280625775447e-05, "loss": 0.2464, "step": 3850 }, { "epoch": 2.1188417730204474, "grad_norm": 2.8391823768615723, "learning_rate": 1.2023460691646821e-05, "loss": 0.2344, "step": 3860 }, { "epoch": 2.124331000411692, "grad_norm": 1.8388172388076782, "learning_rate": 1.1887170705698905e-05, "loss": 0.2191, "step": 3870 }, { "epoch": 2.1298202278029366, "grad_norm": 2.278942346572876, "learning_rate": 1.1751416233870999e-05, "loss": 0.2303, "step": 3880 }, { "epoch": 2.1353094551941814, "grad_norm": 2.4428744316101074, "learning_rate": 1.1616202820232567e-05, "loss": 0.2493, "step": 3890 }, { "epoch": 2.140798682585426, "grad_norm": 2.272839069366455, "learning_rate": 1.1481535986756828e-05, "loss": 0.2527, "step": 3900 }, { "epoch": 2.146287909976671, "grad_norm": 2.1877167224884033, "learning_rate": 1.134742123309525e-05, "loss": 0.2599, "step": 3910 }, { "epoch": 2.1517771373679153, "grad_norm": 1.6496747732162476, "learning_rate": 1.1213864036352939e-05, "loss": 0.2457, "step": 3920 }, { "epoch": 2.15726636475916, "grad_norm": 2.1241507530212402, "learning_rate": 1.1080869850864964e-05, "loss": 0.2532, "step": 3930 }, { "epoch": 2.162755592150405, "grad_norm": 1.5271326303482056, "learning_rate": 1.094844410797361e-05, "loss": 0.2651, "step": 3940 }, { "epoch": 2.1682448195416497, "grad_norm": 1.9697147607803345, "learning_rate": 1.0816592215806562e-05, "loss": 0.2171, "step": 3950 }, { "epoch": 2.173734046932894, "grad_norm": 1.604737401008606, "learning_rate": 1.0685319559056051e-05, "loss": 0.2579, "step": 3960 }, { "epoch": 2.179223274324139, "grad_norm": 2.053114414215088, "learning_rate": 1.0554631498758943e-05, "loss": 0.2541, "step": 3970 }, { "epoch": 2.1847125017153837, "grad_norm": 2.424609422683716, "learning_rate": 1.0424533372077803e-05, "loss": 0.2053, "step": 3980 }, { "epoch": 2.1902017291066285, "grad_norm": 2.34372615814209, "learning_rate": 1.029503049208293e-05, "loss": 0.214, "step": 3990 }, { "epoch": 2.195690956497873, "grad_norm": 2.1274008750915527, "learning_rate": 1.0166128147535352e-05, "loss": 0.2133, "step": 4000 }, { "epoch": 2.2011801838891176, "grad_norm": 1.7201030254364014, "learning_rate": 1.003783160267091e-05, "loss": 0.2233, "step": 4010 }, { "epoch": 2.2066694112803624, "grad_norm": 2.844679832458496, "learning_rate": 9.91014609698519e-06, "loss": 0.2389, "step": 4020 }, { "epoch": 2.212158638671607, "grad_norm": 1.7438113689422607, "learning_rate": 9.783076845019598e-06, "loss": 0.2297, "step": 4030 }, { "epoch": 2.2176478660628516, "grad_norm": 2.076685905456543, "learning_rate": 9.656629036148365e-06, "loss": 0.2519, "step": 4040 }, { "epoch": 2.2231370934540964, "grad_norm": 2.197861671447754, "learning_rate": 9.530807834366658e-06, "loss": 0.2416, "step": 4050 }, { "epoch": 2.228626320845341, "grad_norm": 1.6589149236679077, "learning_rate": 9.405618378079686e-06, "loss": 0.238, "step": 4060 }, { "epoch": 2.2341155482365855, "grad_norm": 2.1636011600494385, "learning_rate": 9.281065779892826e-06, "loss": 0.2165, "step": 4070 }, { "epoch": 2.2396047756278303, "grad_norm": 2.11350679397583, "learning_rate": 9.15715512640282e-06, "loss": 0.2539, "step": 4080 }, { "epoch": 2.245094003019075, "grad_norm": 2.1883132457733154, "learning_rate": 9.033891477990091e-06, "loss": 0.2392, "step": 4090 }, { "epoch": 2.25058323041032, "grad_norm": 1.9660489559173584, "learning_rate": 8.923511544874787e-06, "loss": 0.2507, "step": 4100 }, { "epoch": 2.2560724578015643, "grad_norm": 1.596596360206604, "learning_rate": 8.801491052657259e-06, "loss": 0.2205, "step": 4110 }, { "epoch": 2.261561685192809, "grad_norm": 1.9153178930282593, "learning_rate": 8.680132090462712e-06, "loss": 0.2189, "step": 4120 }, { "epoch": 2.267050912584054, "grad_norm": 2.2162649631500244, "learning_rate": 8.559439614463177e-06, "loss": 0.2573, "step": 4130 }, { "epoch": 2.2725401399752982, "grad_norm": 2.1483819484710693, "learning_rate": 8.439418553612105e-06, "loss": 0.2464, "step": 4140 }, { "epoch": 2.278029367366543, "grad_norm": 2.043180465698242, "learning_rate": 8.320073809443024e-06, "loss": 0.236, "step": 4150 }, { "epoch": 2.283518594757788, "grad_norm": 1.6839346885681152, "learning_rate": 8.201410255869458e-06, "loss": 0.2596, "step": 4160 }, { "epoch": 2.2890078221490326, "grad_norm": 2.334520101547241, "learning_rate": 8.083432738985782e-06, "loss": 0.2233, "step": 4170 }, { "epoch": 2.2944970495402774, "grad_norm": 2.2806715965270996, "learning_rate": 7.966146076869386e-06, "loss": 0.223, "step": 4180 }, { "epoch": 2.299986276931522, "grad_norm": 2.726118803024292, "learning_rate": 7.849555059383839e-06, "loss": 0.2236, "step": 4190 }, { "epoch": 2.3054755043227666, "grad_norm": 1.6786760091781616, "learning_rate": 7.733664447983349e-06, "loss": 0.2509, "step": 4200 }, { "epoch": 2.3109647317140114, "grad_norm": 1.767065405845642, "learning_rate": 7.618478975518292e-06, "loss": 0.2373, "step": 4210 }, { "epoch": 2.3164539591052558, "grad_norm": 1.5344187021255493, "learning_rate": 7.504003346041871e-06, "loss": 0.2404, "step": 4220 }, { "epoch": 2.3219431864965006, "grad_norm": 2.3337080478668213, "learning_rate": 7.390242234618075e-06, "loss": 0.1858, "step": 4230 }, { "epoch": 2.3274324138877454, "grad_norm": 2.3712496757507324, "learning_rate": 7.277200287130728e-06, "loss": 0.2595, "step": 4240 }, { "epoch": 2.33292164127899, "grad_norm": 2.5574772357940674, "learning_rate": 7.164882120093757e-06, "loss": 0.2419, "step": 4250 }, { "epoch": 2.3384108686702345, "grad_norm": 2.367032051086426, "learning_rate": 7.053292320462654e-06, "loss": 0.256, "step": 4260 }, { "epoch": 2.3439000960614793, "grad_norm": 2.3279731273651123, "learning_rate": 6.942435445447159e-06, "loss": 0.2319, "step": 4270 }, { "epoch": 2.349389323452724, "grad_norm": 2.5082924365997314, "learning_rate": 6.832316022325138e-06, "loss": 0.212, "step": 4280 }, { "epoch": 2.354878550843969, "grad_norm": 3.303452730178833, "learning_rate": 6.7229385482577065e-06, "loss": 0.2434, "step": 4290 }, { "epoch": 2.3603677782352133, "grad_norm": 2.3842883110046387, "learning_rate": 6.614307490105557e-06, "loss": 0.2644, "step": 4300 }, { "epoch": 2.365857005626458, "grad_norm": 2.0827999114990234, "learning_rate": 6.506427284246547e-06, "loss": 0.2327, "step": 4310 }, { "epoch": 2.371346233017703, "grad_norm": 1.9834315776824951, "learning_rate": 6.3993023363945165e-06, "loss": 0.2206, "step": 4320 }, { "epoch": 2.376835460408947, "grad_norm": 1.9981067180633545, "learning_rate": 6.2929370214193735e-06, "loss": 0.2736, "step": 4330 }, { "epoch": 2.382324687800192, "grad_norm": 2.4763762950897217, "learning_rate": 6.1873356831683884e-06, "loss": 0.2344, "step": 4340 }, { "epoch": 2.387813915191437, "grad_norm": 1.4734795093536377, "learning_rate": 6.082502634288873e-06, "loss": 0.2019, "step": 4350 }, { "epoch": 2.3933031425826816, "grad_norm": 2.4421563148498535, "learning_rate": 5.978442156051986e-06, "loss": 0.2289, "step": 4360 }, { "epoch": 2.3987923699739264, "grad_norm": 2.192746162414551, "learning_rate": 5.875158498177921e-06, "loss": 0.2396, "step": 4370 }, { "epoch": 2.4042815973651708, "grad_norm": 1.9783297777175903, "learning_rate": 5.772655878662339e-06, "loss": 0.2531, "step": 4380 }, { "epoch": 2.4097708247564156, "grad_norm": 2.5523509979248047, "learning_rate": 5.6709384836041184e-06, "loss": 0.2405, "step": 4390 }, { "epoch": 2.4152600521476604, "grad_norm": 1.6953893899917603, "learning_rate": 5.570010467034425e-06, "loss": 0.2279, "step": 4400 }, { "epoch": 2.4207492795389047, "grad_norm": 1.7373918294906616, "learning_rate": 5.469875950747016e-06, "loss": 0.2081, "step": 4410 }, { "epoch": 2.4262385069301495, "grad_norm": 2.789266347885132, "learning_rate": 5.370539024129928e-06, "loss": 0.2455, "step": 4420 }, { "epoch": 2.4317277343213943, "grad_norm": 1.8112378120422363, "learning_rate": 5.272003743998489e-06, "loss": 0.256, "step": 4430 }, { "epoch": 2.437216961712639, "grad_norm": 1.8407344818115234, "learning_rate": 5.1742741344296246e-06, "loss": 0.2481, "step": 4440 }, { "epoch": 2.4427061891038835, "grad_norm": 2.0232059955596924, "learning_rate": 5.077354186597541e-06, "loss": 0.2213, "step": 4450 }, { "epoch": 2.4481954164951283, "grad_norm": 2.0662572383880615, "learning_rate": 4.981247858610688e-06, "loss": 0.2064, "step": 4460 }, { "epoch": 2.453684643886373, "grad_norm": 1.9686827659606934, "learning_rate": 4.885959075350149e-06, "loss": 0.2344, "step": 4470 }, { "epoch": 2.459173871277618, "grad_norm": 2.095710515975952, "learning_rate": 4.791491728309347e-06, "loss": 0.2727, "step": 4480 }, { "epoch": 2.4646630986688622, "grad_norm": 2.7409679889678955, "learning_rate": 4.697849675435112e-06, "loss": 0.2449, "step": 4490 }, { "epoch": 2.470152326060107, "grad_norm": 1.6528655290603638, "learning_rate": 4.605036740970134e-06, "loss": 0.2228, "step": 4500 }, { "epoch": 2.475641553451352, "grad_norm": 2.210045337677002, "learning_rate": 4.513056715296773e-06, "loss": 0.2399, "step": 4510 }, { "epoch": 2.481130780842596, "grad_norm": 1.4573155641555786, "learning_rate": 4.4219133547822865e-06, "loss": 0.2133, "step": 4520 }, { "epoch": 2.486620008233841, "grad_norm": 2.013803482055664, "learning_rate": 4.331610381625395e-06, "loss": 0.2318, "step": 4530 }, { "epoch": 2.492109235625086, "grad_norm": 2.090888261795044, "learning_rate": 4.242151483704293e-06, "loss": 0.2393, "step": 4540 }, { "epoch": 2.4975984630163306, "grad_norm": 2.1223104000091553, "learning_rate": 4.153540314426033e-06, "loss": 0.2343, "step": 4550 }, { "epoch": 2.5030876904075754, "grad_norm": 2.2379000186920166, "learning_rate": 4.065780492577326e-06, "loss": 0.2608, "step": 4560 }, { "epoch": 2.5085769177988197, "grad_norm": 1.780354380607605, "learning_rate": 3.978875602176726e-06, "loss": 0.2401, "step": 4570 }, { "epoch": 2.5140661451900645, "grad_norm": 2.5559253692626953, "learning_rate": 3.892829192328337e-06, "loss": 0.2381, "step": 4580 }, { "epoch": 2.5195553725813093, "grad_norm": 1.7930986881256104, "learning_rate": 3.8076447770767796e-06, "loss": 0.2712, "step": 4590 }, { "epoch": 2.5250445999725537, "grad_norm": 2.0700008869171143, "learning_rate": 3.7233258352637553e-06, "loss": 0.2162, "step": 4600 }, { "epoch": 2.5305338273637985, "grad_norm": 2.3782896995544434, "learning_rate": 3.6398758103859067e-06, "loss": 0.208, "step": 4610 }, { "epoch": 2.5360230547550433, "grad_norm": 2.345578670501709, "learning_rate": 3.557298110454252e-06, "loss": 0.2231, "step": 4620 }, { "epoch": 2.5415122821462877, "grad_norm": 1.9046144485473633, "learning_rate": 3.475596107854981e-06, "loss": 0.2359, "step": 4630 }, { "epoch": 2.5470015095375325, "grad_norm": 2.2235357761383057, "learning_rate": 3.3947731392117237e-06, "loss": 0.2268, "step": 4640 }, { "epoch": 2.5524907369287773, "grad_norm": 2.3540749549865723, "learning_rate": 3.3148325052492713e-06, "loss": 0.2382, "step": 4650 }, { "epoch": 2.557979964320022, "grad_norm": 2.550370931625366, "learning_rate": 3.2357774706588157e-06, "loss": 0.2364, "step": 4660 }, { "epoch": 2.563469191711267, "grad_norm": 2.06571626663208, "learning_rate": 3.1576112639646023e-06, "loss": 0.2379, "step": 4670 }, { "epoch": 2.568958419102511, "grad_norm": 1.3834680318832397, "learning_rate": 3.08033707739209e-06, "loss": 0.2012, "step": 4680 }, { "epoch": 2.574447646493756, "grad_norm": 1.7131993770599365, "learning_rate": 3.0039580667375557e-06, "loss": 0.2019, "step": 4690 }, { "epoch": 2.579936873885001, "grad_norm": 2.2222018241882324, "learning_rate": 2.9284773512392475e-06, "loss": 0.2565, "step": 4700 }, { "epoch": 2.585426101276245, "grad_norm": 1.8102763891220093, "learning_rate": 2.8538980134499958e-06, "loss": 0.2254, "step": 4710 }, { "epoch": 2.59091532866749, "grad_norm": 2.4614617824554443, "learning_rate": 2.780223099111298e-06, "loss": 0.2505, "step": 4720 }, { "epoch": 2.5964045560587348, "grad_norm": 2.120408535003662, "learning_rate": 2.7074556170289674e-06, "loss": 0.1887, "step": 4730 }, { "epoch": 2.6018937834499796, "grad_norm": 2.1147496700286865, "learning_rate": 2.6355985389502293e-06, "loss": 0.1995, "step": 4740 }, { "epoch": 2.6073830108412244, "grad_norm": 1.818030595779419, "learning_rate": 2.5646547994423784e-06, "loss": 0.2527, "step": 4750 }, { "epoch": 2.6128722382324687, "grad_norm": 1.935293197631836, "learning_rate": 2.4946272957729165e-06, "loss": 0.2007, "step": 4760 }, { "epoch": 2.6183614656237135, "grad_norm": 1.837039589881897, "learning_rate": 2.4255188877912477e-06, "loss": 0.2044, "step": 4770 }, { "epoch": 2.6238506930149583, "grad_norm": 2.487926721572876, "learning_rate": 2.3573323978118705e-06, "loss": 0.2825, "step": 4780 }, { "epoch": 2.6293399204062027, "grad_norm": 1.880677342414856, "learning_rate": 2.29007061049914e-06, "loss": 0.209, "step": 4790 }, { "epoch": 2.6348291477974475, "grad_norm": 2.060526132583618, "learning_rate": 2.2237362727535043e-06, "loss": 0.2069, "step": 4800 }, { "epoch": 2.6403183751886923, "grad_norm": 1.8091875314712524, "learning_rate": 2.1583320935993605e-06, "loss": 0.2606, "step": 4810 }, { "epoch": 2.6458076025799366, "grad_norm": 2.821810483932495, "learning_rate": 2.0938607440744274e-06, "loss": 0.2235, "step": 4820 }, { "epoch": 2.6512968299711814, "grad_norm": 2.605480194091797, "learning_rate": 2.0303248571206244e-06, "loss": 0.2454, "step": 4830 }, { "epoch": 2.6567860573624262, "grad_norm": 2.1553428173065186, "learning_rate": 1.967727027476568e-06, "loss": 0.1998, "step": 4840 }, { "epoch": 2.662275284753671, "grad_norm": 2.485527753829956, "learning_rate": 1.9060698115716063e-06, "loss": 0.2377, "step": 4850 }, { "epoch": 2.667764512144916, "grad_norm": 1.8272162675857544, "learning_rate": 1.8453557274214162e-06, "loss": 0.2288, "step": 4860 }, { "epoch": 2.67325373953616, "grad_norm": 2.5798721313476562, "learning_rate": 1.7855872545251757e-06, "loss": 0.2419, "step": 4870 }, { "epoch": 2.678742966927405, "grad_norm": 2.7085471153259277, "learning_rate": 1.7267668337642761e-06, "loss": 0.222, "step": 4880 }, { "epoch": 2.68423219431865, "grad_norm": 2.2424449920654297, "learning_rate": 1.6688968673026773e-06, "loss": 0.1913, "step": 4890 }, { "epoch": 2.689721421709894, "grad_norm": 2.0495047569274902, "learning_rate": 1.6119797184887792e-06, "loss": 0.1905, "step": 4900 }, { "epoch": 2.695210649101139, "grad_norm": 2.048985242843628, "learning_rate": 1.5560177117589197e-06, "loss": 0.1978, "step": 4910 }, { "epoch": 2.7006998764923837, "grad_norm": 2.4100003242492676, "learning_rate": 1.5010131325424337e-06, "loss": 0.2575, "step": 4920 }, { "epoch": 2.7061891038836285, "grad_norm": 2.420525074005127, "learning_rate": 1.4469682271683327e-06, "loss": 0.215, "step": 4930 }, { "epoch": 2.711678331274873, "grad_norm": 1.7320780754089355, "learning_rate": 1.3938852027735594e-06, "loss": 0.2259, "step": 4940 }, { "epoch": 2.7171675586661177, "grad_norm": 2.3320376873016357, "learning_rate": 1.3417662272128484e-06, "loss": 0.2514, "step": 4950 }, { "epoch": 2.7226567860573625, "grad_norm": 2.5615234375, "learning_rate": 1.2906134289701998e-06, "loss": 0.2342, "step": 4960 }, { "epoch": 2.7281460134486073, "grad_norm": 2.3866004943847656, "learning_rate": 1.240428897071949e-06, "loss": 0.2273, "step": 4970 }, { "epoch": 2.7336352408398517, "grad_norm": 2.7888448238372803, "learning_rate": 1.191214681001454e-06, "loss": 0.216, "step": 4980 }, { "epoch": 2.7391244682310965, "grad_norm": 2.318481922149658, "learning_rate": 1.142972790615407e-06, "loss": 0.2053, "step": 4990 }, { "epoch": 2.7446136956223413, "grad_norm": 1.82982337474823, "learning_rate": 1.095705196061722e-06, "loss": 0.2265, "step": 5000 }, { "epoch": 2.7501029230135856, "grad_norm": 1.4905016422271729, "learning_rate": 1.0494138276991278e-06, "loss": 0.1784, "step": 5010 }, { "epoch": 2.7555921504048304, "grad_norm": 1.6236484050750732, "learning_rate": 1.0041005760182853e-06, "loss": 0.1953, "step": 5020 }, { "epoch": 2.761081377796075, "grad_norm": 1.9028195142745972, "learning_rate": 9.597672915646116e-07, "loss": 0.2076, "step": 5030 }, { "epoch": 2.76657060518732, "grad_norm": 2.1804606914520264, "learning_rate": 9.164157848626842e-07, "loss": 0.2155, "step": 5040 }, { "epoch": 2.772059832578565, "grad_norm": 1.961748480796814, "learning_rate": 8.740478263423197e-07, "loss": 0.2072, "step": 5050 }, { "epoch": 2.777549059969809, "grad_norm": 2.2191579341888428, "learning_rate": 8.32665146266276e-07, "loss": 0.2314, "step": 5060 }, { "epoch": 2.783038287361054, "grad_norm": 2.2354133129119873, "learning_rate": 7.922694346595511e-07, "loss": 0.2297, "step": 5070 }, { "epoch": 2.7885275147522988, "grad_norm": 1.8483319282531738, "learning_rate": 7.528623412404179e-07, "loss": 0.214, "step": 5080 }, { "epoch": 2.794016742143543, "grad_norm": 1.9461801052093506, "learning_rate": 7.144454753530067e-07, "loss": 0.2173, "step": 5090 }, { "epoch": 2.799505969534788, "grad_norm": 1.8403065204620361, "learning_rate": 6.770204059016127e-07, "loss": 0.2012, "step": 5100 }, { "epoch": 2.8049951969260327, "grad_norm": 2.654057264328003, "learning_rate": 6.405886612866036e-07, "loss": 0.224, "step": 5110 }, { "epoch": 2.810484424317277, "grad_norm": 2.0720152854919434, "learning_rate": 6.051517293420101e-07, "loss": 0.1893, "step": 5120 }, { "epoch": 2.815973651708522, "grad_norm": 1.8411768674850464, "learning_rate": 5.707110572747587e-07, "loss": 0.2351, "step": 5130 }, { "epoch": 2.8214628790997667, "grad_norm": 1.893263578414917, "learning_rate": 5.3726805160558e-07, "loss": 0.2541, "step": 5140 }, { "epoch": 2.8269521064910115, "grad_norm": 2.347729206085205, "learning_rate": 5.048240781115571e-07, "loss": 0.2351, "step": 5150 }, { "epoch": 2.8324413338822563, "grad_norm": 2.2554593086242676, "learning_rate": 4.7338046177035354e-07, "loss": 0.245, "step": 5160 }, { "epoch": 2.8379305612735006, "grad_norm": 2.649017095565796, "learning_rate": 4.429384867061015e-07, "loss": 0.2444, "step": 5170 }, { "epoch": 2.8434197886647454, "grad_norm": 2.740800142288208, "learning_rate": 4.1349939613695434e-07, "loss": 0.2354, "step": 5180 }, { "epoch": 2.8489090160559902, "grad_norm": 2.5310161113739014, "learning_rate": 3.85064392324333e-07, "loss": 0.2086, "step": 5190 }, { "epoch": 2.8543982434472346, "grad_norm": 2.4969282150268555, "learning_rate": 3.5763463652380146e-07, "loss": 0.2329, "step": 5200 }, { "epoch": 2.8598874708384794, "grad_norm": 2.0695693492889404, "learning_rate": 3.3121124893766287e-07, "loss": 0.1665, "step": 5210 }, { "epoch": 2.865376698229724, "grad_norm": 2.0718321800231934, "learning_rate": 3.057953086692017e-07, "loss": 0.2444, "step": 5220 }, { "epoch": 2.870865925620969, "grad_norm": 2.0257515907287598, "learning_rate": 2.8138785367860796e-07, "loss": 0.2303, "step": 5230 }, { "epoch": 2.876355153012214, "grad_norm": 2.0960233211517334, "learning_rate": 2.5798988074061394e-07, "loss": 0.2274, "step": 5240 }, { "epoch": 2.881844380403458, "grad_norm": 2.4282174110412598, "learning_rate": 2.3560234540375424e-07, "loss": 0.1995, "step": 5250 }, { "epoch": 2.887333607794703, "grad_norm": 1.898910641670227, "learning_rate": 2.1422616195136692e-07, "loss": 0.2002, "step": 5260 }, { "epoch": 2.8928228351859477, "grad_norm": 1.6507282257080078, "learning_rate": 1.9386220336423678e-07, "loss": 0.1811, "step": 5270 }, { "epoch": 2.898312062577192, "grad_norm": 2.447411060333252, "learning_rate": 1.7451130128495753e-07, "loss": 0.2376, "step": 5280 }, { "epoch": 2.903801289968437, "grad_norm": 1.9209644794464111, "learning_rate": 1.5617424598396712e-07, "loss": 0.236, "step": 5290 }, { "epoch": 2.9092905173596817, "grad_norm": 2.0094797611236572, "learning_rate": 1.3885178632726536e-07, "loss": 0.208, "step": 5300 }, { "epoch": 2.914779744750926, "grad_norm": 2.3810887336730957, "learning_rate": 1.225446297458327e-07, "loss": 0.2124, "step": 5310 }, { "epoch": 2.920268972142171, "grad_norm": 2.039491891860962, "learning_rate": 1.0725344220675337e-07, "loss": 0.1983, "step": 5320 }, { "epoch": 2.9257581995334156, "grad_norm": 2.2363178730010986, "learning_rate": 9.297884818599556e-08, "loss": 0.2173, "step": 5330 }, { "epoch": 2.9312474269246604, "grad_norm": 2.1318199634552, "learning_rate": 7.972143064292892e-08, "loss": 0.182, "step": 5340 }, { "epoch": 2.9367366543159052, "grad_norm": 1.9546360969543457, "learning_rate": 6.748173099650202e-08, "loss": 0.1871, "step": 5350 }, { "epoch": 2.9422258817071496, "grad_norm": 1.8476676940917969, "learning_rate": 5.626024910314609e-08, "loss": 0.1989, "step": 5360 }, { "epoch": 2.9477151090983944, "grad_norm": 2.1973392963409424, "learning_rate": 4.605744323634142e-08, "loss": 0.2186, "step": 5370 }, { "epoch": 2.953204336489639, "grad_norm": 1.935354232788086, "learning_rate": 3.687373006792394e-08, "loss": 0.2124, "step": 5380 }, { "epoch": 2.9586935638808836, "grad_norm": 2.1322507858276367, "learning_rate": 2.870948465105161e-08, "loss": 0.2145, "step": 5390 }, { "epoch": 2.9641827912721284, "grad_norm": 2.501298189163208, "learning_rate": 2.1565040404902813e-08, "loss": 0.22, "step": 5400 }, { "epoch": 2.969672018663373, "grad_norm": 1.6250516176223755, "learning_rate": 1.544068910104002e-08, "loss": 0.218, "step": 5410 }, { "epoch": 2.975161246054618, "grad_norm": 2.794093370437622, "learning_rate": 1.0336680851516512e-08, "loss": 0.257, "step": 5420 }, { "epoch": 2.9806504734458628, "grad_norm": 2.0987253189086914, "learning_rate": 6.2532240986457044e-09, "loss": 0.2193, "step": 5430 }, { "epoch": 2.986139700837107, "grad_norm": 2.217175006866455, "learning_rate": 3.1904856064940424e-09, "loss": 0.2392, "step": 5440 }, { "epoch": 2.991628928228352, "grad_norm": 2.0461556911468506, "learning_rate": 1.1485904540697867e-09, "loss": 0.2137, "step": 5450 }, { "epoch": 2.9971181556195967, "grad_norm": 1.9249966144561768, "learning_rate": 1.276220302215414e-10, "loss": 0.2132, "step": 5460 }, { "epoch": 2.99876492383697, "step": 5463, "total_flos": 3.233486247100416e+17, "train_loss": 0.5040911292388242, "train_runtime": 9149.3841, "train_samples_per_second": 4.779, "train_steps_per_second": 0.597 } ], "logging_steps": 10, "max_steps": 5463, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.233486247100416e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }