{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.492822966507177, "eval_steps": 500, "global_step": 10650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011961722488038277, "grad_norm": 1.9270328283309937, "learning_rate": 4.998999599839936e-05, "loss": 2.3527, "step": 15 }, { "epoch": 0.023923444976076555, "grad_norm": 1.8812594413757324, "learning_rate": 4.995998399359744e-05, "loss": 2.3692, "step": 30 }, { "epoch": 0.03588516746411483, "grad_norm": 1.3909887075424194, "learning_rate": 4.9929971988795524e-05, "loss": 2.3111, "step": 45 }, { "epoch": 0.04784688995215311, "grad_norm": 2.809542417526245, "learning_rate": 4.98999599839936e-05, "loss": 2.2284, "step": 60 }, { "epoch": 0.05980861244019139, "grad_norm": 3.4008336067199707, "learning_rate": 4.986994797919168e-05, "loss": 2.2569, "step": 75 }, { "epoch": 0.07177033492822966, "grad_norm": 1.2219867706298828, "learning_rate": 4.983993597438976e-05, "loss": 2.1762, "step": 90 }, { "epoch": 0.08373205741626795, "grad_norm": 1.3036127090454102, "learning_rate": 4.9809923969587836e-05, "loss": 2.1956, "step": 105 }, { "epoch": 0.09569377990430622, "grad_norm": 1.468847393989563, "learning_rate": 4.977991196478592e-05, "loss": 2.2729, "step": 120 }, { "epoch": 0.1076555023923445, "grad_norm": 1.2088780403137207, "learning_rate": 4.9749899959984e-05, "loss": 2.2164, "step": 135 }, { "epoch": 0.11961722488038277, "grad_norm": 1.197135090827942, "learning_rate": 4.9719887955182076e-05, "loss": 2.1056, "step": 150 }, { "epoch": 0.13157894736842105, "grad_norm": 1.309010624885559, "learning_rate": 4.9689875950380154e-05, "loss": 2.171, "step": 165 }, { "epoch": 0.14354066985645933, "grad_norm": 1.3516101837158203, "learning_rate": 4.965986394557823e-05, "loss": 2.1898, "step": 180 }, { "epoch": 0.15550239234449761, "grad_norm": 1.186513900756836, "learning_rate": 4.962985194077631e-05, "loss": 2.1427, "step": 195 }, { "epoch": 0.1674641148325359, "grad_norm": 1.159972906112671, "learning_rate": 4.959983993597439e-05, "loss": 2.2603, "step": 210 }, { "epoch": 0.17942583732057416, "grad_norm": 1.188928484916687, "learning_rate": 4.956982793117247e-05, "loss": 2.2281, "step": 225 }, { "epoch": 0.19138755980861244, "grad_norm": 2.18959903717041, "learning_rate": 4.953981592637055e-05, "loss": 2.2187, "step": 240 }, { "epoch": 0.20334928229665072, "grad_norm": 1.267388939857483, "learning_rate": 4.9509803921568634e-05, "loss": 2.1898, "step": 255 }, { "epoch": 0.215311004784689, "grad_norm": 1.5959223508834839, "learning_rate": 4.947979191676671e-05, "loss": 2.1488, "step": 270 }, { "epoch": 0.22727272727272727, "grad_norm": 1.081666350364685, "learning_rate": 4.944977991196479e-05, "loss": 2.2176, "step": 285 }, { "epoch": 0.23923444976076555, "grad_norm": 1.1691621541976929, "learning_rate": 4.941976790716287e-05, "loss": 2.1297, "step": 300 }, { "epoch": 0.2511961722488038, "grad_norm": 1.4069727659225464, "learning_rate": 4.9389755902360946e-05, "loss": 2.2035, "step": 315 }, { "epoch": 0.2631578947368421, "grad_norm": 1.135937213897705, "learning_rate": 4.9359743897559024e-05, "loss": 2.1925, "step": 330 }, { "epoch": 0.2751196172248804, "grad_norm": 1.0926917791366577, "learning_rate": 4.93297318927571e-05, "loss": 2.1679, "step": 345 }, { "epoch": 0.28708133971291866, "grad_norm": 1.0808637142181396, "learning_rate": 4.9299719887955186e-05, "loss": 2.2122, "step": 360 }, { "epoch": 0.29904306220095694, "grad_norm": 1.2694952487945557, "learning_rate": 4.9269707883153264e-05, "loss": 2.1643, "step": 375 }, { "epoch": 0.31100478468899523, "grad_norm": 1.1682099103927612, "learning_rate": 4.923969587835134e-05, "loss": 2.2263, "step": 390 }, { "epoch": 0.3229665071770335, "grad_norm": 1.1954610347747803, "learning_rate": 4.920968387354942e-05, "loss": 2.1555, "step": 405 }, { "epoch": 0.3349282296650718, "grad_norm": 1.0608245134353638, "learning_rate": 4.9179671868747504e-05, "loss": 2.1918, "step": 420 }, { "epoch": 0.34688995215311, "grad_norm": 1.2034133672714233, "learning_rate": 4.914965986394558e-05, "loss": 2.1101, "step": 435 }, { "epoch": 0.3588516746411483, "grad_norm": 1.0936003923416138, "learning_rate": 4.911964785914366e-05, "loss": 2.137, "step": 450 }, { "epoch": 0.3708133971291866, "grad_norm": 1.188496708869934, "learning_rate": 4.908963585434174e-05, "loss": 2.1864, "step": 465 }, { "epoch": 0.3827751196172249, "grad_norm": 1.350693941116333, "learning_rate": 4.905962384953982e-05, "loss": 2.1491, "step": 480 }, { "epoch": 0.39473684210526316, "grad_norm": 1.2483429908752441, "learning_rate": 4.90296118447379e-05, "loss": 2.1868, "step": 495 }, { "epoch": 0.40669856459330145, "grad_norm": 1.1137944459915161, "learning_rate": 4.899959983993598e-05, "loss": 2.191, "step": 510 }, { "epoch": 0.41866028708133973, "grad_norm": 1.3261072635650635, "learning_rate": 4.8969587835134056e-05, "loss": 2.1336, "step": 525 }, { "epoch": 0.430622009569378, "grad_norm": 1.6815850734710693, "learning_rate": 4.8939575830332134e-05, "loss": 2.1524, "step": 540 }, { "epoch": 0.44258373205741625, "grad_norm": 1.080824851989746, "learning_rate": 4.890956382553021e-05, "loss": 2.2056, "step": 555 }, { "epoch": 0.45454545454545453, "grad_norm": 1.2140378952026367, "learning_rate": 4.887955182072829e-05, "loss": 2.2114, "step": 570 }, { "epoch": 0.4665071770334928, "grad_norm": 1.1290125846862793, "learning_rate": 4.884953981592637e-05, "loss": 2.101, "step": 585 }, { "epoch": 0.4784688995215311, "grad_norm": 1.171129822731018, "learning_rate": 4.881952781112445e-05, "loss": 2.2186, "step": 600 }, { "epoch": 0.4904306220095694, "grad_norm": 1.99854576587677, "learning_rate": 4.878951580632253e-05, "loss": 2.154, "step": 615 }, { "epoch": 0.5023923444976076, "grad_norm": 1.1021254062652588, "learning_rate": 4.8759503801520615e-05, "loss": 2.1066, "step": 630 }, { "epoch": 0.5143540669856459, "grad_norm": 1.022976040840149, "learning_rate": 4.872949179671869e-05, "loss": 2.1642, "step": 645 }, { "epoch": 0.5263157894736842, "grad_norm": 1.110926866531372, "learning_rate": 4.869947979191677e-05, "loss": 2.1592, "step": 660 }, { "epoch": 0.5382775119617225, "grad_norm": 1.096807599067688, "learning_rate": 4.866946778711485e-05, "loss": 2.2171, "step": 675 }, { "epoch": 0.5502392344497608, "grad_norm": 1.2465318441390991, "learning_rate": 4.8639455782312926e-05, "loss": 2.1794, "step": 690 }, { "epoch": 0.562200956937799, "grad_norm": 1.6367931365966797, "learning_rate": 4.8609443777511004e-05, "loss": 2.1405, "step": 705 }, { "epoch": 0.5741626794258373, "grad_norm": 1.3877207040786743, "learning_rate": 4.857943177270909e-05, "loss": 2.1781, "step": 720 }, { "epoch": 0.5861244019138756, "grad_norm": 1.1698716878890991, "learning_rate": 4.8549419767907166e-05, "loss": 2.2076, "step": 735 }, { "epoch": 0.5980861244019139, "grad_norm": 1.1922690868377686, "learning_rate": 4.8519407763105244e-05, "loss": 2.1515, "step": 750 }, { "epoch": 0.6100478468899522, "grad_norm": 1.1112874746322632, "learning_rate": 4.848939575830332e-05, "loss": 2.0535, "step": 765 }, { "epoch": 0.6220095693779905, "grad_norm": 1.3220607042312622, "learning_rate": 4.84593837535014e-05, "loss": 2.1978, "step": 780 }, { "epoch": 0.6339712918660287, "grad_norm": 1.2560738325119019, "learning_rate": 4.8429371748699484e-05, "loss": 2.245, "step": 795 }, { "epoch": 0.645933014354067, "grad_norm": 1.1312100887298584, "learning_rate": 4.839935974389756e-05, "loss": 2.1252, "step": 810 }, { "epoch": 0.6578947368421053, "grad_norm": 1.2060538530349731, "learning_rate": 4.836934773909564e-05, "loss": 2.1268, "step": 825 }, { "epoch": 0.6698564593301436, "grad_norm": 2.0435290336608887, "learning_rate": 4.8339335734293725e-05, "loss": 2.2091, "step": 840 }, { "epoch": 0.6818181818181818, "grad_norm": 2.7680532932281494, "learning_rate": 4.83093237294918e-05, "loss": 2.0631, "step": 855 }, { "epoch": 0.69377990430622, "grad_norm": 1.1256909370422363, "learning_rate": 4.827931172468988e-05, "loss": 2.1396, "step": 870 }, { "epoch": 0.7057416267942583, "grad_norm": 1.1224644184112549, "learning_rate": 4.824929971988796e-05, "loss": 2.107, "step": 885 }, { "epoch": 0.7177033492822966, "grad_norm": 1.2712397575378418, "learning_rate": 4.8219287715086036e-05, "loss": 2.1332, "step": 900 }, { "epoch": 0.7296650717703349, "grad_norm": 1.2399568557739258, "learning_rate": 4.8189275710284114e-05, "loss": 2.1198, "step": 915 }, { "epoch": 0.7416267942583732, "grad_norm": 1.0852080583572388, "learning_rate": 4.815926370548219e-05, "loss": 2.1436, "step": 930 }, { "epoch": 0.7535885167464115, "grad_norm": 1.3282052278518677, "learning_rate": 4.812925170068027e-05, "loss": 2.1763, "step": 945 }, { "epoch": 0.7655502392344498, "grad_norm": 1.8598517179489136, "learning_rate": 4.809923969587835e-05, "loss": 2.1188, "step": 960 }, { "epoch": 0.777511961722488, "grad_norm": 1.1602433919906616, "learning_rate": 4.806922769107643e-05, "loss": 2.2234, "step": 975 }, { "epoch": 0.7894736842105263, "grad_norm": 1.3578499555587769, "learning_rate": 4.803921568627452e-05, "loss": 2.1404, "step": 990 }, { "epoch": 0.8014354066985646, "grad_norm": 1.4764407873153687, "learning_rate": 4.8009203681472595e-05, "loss": 2.1582, "step": 1005 }, { "epoch": 0.8133971291866029, "grad_norm": 1.083958387374878, "learning_rate": 4.797919167667067e-05, "loss": 2.1156, "step": 1020 }, { "epoch": 0.8253588516746412, "grad_norm": 1.2568596601486206, "learning_rate": 4.794917967186875e-05, "loss": 2.1341, "step": 1035 }, { "epoch": 0.8373205741626795, "grad_norm": 1.1657259464263916, "learning_rate": 4.791916766706683e-05, "loss": 2.1245, "step": 1050 }, { "epoch": 0.8492822966507177, "grad_norm": 2.355947256088257, "learning_rate": 4.7889155662264906e-05, "loss": 2.1975, "step": 1065 }, { "epoch": 0.861244019138756, "grad_norm": 2.6566946506500244, "learning_rate": 4.7859143657462984e-05, "loss": 2.1263, "step": 1080 }, { "epoch": 0.8732057416267942, "grad_norm": 1.2993121147155762, "learning_rate": 4.782913165266107e-05, "loss": 2.1481, "step": 1095 }, { "epoch": 0.8851674641148325, "grad_norm": 1.129744291305542, "learning_rate": 4.7799119647859146e-05, "loss": 2.1574, "step": 1110 }, { "epoch": 0.8971291866028708, "grad_norm": 1.1695717573165894, "learning_rate": 4.7769107643057224e-05, "loss": 2.0916, "step": 1125 }, { "epoch": 0.9090909090909091, "grad_norm": 1.159279465675354, "learning_rate": 4.77390956382553e-05, "loss": 2.1265, "step": 1140 }, { "epoch": 0.9210526315789473, "grad_norm": 1.2150417566299438, "learning_rate": 4.770908363345338e-05, "loss": 2.1351, "step": 1155 }, { "epoch": 0.9330143540669856, "grad_norm": 1.2673773765563965, "learning_rate": 4.7679071628651465e-05, "loss": 2.2444, "step": 1170 }, { "epoch": 0.9449760765550239, "grad_norm": 1.1746214628219604, "learning_rate": 4.764905962384954e-05, "loss": 2.1371, "step": 1185 }, { "epoch": 0.9569377990430622, "grad_norm": 1.3716073036193848, "learning_rate": 4.761904761904762e-05, "loss": 2.1414, "step": 1200 }, { "epoch": 0.9688995215311005, "grad_norm": 1.1066573858261108, "learning_rate": 4.7589035614245705e-05, "loss": 2.0949, "step": 1215 }, { "epoch": 0.9808612440191388, "grad_norm": 1.1547194719314575, "learning_rate": 4.755902360944378e-05, "loss": 2.1023, "step": 1230 }, { "epoch": 0.992822966507177, "grad_norm": 1.5456453561782837, "learning_rate": 4.752901160464186e-05, "loss": 2.1542, "step": 1245 }, { "epoch": 1.0047846889952152, "grad_norm": 1.7362697124481201, "learning_rate": 4.749899959983994e-05, "loss": 2.0444, "step": 1260 }, { "epoch": 1.0167464114832536, "grad_norm": 5.408290386199951, "learning_rate": 4.7468987595038016e-05, "loss": 1.8079, "step": 1275 }, { "epoch": 1.0287081339712918, "grad_norm": 3.33227276802063, "learning_rate": 4.7438975590236094e-05, "loss": 1.9851, "step": 1290 }, { "epoch": 1.0406698564593302, "grad_norm": 1.4184224605560303, "learning_rate": 4.740896358543417e-05, "loss": 1.8732, "step": 1305 }, { "epoch": 1.0526315789473684, "grad_norm": 1.5775929689407349, "learning_rate": 4.737895158063225e-05, "loss": 1.9714, "step": 1320 }, { "epoch": 1.0645933014354068, "grad_norm": 1.4744929075241089, "learning_rate": 4.7348939575830335e-05, "loss": 1.8901, "step": 1335 }, { "epoch": 1.076555023923445, "grad_norm": 1.5280168056488037, "learning_rate": 4.731892757102841e-05, "loss": 1.9348, "step": 1350 }, { "epoch": 1.0885167464114833, "grad_norm": 1.2531495094299316, "learning_rate": 4.72889155662265e-05, "loss": 1.83, "step": 1365 }, { "epoch": 1.1004784688995215, "grad_norm": 1.3821693658828735, "learning_rate": 4.7258903561424575e-05, "loss": 1.7183, "step": 1380 }, { "epoch": 1.11244019138756, "grad_norm": 1.3789594173431396, "learning_rate": 4.722889155662265e-05, "loss": 1.8931, "step": 1395 }, { "epoch": 1.124401913875598, "grad_norm": 1.2702490091323853, "learning_rate": 4.719887955182073e-05, "loss": 1.7617, "step": 1410 }, { "epoch": 1.1363636363636362, "grad_norm": 1.4505800008773804, "learning_rate": 4.716886754701881e-05, "loss": 1.9103, "step": 1425 }, { "epoch": 1.1483253588516746, "grad_norm": 1.612985610961914, "learning_rate": 4.7138855542216886e-05, "loss": 1.9471, "step": 1440 }, { "epoch": 1.160287081339713, "grad_norm": 1.2852972745895386, "learning_rate": 4.710884353741497e-05, "loss": 1.9249, "step": 1455 }, { "epoch": 1.1722488038277512, "grad_norm": 1.385501503944397, "learning_rate": 4.707883153261305e-05, "loss": 1.8883, "step": 1470 }, { "epoch": 1.1842105263157894, "grad_norm": 1.4401298761367798, "learning_rate": 4.704881952781113e-05, "loss": 1.94, "step": 1485 }, { "epoch": 1.1961722488038278, "grad_norm": 3.9501471519470215, "learning_rate": 4.7018807523009204e-05, "loss": 1.893, "step": 1500 }, { "epoch": 1.208133971291866, "grad_norm": 1.3335622549057007, "learning_rate": 4.698879551820728e-05, "loss": 1.7215, "step": 1515 }, { "epoch": 1.2200956937799043, "grad_norm": 1.6928309202194214, "learning_rate": 4.695878351340536e-05, "loss": 1.8889, "step": 1530 }, { "epoch": 1.2320574162679425, "grad_norm": 1.2327487468719482, "learning_rate": 4.6928771508603445e-05, "loss": 1.8503, "step": 1545 }, { "epoch": 1.244019138755981, "grad_norm": 1.3527581691741943, "learning_rate": 4.689875950380152e-05, "loss": 1.7963, "step": 1560 }, { "epoch": 1.255980861244019, "grad_norm": 1.4024996757507324, "learning_rate": 4.686874749899961e-05, "loss": 1.8679, "step": 1575 }, { "epoch": 1.2679425837320575, "grad_norm": 1.6798954010009766, "learning_rate": 4.6838735494197685e-05, "loss": 1.8944, "step": 1590 }, { "epoch": 1.2799043062200957, "grad_norm": 1.4541043043136597, "learning_rate": 4.680872348939576e-05, "loss": 1.9555, "step": 1605 }, { "epoch": 1.291866028708134, "grad_norm": 1.503612756729126, "learning_rate": 4.677871148459384e-05, "loss": 1.8223, "step": 1620 }, { "epoch": 1.3038277511961722, "grad_norm": 1.4559051990509033, "learning_rate": 4.674869947979192e-05, "loss": 1.8442, "step": 1635 }, { "epoch": 1.3157894736842106, "grad_norm": 1.3559598922729492, "learning_rate": 4.6718687474989997e-05, "loss": 1.933, "step": 1650 }, { "epoch": 1.3277511961722488, "grad_norm": 1.3937571048736572, "learning_rate": 4.6688675470188074e-05, "loss": 1.864, "step": 1665 }, { "epoch": 1.339712918660287, "grad_norm": 1.356520175933838, "learning_rate": 4.665866346538615e-05, "loss": 1.856, "step": 1680 }, { "epoch": 1.3516746411483254, "grad_norm": 1.6281076669692993, "learning_rate": 4.662865146058424e-05, "loss": 1.8623, "step": 1695 }, { "epoch": 1.3636363636363638, "grad_norm": 1.390368103981018, "learning_rate": 4.6598639455782315e-05, "loss": 1.8775, "step": 1710 }, { "epoch": 1.375598086124402, "grad_norm": 1.575172781944275, "learning_rate": 4.656862745098039e-05, "loss": 1.9558, "step": 1725 }, { "epoch": 1.38755980861244, "grad_norm": 1.6121597290039062, "learning_rate": 4.653861544617848e-05, "loss": 1.8698, "step": 1740 }, { "epoch": 1.3995215311004785, "grad_norm": 1.4013128280639648, "learning_rate": 4.6508603441376555e-05, "loss": 1.8567, "step": 1755 }, { "epoch": 1.4114832535885167, "grad_norm": 1.636841893196106, "learning_rate": 4.647859143657463e-05, "loss": 1.8708, "step": 1770 }, { "epoch": 1.423444976076555, "grad_norm": 1.6554105281829834, "learning_rate": 4.644857943177271e-05, "loss": 1.9281, "step": 1785 }, { "epoch": 1.4354066985645932, "grad_norm": 1.7569769620895386, "learning_rate": 4.641856742697079e-05, "loss": 1.8563, "step": 1800 }, { "epoch": 1.4473684210526316, "grad_norm": 1.5896693468093872, "learning_rate": 4.638855542216887e-05, "loss": 1.8764, "step": 1815 }, { "epoch": 1.4593301435406698, "grad_norm": 1.3887263536453247, "learning_rate": 4.635854341736695e-05, "loss": 1.8871, "step": 1830 }, { "epoch": 1.4712918660287082, "grad_norm": 1.6596853733062744, "learning_rate": 4.632853141256503e-05, "loss": 1.9176, "step": 1845 }, { "epoch": 1.4832535885167464, "grad_norm": 1.6174405813217163, "learning_rate": 4.629851940776311e-05, "loss": 1.8109, "step": 1860 }, { "epoch": 1.4952153110047846, "grad_norm": 1.3717613220214844, "learning_rate": 4.6268507402961185e-05, "loss": 1.867, "step": 1875 }, { "epoch": 1.507177033492823, "grad_norm": 1.4477450847625732, "learning_rate": 4.623849539815926e-05, "loss": 1.929, "step": 1890 }, { "epoch": 1.5191387559808613, "grad_norm": 1.4237533807754517, "learning_rate": 4.620848339335734e-05, "loss": 1.8444, "step": 1905 }, { "epoch": 1.5311004784688995, "grad_norm": 1.41818106174469, "learning_rate": 4.6178471388555425e-05, "loss": 1.8505, "step": 1920 }, { "epoch": 1.5430622009569377, "grad_norm": 1.5824397802352905, "learning_rate": 4.61484593837535e-05, "loss": 1.773, "step": 1935 }, { "epoch": 1.555023923444976, "grad_norm": 1.6391881704330444, "learning_rate": 4.611844737895159e-05, "loss": 1.9057, "step": 1950 }, { "epoch": 1.5669856459330145, "grad_norm": 1.5484305620193481, "learning_rate": 4.6088435374149665e-05, "loss": 1.9141, "step": 1965 }, { "epoch": 1.5789473684210527, "grad_norm": 1.4594415426254272, "learning_rate": 4.605842336934774e-05, "loss": 1.8732, "step": 1980 }, { "epoch": 1.5909090909090908, "grad_norm": 1.3924568891525269, "learning_rate": 4.602841136454582e-05, "loss": 1.9441, "step": 1995 }, { "epoch": 1.6028708133971292, "grad_norm": 1.523986577987671, "learning_rate": 4.59983993597439e-05, "loss": 1.9101, "step": 2010 }, { "epoch": 1.6148325358851676, "grad_norm": 1.369285225868225, "learning_rate": 4.596838735494198e-05, "loss": 1.8829, "step": 2025 }, { "epoch": 1.6267942583732058, "grad_norm": 1.4909306764602661, "learning_rate": 4.5938375350140055e-05, "loss": 1.9204, "step": 2040 }, { "epoch": 1.638755980861244, "grad_norm": 1.5464478731155396, "learning_rate": 4.590836334533814e-05, "loss": 1.8064, "step": 2055 }, { "epoch": 1.6507177033492821, "grad_norm": 1.5255078077316284, "learning_rate": 4.587835134053622e-05, "loss": 1.9518, "step": 2070 }, { "epoch": 1.6626794258373205, "grad_norm": 1.3710672855377197, "learning_rate": 4.5848339335734295e-05, "loss": 1.8957, "step": 2085 }, { "epoch": 1.674641148325359, "grad_norm": 1.4883019924163818, "learning_rate": 4.581832733093237e-05, "loss": 1.8884, "step": 2100 }, { "epoch": 1.686602870813397, "grad_norm": 1.383284091949463, "learning_rate": 4.578831532613046e-05, "loss": 1.8924, "step": 2115 }, { "epoch": 1.6985645933014353, "grad_norm": 1.5126210451126099, "learning_rate": 4.5758303321328535e-05, "loss": 1.9423, "step": 2130 }, { "epoch": 1.7105263157894737, "grad_norm": 1.4830104112625122, "learning_rate": 4.572829131652661e-05, "loss": 1.9377, "step": 2145 }, { "epoch": 1.722488038277512, "grad_norm": 1.578748106956482, "learning_rate": 4.569827931172469e-05, "loss": 1.8532, "step": 2160 }, { "epoch": 1.7344497607655502, "grad_norm": 3.1164207458496094, "learning_rate": 4.5668267306922776e-05, "loss": 1.9072, "step": 2175 }, { "epoch": 1.7464114832535884, "grad_norm": 1.5984658002853394, "learning_rate": 4.5638255302120853e-05, "loss": 1.9674, "step": 2190 }, { "epoch": 1.7583732057416268, "grad_norm": 1.5007200241088867, "learning_rate": 4.560824329731893e-05, "loss": 1.93, "step": 2205 }, { "epoch": 1.7703349282296652, "grad_norm": 2.623798131942749, "learning_rate": 4.557823129251701e-05, "loss": 1.9068, "step": 2220 }, { "epoch": 1.7822966507177034, "grad_norm": 2.1396572589874268, "learning_rate": 4.554821928771509e-05, "loss": 1.886, "step": 2235 }, { "epoch": 1.7942583732057416, "grad_norm": 1.5055629014968872, "learning_rate": 4.5518207282913165e-05, "loss": 1.8678, "step": 2250 }, { "epoch": 1.80622009569378, "grad_norm": 1.4418485164642334, "learning_rate": 4.548819527811124e-05, "loss": 1.984, "step": 2265 }, { "epoch": 1.8181818181818183, "grad_norm": 1.5159984827041626, "learning_rate": 4.545818327330932e-05, "loss": 1.9688, "step": 2280 }, { "epoch": 1.8301435406698565, "grad_norm": 1.299607753753662, "learning_rate": 4.5428171268507405e-05, "loss": 1.9347, "step": 2295 }, { "epoch": 1.8421052631578947, "grad_norm": 1.4144442081451416, "learning_rate": 4.539815926370549e-05, "loss": 1.8877, "step": 2310 }, { "epoch": 1.8540669856459329, "grad_norm": 1.5180310010910034, "learning_rate": 4.536814725890357e-05, "loss": 1.9392, "step": 2325 }, { "epoch": 1.8660287081339713, "grad_norm": 1.475977897644043, "learning_rate": 4.5338135254101645e-05, "loss": 1.8535, "step": 2340 }, { "epoch": 1.8779904306220097, "grad_norm": 1.4614003896713257, "learning_rate": 4.530812324929972e-05, "loss": 1.9246, "step": 2355 }, { "epoch": 1.8899521531100478, "grad_norm": 1.4736562967300415, "learning_rate": 4.52781112444978e-05, "loss": 1.9095, "step": 2370 }, { "epoch": 1.901913875598086, "grad_norm": 1.3201289176940918, "learning_rate": 4.524809923969588e-05, "loss": 1.8479, "step": 2385 }, { "epoch": 1.9138755980861244, "grad_norm": 1.4976378679275513, "learning_rate": 4.521808723489396e-05, "loss": 1.8262, "step": 2400 }, { "epoch": 1.9258373205741628, "grad_norm": 1.5323299169540405, "learning_rate": 4.5188075230092035e-05, "loss": 1.8989, "step": 2415 }, { "epoch": 1.937799043062201, "grad_norm": 2.050426483154297, "learning_rate": 4.515806322529012e-05, "loss": 1.8958, "step": 2430 }, { "epoch": 1.9497607655502391, "grad_norm": 1.822324514389038, "learning_rate": 4.51280512204882e-05, "loss": 1.99, "step": 2445 }, { "epoch": 1.9617224880382775, "grad_norm": 1.5009537935256958, "learning_rate": 4.5098039215686275e-05, "loss": 1.8561, "step": 2460 }, { "epoch": 1.973684210526316, "grad_norm": 1.3751215934753418, "learning_rate": 4.506802721088435e-05, "loss": 1.9033, "step": 2475 }, { "epoch": 1.985645933014354, "grad_norm": 1.6106884479522705, "learning_rate": 4.503801520608244e-05, "loss": 1.9555, "step": 2490 }, { "epoch": 1.9976076555023923, "grad_norm": 1.5378204584121704, "learning_rate": 4.5008003201280515e-05, "loss": 2.0009, "step": 2505 }, { "epoch": 2.0095693779904304, "grad_norm": 2.0536139011383057, "learning_rate": 4.497799119647859e-05, "loss": 1.7212, "step": 2520 }, { "epoch": 2.021531100478469, "grad_norm": 1.7498282194137573, "learning_rate": 4.494797919167667e-05, "loss": 1.5574, "step": 2535 }, { "epoch": 2.0334928229665072, "grad_norm": 1.7728687524795532, "learning_rate": 4.4917967186874756e-05, "loss": 1.4411, "step": 2550 }, { "epoch": 2.0454545454545454, "grad_norm": 1.8067642450332642, "learning_rate": 4.4887955182072834e-05, "loss": 1.5242, "step": 2565 }, { "epoch": 2.0574162679425836, "grad_norm": 1.924641489982605, "learning_rate": 4.485794317727091e-05, "loss": 1.5415, "step": 2580 }, { "epoch": 2.069377990430622, "grad_norm": 1.9768836498260498, "learning_rate": 4.482793117246899e-05, "loss": 1.6774, "step": 2595 }, { "epoch": 2.0813397129186604, "grad_norm": 1.943829894065857, "learning_rate": 4.479791916766707e-05, "loss": 1.6263, "step": 2610 }, { "epoch": 2.0933014354066986, "grad_norm": 2.1001622676849365, "learning_rate": 4.4767907162865145e-05, "loss": 1.6304, "step": 2625 }, { "epoch": 2.1052631578947367, "grad_norm": 2.0388505458831787, "learning_rate": 4.473789515806322e-05, "loss": 1.4718, "step": 2640 }, { "epoch": 2.117224880382775, "grad_norm": 1.884468913078308, "learning_rate": 4.47078831532613e-05, "loss": 1.5752, "step": 2655 }, { "epoch": 2.1291866028708135, "grad_norm": 1.9775267839431763, "learning_rate": 4.4677871148459385e-05, "loss": 1.478, "step": 2670 }, { "epoch": 2.1411483253588517, "grad_norm": 1.8365753889083862, "learning_rate": 4.464785914365747e-05, "loss": 1.5408, "step": 2685 }, { "epoch": 2.15311004784689, "grad_norm": 1.8778951168060303, "learning_rate": 4.461784713885555e-05, "loss": 1.6373, "step": 2700 }, { "epoch": 2.165071770334928, "grad_norm": 1.9629762172698975, "learning_rate": 4.4587835134053626e-05, "loss": 1.5741, "step": 2715 }, { "epoch": 2.1770334928229667, "grad_norm": 2.0409107208251953, "learning_rate": 4.4557823129251704e-05, "loss": 1.6216, "step": 2730 }, { "epoch": 2.188995215311005, "grad_norm": 2.1008028984069824, "learning_rate": 4.452781112444978e-05, "loss": 1.5515, "step": 2745 }, { "epoch": 2.200956937799043, "grad_norm": 2.2391457557678223, "learning_rate": 4.449779911964786e-05, "loss": 1.6279, "step": 2760 }, { "epoch": 2.212918660287081, "grad_norm": 2.294734239578247, "learning_rate": 4.446778711484594e-05, "loss": 1.5232, "step": 2775 }, { "epoch": 2.22488038277512, "grad_norm": 1.6631484031677246, "learning_rate": 4.443777511004402e-05, "loss": 1.5113, "step": 2790 }, { "epoch": 2.236842105263158, "grad_norm": 1.9847686290740967, "learning_rate": 4.44077631052421e-05, "loss": 1.5006, "step": 2805 }, { "epoch": 2.248803827751196, "grad_norm": 1.8953202962875366, "learning_rate": 4.437775110044018e-05, "loss": 1.5853, "step": 2820 }, { "epoch": 2.2607655502392343, "grad_norm": 1.9015896320343018, "learning_rate": 4.4347739095638255e-05, "loss": 1.6078, "step": 2835 }, { "epoch": 2.2727272727272725, "grad_norm": 1.900415301322937, "learning_rate": 4.431772709083633e-05, "loss": 1.5399, "step": 2850 }, { "epoch": 2.284688995215311, "grad_norm": 1.9138609170913696, "learning_rate": 4.428771508603442e-05, "loss": 1.589, "step": 2865 }, { "epoch": 2.2966507177033493, "grad_norm": 1.7661852836608887, "learning_rate": 4.4257703081232496e-05, "loss": 1.6258, "step": 2880 }, { "epoch": 2.3086124401913874, "grad_norm": 1.9043537378311157, "learning_rate": 4.4227691076430573e-05, "loss": 1.6243, "step": 2895 }, { "epoch": 2.320574162679426, "grad_norm": 1.8166050910949707, "learning_rate": 4.419767907162866e-05, "loss": 1.5999, "step": 2910 }, { "epoch": 2.3325358851674642, "grad_norm": 1.7325972318649292, "learning_rate": 4.4167667066826736e-05, "loss": 1.586, "step": 2925 }, { "epoch": 2.3444976076555024, "grad_norm": 1.8609052896499634, "learning_rate": 4.4137655062024814e-05, "loss": 1.5466, "step": 2940 }, { "epoch": 2.3564593301435406, "grad_norm": 3.3115549087524414, "learning_rate": 4.410764305722289e-05, "loss": 1.5816, "step": 2955 }, { "epoch": 2.3684210526315788, "grad_norm": 2.2015438079833984, "learning_rate": 4.407763105242097e-05, "loss": 1.5162, "step": 2970 }, { "epoch": 2.3803827751196174, "grad_norm": 1.7339051961898804, "learning_rate": 4.404761904761905e-05, "loss": 1.5764, "step": 2985 }, { "epoch": 2.3923444976076556, "grad_norm": 2.817207098007202, "learning_rate": 4.4017607042817125e-05, "loss": 1.5633, "step": 3000 }, { "epoch": 2.4043062200956937, "grad_norm": 2.063880681991577, "learning_rate": 4.39875950380152e-05, "loss": 1.604, "step": 3015 }, { "epoch": 2.416267942583732, "grad_norm": 1.8153194189071655, "learning_rate": 4.395758303321329e-05, "loss": 1.6417, "step": 3030 }, { "epoch": 2.4282296650717705, "grad_norm": 3.646466016769409, "learning_rate": 4.3927571028411365e-05, "loss": 1.6325, "step": 3045 }, { "epoch": 2.4401913875598087, "grad_norm": 1.9638229608535767, "learning_rate": 4.389755902360945e-05, "loss": 1.6393, "step": 3060 }, { "epoch": 2.452153110047847, "grad_norm": 2.549917697906494, "learning_rate": 4.386754701880753e-05, "loss": 1.6231, "step": 3075 }, { "epoch": 2.464114832535885, "grad_norm": 1.8698160648345947, "learning_rate": 4.3837535014005606e-05, "loss": 1.4995, "step": 3090 }, { "epoch": 2.4760765550239237, "grad_norm": 1.8844027519226074, "learning_rate": 4.3807523009203684e-05, "loss": 1.6133, "step": 3105 }, { "epoch": 2.488038277511962, "grad_norm": 2.275132417678833, "learning_rate": 4.377751100440176e-05, "loss": 1.6124, "step": 3120 }, { "epoch": 2.5, "grad_norm": 1.729272723197937, "learning_rate": 4.374749899959984e-05, "loss": 1.6766, "step": 3135 }, { "epoch": 2.511961722488038, "grad_norm": 1.9503229856491089, "learning_rate": 4.3717486994797924e-05, "loss": 1.6937, "step": 3150 }, { "epoch": 2.5239234449760763, "grad_norm": 1.8774380683898926, "learning_rate": 4.3687474989996e-05, "loss": 1.6159, "step": 3165 }, { "epoch": 2.535885167464115, "grad_norm": 2.066387176513672, "learning_rate": 4.365746298519408e-05, "loss": 1.6234, "step": 3180 }, { "epoch": 2.547846889952153, "grad_norm": 2.7428183555603027, "learning_rate": 4.362745098039216e-05, "loss": 1.5469, "step": 3195 }, { "epoch": 2.5598086124401913, "grad_norm": 1.9833886623382568, "learning_rate": 4.3597438975590235e-05, "loss": 1.5982, "step": 3210 }, { "epoch": 2.57177033492823, "grad_norm": 1.7080726623535156, "learning_rate": 4.356742697078831e-05, "loss": 1.5975, "step": 3225 }, { "epoch": 2.583732057416268, "grad_norm": 1.9213649034500122, "learning_rate": 4.35374149659864e-05, "loss": 1.5921, "step": 3240 }, { "epoch": 2.5956937799043063, "grad_norm": 2.0085928440093994, "learning_rate": 4.3507402961184476e-05, "loss": 1.5904, "step": 3255 }, { "epoch": 2.6076555023923444, "grad_norm": 1.903548002243042, "learning_rate": 4.347739095638256e-05, "loss": 1.5794, "step": 3270 }, { "epoch": 2.6196172248803826, "grad_norm": 1.8258320093154907, "learning_rate": 4.344737895158064e-05, "loss": 1.6408, "step": 3285 }, { "epoch": 2.6315789473684212, "grad_norm": 2.0597989559173584, "learning_rate": 4.3417366946778716e-05, "loss": 1.5868, "step": 3300 }, { "epoch": 2.6435406698564594, "grad_norm": 2.0705902576446533, "learning_rate": 4.3387354941976794e-05, "loss": 1.6906, "step": 3315 }, { "epoch": 2.6555023923444976, "grad_norm": 1.9880789518356323, "learning_rate": 4.335734293717487e-05, "loss": 1.5963, "step": 3330 }, { "epoch": 2.6674641148325358, "grad_norm": 2.0182063579559326, "learning_rate": 4.332733093237295e-05, "loss": 1.6478, "step": 3345 }, { "epoch": 2.679425837320574, "grad_norm": 1.9995989799499512, "learning_rate": 4.329731892757103e-05, "loss": 1.653, "step": 3360 }, { "epoch": 2.6913875598086126, "grad_norm": 2.738987922668457, "learning_rate": 4.3267306922769105e-05, "loss": 1.6505, "step": 3375 }, { "epoch": 2.7033492822966507, "grad_norm": 2.058044672012329, "learning_rate": 4.323729491796719e-05, "loss": 1.5528, "step": 3390 }, { "epoch": 2.715311004784689, "grad_norm": 2.0416853427886963, "learning_rate": 4.320728291316527e-05, "loss": 1.5553, "step": 3405 }, { "epoch": 2.7272727272727275, "grad_norm": 1.9002925157546997, "learning_rate": 4.3177270908363346e-05, "loss": 1.5736, "step": 3420 }, { "epoch": 2.7392344497607657, "grad_norm": 1.8847737312316895, "learning_rate": 4.314725890356143e-05, "loss": 1.6232, "step": 3435 }, { "epoch": 2.751196172248804, "grad_norm": 1.9627894163131714, "learning_rate": 4.311724689875951e-05, "loss": 1.6496, "step": 3450 }, { "epoch": 2.763157894736842, "grad_norm": 1.823258638381958, "learning_rate": 4.3087234893957586e-05, "loss": 1.584, "step": 3465 }, { "epoch": 2.77511961722488, "grad_norm": 3.361528158187866, "learning_rate": 4.3057222889155664e-05, "loss": 1.6163, "step": 3480 }, { "epoch": 2.787081339712919, "grad_norm": 2.01798677444458, "learning_rate": 4.302721088435374e-05, "loss": 1.4596, "step": 3495 }, { "epoch": 2.799043062200957, "grad_norm": 1.9381790161132812, "learning_rate": 4.2997198879551826e-05, "loss": 1.6621, "step": 3510 }, { "epoch": 2.811004784688995, "grad_norm": 2.0217368602752686, "learning_rate": 4.2967186874749904e-05, "loss": 1.6089, "step": 3525 }, { "epoch": 2.8229665071770333, "grad_norm": 1.7677721977233887, "learning_rate": 4.293717486994798e-05, "loss": 1.6052, "step": 3540 }, { "epoch": 2.8349282296650715, "grad_norm": 1.9464062452316284, "learning_rate": 4.290716286514606e-05, "loss": 1.6751, "step": 3555 }, { "epoch": 2.84688995215311, "grad_norm": 1.9557422399520874, "learning_rate": 4.287715086034414e-05, "loss": 1.5964, "step": 3570 }, { "epoch": 2.8588516746411483, "grad_norm": 3.1278235912323, "learning_rate": 4.2847138855542216e-05, "loss": 1.6272, "step": 3585 }, { "epoch": 2.8708133971291865, "grad_norm": 1.8671112060546875, "learning_rate": 4.2817126850740293e-05, "loss": 1.6573, "step": 3600 }, { "epoch": 2.882775119617225, "grad_norm": 1.9375852346420288, "learning_rate": 4.278711484593838e-05, "loss": 1.6407, "step": 3615 }, { "epoch": 2.8947368421052633, "grad_norm": 1.907958984375, "learning_rate": 4.275710284113646e-05, "loss": 1.6272, "step": 3630 }, { "epoch": 2.9066985645933014, "grad_norm": 2.1269607543945312, "learning_rate": 4.272709083633454e-05, "loss": 1.5664, "step": 3645 }, { "epoch": 2.9186602870813396, "grad_norm": 1.766072392463684, "learning_rate": 4.269707883153262e-05, "loss": 1.6766, "step": 3660 }, { "epoch": 2.930622009569378, "grad_norm": 2.157346248626709, "learning_rate": 4.2667066826730696e-05, "loss": 1.6374, "step": 3675 }, { "epoch": 2.9425837320574164, "grad_norm": 3.1585512161254883, "learning_rate": 4.2637054821928774e-05, "loss": 1.6082, "step": 3690 }, { "epoch": 2.9545454545454546, "grad_norm": 2.0836970806121826, "learning_rate": 4.260704281712685e-05, "loss": 1.6703, "step": 3705 }, { "epoch": 2.9665071770334928, "grad_norm": 1.729893445968628, "learning_rate": 4.257703081232493e-05, "loss": 1.6557, "step": 3720 }, { "epoch": 2.9784688995215314, "grad_norm": 3.384397268295288, "learning_rate": 4.254701880752301e-05, "loss": 1.643, "step": 3735 }, { "epoch": 2.990430622009569, "grad_norm": 1.8642953634262085, "learning_rate": 4.2517006802721085e-05, "loss": 1.6524, "step": 3750 }, { "epoch": 3.0023923444976077, "grad_norm": 1.9247709512710571, "learning_rate": 4.248699479791917e-05, "loss": 1.484, "step": 3765 }, { "epoch": 3.014354066985646, "grad_norm": 2.0377817153930664, "learning_rate": 4.245698279311725e-05, "loss": 1.2241, "step": 3780 }, { "epoch": 3.026315789473684, "grad_norm": 2.2331552505493164, "learning_rate": 4.2426970788315326e-05, "loss": 1.1948, "step": 3795 }, { "epoch": 3.0382775119617227, "grad_norm": 2.3499271869659424, "learning_rate": 4.239695878351341e-05, "loss": 1.2828, "step": 3810 }, { "epoch": 3.050239234449761, "grad_norm": 2.445600748062134, "learning_rate": 4.236694677871149e-05, "loss": 1.1715, "step": 3825 }, { "epoch": 3.062200956937799, "grad_norm": 2.801543951034546, "learning_rate": 4.2336934773909566e-05, "loss": 1.2167, "step": 3840 }, { "epoch": 3.074162679425837, "grad_norm": 2.515307664871216, "learning_rate": 4.2306922769107644e-05, "loss": 1.1451, "step": 3855 }, { "epoch": 3.0861244019138754, "grad_norm": 2.6123640537261963, "learning_rate": 4.227691076430572e-05, "loss": 1.256, "step": 3870 }, { "epoch": 3.098086124401914, "grad_norm": 2.602388381958008, "learning_rate": 4.2246898759503806e-05, "loss": 1.1867, "step": 3885 }, { "epoch": 3.110047846889952, "grad_norm": 2.552335739135742, "learning_rate": 4.2216886754701884e-05, "loss": 1.1845, "step": 3900 }, { "epoch": 3.1220095693779903, "grad_norm": 2.6270079612731934, "learning_rate": 4.218687474989996e-05, "loss": 1.2479, "step": 3915 }, { "epoch": 3.1339712918660285, "grad_norm": 2.490518808364868, "learning_rate": 4.215686274509804e-05, "loss": 1.2386, "step": 3930 }, { "epoch": 3.145933014354067, "grad_norm": 2.348869800567627, "learning_rate": 4.212685074029612e-05, "loss": 1.2285, "step": 3945 }, { "epoch": 3.1578947368421053, "grad_norm": 2.3546955585479736, "learning_rate": 4.2096838735494196e-05, "loss": 1.206, "step": 3960 }, { "epoch": 3.1698564593301435, "grad_norm": 2.4429666996002197, "learning_rate": 4.2066826730692274e-05, "loss": 1.335, "step": 3975 }, { "epoch": 3.1818181818181817, "grad_norm": 2.397874355316162, "learning_rate": 4.203681472589036e-05, "loss": 1.2252, "step": 3990 }, { "epoch": 3.1937799043062203, "grad_norm": 2.526556968688965, "learning_rate": 4.200680272108844e-05, "loss": 1.2811, "step": 4005 }, { "epoch": 3.2057416267942584, "grad_norm": 2.7083089351654053, "learning_rate": 4.197679071628652e-05, "loss": 1.3154, "step": 4020 }, { "epoch": 3.2177033492822966, "grad_norm": 2.426650285720825, "learning_rate": 4.19467787114846e-05, "loss": 1.2251, "step": 4035 }, { "epoch": 3.229665071770335, "grad_norm": 3.1592352390289307, "learning_rate": 4.1916766706682676e-05, "loss": 1.232, "step": 4050 }, { "epoch": 3.2416267942583734, "grad_norm": 2.4699387550354004, "learning_rate": 4.1886754701880754e-05, "loss": 1.3075, "step": 4065 }, { "epoch": 3.2535885167464116, "grad_norm": 2.410412311553955, "learning_rate": 4.185674269707883e-05, "loss": 1.2583, "step": 4080 }, { "epoch": 3.2655502392344498, "grad_norm": 2.3662848472595215, "learning_rate": 4.182673069227691e-05, "loss": 1.2718, "step": 4095 }, { "epoch": 3.277511961722488, "grad_norm": 2.241677761077881, "learning_rate": 4.179671868747499e-05, "loss": 1.2293, "step": 4110 }, { "epoch": 3.2894736842105265, "grad_norm": 2.289928674697876, "learning_rate": 4.176670668267307e-05, "loss": 1.2369, "step": 4125 }, { "epoch": 3.3014354066985647, "grad_norm": 2.9561991691589355, "learning_rate": 4.173669467787115e-05, "loss": 1.1936, "step": 4140 }, { "epoch": 3.313397129186603, "grad_norm": 2.6181890964508057, "learning_rate": 4.170668267306923e-05, "loss": 1.2791, "step": 4155 }, { "epoch": 3.325358851674641, "grad_norm": 2.208653688430786, "learning_rate": 4.1676670668267306e-05, "loss": 1.3175, "step": 4170 }, { "epoch": 3.3373205741626792, "grad_norm": 2.460291624069214, "learning_rate": 4.164665866346539e-05, "loss": 1.255, "step": 4185 }, { "epoch": 3.349282296650718, "grad_norm": 2.2541019916534424, "learning_rate": 4.161664665866347e-05, "loss": 1.2815, "step": 4200 }, { "epoch": 3.361244019138756, "grad_norm": 2.543994903564453, "learning_rate": 4.1586634653861546e-05, "loss": 1.2888, "step": 4215 }, { "epoch": 3.373205741626794, "grad_norm": 2.7568411827087402, "learning_rate": 4.1556622649059624e-05, "loss": 1.2894, "step": 4230 }, { "epoch": 3.3851674641148324, "grad_norm": 2.5805466175079346, "learning_rate": 4.152661064425771e-05, "loss": 1.3434, "step": 4245 }, { "epoch": 3.397129186602871, "grad_norm": 2.409097194671631, "learning_rate": 4.149659863945579e-05, "loss": 1.2903, "step": 4260 }, { "epoch": 3.409090909090909, "grad_norm": 4.126059532165527, "learning_rate": 4.1466586634653865e-05, "loss": 1.2764, "step": 4275 }, { "epoch": 3.4210526315789473, "grad_norm": 3.106367826461792, "learning_rate": 4.143657462985194e-05, "loss": 1.3184, "step": 4290 }, { "epoch": 3.4330143540669855, "grad_norm": 2.195138454437256, "learning_rate": 4.140656262505002e-05, "loss": 1.2636, "step": 4305 }, { "epoch": 3.444976076555024, "grad_norm": 2.7023708820343018, "learning_rate": 4.13765506202481e-05, "loss": 1.3316, "step": 4320 }, { "epoch": 3.4569377990430623, "grad_norm": 2.262626886367798, "learning_rate": 4.1346538615446176e-05, "loss": 1.2847, "step": 4335 }, { "epoch": 3.4688995215311005, "grad_norm": 2.5416321754455566, "learning_rate": 4.131652661064426e-05, "loss": 1.3254, "step": 4350 }, { "epoch": 3.4808612440191387, "grad_norm": 2.868903875350952, "learning_rate": 4.128651460584234e-05, "loss": 1.2778, "step": 4365 }, { "epoch": 3.492822966507177, "grad_norm": 2.347463607788086, "learning_rate": 4.125650260104042e-05, "loss": 1.34, "step": 4380 }, { "epoch": 3.5047846889952154, "grad_norm": 2.644416332244873, "learning_rate": 4.12264905962385e-05, "loss": 1.2862, "step": 4395 }, { "epoch": 3.5167464114832536, "grad_norm": 2.8803160190582275, "learning_rate": 4.119647859143658e-05, "loss": 1.3538, "step": 4410 }, { "epoch": 3.528708133971292, "grad_norm": 2.643848180770874, "learning_rate": 4.1166466586634657e-05, "loss": 1.3566, "step": 4425 }, { "epoch": 3.5406698564593304, "grad_norm": 2.555978298187256, "learning_rate": 4.1136454581832734e-05, "loss": 1.284, "step": 4440 }, { "epoch": 3.5526315789473686, "grad_norm": 2.4635751247406006, "learning_rate": 4.110644257703081e-05, "loss": 1.3229, "step": 4455 }, { "epoch": 3.5645933014354068, "grad_norm": 2.804314374923706, "learning_rate": 4.107643057222889e-05, "loss": 1.2931, "step": 4470 }, { "epoch": 3.576555023923445, "grad_norm": 2.5955514907836914, "learning_rate": 4.1046418567426975e-05, "loss": 1.3153, "step": 4485 }, { "epoch": 3.588516746411483, "grad_norm": 2.4464356899261475, "learning_rate": 4.101640656262505e-05, "loss": 1.2963, "step": 4500 }, { "epoch": 3.6004784688995217, "grad_norm": 2.8158469200134277, "learning_rate": 4.098639455782313e-05, "loss": 1.333, "step": 4515 }, { "epoch": 3.61244019138756, "grad_norm": 2.324192523956299, "learning_rate": 4.095638255302121e-05, "loss": 1.3438, "step": 4530 }, { "epoch": 3.624401913875598, "grad_norm": 2.5822291374206543, "learning_rate": 4.0926370548219286e-05, "loss": 1.381, "step": 4545 }, { "epoch": 3.6363636363636362, "grad_norm": 2.3783419132232666, "learning_rate": 4.089635854341737e-05, "loss": 1.321, "step": 4560 }, { "epoch": 3.6483253588516744, "grad_norm": 2.453040361404419, "learning_rate": 4.086634653861545e-05, "loss": 1.35, "step": 4575 }, { "epoch": 3.660287081339713, "grad_norm": 2.694587230682373, "learning_rate": 4.0836334533813526e-05, "loss": 1.3342, "step": 4590 }, { "epoch": 3.672248803827751, "grad_norm": 2.4545223712921143, "learning_rate": 4.080632252901161e-05, "loss": 1.4238, "step": 4605 }, { "epoch": 3.6842105263157894, "grad_norm": 2.5401089191436768, "learning_rate": 4.077631052420969e-05, "loss": 1.3699, "step": 4620 }, { "epoch": 3.696172248803828, "grad_norm": 2.4257302284240723, "learning_rate": 4.074629851940777e-05, "loss": 1.3569, "step": 4635 }, { "epoch": 3.708133971291866, "grad_norm": 2.7543747425079346, "learning_rate": 4.0716286514605845e-05, "loss": 1.2967, "step": 4650 }, { "epoch": 3.7200956937799043, "grad_norm": 2.4614686965942383, "learning_rate": 4.068627450980392e-05, "loss": 1.2982, "step": 4665 }, { "epoch": 3.7320574162679425, "grad_norm": 3.7613461017608643, "learning_rate": 4.0656262505002e-05, "loss": 1.3812, "step": 4680 }, { "epoch": 3.7440191387559807, "grad_norm": 2.60383939743042, "learning_rate": 4.062625050020008e-05, "loss": 1.3526, "step": 4695 }, { "epoch": 3.7559808612440193, "grad_norm": 2.3789987564086914, "learning_rate": 4.0596238495398156e-05, "loss": 1.3502, "step": 4710 }, { "epoch": 3.7679425837320575, "grad_norm": 2.6684768199920654, "learning_rate": 4.056622649059624e-05, "loss": 1.4723, "step": 4725 }, { "epoch": 3.7799043062200957, "grad_norm": 2.480144500732422, "learning_rate": 4.053621448579432e-05, "loss": 1.3716, "step": 4740 }, { "epoch": 3.791866028708134, "grad_norm": 2.429513454437256, "learning_rate": 4.05062024809924e-05, "loss": 1.2895, "step": 4755 }, { "epoch": 3.803827751196172, "grad_norm": 2.4947898387908936, "learning_rate": 4.047619047619048e-05, "loss": 1.4147, "step": 4770 }, { "epoch": 3.8157894736842106, "grad_norm": 2.351773500442505, "learning_rate": 4.044617847138856e-05, "loss": 1.3712, "step": 4785 }, { "epoch": 3.827751196172249, "grad_norm": 2.4937288761138916, "learning_rate": 4.041616646658664e-05, "loss": 1.3342, "step": 4800 }, { "epoch": 3.839712918660287, "grad_norm": 3.4912281036376953, "learning_rate": 4.0386154461784715e-05, "loss": 1.3403, "step": 4815 }, { "epoch": 3.8516746411483256, "grad_norm": 2.2786455154418945, "learning_rate": 4.035614245698279e-05, "loss": 1.335, "step": 4830 }, { "epoch": 3.8636363636363638, "grad_norm": 2.7752015590667725, "learning_rate": 4.032613045218088e-05, "loss": 1.3739, "step": 4845 }, { "epoch": 3.875598086124402, "grad_norm": 2.510052442550659, "learning_rate": 4.0296118447378955e-05, "loss": 1.3793, "step": 4860 }, { "epoch": 3.88755980861244, "grad_norm": 4.657649517059326, "learning_rate": 4.026610644257703e-05, "loss": 1.3914, "step": 4875 }, { "epoch": 3.8995215311004783, "grad_norm": 2.437033176422119, "learning_rate": 4.023609443777511e-05, "loss": 1.3793, "step": 4890 }, { "epoch": 3.911483253588517, "grad_norm": 2.7319986820220947, "learning_rate": 4.020608243297319e-05, "loss": 1.437, "step": 4905 }, { "epoch": 3.923444976076555, "grad_norm": 2.553680896759033, "learning_rate": 4.0176070428171266e-05, "loss": 1.3613, "step": 4920 }, { "epoch": 3.9354066985645932, "grad_norm": 2.379471778869629, "learning_rate": 4.014605842336935e-05, "loss": 1.3638, "step": 4935 }, { "epoch": 3.9473684210526314, "grad_norm": 2.8651113510131836, "learning_rate": 4.011604641856743e-05, "loss": 1.3265, "step": 4950 }, { "epoch": 3.9593301435406696, "grad_norm": 2.366116762161255, "learning_rate": 4.0086034413765513e-05, "loss": 1.2701, "step": 4965 }, { "epoch": 3.971291866028708, "grad_norm": 2.60257625579834, "learning_rate": 4.005602240896359e-05, "loss": 1.305, "step": 4980 }, { "epoch": 3.9832535885167464, "grad_norm": 2.544235944747925, "learning_rate": 4.002601040416167e-05, "loss": 1.3632, "step": 4995 }, { "epoch": 3.9952153110047846, "grad_norm": 2.541198253631592, "learning_rate": 3.999599839935975e-05, "loss": 1.4154, "step": 5010 }, { "epoch": 4.007177033492823, "grad_norm": 3.7236313819885254, "learning_rate": 3.9965986394557825e-05, "loss": 1.1803, "step": 5025 }, { "epoch": 4.019138755980861, "grad_norm": 3.206791877746582, "learning_rate": 3.99359743897559e-05, "loss": 0.9466, "step": 5040 }, { "epoch": 4.0311004784688995, "grad_norm": 2.9792520999908447, "learning_rate": 3.990596238495398e-05, "loss": 0.8937, "step": 5055 }, { "epoch": 4.043062200956938, "grad_norm": 3.3796586990356445, "learning_rate": 3.987595038015206e-05, "loss": 0.9352, "step": 5070 }, { "epoch": 4.055023923444976, "grad_norm": 2.383775472640991, "learning_rate": 3.984593837535014e-05, "loss": 0.8506, "step": 5085 }, { "epoch": 4.0669856459330145, "grad_norm": 2.6192071437835693, "learning_rate": 3.981592637054822e-05, "loss": 0.8886, "step": 5100 }, { "epoch": 4.078947368421052, "grad_norm": 3.329030990600586, "learning_rate": 3.97859143657463e-05, "loss": 0.9639, "step": 5115 }, { "epoch": 4.090909090909091, "grad_norm": 3.970484733581543, "learning_rate": 3.975590236094438e-05, "loss": 0.9112, "step": 5130 }, { "epoch": 4.1028708133971294, "grad_norm": 3.082409381866455, "learning_rate": 3.972589035614246e-05, "loss": 0.8825, "step": 5145 }, { "epoch": 4.114832535885167, "grad_norm": 2.9433696269989014, "learning_rate": 3.969587835134054e-05, "loss": 0.9384, "step": 5160 }, { "epoch": 4.126794258373206, "grad_norm": 3.1707279682159424, "learning_rate": 3.966586634653862e-05, "loss": 0.9025, "step": 5175 }, { "epoch": 4.138755980861244, "grad_norm": 3.336472988128662, "learning_rate": 3.9635854341736695e-05, "loss": 0.9228, "step": 5190 }, { "epoch": 4.150717703349282, "grad_norm": 3.4995670318603516, "learning_rate": 3.960584233693477e-05, "loss": 0.9847, "step": 5205 }, { "epoch": 4.162679425837321, "grad_norm": 3.3354713916778564, "learning_rate": 3.957583033213286e-05, "loss": 0.9717, "step": 5220 }, { "epoch": 4.1746411483253585, "grad_norm": 3.2553207874298096, "learning_rate": 3.9545818327330935e-05, "loss": 0.9973, "step": 5235 }, { "epoch": 4.186602870813397, "grad_norm": 3.007181406021118, "learning_rate": 3.951580632252901e-05, "loss": 0.919, "step": 5250 }, { "epoch": 4.198564593301436, "grad_norm": 2.7252211570739746, "learning_rate": 3.948579431772709e-05, "loss": 0.8914, "step": 5265 }, { "epoch": 4.2105263157894735, "grad_norm": 3.078258514404297, "learning_rate": 3.945578231292517e-05, "loss": 1.0353, "step": 5280 }, { "epoch": 4.222488038277512, "grad_norm": 3.0154271125793457, "learning_rate": 3.942577030812325e-05, "loss": 0.9594, "step": 5295 }, { "epoch": 4.23444976076555, "grad_norm": 3.7115094661712646, "learning_rate": 3.939575830332133e-05, "loss": 0.9248, "step": 5310 }, { "epoch": 4.246411483253588, "grad_norm": 3.135359048843384, "learning_rate": 3.936574629851941e-05, "loss": 0.9918, "step": 5325 }, { "epoch": 4.258373205741627, "grad_norm": 2.8541269302368164, "learning_rate": 3.9335734293717494e-05, "loss": 0.974, "step": 5340 }, { "epoch": 4.270334928229665, "grad_norm": 3.1880204677581787, "learning_rate": 3.930572228891557e-05, "loss": 1.0267, "step": 5355 }, { "epoch": 4.282296650717703, "grad_norm": 4.082556247711182, "learning_rate": 3.927571028411365e-05, "loss": 0.9764, "step": 5370 }, { "epoch": 4.294258373205742, "grad_norm": 3.121758460998535, "learning_rate": 3.924569827931173e-05, "loss": 1.0353, "step": 5385 }, { "epoch": 4.30622009569378, "grad_norm": 3.3821141719818115, "learning_rate": 3.9215686274509805e-05, "loss": 1.0219, "step": 5400 }, { "epoch": 4.318181818181818, "grad_norm": 3.336914300918579, "learning_rate": 3.918567426970788e-05, "loss": 1.0427, "step": 5415 }, { "epoch": 4.330143540669856, "grad_norm": 3.1878132820129395, "learning_rate": 3.915566226490596e-05, "loss": 1.0125, "step": 5430 }, { "epoch": 4.342105263157895, "grad_norm": 3.5293705463409424, "learning_rate": 3.912565026010404e-05, "loss": 0.9655, "step": 5445 }, { "epoch": 4.354066985645933, "grad_norm": 2.9817090034484863, "learning_rate": 3.909563825530212e-05, "loss": 0.9854, "step": 5460 }, { "epoch": 4.366028708133971, "grad_norm": 3.0998663902282715, "learning_rate": 3.90656262505002e-05, "loss": 0.951, "step": 5475 }, { "epoch": 4.37799043062201, "grad_norm": 3.541856050491333, "learning_rate": 3.903561424569828e-05, "loss": 1.0302, "step": 5490 }, { "epoch": 4.389952153110048, "grad_norm": 3.180595636367798, "learning_rate": 3.9005602240896364e-05, "loss": 0.9434, "step": 5505 }, { "epoch": 4.401913875598086, "grad_norm": 3.341787099838257, "learning_rate": 3.897559023609444e-05, "loss": 1.0062, "step": 5520 }, { "epoch": 4.413875598086125, "grad_norm": 3.4445912837982178, "learning_rate": 3.894557823129252e-05, "loss": 0.9558, "step": 5535 }, { "epoch": 4.425837320574162, "grad_norm": 2.839120388031006, "learning_rate": 3.89155662264906e-05, "loss": 1.0152, "step": 5550 }, { "epoch": 4.437799043062201, "grad_norm": 3.482067108154297, "learning_rate": 3.8885554221688675e-05, "loss": 1.0234, "step": 5565 }, { "epoch": 4.44976076555024, "grad_norm": 2.869065761566162, "learning_rate": 3.885554221688676e-05, "loss": 1.0045, "step": 5580 }, { "epoch": 4.461722488038277, "grad_norm": 3.366964101791382, "learning_rate": 3.882553021208484e-05, "loss": 1.0086, "step": 5595 }, { "epoch": 4.473684210526316, "grad_norm": 3.8538451194763184, "learning_rate": 3.8795518207282915e-05, "loss": 1.0727, "step": 5610 }, { "epoch": 4.485645933014354, "grad_norm": 3.1612632274627686, "learning_rate": 3.876550620248099e-05, "loss": 1.1, "step": 5625 }, { "epoch": 4.497607655502392, "grad_norm": 3.4518115520477295, "learning_rate": 3.873549419767907e-05, "loss": 0.9788, "step": 5640 }, { "epoch": 4.509569377990431, "grad_norm": 2.8597676753997803, "learning_rate": 3.870548219287715e-05, "loss": 1.0111, "step": 5655 }, { "epoch": 4.521531100478469, "grad_norm": 3.2637124061584473, "learning_rate": 3.8675470188075233e-05, "loss": 0.9647, "step": 5670 }, { "epoch": 4.533492822966507, "grad_norm": 3.176473379135132, "learning_rate": 3.864545818327331e-05, "loss": 1.0303, "step": 5685 }, { "epoch": 4.545454545454545, "grad_norm": 3.1555211544036865, "learning_rate": 3.8615446178471396e-05, "loss": 0.9983, "step": 5700 }, { "epoch": 4.557416267942584, "grad_norm": 3.690917730331421, "learning_rate": 3.8585434173669474e-05, "loss": 1.0843, "step": 5715 }, { "epoch": 4.569377990430622, "grad_norm": 3.4356346130371094, "learning_rate": 3.855542216886755e-05, "loss": 1.0957, "step": 5730 }, { "epoch": 4.58133971291866, "grad_norm": 3.0207927227020264, "learning_rate": 3.852541016406563e-05, "loss": 0.9877, "step": 5745 }, { "epoch": 4.5933014354066986, "grad_norm": 3.256007194519043, "learning_rate": 3.849539815926371e-05, "loss": 0.9934, "step": 5760 }, { "epoch": 4.605263157894737, "grad_norm": 4.417782783508301, "learning_rate": 3.8465386154461785e-05, "loss": 1.0612, "step": 5775 }, { "epoch": 4.617224880382775, "grad_norm": 2.802917242050171, "learning_rate": 3.843537414965986e-05, "loss": 1.0714, "step": 5790 }, { "epoch": 4.6291866028708135, "grad_norm": 2.9113950729370117, "learning_rate": 3.840536214485794e-05, "loss": 1.0637, "step": 5805 }, { "epoch": 4.641148325358852, "grad_norm": 3.0320019721984863, "learning_rate": 3.8375350140056026e-05, "loss": 1.0407, "step": 5820 }, { "epoch": 4.65311004784689, "grad_norm": 2.9705982208251953, "learning_rate": 3.83453381352541e-05, "loss": 1.118, "step": 5835 }, { "epoch": 4.6650717703349285, "grad_norm": 3.1082069873809814, "learning_rate": 3.831532613045218e-05, "loss": 1.1102, "step": 5850 }, { "epoch": 4.677033492822966, "grad_norm": 3.2098066806793213, "learning_rate": 3.828531412565026e-05, "loss": 1.1063, "step": 5865 }, { "epoch": 4.688995215311005, "grad_norm": 3.18621826171875, "learning_rate": 3.8255302120848344e-05, "loss": 1.0772, "step": 5880 }, { "epoch": 4.7009569377990434, "grad_norm": 3.3197460174560547, "learning_rate": 3.822529011604642e-05, "loss": 1.0054, "step": 5895 }, { "epoch": 4.712918660287081, "grad_norm": 2.8657805919647217, "learning_rate": 3.81952781112445e-05, "loss": 1.073, "step": 5910 }, { "epoch": 4.72488038277512, "grad_norm": 2.897557497024536, "learning_rate": 3.816526610644258e-05, "loss": 1.0991, "step": 5925 }, { "epoch": 4.7368421052631575, "grad_norm": 2.881815195083618, "learning_rate": 3.813525410164066e-05, "loss": 1.1037, "step": 5940 }, { "epoch": 4.748803827751196, "grad_norm": 3.131378412246704, "learning_rate": 3.810524209683874e-05, "loss": 1.149, "step": 5955 }, { "epoch": 4.760765550239235, "grad_norm": 3.3418426513671875, "learning_rate": 3.807523009203682e-05, "loss": 1.0799, "step": 5970 }, { "epoch": 4.7727272727272725, "grad_norm": 2.759793519973755, "learning_rate": 3.8045218087234895e-05, "loss": 1.1026, "step": 5985 }, { "epoch": 4.784688995215311, "grad_norm": 3.082688808441162, "learning_rate": 3.801520608243297e-05, "loss": 1.0911, "step": 6000 }, { "epoch": 4.796650717703349, "grad_norm": 3.788597583770752, "learning_rate": 3.798519407763105e-05, "loss": 1.1133, "step": 6015 }, { "epoch": 4.8086124401913874, "grad_norm": 3.0609753131866455, "learning_rate": 3.795518207282913e-05, "loss": 1.0023, "step": 6030 }, { "epoch": 4.820574162679426, "grad_norm": 3.5260090827941895, "learning_rate": 3.7925170068027214e-05, "loss": 1.105, "step": 6045 }, { "epoch": 4.832535885167464, "grad_norm": 3.1473610401153564, "learning_rate": 3.789515806322529e-05, "loss": 1.1896, "step": 6060 }, { "epoch": 4.844497607655502, "grad_norm": 3.2314066886901855, "learning_rate": 3.7865146058423376e-05, "loss": 1.1403, "step": 6075 }, { "epoch": 4.856459330143541, "grad_norm": 3.1266963481903076, "learning_rate": 3.7835134053621454e-05, "loss": 1.123, "step": 6090 }, { "epoch": 4.868421052631579, "grad_norm": 3.1995601654052734, "learning_rate": 3.780512204881953e-05, "loss": 1.1833, "step": 6105 }, { "epoch": 4.880382775119617, "grad_norm": 3.251296043395996, "learning_rate": 3.777511004401761e-05, "loss": 1.1502, "step": 6120 }, { "epoch": 4.892344497607656, "grad_norm": 3.1420419216156006, "learning_rate": 3.774509803921569e-05, "loss": 1.1207, "step": 6135 }, { "epoch": 4.904306220095694, "grad_norm": 2.992222785949707, "learning_rate": 3.7715086034413765e-05, "loss": 1.1347, "step": 6150 }, { "epoch": 4.916267942583732, "grad_norm": 3.03808856010437, "learning_rate": 3.768507402961184e-05, "loss": 1.131, "step": 6165 }, { "epoch": 4.92822966507177, "grad_norm": 3.9193668365478516, "learning_rate": 3.765506202480993e-05, "loss": 1.0749, "step": 6180 }, { "epoch": 4.940191387559809, "grad_norm": 3.3145644664764404, "learning_rate": 3.7625050020008006e-05, "loss": 1.0406, "step": 6195 }, { "epoch": 4.952153110047847, "grad_norm": 3.134812116622925, "learning_rate": 3.7595038015206084e-05, "loss": 1.1243, "step": 6210 }, { "epoch": 4.964114832535885, "grad_norm": 3.403087854385376, "learning_rate": 3.756502601040416e-05, "loss": 1.0429, "step": 6225 }, { "epoch": 4.976076555023924, "grad_norm": 3.0964858531951904, "learning_rate": 3.753501400560224e-05, "loss": 1.112, "step": 6240 }, { "epoch": 4.988038277511961, "grad_norm": 4.416729927062988, "learning_rate": 3.7505002000800324e-05, "loss": 1.1144, "step": 6255 }, { "epoch": 5.0, "grad_norm": 4.442926406860352, "learning_rate": 3.74749899959984e-05, "loss": 1.0938, "step": 6270 }, { "epoch": 5.011961722488039, "grad_norm": 3.0728983879089355, "learning_rate": 3.744497799119648e-05, "loss": 0.6816, "step": 6285 }, { "epoch": 5.023923444976076, "grad_norm": 3.4252402782440186, "learning_rate": 3.7414965986394564e-05, "loss": 0.7263, "step": 6300 }, { "epoch": 5.035885167464115, "grad_norm": 4.501566410064697, "learning_rate": 3.738495398159264e-05, "loss": 0.6744, "step": 6315 }, { "epoch": 5.047846889952153, "grad_norm": 3.8966481685638428, "learning_rate": 3.735494197679072e-05, "loss": 0.645, "step": 6330 }, { "epoch": 5.059808612440191, "grad_norm": 3.794740915298462, "learning_rate": 3.73249299719888e-05, "loss": 0.6894, "step": 6345 }, { "epoch": 5.07177033492823, "grad_norm": 3.1294026374816895, "learning_rate": 3.7294917967186876e-05, "loss": 0.6101, "step": 6360 }, { "epoch": 5.083732057416268, "grad_norm": 3.1900405883789062, "learning_rate": 3.7264905962384953e-05, "loss": 0.6731, "step": 6375 }, { "epoch": 5.095693779904306, "grad_norm": 3.9348907470703125, "learning_rate": 3.723489395758303e-05, "loss": 0.7257, "step": 6390 }, { "epoch": 5.107655502392345, "grad_norm": 3.5655553340911865, "learning_rate": 3.720488195278111e-05, "loss": 0.6219, "step": 6405 }, { "epoch": 5.119617224880383, "grad_norm": 3.678565740585327, "learning_rate": 3.7174869947979194e-05, "loss": 0.6896, "step": 6420 }, { "epoch": 5.131578947368421, "grad_norm": 3.041287422180176, "learning_rate": 3.714485794317727e-05, "loss": 0.7084, "step": 6435 }, { "epoch": 5.143540669856459, "grad_norm": 3.382601737976074, "learning_rate": 3.7114845938375356e-05, "loss": 0.6298, "step": 6450 }, { "epoch": 5.155502392344498, "grad_norm": 3.4510035514831543, "learning_rate": 3.7084833933573434e-05, "loss": 0.6882, "step": 6465 }, { "epoch": 5.167464114832536, "grad_norm": 4.204371929168701, "learning_rate": 3.705482192877151e-05, "loss": 0.7478, "step": 6480 }, { "epoch": 5.179425837320574, "grad_norm": 3.669754981994629, "learning_rate": 3.702480992396959e-05, "loss": 0.7159, "step": 6495 }, { "epoch": 5.1913875598086126, "grad_norm": 3.454606056213379, "learning_rate": 3.699479791916767e-05, "loss": 0.7049, "step": 6510 }, { "epoch": 5.203349282296651, "grad_norm": 3.548112154006958, "learning_rate": 3.6964785914365746e-05, "loss": 0.7279, "step": 6525 }, { "epoch": 5.215311004784689, "grad_norm": 4.184609413146973, "learning_rate": 3.693477390956382e-05, "loss": 0.7747, "step": 6540 }, { "epoch": 5.2272727272727275, "grad_norm": 3.418808937072754, "learning_rate": 3.690476190476191e-05, "loss": 0.7833, "step": 6555 }, { "epoch": 5.239234449760765, "grad_norm": 3.444638729095459, "learning_rate": 3.6874749899959986e-05, "loss": 0.81, "step": 6570 }, { "epoch": 5.251196172248804, "grad_norm": 3.960958242416382, "learning_rate": 3.6844737895158064e-05, "loss": 0.6915, "step": 6585 }, { "epoch": 5.2631578947368425, "grad_norm": 3.772879123687744, "learning_rate": 3.681472589035614e-05, "loss": 0.7157, "step": 6600 }, { "epoch": 5.27511961722488, "grad_norm": 4.02428674697876, "learning_rate": 3.6784713885554226e-05, "loss": 0.7383, "step": 6615 }, { "epoch": 5.287081339712919, "grad_norm": 3.4093050956726074, "learning_rate": 3.6754701880752304e-05, "loss": 0.7163, "step": 6630 }, { "epoch": 5.2990430622009566, "grad_norm": 3.6924562454223633, "learning_rate": 3.672468987595038e-05, "loss": 0.7022, "step": 6645 }, { "epoch": 5.311004784688995, "grad_norm": 3.356632947921753, "learning_rate": 3.669467787114846e-05, "loss": 0.737, "step": 6660 }, { "epoch": 5.322966507177034, "grad_norm": 3.501210927963257, "learning_rate": 3.6664665866346544e-05, "loss": 0.7474, "step": 6675 }, { "epoch": 5.3349282296650715, "grad_norm": 3.852551221847534, "learning_rate": 3.663465386154462e-05, "loss": 0.779, "step": 6690 }, { "epoch": 5.34688995215311, "grad_norm": 3.4461312294006348, "learning_rate": 3.66046418567427e-05, "loss": 0.6816, "step": 6705 }, { "epoch": 5.358851674641148, "grad_norm": 2.9088375568389893, "learning_rate": 3.657462985194078e-05, "loss": 0.7619, "step": 6720 }, { "epoch": 5.3708133971291865, "grad_norm": 3.4227547645568848, "learning_rate": 3.6544617847138856e-05, "loss": 0.7646, "step": 6735 }, { "epoch": 5.382775119617225, "grad_norm": 4.553009986877441, "learning_rate": 3.6514605842336934e-05, "loss": 0.7907, "step": 6750 }, { "epoch": 5.394736842105263, "grad_norm": 3.965406656265259, "learning_rate": 3.648459383753501e-05, "loss": 0.7901, "step": 6765 }, { "epoch": 5.4066985645933014, "grad_norm": 3.7064077854156494, "learning_rate": 3.645458183273309e-05, "loss": 0.758, "step": 6780 }, { "epoch": 5.41866028708134, "grad_norm": 3.4479455947875977, "learning_rate": 3.6424569827931174e-05, "loss": 0.7439, "step": 6795 }, { "epoch": 5.430622009569378, "grad_norm": 3.9599294662475586, "learning_rate": 3.639455782312925e-05, "loss": 0.8257, "step": 6810 }, { "epoch": 5.442583732057416, "grad_norm": 3.7063801288604736, "learning_rate": 3.6364545818327336e-05, "loss": 0.7717, "step": 6825 }, { "epoch": 5.454545454545454, "grad_norm": 4.6955060958862305, "learning_rate": 3.6334533813525414e-05, "loss": 0.7575, "step": 6840 }, { "epoch": 5.466507177033493, "grad_norm": 3.915292501449585, "learning_rate": 3.630452180872349e-05, "loss": 0.7989, "step": 6855 }, { "epoch": 5.478468899521531, "grad_norm": 3.974541664123535, "learning_rate": 3.627450980392157e-05, "loss": 0.8685, "step": 6870 }, { "epoch": 5.490430622009569, "grad_norm": 3.9493520259857178, "learning_rate": 3.624449779911965e-05, "loss": 0.8111, "step": 6885 }, { "epoch": 5.502392344497608, "grad_norm": 3.7138257026672363, "learning_rate": 3.6214485794317726e-05, "loss": 0.8086, "step": 6900 }, { "epoch": 5.514354066985646, "grad_norm": 3.838562250137329, "learning_rate": 3.618447378951581e-05, "loss": 0.8, "step": 6915 }, { "epoch": 5.526315789473684, "grad_norm": 3.5369865894317627, "learning_rate": 3.615446178471389e-05, "loss": 0.7449, "step": 6930 }, { "epoch": 5.538277511961723, "grad_norm": 3.607936382293701, "learning_rate": 3.6124449779911966e-05, "loss": 0.7974, "step": 6945 }, { "epoch": 5.55023923444976, "grad_norm": 4.021537780761719, "learning_rate": 3.6094437775110044e-05, "loss": 0.6972, "step": 6960 }, { "epoch": 5.562200956937799, "grad_norm": 4.086754322052002, "learning_rate": 3.606442577030812e-05, "loss": 0.8349, "step": 6975 }, { "epoch": 5.574162679425838, "grad_norm": 3.385819673538208, "learning_rate": 3.6034413765506206e-05, "loss": 0.8016, "step": 6990 }, { "epoch": 5.586124401913875, "grad_norm": 3.3851637840270996, "learning_rate": 3.6004401760704284e-05, "loss": 0.8013, "step": 7005 }, { "epoch": 5.598086124401914, "grad_norm": 3.6127657890319824, "learning_rate": 3.597438975590236e-05, "loss": 0.889, "step": 7020 }, { "epoch": 5.610047846889952, "grad_norm": 3.7455716133117676, "learning_rate": 3.594437775110045e-05, "loss": 0.8244, "step": 7035 }, { "epoch": 5.62200956937799, "grad_norm": 3.5797011852264404, "learning_rate": 3.5914365746298525e-05, "loss": 0.8794, "step": 7050 }, { "epoch": 5.633971291866029, "grad_norm": 3.6951963901519775, "learning_rate": 3.58843537414966e-05, "loss": 0.8377, "step": 7065 }, { "epoch": 5.645933014354067, "grad_norm": 4.805546283721924, "learning_rate": 3.585434173669468e-05, "loss": 0.7658, "step": 7080 }, { "epoch": 5.657894736842105, "grad_norm": 3.3476104736328125, "learning_rate": 3.582432973189276e-05, "loss": 0.8535, "step": 7095 }, { "epoch": 5.669856459330144, "grad_norm": 3.7429189682006836, "learning_rate": 3.5794317727090836e-05, "loss": 0.7698, "step": 7110 }, { "epoch": 5.681818181818182, "grad_norm": 3.6189913749694824, "learning_rate": 3.5764305722288914e-05, "loss": 0.8843, "step": 7125 }, { "epoch": 5.69377990430622, "grad_norm": 3.614164113998413, "learning_rate": 3.573429371748699e-05, "loss": 0.7855, "step": 7140 }, { "epoch": 5.705741626794258, "grad_norm": 3.9962081909179688, "learning_rate": 3.5704281712685076e-05, "loss": 0.8501, "step": 7155 }, { "epoch": 5.717703349282297, "grad_norm": 3.6668338775634766, "learning_rate": 3.5674269707883154e-05, "loss": 0.7866, "step": 7170 }, { "epoch": 5.729665071770335, "grad_norm": 3.9314942359924316, "learning_rate": 3.564425770308123e-05, "loss": 0.8003, "step": 7185 }, { "epoch": 5.741626794258373, "grad_norm": 4.32262659072876, "learning_rate": 3.5614245698279317e-05, "loss": 0.8137, "step": 7200 }, { "epoch": 5.753588516746412, "grad_norm": 5.040790557861328, "learning_rate": 3.5584233693477394e-05, "loss": 0.8354, "step": 7215 }, { "epoch": 5.76555023923445, "grad_norm": 3.7755401134490967, "learning_rate": 3.555422168867547e-05, "loss": 0.8574, "step": 7230 }, { "epoch": 5.777511961722488, "grad_norm": 3.8143343925476074, "learning_rate": 3.552420968387355e-05, "loss": 0.8091, "step": 7245 }, { "epoch": 5.7894736842105265, "grad_norm": 3.4861605167388916, "learning_rate": 3.549419767907163e-05, "loss": 0.8304, "step": 7260 }, { "epoch": 5.801435406698564, "grad_norm": 3.5389742851257324, "learning_rate": 3.546418567426971e-05, "loss": 0.8676, "step": 7275 }, { "epoch": 5.813397129186603, "grad_norm": 3.465071439743042, "learning_rate": 3.543417366946779e-05, "loss": 0.8296, "step": 7290 }, { "epoch": 5.8253588516746415, "grad_norm": 3.9034931659698486, "learning_rate": 3.540416166466587e-05, "loss": 0.8398, "step": 7305 }, { "epoch": 5.837320574162679, "grad_norm": 3.817934989929199, "learning_rate": 3.5374149659863946e-05, "loss": 0.8602, "step": 7320 }, { "epoch": 5.849282296650718, "grad_norm": 4.706762790679932, "learning_rate": 3.5344137655062024e-05, "loss": 0.8684, "step": 7335 }, { "epoch": 5.861244019138756, "grad_norm": 3.3008809089660645, "learning_rate": 3.53141256502601e-05, "loss": 0.8182, "step": 7350 }, { "epoch": 5.873205741626794, "grad_norm": 3.5898377895355225, "learning_rate": 3.5284113645458186e-05, "loss": 0.8512, "step": 7365 }, { "epoch": 5.885167464114833, "grad_norm": 3.8670029640197754, "learning_rate": 3.5254101640656264e-05, "loss": 0.8412, "step": 7380 }, { "epoch": 5.8971291866028706, "grad_norm": 3.6071064472198486, "learning_rate": 3.522408963585435e-05, "loss": 0.8578, "step": 7395 }, { "epoch": 5.909090909090909, "grad_norm": 4.674183368682861, "learning_rate": 3.519407763105243e-05, "loss": 0.8554, "step": 7410 }, { "epoch": 5.921052631578947, "grad_norm": 3.45503306388855, "learning_rate": 3.5164065626250505e-05, "loss": 0.9224, "step": 7425 }, { "epoch": 5.9330143540669855, "grad_norm": 3.4863317012786865, "learning_rate": 3.513405362144858e-05, "loss": 0.8177, "step": 7440 }, { "epoch": 5.944976076555024, "grad_norm": 3.9804773330688477, "learning_rate": 3.510404161664666e-05, "loss": 0.8379, "step": 7455 }, { "epoch": 5.956937799043062, "grad_norm": 3.6782078742980957, "learning_rate": 3.507402961184474e-05, "loss": 0.8634, "step": 7470 }, { "epoch": 5.9688995215311005, "grad_norm": 3.7234580516815186, "learning_rate": 3.5044017607042816e-05, "loss": 0.9142, "step": 7485 }, { "epoch": 5.980861244019139, "grad_norm": 3.6034648418426514, "learning_rate": 3.5014005602240894e-05, "loss": 0.8777, "step": 7500 }, { "epoch": 5.992822966507177, "grad_norm": 3.407047748565674, "learning_rate": 3.498399359743898e-05, "loss": 0.8191, "step": 7515 }, { "epoch": 6.0047846889952154, "grad_norm": 4.2239508628845215, "learning_rate": 3.4953981592637056e-05, "loss": 0.7896, "step": 7530 }, { "epoch": 6.016746411483253, "grad_norm": 2.516592502593994, "learning_rate": 3.4923969587835134e-05, "loss": 0.5012, "step": 7545 }, { "epoch": 6.028708133971292, "grad_norm": 3.366042375564575, "learning_rate": 3.489395758303321e-05, "loss": 0.4626, "step": 7560 }, { "epoch": 6.04066985645933, "grad_norm": 4.176771640777588, "learning_rate": 3.48639455782313e-05, "loss": 0.4813, "step": 7575 }, { "epoch": 6.052631578947368, "grad_norm": 3.807236671447754, "learning_rate": 3.4833933573429375e-05, "loss": 0.4928, "step": 7590 }, { "epoch": 6.064593301435407, "grad_norm": 3.5176925659179688, "learning_rate": 3.480392156862745e-05, "loss": 0.4474, "step": 7605 }, { "epoch": 6.076555023923445, "grad_norm": 3.860903739929199, "learning_rate": 3.477390956382553e-05, "loss": 0.5181, "step": 7620 }, { "epoch": 6.088516746411483, "grad_norm": 3.883094072341919, "learning_rate": 3.4743897559023615e-05, "loss": 0.497, "step": 7635 }, { "epoch": 6.100478468899522, "grad_norm": 3.299124240875244, "learning_rate": 3.471388555422169e-05, "loss": 0.5023, "step": 7650 }, { "epoch": 6.1124401913875595, "grad_norm": 3.780906915664673, "learning_rate": 3.468387354941977e-05, "loss": 0.4938, "step": 7665 }, { "epoch": 6.124401913875598, "grad_norm": 3.906473159790039, "learning_rate": 3.465386154461785e-05, "loss": 0.52, "step": 7680 }, { "epoch": 6.136363636363637, "grad_norm": 3.7031853199005127, "learning_rate": 3.4623849539815926e-05, "loss": 0.4922, "step": 7695 }, { "epoch": 6.148325358851674, "grad_norm": 4.119719505310059, "learning_rate": 3.4593837535014004e-05, "loss": 0.4726, "step": 7710 }, { "epoch": 6.160287081339713, "grad_norm": 3.637122869491577, "learning_rate": 3.456382553021208e-05, "loss": 0.4522, "step": 7725 }, { "epoch": 6.172248803827751, "grad_norm": 3.6455516815185547, "learning_rate": 3.453381352541017e-05, "loss": 0.497, "step": 7740 }, { "epoch": 6.184210526315789, "grad_norm": 3.90136981010437, "learning_rate": 3.4503801520608245e-05, "loss": 0.5286, "step": 7755 }, { "epoch": 6.196172248803828, "grad_norm": 3.776540994644165, "learning_rate": 3.447378951580633e-05, "loss": 0.5407, "step": 7770 }, { "epoch": 6.208133971291866, "grad_norm": 4.160264015197754, "learning_rate": 3.444377751100441e-05, "loss": 0.4874, "step": 7785 }, { "epoch": 6.220095693779904, "grad_norm": 3.5366413593292236, "learning_rate": 3.4413765506202485e-05, "loss": 0.4708, "step": 7800 }, { "epoch": 6.232057416267943, "grad_norm": 3.604766368865967, "learning_rate": 3.438375350140056e-05, "loss": 0.5326, "step": 7815 }, { "epoch": 6.244019138755981, "grad_norm": 3.5916519165039062, "learning_rate": 3.435374149659864e-05, "loss": 0.5411, "step": 7830 }, { "epoch": 6.255980861244019, "grad_norm": 3.626094102859497, "learning_rate": 3.432372949179672e-05, "loss": 0.5142, "step": 7845 }, { "epoch": 6.267942583732057, "grad_norm": 4.346883296966553, "learning_rate": 3.4293717486994796e-05, "loss": 0.5135, "step": 7860 }, { "epoch": 6.279904306220096, "grad_norm": 4.123327732086182, "learning_rate": 3.426370548219288e-05, "loss": 0.5403, "step": 7875 }, { "epoch": 6.291866028708134, "grad_norm": 4.1574482917785645, "learning_rate": 3.423369347739096e-05, "loss": 0.5373, "step": 7890 }, { "epoch": 6.303827751196172, "grad_norm": 3.9462273120880127, "learning_rate": 3.4203681472589037e-05, "loss": 0.5223, "step": 7905 }, { "epoch": 6.315789473684211, "grad_norm": 4.356924533843994, "learning_rate": 3.4173669467787114e-05, "loss": 0.5857, "step": 7920 }, { "epoch": 6.327751196172249, "grad_norm": 3.8217930793762207, "learning_rate": 3.41436574629852e-05, "loss": 0.5272, "step": 7935 }, { "epoch": 6.339712918660287, "grad_norm": 3.689328908920288, "learning_rate": 3.411364545818328e-05, "loss": 0.5162, "step": 7950 }, { "epoch": 6.351674641148326, "grad_norm": 3.6850223541259766, "learning_rate": 3.4083633453381355e-05, "loss": 0.582, "step": 7965 }, { "epoch": 6.363636363636363, "grad_norm": 4.063047885894775, "learning_rate": 3.405362144857943e-05, "loss": 0.5642, "step": 7980 }, { "epoch": 6.375598086124402, "grad_norm": 3.6065573692321777, "learning_rate": 3.402360944377751e-05, "loss": 0.5225, "step": 7995 }, { "epoch": 6.3875598086124405, "grad_norm": 4.188450336456299, "learning_rate": 3.3993597438975595e-05, "loss": 0.5911, "step": 8010 }, { "epoch": 6.399521531100478, "grad_norm": 3.9791886806488037, "learning_rate": 3.396358543417367e-05, "loss": 0.5178, "step": 8025 }, { "epoch": 6.411483253588517, "grad_norm": 4.381253719329834, "learning_rate": 3.393357342937175e-05, "loss": 0.5344, "step": 8040 }, { "epoch": 6.423444976076555, "grad_norm": 3.810927152633667, "learning_rate": 3.390356142456983e-05, "loss": 0.4915, "step": 8055 }, { "epoch": 6.435406698564593, "grad_norm": 4.254152774810791, "learning_rate": 3.3873549419767907e-05, "loss": 0.601, "step": 8070 }, { "epoch": 6.447368421052632, "grad_norm": 4.086537837982178, "learning_rate": 3.3843537414965984e-05, "loss": 0.5944, "step": 8085 }, { "epoch": 6.45933014354067, "grad_norm": 4.881983280181885, "learning_rate": 3.381352541016406e-05, "loss": 0.5789, "step": 8100 }, { "epoch": 6.471291866028708, "grad_norm": 4.15606689453125, "learning_rate": 3.378351340536215e-05, "loss": 0.5397, "step": 8115 }, { "epoch": 6.483253588516747, "grad_norm": 3.6769986152648926, "learning_rate": 3.3753501400560225e-05, "loss": 0.5449, "step": 8130 }, { "epoch": 6.4952153110047846, "grad_norm": 3.846041440963745, "learning_rate": 3.372348939575831e-05, "loss": 0.5555, "step": 8145 }, { "epoch": 6.507177033492823, "grad_norm": 4.353069305419922, "learning_rate": 3.369347739095639e-05, "loss": 0.608, "step": 8160 }, { "epoch": 6.519138755980861, "grad_norm": 4.087284564971924, "learning_rate": 3.3663465386154465e-05, "loss": 0.5741, "step": 8175 }, { "epoch": 6.5311004784688995, "grad_norm": 4.356995582580566, "learning_rate": 3.363345338135254e-05, "loss": 0.6432, "step": 8190 }, { "epoch": 6.543062200956938, "grad_norm": 3.855937957763672, "learning_rate": 3.360344137655062e-05, "loss": 0.5783, "step": 8205 }, { "epoch": 6.555023923444976, "grad_norm": 3.820133686065674, "learning_rate": 3.35734293717487e-05, "loss": 0.5814, "step": 8220 }, { "epoch": 6.5669856459330145, "grad_norm": 4.873568058013916, "learning_rate": 3.3543417366946776e-05, "loss": 0.6264, "step": 8235 }, { "epoch": 6.578947368421053, "grad_norm": 3.8670310974121094, "learning_rate": 3.351340536214486e-05, "loss": 0.6271, "step": 8250 }, { "epoch": 6.590909090909091, "grad_norm": 4.838265895843506, "learning_rate": 3.348339335734294e-05, "loss": 0.6346, "step": 8265 }, { "epoch": 6.6028708133971294, "grad_norm": 4.0044403076171875, "learning_rate": 3.345338135254102e-05, "loss": 0.5724, "step": 8280 }, { "epoch": 6.614832535885167, "grad_norm": 3.866497039794922, "learning_rate": 3.3423369347739095e-05, "loss": 0.6172, "step": 8295 }, { "epoch": 6.626794258373206, "grad_norm": 4.213998317718506, "learning_rate": 3.339335734293718e-05, "loss": 0.6246, "step": 8310 }, { "epoch": 6.638755980861244, "grad_norm": 4.162674427032471, "learning_rate": 3.336334533813526e-05, "loss": 0.6301, "step": 8325 }, { "epoch": 6.650717703349282, "grad_norm": 4.032559394836426, "learning_rate": 3.3333333333333335e-05, "loss": 0.6557, "step": 8340 }, { "epoch": 6.662679425837321, "grad_norm": 4.416426658630371, "learning_rate": 3.330332132853141e-05, "loss": 0.6592, "step": 8355 }, { "epoch": 6.6746411483253585, "grad_norm": 4.758429527282715, "learning_rate": 3.32733093237295e-05, "loss": 0.6753, "step": 8370 }, { "epoch": 6.686602870813397, "grad_norm": 4.513240337371826, "learning_rate": 3.3243297318927575e-05, "loss": 0.5713, "step": 8385 }, { "epoch": 6.698564593301436, "grad_norm": 4.007817268371582, "learning_rate": 3.321328531412565e-05, "loss": 0.596, "step": 8400 }, { "epoch": 6.7105263157894735, "grad_norm": 4.17065954208374, "learning_rate": 3.318327330932373e-05, "loss": 0.5975, "step": 8415 }, { "epoch": 6.722488038277512, "grad_norm": 3.68249773979187, "learning_rate": 3.315326130452181e-05, "loss": 0.563, "step": 8430 }, { "epoch": 6.73444976076555, "grad_norm": 4.292535781860352, "learning_rate": 3.312324929971989e-05, "loss": 0.6413, "step": 8445 }, { "epoch": 6.746411483253588, "grad_norm": 4.380221843719482, "learning_rate": 3.3093237294917965e-05, "loss": 0.5637, "step": 8460 }, { "epoch": 6.758373205741627, "grad_norm": 3.799266815185547, "learning_rate": 3.306322529011604e-05, "loss": 0.6653, "step": 8475 }, { "epoch": 6.770334928229665, "grad_norm": 4.119513034820557, "learning_rate": 3.303321328531413e-05, "loss": 0.6436, "step": 8490 }, { "epoch": 6.782296650717703, "grad_norm": 4.17624044418335, "learning_rate": 3.3003201280512205e-05, "loss": 0.6778, "step": 8505 }, { "epoch": 6.794258373205742, "grad_norm": 4.3085761070251465, "learning_rate": 3.297318927571029e-05, "loss": 0.6298, "step": 8520 }, { "epoch": 6.80622009569378, "grad_norm": 3.8202457427978516, "learning_rate": 3.294317727090837e-05, "loss": 0.5924, "step": 8535 }, { "epoch": 6.818181818181818, "grad_norm": 4.103767395019531, "learning_rate": 3.2913165266106445e-05, "loss": 0.5925, "step": 8550 }, { "epoch": 6.830143540669856, "grad_norm": 4.139376640319824, "learning_rate": 3.288315326130452e-05, "loss": 0.6656, "step": 8565 }, { "epoch": 6.842105263157895, "grad_norm": 4.039120674133301, "learning_rate": 3.28531412565026e-05, "loss": 0.6807, "step": 8580 }, { "epoch": 6.854066985645933, "grad_norm": 4.153085708618164, "learning_rate": 3.282312925170068e-05, "loss": 0.6194, "step": 8595 }, { "epoch": 6.866028708133971, "grad_norm": 4.125678539276123, "learning_rate": 3.279311724689876e-05, "loss": 0.6333, "step": 8610 }, { "epoch": 6.87799043062201, "grad_norm": 4.25078821182251, "learning_rate": 3.276310524209684e-05, "loss": 0.6895, "step": 8625 }, { "epoch": 6.889952153110048, "grad_norm": 3.782094955444336, "learning_rate": 3.273309323729492e-05, "loss": 0.6789, "step": 8640 }, { "epoch": 6.901913875598086, "grad_norm": 4.739928245544434, "learning_rate": 3.2703081232493e-05, "loss": 0.6427, "step": 8655 }, { "epoch": 6.913875598086125, "grad_norm": 4.479592800140381, "learning_rate": 3.2673069227691075e-05, "loss": 0.6309, "step": 8670 }, { "epoch": 6.925837320574162, "grad_norm": 4.018124580383301, "learning_rate": 3.264305722288916e-05, "loss": 0.6828, "step": 8685 }, { "epoch": 6.937799043062201, "grad_norm": 3.8505430221557617, "learning_rate": 3.261304521808724e-05, "loss": 0.6361, "step": 8700 }, { "epoch": 6.94976076555024, "grad_norm": 3.596605062484741, "learning_rate": 3.2583033213285315e-05, "loss": 0.6297, "step": 8715 }, { "epoch": 6.961722488038277, "grad_norm": 4.318160533905029, "learning_rate": 3.25530212084834e-05, "loss": 0.6403, "step": 8730 }, { "epoch": 6.973684210526316, "grad_norm": 4.0697431564331055, "learning_rate": 3.252300920368148e-05, "loss": 0.6154, "step": 8745 }, { "epoch": 6.985645933014354, "grad_norm": 4.358625411987305, "learning_rate": 3.2492997198879555e-05, "loss": 0.6782, "step": 8760 }, { "epoch": 6.997607655502392, "grad_norm": 4.264054298400879, "learning_rate": 3.246298519407763e-05, "loss": 0.6498, "step": 8775 }, { "epoch": 7.009569377990431, "grad_norm": 3.4622254371643066, "learning_rate": 3.243297318927571e-05, "loss": 0.4423, "step": 8790 }, { "epoch": 7.021531100478469, "grad_norm": 4.359318733215332, "learning_rate": 3.240296118447379e-05, "loss": 0.2942, "step": 8805 }, { "epoch": 7.033492822966507, "grad_norm": 4.986384391784668, "learning_rate": 3.237294917967187e-05, "loss": 0.3187, "step": 8820 }, { "epoch": 7.045454545454546, "grad_norm": 4.1987104415893555, "learning_rate": 3.2342937174869945e-05, "loss": 0.3306, "step": 8835 }, { "epoch": 7.057416267942584, "grad_norm": 4.6675028800964355, "learning_rate": 3.231292517006803e-05, "loss": 0.349, "step": 8850 }, { "epoch": 7.069377990430622, "grad_norm": 4.357269763946533, "learning_rate": 3.228291316526611e-05, "loss": 0.3153, "step": 8865 }, { "epoch": 7.08133971291866, "grad_norm": 3.168750762939453, "learning_rate": 3.225290116046419e-05, "loss": 0.3223, "step": 8880 }, { "epoch": 7.0933014354066986, "grad_norm": 4.13469934463501, "learning_rate": 3.222288915566227e-05, "loss": 0.3289, "step": 8895 }, { "epoch": 7.105263157894737, "grad_norm": 3.306483507156372, "learning_rate": 3.219287715086035e-05, "loss": 0.3357, "step": 8910 }, { "epoch": 7.117224880382775, "grad_norm": 3.830190896987915, "learning_rate": 3.2162865146058425e-05, "loss": 0.3425, "step": 8925 }, { "epoch": 7.1291866028708135, "grad_norm": 3.848161220550537, "learning_rate": 3.21328531412565e-05, "loss": 0.3412, "step": 8940 }, { "epoch": 7.141148325358851, "grad_norm": 4.058409214019775, "learning_rate": 3.210284113645458e-05, "loss": 0.3333, "step": 8955 }, { "epoch": 7.15311004784689, "grad_norm": 3.780856132507324, "learning_rate": 3.2072829131652666e-05, "loss": 0.3205, "step": 8970 }, { "epoch": 7.1650717703349285, "grad_norm": 3.9334750175476074, "learning_rate": 3.2042817126850744e-05, "loss": 0.3546, "step": 8985 }, { "epoch": 7.177033492822966, "grad_norm": 4.092038631439209, "learning_rate": 3.201280512204882e-05, "loss": 0.3295, "step": 9000 }, { "epoch": 7.188995215311005, "grad_norm": 4.35646390914917, "learning_rate": 3.19827931172469e-05, "loss": 0.3593, "step": 9015 }, { "epoch": 7.2009569377990434, "grad_norm": 3.8881773948669434, "learning_rate": 3.195278111244498e-05, "loss": 0.3541, "step": 9030 }, { "epoch": 7.212918660287081, "grad_norm": 4.399089336395264, "learning_rate": 3.1922769107643055e-05, "loss": 0.3271, "step": 9045 }, { "epoch": 7.22488038277512, "grad_norm": 4.376395225524902, "learning_rate": 3.189275710284114e-05, "loss": 0.4131, "step": 9060 }, { "epoch": 7.2368421052631575, "grad_norm": 4.1286468505859375, "learning_rate": 3.186274509803922e-05, "loss": 0.3824, "step": 9075 }, { "epoch": 7.248803827751196, "grad_norm": 4.728172302246094, "learning_rate": 3.18327330932373e-05, "loss": 0.3706, "step": 9090 }, { "epoch": 7.260765550239235, "grad_norm": 3.76225209236145, "learning_rate": 3.180272108843538e-05, "loss": 0.3568, "step": 9105 }, { "epoch": 7.2727272727272725, "grad_norm": 3.939035415649414, "learning_rate": 3.177270908363346e-05, "loss": 0.4092, "step": 9120 }, { "epoch": 7.284688995215311, "grad_norm": 4.537744045257568, "learning_rate": 3.1742697078831536e-05, "loss": 0.3606, "step": 9135 }, { "epoch": 7.296650717703349, "grad_norm": 4.309103965759277, "learning_rate": 3.1712685074029613e-05, "loss": 0.406, "step": 9150 }, { "epoch": 7.3086124401913874, "grad_norm": 4.298764228820801, "learning_rate": 3.168267306922769e-05, "loss": 0.373, "step": 9165 }, { "epoch": 7.320574162679426, "grad_norm": 4.205005645751953, "learning_rate": 3.165266106442577e-05, "loss": 0.3567, "step": 9180 }, { "epoch": 7.332535885167464, "grad_norm": 4.051873207092285, "learning_rate": 3.162264905962385e-05, "loss": 0.377, "step": 9195 }, { "epoch": 7.344497607655502, "grad_norm": 4.320316314697266, "learning_rate": 3.159263705482193e-05, "loss": 0.4071, "step": 9210 }, { "epoch": 7.356459330143541, "grad_norm": 4.617473125457764, "learning_rate": 3.156262505002001e-05, "loss": 0.4048, "step": 9225 }, { "epoch": 7.368421052631579, "grad_norm": 4.013522148132324, "learning_rate": 3.153261304521809e-05, "loss": 0.3792, "step": 9240 }, { "epoch": 7.380382775119617, "grad_norm": 4.339334487915039, "learning_rate": 3.150260104041617e-05, "loss": 0.4172, "step": 9255 }, { "epoch": 7.392344497607655, "grad_norm": 4.555285453796387, "learning_rate": 3.147258903561425e-05, "loss": 0.397, "step": 9270 }, { "epoch": 7.404306220095694, "grad_norm": 3.832693576812744, "learning_rate": 3.144257703081233e-05, "loss": 0.3784, "step": 9285 }, { "epoch": 7.416267942583732, "grad_norm": 4.14719295501709, "learning_rate": 3.1412565026010406e-05, "loss": 0.3979, "step": 9300 }, { "epoch": 7.42822966507177, "grad_norm": 3.914750337600708, "learning_rate": 3.138255302120848e-05, "loss": 0.3848, "step": 9315 }, { "epoch": 7.440191387559809, "grad_norm": 4.9536967277526855, "learning_rate": 3.135254101640656e-05, "loss": 0.4144, "step": 9330 }, { "epoch": 7.452153110047847, "grad_norm": 4.35673713684082, "learning_rate": 3.1322529011604646e-05, "loss": 0.4446, "step": 9345 }, { "epoch": 7.464114832535885, "grad_norm": 4.106342315673828, "learning_rate": 3.1292517006802724e-05, "loss": 0.4056, "step": 9360 }, { "epoch": 7.476076555023924, "grad_norm": 4.211533546447754, "learning_rate": 3.12625050020008e-05, "loss": 0.4072, "step": 9375 }, { "epoch": 7.488038277511961, "grad_norm": 3.965963840484619, "learning_rate": 3.123249299719888e-05, "loss": 0.4329, "step": 9390 }, { "epoch": 7.5, "grad_norm": 4.13434362411499, "learning_rate": 3.120248099239696e-05, "loss": 0.4161, "step": 9405 }, { "epoch": 7.511961722488039, "grad_norm": 6.448205947875977, "learning_rate": 3.1172468987595035e-05, "loss": 0.3927, "step": 9420 }, { "epoch": 7.523923444976076, "grad_norm": 4.125397682189941, "learning_rate": 3.114245698279312e-05, "loss": 0.4021, "step": 9435 }, { "epoch": 7.535885167464115, "grad_norm": 4.477077007293701, "learning_rate": 3.11124449779912e-05, "loss": 0.4195, "step": 9450 }, { "epoch": 7.547846889952153, "grad_norm": 3.9981772899627686, "learning_rate": 3.108243297318928e-05, "loss": 0.4473, "step": 9465 }, { "epoch": 7.559808612440191, "grad_norm": 4.3731689453125, "learning_rate": 3.105242096838736e-05, "loss": 0.4264, "step": 9480 }, { "epoch": 7.57177033492823, "grad_norm": 4.046823501586914, "learning_rate": 3.102240896358544e-05, "loss": 0.4151, "step": 9495 }, { "epoch": 7.583732057416268, "grad_norm": 4.526839733123779, "learning_rate": 3.0992396958783516e-05, "loss": 0.4426, "step": 9510 }, { "epoch": 7.595693779904306, "grad_norm": 4.215605735778809, "learning_rate": 3.0962384953981594e-05, "loss": 0.4376, "step": 9525 }, { "epoch": 7.607655502392344, "grad_norm": 4.018391132354736, "learning_rate": 3.093237294917967e-05, "loss": 0.4385, "step": 9540 }, { "epoch": 7.619617224880383, "grad_norm": 5.19038200378418, "learning_rate": 3.090236094437775e-05, "loss": 0.4379, "step": 9555 }, { "epoch": 7.631578947368421, "grad_norm": 4.6209611892700195, "learning_rate": 3.087234893957583e-05, "loss": 0.4445, "step": 9570 }, { "epoch": 7.643540669856459, "grad_norm": 4.700253486633301, "learning_rate": 3.084233693477391e-05, "loss": 0.4309, "step": 9585 }, { "epoch": 7.655502392344498, "grad_norm": 4.6337761878967285, "learning_rate": 3.081232492997199e-05, "loss": 0.4256, "step": 9600 }, { "epoch": 7.667464114832536, "grad_norm": 4.5144734382629395, "learning_rate": 3.078231292517007e-05, "loss": 0.4685, "step": 9615 }, { "epoch": 7.679425837320574, "grad_norm": 4.41657829284668, "learning_rate": 3.075230092036815e-05, "loss": 0.4455, "step": 9630 }, { "epoch": 7.6913875598086126, "grad_norm": 4.547213554382324, "learning_rate": 3.072228891556623e-05, "loss": 0.4935, "step": 9645 }, { "epoch": 7.703349282296651, "grad_norm": 4.367729187011719, "learning_rate": 3.069227691076431e-05, "loss": 0.4636, "step": 9660 }, { "epoch": 7.715311004784689, "grad_norm": 4.459219932556152, "learning_rate": 3.0662264905962386e-05, "loss": 0.4668, "step": 9675 }, { "epoch": 7.7272727272727275, "grad_norm": 4.355218887329102, "learning_rate": 3.0632252901160464e-05, "loss": 0.4296, "step": 9690 }, { "epoch": 7.739234449760765, "grad_norm": 3.960000514984131, "learning_rate": 3.060224089635855e-05, "loss": 0.4429, "step": 9705 }, { "epoch": 7.751196172248804, "grad_norm": 4.526662349700928, "learning_rate": 3.0572228891556626e-05, "loss": 0.4751, "step": 9720 }, { "epoch": 7.7631578947368425, "grad_norm": 4.3358259201049805, "learning_rate": 3.0542216886754704e-05, "loss": 0.4885, "step": 9735 }, { "epoch": 7.77511961722488, "grad_norm": 4.190465927124023, "learning_rate": 3.0512204881952782e-05, "loss": 0.4633, "step": 9750 }, { "epoch": 7.787081339712919, "grad_norm": 4.320166110992432, "learning_rate": 3.0482192877150863e-05, "loss": 0.4926, "step": 9765 }, { "epoch": 7.7990430622009566, "grad_norm": 3.990604877471924, "learning_rate": 3.045218087234894e-05, "loss": 0.4516, "step": 9780 }, { "epoch": 7.811004784688995, "grad_norm": 5.037746906280518, "learning_rate": 3.042216886754702e-05, "loss": 0.4121, "step": 9795 }, { "epoch": 7.822966507177034, "grad_norm": 5.006950855255127, "learning_rate": 3.0392156862745097e-05, "loss": 0.4643, "step": 9810 }, { "epoch": 7.8349282296650715, "grad_norm": 4.678879261016846, "learning_rate": 3.036214485794318e-05, "loss": 0.4733, "step": 9825 }, { "epoch": 7.84688995215311, "grad_norm": 4.293395042419434, "learning_rate": 3.033213285314126e-05, "loss": 0.4866, "step": 9840 }, { "epoch": 7.858851674641148, "grad_norm": 4.712632656097412, "learning_rate": 3.0302120848339337e-05, "loss": 0.4878, "step": 9855 }, { "epoch": 7.8708133971291865, "grad_norm": 4.51541805267334, "learning_rate": 3.0272108843537418e-05, "loss": 0.4721, "step": 9870 }, { "epoch": 7.882775119617225, "grad_norm": 4.705857753753662, "learning_rate": 3.0242096838735496e-05, "loss": 0.4849, "step": 9885 }, { "epoch": 7.894736842105263, "grad_norm": 4.610105037689209, "learning_rate": 3.0212084833933574e-05, "loss": 0.4974, "step": 9900 }, { "epoch": 7.9066985645933014, "grad_norm": 4.228977680206299, "learning_rate": 3.018207282913165e-05, "loss": 0.468, "step": 9915 }, { "epoch": 7.91866028708134, "grad_norm": 4.514330863952637, "learning_rate": 3.015206082432973e-05, "loss": 0.4857, "step": 9930 }, { "epoch": 7.930622009569378, "grad_norm": 4.639202117919922, "learning_rate": 3.0122048819527814e-05, "loss": 0.3874, "step": 9945 }, { "epoch": 7.942583732057416, "grad_norm": 4.870967864990234, "learning_rate": 3.0092036814725892e-05, "loss": 0.4849, "step": 9960 }, { "epoch": 7.954545454545455, "grad_norm": 4.402018070220947, "learning_rate": 3.0062024809923973e-05, "loss": 0.492, "step": 9975 }, { "epoch": 7.966507177033493, "grad_norm": 4.405611991882324, "learning_rate": 3.003201280512205e-05, "loss": 0.4874, "step": 9990 }, { "epoch": 7.978468899521531, "grad_norm": 4.78075647354126, "learning_rate": 3.000200080032013e-05, "loss": 0.5089, "step": 10005 }, { "epoch": 7.990430622009569, "grad_norm": 4.583403587341309, "learning_rate": 2.9971988795518207e-05, "loss": 0.4791, "step": 10020 }, { "epoch": 8.002392344497608, "grad_norm": 3.6340909004211426, "learning_rate": 2.9941976790716285e-05, "loss": 0.4022, "step": 10035 }, { "epoch": 8.014354066985646, "grad_norm": 3.58935809135437, "learning_rate": 2.9911964785914366e-05, "loss": 0.2033, "step": 10050 }, { "epoch": 8.026315789473685, "grad_norm": 4.309442520141602, "learning_rate": 2.988195278111245e-05, "loss": 0.209, "step": 10065 }, { "epoch": 8.038277511961722, "grad_norm": 3.540694236755371, "learning_rate": 2.985194077631053e-05, "loss": 0.2269, "step": 10080 }, { "epoch": 8.05023923444976, "grad_norm": 4.051588535308838, "learning_rate": 2.9821928771508606e-05, "loss": 0.225, "step": 10095 }, { "epoch": 8.062200956937799, "grad_norm": 3.8642947673797607, "learning_rate": 2.9791916766706684e-05, "loss": 0.2408, "step": 10110 }, { "epoch": 8.074162679425838, "grad_norm": 4.4070539474487305, "learning_rate": 2.9761904761904762e-05, "loss": 0.2131, "step": 10125 }, { "epoch": 8.086124401913876, "grad_norm": 3.5634195804595947, "learning_rate": 2.9731892757102843e-05, "loss": 0.2253, "step": 10140 }, { "epoch": 8.098086124401913, "grad_norm": 4.4950737953186035, "learning_rate": 2.970188075230092e-05, "loss": 0.2438, "step": 10155 }, { "epoch": 8.110047846889952, "grad_norm": 4.489715576171875, "learning_rate": 2.9671868747499e-05, "loss": 0.2151, "step": 10170 }, { "epoch": 8.12200956937799, "grad_norm": 4.503179550170898, "learning_rate": 2.9641856742697083e-05, "loss": 0.2375, "step": 10185 }, { "epoch": 8.133971291866029, "grad_norm": 4.019615173339844, "learning_rate": 2.961184473789516e-05, "loss": 0.253, "step": 10200 }, { "epoch": 8.145933014354068, "grad_norm": 3.398512601852417, "learning_rate": 2.958183273309324e-05, "loss": 0.2437, "step": 10215 }, { "epoch": 8.157894736842104, "grad_norm": 2.8724753856658936, "learning_rate": 2.9551820728291317e-05, "loss": 0.2236, "step": 10230 }, { "epoch": 8.169856459330143, "grad_norm": 3.7883143424987793, "learning_rate": 2.9521808723489398e-05, "loss": 0.2164, "step": 10245 }, { "epoch": 8.181818181818182, "grad_norm": 4.483898639678955, "learning_rate": 2.9491796718687476e-05, "loss": 0.231, "step": 10260 }, { "epoch": 8.19377990430622, "grad_norm": 4.909805774688721, "learning_rate": 2.9461784713885554e-05, "loss": 0.2511, "step": 10275 }, { "epoch": 8.205741626794259, "grad_norm": 4.415759563446045, "learning_rate": 2.9431772709083632e-05, "loss": 0.2259, "step": 10290 }, { "epoch": 8.217703349282298, "grad_norm": 3.9223194122314453, "learning_rate": 2.9401760704281716e-05, "loss": 0.2479, "step": 10305 }, { "epoch": 8.229665071770334, "grad_norm": 3.4528160095214844, "learning_rate": 2.9371748699479794e-05, "loss": 0.2275, "step": 10320 }, { "epoch": 8.241626794258373, "grad_norm": 4.239967346191406, "learning_rate": 2.9341736694677872e-05, "loss": 0.2316, "step": 10335 }, { "epoch": 8.253588516746412, "grad_norm": 4.16427755355835, "learning_rate": 2.9311724689875953e-05, "loss": 0.2818, "step": 10350 }, { "epoch": 8.26555023923445, "grad_norm": 4.7562994956970215, "learning_rate": 2.928171268507403e-05, "loss": 0.2658, "step": 10365 }, { "epoch": 8.277511961722489, "grad_norm": 4.450767517089844, "learning_rate": 2.925170068027211e-05, "loss": 0.2792, "step": 10380 }, { "epoch": 8.289473684210526, "grad_norm": 4.766055583953857, "learning_rate": 2.9221688675470187e-05, "loss": 0.2926, "step": 10395 }, { "epoch": 8.301435406698564, "grad_norm": 4.053709030151367, "learning_rate": 2.9191676670668268e-05, "loss": 0.2418, "step": 10410 }, { "epoch": 8.313397129186603, "grad_norm": 4.844228267669678, "learning_rate": 2.916166466586635e-05, "loss": 0.2426, "step": 10425 }, { "epoch": 8.325358851674642, "grad_norm": 3.6860673427581787, "learning_rate": 2.913165266106443e-05, "loss": 0.2542, "step": 10440 }, { "epoch": 8.33732057416268, "grad_norm": 3.938351631164551, "learning_rate": 2.910164065626251e-05, "loss": 0.2769, "step": 10455 }, { "epoch": 8.349282296650717, "grad_norm": 4.569359302520752, "learning_rate": 2.9071628651460586e-05, "loss": 0.2456, "step": 10470 }, { "epoch": 8.361244019138756, "grad_norm": 3.8243377208709717, "learning_rate": 2.9041616646658664e-05, "loss": 0.2666, "step": 10485 }, { "epoch": 8.373205741626794, "grad_norm": 4.553408145904541, "learning_rate": 2.9011604641856742e-05, "loss": 0.2891, "step": 10500 }, { "epoch": 8.385167464114833, "grad_norm": 4.640753746032715, "learning_rate": 2.8981592637054823e-05, "loss": 0.2912, "step": 10515 }, { "epoch": 8.397129186602871, "grad_norm": 4.968740940093994, "learning_rate": 2.89515806322529e-05, "loss": 0.2761, "step": 10530 }, { "epoch": 8.409090909090908, "grad_norm": 4.833539962768555, "learning_rate": 2.8921568627450986e-05, "loss": 0.2915, "step": 10545 }, { "epoch": 8.421052631578947, "grad_norm": 4.913358211517334, "learning_rate": 2.8891556622649064e-05, "loss": 0.2703, "step": 10560 }, { "epoch": 8.433014354066986, "grad_norm": 3.7276763916015625, "learning_rate": 2.886154461784714e-05, "loss": 0.2705, "step": 10575 }, { "epoch": 8.444976076555024, "grad_norm": 4.225296974182129, "learning_rate": 2.883153261304522e-05, "loss": 0.2944, "step": 10590 }, { "epoch": 8.456937799043063, "grad_norm": 4.071160793304443, "learning_rate": 2.8801520608243297e-05, "loss": 0.3017, "step": 10605 }, { "epoch": 8.4688995215311, "grad_norm": 4.818964958190918, "learning_rate": 2.877150860344138e-05, "loss": 0.3057, "step": 10620 }, { "epoch": 8.480861244019138, "grad_norm": 4.391495704650879, "learning_rate": 2.8741496598639456e-05, "loss": 0.2854, "step": 10635 }, { "epoch": 8.492822966507177, "grad_norm": 4.263548374176025, "learning_rate": 2.8711484593837534e-05, "loss": 0.2604, "step": 10650 } ], "logging_steps": 15, "max_steps": 25000, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 15, "total_flos": 7.844568831858917e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }