diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4991 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.492822966507177, + "eval_steps": 500, + "global_step": 10650, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011961722488038277, + "grad_norm": 1.9270328283309937, + "learning_rate": 4.998999599839936e-05, + "loss": 2.3527, + "step": 15 + }, + { + "epoch": 0.023923444976076555, + "grad_norm": 1.8812594413757324, + "learning_rate": 4.995998399359744e-05, + "loss": 2.3692, + "step": 30 + }, + { + "epoch": 0.03588516746411483, + "grad_norm": 1.3909887075424194, + "learning_rate": 4.9929971988795524e-05, + "loss": 2.3111, + "step": 45 + }, + { + "epoch": 0.04784688995215311, + "grad_norm": 2.809542417526245, + "learning_rate": 4.98999599839936e-05, + "loss": 2.2284, + "step": 60 + }, + { + "epoch": 0.05980861244019139, + "grad_norm": 3.4008336067199707, + "learning_rate": 4.986994797919168e-05, + "loss": 2.2569, + "step": 75 + }, + { + "epoch": 0.07177033492822966, + "grad_norm": 1.2219867706298828, + "learning_rate": 4.983993597438976e-05, + "loss": 2.1762, + "step": 90 + }, + { + "epoch": 0.08373205741626795, + "grad_norm": 1.3036127090454102, + "learning_rate": 4.9809923969587836e-05, + "loss": 2.1956, + "step": 105 + }, + { + "epoch": 0.09569377990430622, + "grad_norm": 1.468847393989563, + "learning_rate": 4.977991196478592e-05, + "loss": 2.2729, + "step": 120 + }, + { + "epoch": 0.1076555023923445, + "grad_norm": 1.2088780403137207, + "learning_rate": 4.9749899959984e-05, + "loss": 2.2164, + "step": 135 + }, + { + "epoch": 0.11961722488038277, + "grad_norm": 1.197135090827942, + "learning_rate": 4.9719887955182076e-05, + "loss": 2.1056, + "step": 150 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 1.309010624885559, + "learning_rate": 4.9689875950380154e-05, + "loss": 2.171, + "step": 165 + }, + { + "epoch": 0.14354066985645933, + "grad_norm": 1.3516101837158203, + "learning_rate": 4.965986394557823e-05, + "loss": 2.1898, + "step": 180 + }, + { + "epoch": 0.15550239234449761, + "grad_norm": 1.186513900756836, + "learning_rate": 4.962985194077631e-05, + "loss": 2.1427, + "step": 195 + }, + { + "epoch": 0.1674641148325359, + "grad_norm": 1.159972906112671, + "learning_rate": 4.959983993597439e-05, + "loss": 2.2603, + "step": 210 + }, + { + "epoch": 0.17942583732057416, + "grad_norm": 1.188928484916687, + "learning_rate": 4.956982793117247e-05, + "loss": 2.2281, + "step": 225 + }, + { + "epoch": 0.19138755980861244, + "grad_norm": 2.18959903717041, + "learning_rate": 4.953981592637055e-05, + "loss": 2.2187, + "step": 240 + }, + { + "epoch": 0.20334928229665072, + "grad_norm": 1.267388939857483, + "learning_rate": 4.9509803921568634e-05, + "loss": 2.1898, + "step": 255 + }, + { + "epoch": 0.215311004784689, + "grad_norm": 1.5959223508834839, + "learning_rate": 4.947979191676671e-05, + "loss": 2.1488, + "step": 270 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 1.081666350364685, + "learning_rate": 4.944977991196479e-05, + "loss": 2.2176, + "step": 285 + }, + { + "epoch": 0.23923444976076555, + "grad_norm": 1.1691621541976929, + "learning_rate": 4.941976790716287e-05, + "loss": 2.1297, + "step": 300 + }, + { + "epoch": 0.2511961722488038, + "grad_norm": 1.4069727659225464, + "learning_rate": 4.9389755902360946e-05, + "loss": 2.2035, + "step": 315 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 1.135937213897705, + "learning_rate": 4.9359743897559024e-05, + "loss": 2.1925, + "step": 330 + }, + { + "epoch": 0.2751196172248804, + "grad_norm": 1.0926917791366577, + "learning_rate": 4.93297318927571e-05, + "loss": 2.1679, + "step": 345 + }, + { + "epoch": 0.28708133971291866, + "grad_norm": 1.0808637142181396, + "learning_rate": 4.9299719887955186e-05, + "loss": 2.2122, + "step": 360 + }, + { + "epoch": 0.29904306220095694, + "grad_norm": 1.2694952487945557, + "learning_rate": 4.9269707883153264e-05, + "loss": 2.1643, + "step": 375 + }, + { + "epoch": 0.31100478468899523, + "grad_norm": 1.1682099103927612, + "learning_rate": 4.923969587835134e-05, + "loss": 2.2263, + "step": 390 + }, + { + "epoch": 0.3229665071770335, + "grad_norm": 1.1954610347747803, + "learning_rate": 4.920968387354942e-05, + "loss": 2.1555, + "step": 405 + }, + { + "epoch": 0.3349282296650718, + "grad_norm": 1.0608245134353638, + "learning_rate": 4.9179671868747504e-05, + "loss": 2.1918, + "step": 420 + }, + { + "epoch": 0.34688995215311, + "grad_norm": 1.2034133672714233, + "learning_rate": 4.914965986394558e-05, + "loss": 2.1101, + "step": 435 + }, + { + "epoch": 0.3588516746411483, + "grad_norm": 1.0936003923416138, + "learning_rate": 4.911964785914366e-05, + "loss": 2.137, + "step": 450 + }, + { + "epoch": 0.3708133971291866, + "grad_norm": 1.188496708869934, + "learning_rate": 4.908963585434174e-05, + "loss": 2.1864, + "step": 465 + }, + { + "epoch": 0.3827751196172249, + "grad_norm": 1.350693941116333, + "learning_rate": 4.905962384953982e-05, + "loss": 2.1491, + "step": 480 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 1.2483429908752441, + "learning_rate": 4.90296118447379e-05, + "loss": 2.1868, + "step": 495 + }, + { + "epoch": 0.40669856459330145, + "grad_norm": 1.1137944459915161, + "learning_rate": 4.899959983993598e-05, + "loss": 2.191, + "step": 510 + }, + { + "epoch": 0.41866028708133973, + "grad_norm": 1.3261072635650635, + "learning_rate": 4.8969587835134056e-05, + "loss": 2.1336, + "step": 525 + }, + { + "epoch": 0.430622009569378, + "grad_norm": 1.6815850734710693, + "learning_rate": 4.8939575830332134e-05, + "loss": 2.1524, + "step": 540 + }, + { + "epoch": 0.44258373205741625, + "grad_norm": 1.080824851989746, + "learning_rate": 4.890956382553021e-05, + "loss": 2.2056, + "step": 555 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 1.2140378952026367, + "learning_rate": 4.887955182072829e-05, + "loss": 2.2114, + "step": 570 + }, + { + "epoch": 0.4665071770334928, + "grad_norm": 1.1290125846862793, + "learning_rate": 4.884953981592637e-05, + "loss": 2.101, + "step": 585 + }, + { + "epoch": 0.4784688995215311, + "grad_norm": 1.171129822731018, + "learning_rate": 4.881952781112445e-05, + "loss": 2.2186, + "step": 600 + }, + { + "epoch": 0.4904306220095694, + "grad_norm": 1.99854576587677, + "learning_rate": 4.878951580632253e-05, + "loss": 2.154, + "step": 615 + }, + { + "epoch": 0.5023923444976076, + "grad_norm": 1.1021254062652588, + "learning_rate": 4.8759503801520615e-05, + "loss": 2.1066, + "step": 630 + }, + { + "epoch": 0.5143540669856459, + "grad_norm": 1.022976040840149, + "learning_rate": 4.872949179671869e-05, + "loss": 2.1642, + "step": 645 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.110926866531372, + "learning_rate": 4.869947979191677e-05, + "loss": 2.1592, + "step": 660 + }, + { + "epoch": 0.5382775119617225, + "grad_norm": 1.096807599067688, + "learning_rate": 4.866946778711485e-05, + "loss": 2.2171, + "step": 675 + }, + { + "epoch": 0.5502392344497608, + "grad_norm": 1.2465318441390991, + "learning_rate": 4.8639455782312926e-05, + "loss": 2.1794, + "step": 690 + }, + { + "epoch": 0.562200956937799, + "grad_norm": 1.6367931365966797, + "learning_rate": 4.8609443777511004e-05, + "loss": 2.1405, + "step": 705 + }, + { + "epoch": 0.5741626794258373, + "grad_norm": 1.3877207040786743, + "learning_rate": 4.857943177270909e-05, + "loss": 2.1781, + "step": 720 + }, + { + "epoch": 0.5861244019138756, + "grad_norm": 1.1698716878890991, + "learning_rate": 4.8549419767907166e-05, + "loss": 2.2076, + "step": 735 + }, + { + "epoch": 0.5980861244019139, + "grad_norm": 1.1922690868377686, + "learning_rate": 4.8519407763105244e-05, + "loss": 2.1515, + "step": 750 + }, + { + "epoch": 0.6100478468899522, + "grad_norm": 1.1112874746322632, + "learning_rate": 4.848939575830332e-05, + "loss": 2.0535, + "step": 765 + }, + { + "epoch": 0.6220095693779905, + "grad_norm": 1.3220607042312622, + "learning_rate": 4.84593837535014e-05, + "loss": 2.1978, + "step": 780 + }, + { + "epoch": 0.6339712918660287, + "grad_norm": 1.2560738325119019, + "learning_rate": 4.8429371748699484e-05, + "loss": 2.245, + "step": 795 + }, + { + "epoch": 0.645933014354067, + "grad_norm": 1.1312100887298584, + "learning_rate": 4.839935974389756e-05, + "loss": 2.1252, + "step": 810 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 1.2060538530349731, + "learning_rate": 4.836934773909564e-05, + "loss": 2.1268, + "step": 825 + }, + { + "epoch": 0.6698564593301436, + "grad_norm": 2.0435290336608887, + "learning_rate": 4.8339335734293725e-05, + "loss": 2.2091, + "step": 840 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 2.7680532932281494, + "learning_rate": 4.83093237294918e-05, + "loss": 2.0631, + "step": 855 + }, + { + "epoch": 0.69377990430622, + "grad_norm": 1.1256909370422363, + "learning_rate": 4.827931172468988e-05, + "loss": 2.1396, + "step": 870 + }, + { + "epoch": 0.7057416267942583, + "grad_norm": 1.1224644184112549, + "learning_rate": 4.824929971988796e-05, + "loss": 2.107, + "step": 885 + }, + { + "epoch": 0.7177033492822966, + "grad_norm": 1.2712397575378418, + "learning_rate": 4.8219287715086036e-05, + "loss": 2.1332, + "step": 900 + }, + { + "epoch": 0.7296650717703349, + "grad_norm": 1.2399568557739258, + "learning_rate": 4.8189275710284114e-05, + "loss": 2.1198, + "step": 915 + }, + { + "epoch": 0.7416267942583732, + "grad_norm": 1.0852080583572388, + "learning_rate": 4.815926370548219e-05, + "loss": 2.1436, + "step": 930 + }, + { + "epoch": 0.7535885167464115, + "grad_norm": 1.3282052278518677, + "learning_rate": 4.812925170068027e-05, + "loss": 2.1763, + "step": 945 + }, + { + "epoch": 0.7655502392344498, + "grad_norm": 1.8598517179489136, + "learning_rate": 4.809923969587835e-05, + "loss": 2.1188, + "step": 960 + }, + { + "epoch": 0.777511961722488, + "grad_norm": 1.1602433919906616, + "learning_rate": 4.806922769107643e-05, + "loss": 2.2234, + "step": 975 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 1.3578499555587769, + "learning_rate": 4.803921568627452e-05, + "loss": 2.1404, + "step": 990 + }, + { + "epoch": 0.8014354066985646, + "grad_norm": 1.4764407873153687, + "learning_rate": 4.8009203681472595e-05, + "loss": 2.1582, + "step": 1005 + }, + { + "epoch": 0.8133971291866029, + "grad_norm": 1.083958387374878, + "learning_rate": 4.797919167667067e-05, + "loss": 2.1156, + "step": 1020 + }, + { + "epoch": 0.8253588516746412, + "grad_norm": 1.2568596601486206, + "learning_rate": 4.794917967186875e-05, + "loss": 2.1341, + "step": 1035 + }, + { + "epoch": 0.8373205741626795, + "grad_norm": 1.1657259464263916, + "learning_rate": 4.791916766706683e-05, + "loss": 2.1245, + "step": 1050 + }, + { + "epoch": 0.8492822966507177, + "grad_norm": 2.355947256088257, + "learning_rate": 4.7889155662264906e-05, + "loss": 2.1975, + "step": 1065 + }, + { + "epoch": 0.861244019138756, + "grad_norm": 2.6566946506500244, + "learning_rate": 4.7859143657462984e-05, + "loss": 2.1263, + "step": 1080 + }, + { + "epoch": 0.8732057416267942, + "grad_norm": 1.2993121147155762, + "learning_rate": 4.782913165266107e-05, + "loss": 2.1481, + "step": 1095 + }, + { + "epoch": 0.8851674641148325, + "grad_norm": 1.129744291305542, + "learning_rate": 4.7799119647859146e-05, + "loss": 2.1574, + "step": 1110 + }, + { + "epoch": 0.8971291866028708, + "grad_norm": 1.1695717573165894, + "learning_rate": 4.7769107643057224e-05, + "loss": 2.0916, + "step": 1125 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.159279465675354, + "learning_rate": 4.77390956382553e-05, + "loss": 2.1265, + "step": 1140 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 1.2150417566299438, + "learning_rate": 4.770908363345338e-05, + "loss": 2.1351, + "step": 1155 + }, + { + "epoch": 0.9330143540669856, + "grad_norm": 1.2673773765563965, + "learning_rate": 4.7679071628651465e-05, + "loss": 2.2444, + "step": 1170 + }, + { + "epoch": 0.9449760765550239, + "grad_norm": 1.1746214628219604, + "learning_rate": 4.764905962384954e-05, + "loss": 2.1371, + "step": 1185 + }, + { + "epoch": 0.9569377990430622, + "grad_norm": 1.3716073036193848, + "learning_rate": 4.761904761904762e-05, + "loss": 2.1414, + "step": 1200 + }, + { + "epoch": 0.9688995215311005, + "grad_norm": 1.1066573858261108, + "learning_rate": 4.7589035614245705e-05, + "loss": 2.0949, + "step": 1215 + }, + { + "epoch": 0.9808612440191388, + "grad_norm": 1.1547194719314575, + "learning_rate": 4.755902360944378e-05, + "loss": 2.1023, + "step": 1230 + }, + { + "epoch": 0.992822966507177, + "grad_norm": 1.5456453561782837, + "learning_rate": 4.752901160464186e-05, + "loss": 2.1542, + "step": 1245 + }, + { + "epoch": 1.0047846889952152, + "grad_norm": 1.7362697124481201, + "learning_rate": 4.749899959983994e-05, + "loss": 2.0444, + "step": 1260 + }, + { + "epoch": 1.0167464114832536, + "grad_norm": 5.408290386199951, + "learning_rate": 4.7468987595038016e-05, + "loss": 1.8079, + "step": 1275 + }, + { + "epoch": 1.0287081339712918, + "grad_norm": 3.33227276802063, + "learning_rate": 4.7438975590236094e-05, + "loss": 1.9851, + "step": 1290 + }, + { + "epoch": 1.0406698564593302, + "grad_norm": 1.4184224605560303, + "learning_rate": 4.740896358543417e-05, + "loss": 1.8732, + "step": 1305 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.5775929689407349, + "learning_rate": 4.737895158063225e-05, + "loss": 1.9714, + "step": 1320 + }, + { + "epoch": 1.0645933014354068, + "grad_norm": 1.4744929075241089, + "learning_rate": 4.7348939575830335e-05, + "loss": 1.8901, + "step": 1335 + }, + { + "epoch": 1.076555023923445, + "grad_norm": 1.5280168056488037, + "learning_rate": 4.731892757102841e-05, + "loss": 1.9348, + "step": 1350 + }, + { + "epoch": 1.0885167464114833, + "grad_norm": 1.2531495094299316, + "learning_rate": 4.72889155662265e-05, + "loss": 1.83, + "step": 1365 + }, + { + "epoch": 1.1004784688995215, + "grad_norm": 1.3821693658828735, + "learning_rate": 4.7258903561424575e-05, + "loss": 1.7183, + "step": 1380 + }, + { + "epoch": 1.11244019138756, + "grad_norm": 1.3789594173431396, + "learning_rate": 4.722889155662265e-05, + "loss": 1.8931, + "step": 1395 + }, + { + "epoch": 1.124401913875598, + "grad_norm": 1.2702490091323853, + "learning_rate": 4.719887955182073e-05, + "loss": 1.7617, + "step": 1410 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 1.4505800008773804, + "learning_rate": 4.716886754701881e-05, + "loss": 1.9103, + "step": 1425 + }, + { + "epoch": 1.1483253588516746, + "grad_norm": 1.612985610961914, + "learning_rate": 4.7138855542216886e-05, + "loss": 1.9471, + "step": 1440 + }, + { + "epoch": 1.160287081339713, + "grad_norm": 1.2852972745895386, + "learning_rate": 4.710884353741497e-05, + "loss": 1.9249, + "step": 1455 + }, + { + "epoch": 1.1722488038277512, + "grad_norm": 1.385501503944397, + "learning_rate": 4.707883153261305e-05, + "loss": 1.8883, + "step": 1470 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 1.4401298761367798, + "learning_rate": 4.704881952781113e-05, + "loss": 1.94, + "step": 1485 + }, + { + "epoch": 1.1961722488038278, + "grad_norm": 3.9501471519470215, + "learning_rate": 4.7018807523009204e-05, + "loss": 1.893, + "step": 1500 + }, + { + "epoch": 1.208133971291866, + "grad_norm": 1.3335622549057007, + "learning_rate": 4.698879551820728e-05, + "loss": 1.7215, + "step": 1515 + }, + { + "epoch": 1.2200956937799043, + "grad_norm": 1.6928309202194214, + "learning_rate": 4.695878351340536e-05, + "loss": 1.8889, + "step": 1530 + }, + { + "epoch": 1.2320574162679425, + "grad_norm": 1.2327487468719482, + "learning_rate": 4.6928771508603445e-05, + "loss": 1.8503, + "step": 1545 + }, + { + "epoch": 1.244019138755981, + "grad_norm": 1.3527581691741943, + "learning_rate": 4.689875950380152e-05, + "loss": 1.7963, + "step": 1560 + }, + { + "epoch": 1.255980861244019, + "grad_norm": 1.4024996757507324, + "learning_rate": 4.686874749899961e-05, + "loss": 1.8679, + "step": 1575 + }, + { + "epoch": 1.2679425837320575, + "grad_norm": 1.6798954010009766, + "learning_rate": 4.6838735494197685e-05, + "loss": 1.8944, + "step": 1590 + }, + { + "epoch": 1.2799043062200957, + "grad_norm": 1.4541043043136597, + "learning_rate": 4.680872348939576e-05, + "loss": 1.9555, + "step": 1605 + }, + { + "epoch": 1.291866028708134, + "grad_norm": 1.503612756729126, + "learning_rate": 4.677871148459384e-05, + "loss": 1.8223, + "step": 1620 + }, + { + "epoch": 1.3038277511961722, + "grad_norm": 1.4559051990509033, + "learning_rate": 4.674869947979192e-05, + "loss": 1.8442, + "step": 1635 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 1.3559598922729492, + "learning_rate": 4.6718687474989997e-05, + "loss": 1.933, + "step": 1650 + }, + { + "epoch": 1.3277511961722488, + "grad_norm": 1.3937571048736572, + "learning_rate": 4.6688675470188074e-05, + "loss": 1.864, + "step": 1665 + }, + { + "epoch": 1.339712918660287, + "grad_norm": 1.356520175933838, + "learning_rate": 4.665866346538615e-05, + "loss": 1.856, + "step": 1680 + }, + { + "epoch": 1.3516746411483254, + "grad_norm": 1.6281076669692993, + "learning_rate": 4.662865146058424e-05, + "loss": 1.8623, + "step": 1695 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 1.390368103981018, + "learning_rate": 4.6598639455782315e-05, + "loss": 1.8775, + "step": 1710 + }, + { + "epoch": 1.375598086124402, + "grad_norm": 1.575172781944275, + "learning_rate": 4.656862745098039e-05, + "loss": 1.9558, + "step": 1725 + }, + { + "epoch": 1.38755980861244, + "grad_norm": 1.6121597290039062, + "learning_rate": 4.653861544617848e-05, + "loss": 1.8698, + "step": 1740 + }, + { + "epoch": 1.3995215311004785, + "grad_norm": 1.4013128280639648, + "learning_rate": 4.6508603441376555e-05, + "loss": 1.8567, + "step": 1755 + }, + { + "epoch": 1.4114832535885167, + "grad_norm": 1.636841893196106, + "learning_rate": 4.647859143657463e-05, + "loss": 1.8708, + "step": 1770 + }, + { + "epoch": 1.423444976076555, + "grad_norm": 1.6554105281829834, + "learning_rate": 4.644857943177271e-05, + "loss": 1.9281, + "step": 1785 + }, + { + "epoch": 1.4354066985645932, + "grad_norm": 1.7569769620895386, + "learning_rate": 4.641856742697079e-05, + "loss": 1.8563, + "step": 1800 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 1.5896693468093872, + "learning_rate": 4.638855542216887e-05, + "loss": 1.8764, + "step": 1815 + }, + { + "epoch": 1.4593301435406698, + "grad_norm": 1.3887263536453247, + "learning_rate": 4.635854341736695e-05, + "loss": 1.8871, + "step": 1830 + }, + { + "epoch": 1.4712918660287082, + "grad_norm": 1.6596853733062744, + "learning_rate": 4.632853141256503e-05, + "loss": 1.9176, + "step": 1845 + }, + { + "epoch": 1.4832535885167464, + "grad_norm": 1.6174405813217163, + "learning_rate": 4.629851940776311e-05, + "loss": 1.8109, + "step": 1860 + }, + { + "epoch": 1.4952153110047846, + "grad_norm": 1.3717613220214844, + "learning_rate": 4.6268507402961185e-05, + "loss": 1.867, + "step": 1875 + }, + { + "epoch": 1.507177033492823, + "grad_norm": 1.4477450847625732, + "learning_rate": 4.623849539815926e-05, + "loss": 1.929, + "step": 1890 + }, + { + "epoch": 1.5191387559808613, + "grad_norm": 1.4237533807754517, + "learning_rate": 4.620848339335734e-05, + "loss": 1.8444, + "step": 1905 + }, + { + "epoch": 1.5311004784688995, + "grad_norm": 1.41818106174469, + "learning_rate": 4.6178471388555425e-05, + "loss": 1.8505, + "step": 1920 + }, + { + "epoch": 1.5430622009569377, + "grad_norm": 1.5824397802352905, + "learning_rate": 4.61484593837535e-05, + "loss": 1.773, + "step": 1935 + }, + { + "epoch": 1.555023923444976, + "grad_norm": 1.6391881704330444, + "learning_rate": 4.611844737895159e-05, + "loss": 1.9057, + "step": 1950 + }, + { + "epoch": 1.5669856459330145, + "grad_norm": 1.5484305620193481, + "learning_rate": 4.6088435374149665e-05, + "loss": 1.9141, + "step": 1965 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 1.4594415426254272, + "learning_rate": 4.605842336934774e-05, + "loss": 1.8732, + "step": 1980 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 1.3924568891525269, + "learning_rate": 4.602841136454582e-05, + "loss": 1.9441, + "step": 1995 + }, + { + "epoch": 1.6028708133971292, + "grad_norm": 1.523986577987671, + "learning_rate": 4.59983993597439e-05, + "loss": 1.9101, + "step": 2010 + }, + { + "epoch": 1.6148325358851676, + "grad_norm": 1.369285225868225, + "learning_rate": 4.596838735494198e-05, + "loss": 1.8829, + "step": 2025 + }, + { + "epoch": 1.6267942583732058, + "grad_norm": 1.4909306764602661, + "learning_rate": 4.5938375350140055e-05, + "loss": 1.9204, + "step": 2040 + }, + { + "epoch": 1.638755980861244, + "grad_norm": 1.5464478731155396, + "learning_rate": 4.590836334533814e-05, + "loss": 1.8064, + "step": 2055 + }, + { + "epoch": 1.6507177033492821, + "grad_norm": 1.5255078077316284, + "learning_rate": 4.587835134053622e-05, + "loss": 1.9518, + "step": 2070 + }, + { + "epoch": 1.6626794258373205, + "grad_norm": 1.3710672855377197, + "learning_rate": 4.5848339335734295e-05, + "loss": 1.8957, + "step": 2085 + }, + { + "epoch": 1.674641148325359, + "grad_norm": 1.4883019924163818, + "learning_rate": 4.581832733093237e-05, + "loss": 1.8884, + "step": 2100 + }, + { + "epoch": 1.686602870813397, + "grad_norm": 1.383284091949463, + "learning_rate": 4.578831532613046e-05, + "loss": 1.8924, + "step": 2115 + }, + { + "epoch": 1.6985645933014353, + "grad_norm": 1.5126210451126099, + "learning_rate": 4.5758303321328535e-05, + "loss": 1.9423, + "step": 2130 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 1.4830104112625122, + "learning_rate": 4.572829131652661e-05, + "loss": 1.9377, + "step": 2145 + }, + { + "epoch": 1.722488038277512, + "grad_norm": 1.578748106956482, + "learning_rate": 4.569827931172469e-05, + "loss": 1.8532, + "step": 2160 + }, + { + "epoch": 1.7344497607655502, + "grad_norm": 3.1164207458496094, + "learning_rate": 4.5668267306922776e-05, + "loss": 1.9072, + "step": 2175 + }, + { + "epoch": 1.7464114832535884, + "grad_norm": 1.5984658002853394, + "learning_rate": 4.5638255302120853e-05, + "loss": 1.9674, + "step": 2190 + }, + { + "epoch": 1.7583732057416268, + "grad_norm": 1.5007200241088867, + "learning_rate": 4.560824329731893e-05, + "loss": 1.93, + "step": 2205 + }, + { + "epoch": 1.7703349282296652, + "grad_norm": 2.623798131942749, + "learning_rate": 4.557823129251701e-05, + "loss": 1.9068, + "step": 2220 + }, + { + "epoch": 1.7822966507177034, + "grad_norm": 2.1396572589874268, + "learning_rate": 4.554821928771509e-05, + "loss": 1.886, + "step": 2235 + }, + { + "epoch": 1.7942583732057416, + "grad_norm": 1.5055629014968872, + "learning_rate": 4.5518207282913165e-05, + "loss": 1.8678, + "step": 2250 + }, + { + "epoch": 1.80622009569378, + "grad_norm": 1.4418485164642334, + "learning_rate": 4.548819527811124e-05, + "loss": 1.984, + "step": 2265 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 1.5159984827041626, + "learning_rate": 4.545818327330932e-05, + "loss": 1.9688, + "step": 2280 + }, + { + "epoch": 1.8301435406698565, + "grad_norm": 1.299607753753662, + "learning_rate": 4.5428171268507405e-05, + "loss": 1.9347, + "step": 2295 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 1.4144442081451416, + "learning_rate": 4.539815926370549e-05, + "loss": 1.8877, + "step": 2310 + }, + { + "epoch": 1.8540669856459329, + "grad_norm": 1.5180310010910034, + "learning_rate": 4.536814725890357e-05, + "loss": 1.9392, + "step": 2325 + }, + { + "epoch": 1.8660287081339713, + "grad_norm": 1.475977897644043, + "learning_rate": 4.5338135254101645e-05, + "loss": 1.8535, + "step": 2340 + }, + { + "epoch": 1.8779904306220097, + "grad_norm": 1.4614003896713257, + "learning_rate": 4.530812324929972e-05, + "loss": 1.9246, + "step": 2355 + }, + { + "epoch": 1.8899521531100478, + "grad_norm": 1.4736562967300415, + "learning_rate": 4.52781112444978e-05, + "loss": 1.9095, + "step": 2370 + }, + { + "epoch": 1.901913875598086, + "grad_norm": 1.3201289176940918, + "learning_rate": 4.524809923969588e-05, + "loss": 1.8479, + "step": 2385 + }, + { + "epoch": 1.9138755980861244, + "grad_norm": 1.4976378679275513, + "learning_rate": 4.521808723489396e-05, + "loss": 1.8262, + "step": 2400 + }, + { + "epoch": 1.9258373205741628, + "grad_norm": 1.5323299169540405, + "learning_rate": 4.5188075230092035e-05, + "loss": 1.8989, + "step": 2415 + }, + { + "epoch": 1.937799043062201, + "grad_norm": 2.050426483154297, + "learning_rate": 4.515806322529012e-05, + "loss": 1.8958, + "step": 2430 + }, + { + "epoch": 1.9497607655502391, + "grad_norm": 1.822324514389038, + "learning_rate": 4.51280512204882e-05, + "loss": 1.99, + "step": 2445 + }, + { + "epoch": 1.9617224880382775, + "grad_norm": 1.5009537935256958, + "learning_rate": 4.5098039215686275e-05, + "loss": 1.8561, + "step": 2460 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 1.3751215934753418, + "learning_rate": 4.506802721088435e-05, + "loss": 1.9033, + "step": 2475 + }, + { + "epoch": 1.985645933014354, + "grad_norm": 1.6106884479522705, + "learning_rate": 4.503801520608244e-05, + "loss": 1.9555, + "step": 2490 + }, + { + "epoch": 1.9976076555023923, + "grad_norm": 1.5378204584121704, + "learning_rate": 4.5008003201280515e-05, + "loss": 2.0009, + "step": 2505 + }, + { + "epoch": 2.0095693779904304, + "grad_norm": 2.0536139011383057, + "learning_rate": 4.497799119647859e-05, + "loss": 1.7212, + "step": 2520 + }, + { + "epoch": 2.021531100478469, + "grad_norm": 1.7498282194137573, + "learning_rate": 4.494797919167667e-05, + "loss": 1.5574, + "step": 2535 + }, + { + "epoch": 2.0334928229665072, + "grad_norm": 1.7728687524795532, + "learning_rate": 4.4917967186874756e-05, + "loss": 1.4411, + "step": 2550 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 1.8067642450332642, + "learning_rate": 4.4887955182072834e-05, + "loss": 1.5242, + "step": 2565 + }, + { + "epoch": 2.0574162679425836, + "grad_norm": 1.924641489982605, + "learning_rate": 4.485794317727091e-05, + "loss": 1.5415, + "step": 2580 + }, + { + "epoch": 2.069377990430622, + "grad_norm": 1.9768836498260498, + "learning_rate": 4.482793117246899e-05, + "loss": 1.6774, + "step": 2595 + }, + { + "epoch": 2.0813397129186604, + "grad_norm": 1.943829894065857, + "learning_rate": 4.479791916766707e-05, + "loss": 1.6263, + "step": 2610 + }, + { + "epoch": 2.0933014354066986, + "grad_norm": 2.1001622676849365, + "learning_rate": 4.4767907162865145e-05, + "loss": 1.6304, + "step": 2625 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 2.0388505458831787, + "learning_rate": 4.473789515806322e-05, + "loss": 1.4718, + "step": 2640 + }, + { + "epoch": 2.117224880382775, + "grad_norm": 1.884468913078308, + "learning_rate": 4.47078831532613e-05, + "loss": 1.5752, + "step": 2655 + }, + { + "epoch": 2.1291866028708135, + "grad_norm": 1.9775267839431763, + "learning_rate": 4.4677871148459385e-05, + "loss": 1.478, + "step": 2670 + }, + { + "epoch": 2.1411483253588517, + "grad_norm": 1.8365753889083862, + "learning_rate": 4.464785914365747e-05, + "loss": 1.5408, + "step": 2685 + }, + { + "epoch": 2.15311004784689, + "grad_norm": 1.8778951168060303, + "learning_rate": 4.461784713885555e-05, + "loss": 1.6373, + "step": 2700 + }, + { + "epoch": 2.165071770334928, + "grad_norm": 1.9629762172698975, + "learning_rate": 4.4587835134053626e-05, + "loss": 1.5741, + "step": 2715 + }, + { + "epoch": 2.1770334928229667, + "grad_norm": 2.0409107208251953, + "learning_rate": 4.4557823129251704e-05, + "loss": 1.6216, + "step": 2730 + }, + { + "epoch": 2.188995215311005, + "grad_norm": 2.1008028984069824, + "learning_rate": 4.452781112444978e-05, + "loss": 1.5515, + "step": 2745 + }, + { + "epoch": 2.200956937799043, + "grad_norm": 2.2391457557678223, + "learning_rate": 4.449779911964786e-05, + "loss": 1.6279, + "step": 2760 + }, + { + "epoch": 2.212918660287081, + "grad_norm": 2.294734239578247, + "learning_rate": 4.446778711484594e-05, + "loss": 1.5232, + "step": 2775 + }, + { + "epoch": 2.22488038277512, + "grad_norm": 1.6631484031677246, + "learning_rate": 4.443777511004402e-05, + "loss": 1.5113, + "step": 2790 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 1.9847686290740967, + "learning_rate": 4.44077631052421e-05, + "loss": 1.5006, + "step": 2805 + }, + { + "epoch": 2.248803827751196, + "grad_norm": 1.8953202962875366, + "learning_rate": 4.437775110044018e-05, + "loss": 1.5853, + "step": 2820 + }, + { + "epoch": 2.2607655502392343, + "grad_norm": 1.9015896320343018, + "learning_rate": 4.4347739095638255e-05, + "loss": 1.6078, + "step": 2835 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 1.900415301322937, + "learning_rate": 4.431772709083633e-05, + "loss": 1.5399, + "step": 2850 + }, + { + "epoch": 2.284688995215311, + "grad_norm": 1.9138609170913696, + "learning_rate": 4.428771508603442e-05, + "loss": 1.589, + "step": 2865 + }, + { + "epoch": 2.2966507177033493, + "grad_norm": 1.7661852836608887, + "learning_rate": 4.4257703081232496e-05, + "loss": 1.6258, + "step": 2880 + }, + { + "epoch": 2.3086124401913874, + "grad_norm": 1.9043537378311157, + "learning_rate": 4.4227691076430573e-05, + "loss": 1.6243, + "step": 2895 + }, + { + "epoch": 2.320574162679426, + "grad_norm": 1.8166050910949707, + "learning_rate": 4.419767907162866e-05, + "loss": 1.5999, + "step": 2910 + }, + { + "epoch": 2.3325358851674642, + "grad_norm": 1.7325972318649292, + "learning_rate": 4.4167667066826736e-05, + "loss": 1.586, + "step": 2925 + }, + { + "epoch": 2.3444976076555024, + "grad_norm": 1.8609052896499634, + "learning_rate": 4.4137655062024814e-05, + "loss": 1.5466, + "step": 2940 + }, + { + "epoch": 2.3564593301435406, + "grad_norm": 3.3115549087524414, + "learning_rate": 4.410764305722289e-05, + "loss": 1.5816, + "step": 2955 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 2.2015438079833984, + "learning_rate": 4.407763105242097e-05, + "loss": 1.5162, + "step": 2970 + }, + { + "epoch": 2.3803827751196174, + "grad_norm": 1.7339051961898804, + "learning_rate": 4.404761904761905e-05, + "loss": 1.5764, + "step": 2985 + }, + { + "epoch": 2.3923444976076556, + "grad_norm": 2.817207098007202, + "learning_rate": 4.4017607042817125e-05, + "loss": 1.5633, + "step": 3000 + }, + { + "epoch": 2.4043062200956937, + "grad_norm": 2.063880681991577, + "learning_rate": 4.39875950380152e-05, + "loss": 1.604, + "step": 3015 + }, + { + "epoch": 2.416267942583732, + "grad_norm": 1.8153194189071655, + "learning_rate": 4.395758303321329e-05, + "loss": 1.6417, + "step": 3030 + }, + { + "epoch": 2.4282296650717705, + "grad_norm": 3.646466016769409, + "learning_rate": 4.3927571028411365e-05, + "loss": 1.6325, + "step": 3045 + }, + { + "epoch": 2.4401913875598087, + "grad_norm": 1.9638229608535767, + "learning_rate": 4.389755902360945e-05, + "loss": 1.6393, + "step": 3060 + }, + { + "epoch": 2.452153110047847, + "grad_norm": 2.549917697906494, + "learning_rate": 4.386754701880753e-05, + "loss": 1.6231, + "step": 3075 + }, + { + "epoch": 2.464114832535885, + "grad_norm": 1.8698160648345947, + "learning_rate": 4.3837535014005606e-05, + "loss": 1.4995, + "step": 3090 + }, + { + "epoch": 2.4760765550239237, + "grad_norm": 1.8844027519226074, + "learning_rate": 4.3807523009203684e-05, + "loss": 1.6133, + "step": 3105 + }, + { + "epoch": 2.488038277511962, + "grad_norm": 2.275132417678833, + "learning_rate": 4.377751100440176e-05, + "loss": 1.6124, + "step": 3120 + }, + { + "epoch": 2.5, + "grad_norm": 1.729272723197937, + "learning_rate": 4.374749899959984e-05, + "loss": 1.6766, + "step": 3135 + }, + { + "epoch": 2.511961722488038, + "grad_norm": 1.9503229856491089, + "learning_rate": 4.3717486994797924e-05, + "loss": 1.6937, + "step": 3150 + }, + { + "epoch": 2.5239234449760763, + "grad_norm": 1.8774380683898926, + "learning_rate": 4.3687474989996e-05, + "loss": 1.6159, + "step": 3165 + }, + { + "epoch": 2.535885167464115, + "grad_norm": 2.066387176513672, + "learning_rate": 4.365746298519408e-05, + "loss": 1.6234, + "step": 3180 + }, + { + "epoch": 2.547846889952153, + "grad_norm": 2.7428183555603027, + "learning_rate": 4.362745098039216e-05, + "loss": 1.5469, + "step": 3195 + }, + { + "epoch": 2.5598086124401913, + "grad_norm": 1.9833886623382568, + "learning_rate": 4.3597438975590235e-05, + "loss": 1.5982, + "step": 3210 + }, + { + "epoch": 2.57177033492823, + "grad_norm": 1.7080726623535156, + "learning_rate": 4.356742697078831e-05, + "loss": 1.5975, + "step": 3225 + }, + { + "epoch": 2.583732057416268, + "grad_norm": 1.9213649034500122, + "learning_rate": 4.35374149659864e-05, + "loss": 1.5921, + "step": 3240 + }, + { + "epoch": 2.5956937799043063, + "grad_norm": 2.0085928440093994, + "learning_rate": 4.3507402961184476e-05, + "loss": 1.5904, + "step": 3255 + }, + { + "epoch": 2.6076555023923444, + "grad_norm": 1.903548002243042, + "learning_rate": 4.347739095638256e-05, + "loss": 1.5794, + "step": 3270 + }, + { + "epoch": 2.6196172248803826, + "grad_norm": 1.8258320093154907, + "learning_rate": 4.344737895158064e-05, + "loss": 1.6408, + "step": 3285 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 2.0597989559173584, + "learning_rate": 4.3417366946778716e-05, + "loss": 1.5868, + "step": 3300 + }, + { + "epoch": 2.6435406698564594, + "grad_norm": 2.0705902576446533, + "learning_rate": 4.3387354941976794e-05, + "loss": 1.6906, + "step": 3315 + }, + { + "epoch": 2.6555023923444976, + "grad_norm": 1.9880789518356323, + "learning_rate": 4.335734293717487e-05, + "loss": 1.5963, + "step": 3330 + }, + { + "epoch": 2.6674641148325358, + "grad_norm": 2.0182063579559326, + "learning_rate": 4.332733093237295e-05, + "loss": 1.6478, + "step": 3345 + }, + { + "epoch": 2.679425837320574, + "grad_norm": 1.9995989799499512, + "learning_rate": 4.329731892757103e-05, + "loss": 1.653, + "step": 3360 + }, + { + "epoch": 2.6913875598086126, + "grad_norm": 2.738987922668457, + "learning_rate": 4.3267306922769105e-05, + "loss": 1.6505, + "step": 3375 + }, + { + "epoch": 2.7033492822966507, + "grad_norm": 2.058044672012329, + "learning_rate": 4.323729491796719e-05, + "loss": 1.5528, + "step": 3390 + }, + { + "epoch": 2.715311004784689, + "grad_norm": 2.0416853427886963, + "learning_rate": 4.320728291316527e-05, + "loss": 1.5553, + "step": 3405 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 1.9002925157546997, + "learning_rate": 4.3177270908363346e-05, + "loss": 1.5736, + "step": 3420 + }, + { + "epoch": 2.7392344497607657, + "grad_norm": 1.8847737312316895, + "learning_rate": 4.314725890356143e-05, + "loss": 1.6232, + "step": 3435 + }, + { + "epoch": 2.751196172248804, + "grad_norm": 1.9627894163131714, + "learning_rate": 4.311724689875951e-05, + "loss": 1.6496, + "step": 3450 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 1.823258638381958, + "learning_rate": 4.3087234893957586e-05, + "loss": 1.584, + "step": 3465 + }, + { + "epoch": 2.77511961722488, + "grad_norm": 3.361528158187866, + "learning_rate": 4.3057222889155664e-05, + "loss": 1.6163, + "step": 3480 + }, + { + "epoch": 2.787081339712919, + "grad_norm": 2.01798677444458, + "learning_rate": 4.302721088435374e-05, + "loss": 1.4596, + "step": 3495 + }, + { + "epoch": 2.799043062200957, + "grad_norm": 1.9381790161132812, + "learning_rate": 4.2997198879551826e-05, + "loss": 1.6621, + "step": 3510 + }, + { + "epoch": 2.811004784688995, + "grad_norm": 2.0217368602752686, + "learning_rate": 4.2967186874749904e-05, + "loss": 1.6089, + "step": 3525 + }, + { + "epoch": 2.8229665071770333, + "grad_norm": 1.7677721977233887, + "learning_rate": 4.293717486994798e-05, + "loss": 1.6052, + "step": 3540 + }, + { + "epoch": 2.8349282296650715, + "grad_norm": 1.9464062452316284, + "learning_rate": 4.290716286514606e-05, + "loss": 1.6751, + "step": 3555 + }, + { + "epoch": 2.84688995215311, + "grad_norm": 1.9557422399520874, + "learning_rate": 4.287715086034414e-05, + "loss": 1.5964, + "step": 3570 + }, + { + "epoch": 2.8588516746411483, + "grad_norm": 3.1278235912323, + "learning_rate": 4.2847138855542216e-05, + "loss": 1.6272, + "step": 3585 + }, + { + "epoch": 2.8708133971291865, + "grad_norm": 1.8671112060546875, + "learning_rate": 4.2817126850740293e-05, + "loss": 1.6573, + "step": 3600 + }, + { + "epoch": 2.882775119617225, + "grad_norm": 1.9375852346420288, + "learning_rate": 4.278711484593838e-05, + "loss": 1.6407, + "step": 3615 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 1.907958984375, + "learning_rate": 4.275710284113646e-05, + "loss": 1.6272, + "step": 3630 + }, + { + "epoch": 2.9066985645933014, + "grad_norm": 2.1269607543945312, + "learning_rate": 4.272709083633454e-05, + "loss": 1.5664, + "step": 3645 + }, + { + "epoch": 2.9186602870813396, + "grad_norm": 1.766072392463684, + "learning_rate": 4.269707883153262e-05, + "loss": 1.6766, + "step": 3660 + }, + { + "epoch": 2.930622009569378, + "grad_norm": 2.157346248626709, + "learning_rate": 4.2667066826730696e-05, + "loss": 1.6374, + "step": 3675 + }, + { + "epoch": 2.9425837320574164, + "grad_norm": 3.1585512161254883, + "learning_rate": 4.2637054821928774e-05, + "loss": 1.6082, + "step": 3690 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 2.0836970806121826, + "learning_rate": 4.260704281712685e-05, + "loss": 1.6703, + "step": 3705 + }, + { + "epoch": 2.9665071770334928, + "grad_norm": 1.729893445968628, + "learning_rate": 4.257703081232493e-05, + "loss": 1.6557, + "step": 3720 + }, + { + "epoch": 2.9784688995215314, + "grad_norm": 3.384397268295288, + "learning_rate": 4.254701880752301e-05, + "loss": 1.643, + "step": 3735 + }, + { + "epoch": 2.990430622009569, + "grad_norm": 1.8642953634262085, + "learning_rate": 4.2517006802721085e-05, + "loss": 1.6524, + "step": 3750 + }, + { + "epoch": 3.0023923444976077, + "grad_norm": 1.9247709512710571, + "learning_rate": 4.248699479791917e-05, + "loss": 1.484, + "step": 3765 + }, + { + "epoch": 3.014354066985646, + "grad_norm": 2.0377817153930664, + "learning_rate": 4.245698279311725e-05, + "loss": 1.2241, + "step": 3780 + }, + { + "epoch": 3.026315789473684, + "grad_norm": 2.2331552505493164, + "learning_rate": 4.2426970788315326e-05, + "loss": 1.1948, + "step": 3795 + }, + { + "epoch": 3.0382775119617227, + "grad_norm": 2.3499271869659424, + "learning_rate": 4.239695878351341e-05, + "loss": 1.2828, + "step": 3810 + }, + { + "epoch": 3.050239234449761, + "grad_norm": 2.445600748062134, + "learning_rate": 4.236694677871149e-05, + "loss": 1.1715, + "step": 3825 + }, + { + "epoch": 3.062200956937799, + "grad_norm": 2.801543951034546, + "learning_rate": 4.2336934773909566e-05, + "loss": 1.2167, + "step": 3840 + }, + { + "epoch": 3.074162679425837, + "grad_norm": 2.515307664871216, + "learning_rate": 4.2306922769107644e-05, + "loss": 1.1451, + "step": 3855 + }, + { + "epoch": 3.0861244019138754, + "grad_norm": 2.6123640537261963, + "learning_rate": 4.227691076430572e-05, + "loss": 1.256, + "step": 3870 + }, + { + "epoch": 3.098086124401914, + "grad_norm": 2.602388381958008, + "learning_rate": 4.2246898759503806e-05, + "loss": 1.1867, + "step": 3885 + }, + { + "epoch": 3.110047846889952, + "grad_norm": 2.552335739135742, + "learning_rate": 4.2216886754701884e-05, + "loss": 1.1845, + "step": 3900 + }, + { + "epoch": 3.1220095693779903, + "grad_norm": 2.6270079612731934, + "learning_rate": 4.218687474989996e-05, + "loss": 1.2479, + "step": 3915 + }, + { + "epoch": 3.1339712918660285, + "grad_norm": 2.490518808364868, + "learning_rate": 4.215686274509804e-05, + "loss": 1.2386, + "step": 3930 + }, + { + "epoch": 3.145933014354067, + "grad_norm": 2.348869800567627, + "learning_rate": 4.212685074029612e-05, + "loss": 1.2285, + "step": 3945 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 2.3546955585479736, + "learning_rate": 4.2096838735494196e-05, + "loss": 1.206, + "step": 3960 + }, + { + "epoch": 3.1698564593301435, + "grad_norm": 2.4429666996002197, + "learning_rate": 4.2066826730692274e-05, + "loss": 1.335, + "step": 3975 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 2.397874355316162, + "learning_rate": 4.203681472589036e-05, + "loss": 1.2252, + "step": 3990 + }, + { + "epoch": 3.1937799043062203, + "grad_norm": 2.526556968688965, + "learning_rate": 4.200680272108844e-05, + "loss": 1.2811, + "step": 4005 + }, + { + "epoch": 3.2057416267942584, + "grad_norm": 2.7083089351654053, + "learning_rate": 4.197679071628652e-05, + "loss": 1.3154, + "step": 4020 + }, + { + "epoch": 3.2177033492822966, + "grad_norm": 2.426650285720825, + "learning_rate": 4.19467787114846e-05, + "loss": 1.2251, + "step": 4035 + }, + { + "epoch": 3.229665071770335, + "grad_norm": 3.1592352390289307, + "learning_rate": 4.1916766706682676e-05, + "loss": 1.232, + "step": 4050 + }, + { + "epoch": 3.2416267942583734, + "grad_norm": 2.4699387550354004, + "learning_rate": 4.1886754701880754e-05, + "loss": 1.3075, + "step": 4065 + }, + { + "epoch": 3.2535885167464116, + "grad_norm": 2.410412311553955, + "learning_rate": 4.185674269707883e-05, + "loss": 1.2583, + "step": 4080 + }, + { + "epoch": 3.2655502392344498, + "grad_norm": 2.3662848472595215, + "learning_rate": 4.182673069227691e-05, + "loss": 1.2718, + "step": 4095 + }, + { + "epoch": 3.277511961722488, + "grad_norm": 2.241677761077881, + "learning_rate": 4.179671868747499e-05, + "loss": 1.2293, + "step": 4110 + }, + { + "epoch": 3.2894736842105265, + "grad_norm": 2.289928674697876, + "learning_rate": 4.176670668267307e-05, + "loss": 1.2369, + "step": 4125 + }, + { + "epoch": 3.3014354066985647, + "grad_norm": 2.9561991691589355, + "learning_rate": 4.173669467787115e-05, + "loss": 1.1936, + "step": 4140 + }, + { + "epoch": 3.313397129186603, + "grad_norm": 2.6181890964508057, + "learning_rate": 4.170668267306923e-05, + "loss": 1.2791, + "step": 4155 + }, + { + "epoch": 3.325358851674641, + "grad_norm": 2.208653688430786, + "learning_rate": 4.1676670668267306e-05, + "loss": 1.3175, + "step": 4170 + }, + { + "epoch": 3.3373205741626792, + "grad_norm": 2.460291624069214, + "learning_rate": 4.164665866346539e-05, + "loss": 1.255, + "step": 4185 + }, + { + "epoch": 3.349282296650718, + "grad_norm": 2.2541019916534424, + "learning_rate": 4.161664665866347e-05, + "loss": 1.2815, + "step": 4200 + }, + { + "epoch": 3.361244019138756, + "grad_norm": 2.543994903564453, + "learning_rate": 4.1586634653861546e-05, + "loss": 1.2888, + "step": 4215 + }, + { + "epoch": 3.373205741626794, + "grad_norm": 2.7568411827087402, + "learning_rate": 4.1556622649059624e-05, + "loss": 1.2894, + "step": 4230 + }, + { + "epoch": 3.3851674641148324, + "grad_norm": 2.5805466175079346, + "learning_rate": 4.152661064425771e-05, + "loss": 1.3434, + "step": 4245 + }, + { + "epoch": 3.397129186602871, + "grad_norm": 2.409097194671631, + "learning_rate": 4.149659863945579e-05, + "loss": 1.2903, + "step": 4260 + }, + { + "epoch": 3.409090909090909, + "grad_norm": 4.126059532165527, + "learning_rate": 4.1466586634653865e-05, + "loss": 1.2764, + "step": 4275 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 3.106367826461792, + "learning_rate": 4.143657462985194e-05, + "loss": 1.3184, + "step": 4290 + }, + { + "epoch": 3.4330143540669855, + "grad_norm": 2.195138454437256, + "learning_rate": 4.140656262505002e-05, + "loss": 1.2636, + "step": 4305 + }, + { + "epoch": 3.444976076555024, + "grad_norm": 2.7023708820343018, + "learning_rate": 4.13765506202481e-05, + "loss": 1.3316, + "step": 4320 + }, + { + "epoch": 3.4569377990430623, + "grad_norm": 2.262626886367798, + "learning_rate": 4.1346538615446176e-05, + "loss": 1.2847, + "step": 4335 + }, + { + "epoch": 3.4688995215311005, + "grad_norm": 2.5416321754455566, + "learning_rate": 4.131652661064426e-05, + "loss": 1.3254, + "step": 4350 + }, + { + "epoch": 3.4808612440191387, + "grad_norm": 2.868903875350952, + "learning_rate": 4.128651460584234e-05, + "loss": 1.2778, + "step": 4365 + }, + { + "epoch": 3.492822966507177, + "grad_norm": 2.347463607788086, + "learning_rate": 4.125650260104042e-05, + "loss": 1.34, + "step": 4380 + }, + { + "epoch": 3.5047846889952154, + "grad_norm": 2.644416332244873, + "learning_rate": 4.12264905962385e-05, + "loss": 1.2862, + "step": 4395 + }, + { + "epoch": 3.5167464114832536, + "grad_norm": 2.8803160190582275, + "learning_rate": 4.119647859143658e-05, + "loss": 1.3538, + "step": 4410 + }, + { + "epoch": 3.528708133971292, + "grad_norm": 2.643848180770874, + "learning_rate": 4.1166466586634657e-05, + "loss": 1.3566, + "step": 4425 + }, + { + "epoch": 3.5406698564593304, + "grad_norm": 2.555978298187256, + "learning_rate": 4.1136454581832734e-05, + "loss": 1.284, + "step": 4440 + }, + { + "epoch": 3.5526315789473686, + "grad_norm": 2.4635751247406006, + "learning_rate": 4.110644257703081e-05, + "loss": 1.3229, + "step": 4455 + }, + { + "epoch": 3.5645933014354068, + "grad_norm": 2.804314374923706, + "learning_rate": 4.107643057222889e-05, + "loss": 1.2931, + "step": 4470 + }, + { + "epoch": 3.576555023923445, + "grad_norm": 2.5955514907836914, + "learning_rate": 4.1046418567426975e-05, + "loss": 1.3153, + "step": 4485 + }, + { + "epoch": 3.588516746411483, + "grad_norm": 2.4464356899261475, + "learning_rate": 4.101640656262505e-05, + "loss": 1.2963, + "step": 4500 + }, + { + "epoch": 3.6004784688995217, + "grad_norm": 2.8158469200134277, + "learning_rate": 4.098639455782313e-05, + "loss": 1.333, + "step": 4515 + }, + { + "epoch": 3.61244019138756, + "grad_norm": 2.324192523956299, + "learning_rate": 4.095638255302121e-05, + "loss": 1.3438, + "step": 4530 + }, + { + "epoch": 3.624401913875598, + "grad_norm": 2.5822291374206543, + "learning_rate": 4.0926370548219286e-05, + "loss": 1.381, + "step": 4545 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 2.3783419132232666, + "learning_rate": 4.089635854341737e-05, + "loss": 1.321, + "step": 4560 + }, + { + "epoch": 3.6483253588516744, + "grad_norm": 2.453040361404419, + "learning_rate": 4.086634653861545e-05, + "loss": 1.35, + "step": 4575 + }, + { + "epoch": 3.660287081339713, + "grad_norm": 2.694587230682373, + "learning_rate": 4.0836334533813526e-05, + "loss": 1.3342, + "step": 4590 + }, + { + "epoch": 3.672248803827751, + "grad_norm": 2.4545223712921143, + "learning_rate": 4.080632252901161e-05, + "loss": 1.4238, + "step": 4605 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 2.5401089191436768, + "learning_rate": 4.077631052420969e-05, + "loss": 1.3699, + "step": 4620 + }, + { + "epoch": 3.696172248803828, + "grad_norm": 2.4257302284240723, + "learning_rate": 4.074629851940777e-05, + "loss": 1.3569, + "step": 4635 + }, + { + "epoch": 3.708133971291866, + "grad_norm": 2.7543747425079346, + "learning_rate": 4.0716286514605845e-05, + "loss": 1.2967, + "step": 4650 + }, + { + "epoch": 3.7200956937799043, + "grad_norm": 2.4614686965942383, + "learning_rate": 4.068627450980392e-05, + "loss": 1.2982, + "step": 4665 + }, + { + "epoch": 3.7320574162679425, + "grad_norm": 3.7613461017608643, + "learning_rate": 4.0656262505002e-05, + "loss": 1.3812, + "step": 4680 + }, + { + "epoch": 3.7440191387559807, + "grad_norm": 2.60383939743042, + "learning_rate": 4.062625050020008e-05, + "loss": 1.3526, + "step": 4695 + }, + { + "epoch": 3.7559808612440193, + "grad_norm": 2.3789987564086914, + "learning_rate": 4.0596238495398156e-05, + "loss": 1.3502, + "step": 4710 + }, + { + "epoch": 3.7679425837320575, + "grad_norm": 2.6684768199920654, + "learning_rate": 4.056622649059624e-05, + "loss": 1.4723, + "step": 4725 + }, + { + "epoch": 3.7799043062200957, + "grad_norm": 2.480144500732422, + "learning_rate": 4.053621448579432e-05, + "loss": 1.3716, + "step": 4740 + }, + { + "epoch": 3.791866028708134, + "grad_norm": 2.429513454437256, + "learning_rate": 4.05062024809924e-05, + "loss": 1.2895, + "step": 4755 + }, + { + "epoch": 3.803827751196172, + "grad_norm": 2.4947898387908936, + "learning_rate": 4.047619047619048e-05, + "loss": 1.4147, + "step": 4770 + }, + { + "epoch": 3.8157894736842106, + "grad_norm": 2.351773500442505, + "learning_rate": 4.044617847138856e-05, + "loss": 1.3712, + "step": 4785 + }, + { + "epoch": 3.827751196172249, + "grad_norm": 2.4937288761138916, + "learning_rate": 4.041616646658664e-05, + "loss": 1.3342, + "step": 4800 + }, + { + "epoch": 3.839712918660287, + "grad_norm": 3.4912281036376953, + "learning_rate": 4.0386154461784715e-05, + "loss": 1.3403, + "step": 4815 + }, + { + "epoch": 3.8516746411483256, + "grad_norm": 2.2786455154418945, + "learning_rate": 4.035614245698279e-05, + "loss": 1.335, + "step": 4830 + }, + { + "epoch": 3.8636363636363638, + "grad_norm": 2.7752015590667725, + "learning_rate": 4.032613045218088e-05, + "loss": 1.3739, + "step": 4845 + }, + { + "epoch": 3.875598086124402, + "grad_norm": 2.510052442550659, + "learning_rate": 4.0296118447378955e-05, + "loss": 1.3793, + "step": 4860 + }, + { + "epoch": 3.88755980861244, + "grad_norm": 4.657649517059326, + "learning_rate": 4.026610644257703e-05, + "loss": 1.3914, + "step": 4875 + }, + { + "epoch": 3.8995215311004783, + "grad_norm": 2.437033176422119, + "learning_rate": 4.023609443777511e-05, + "loss": 1.3793, + "step": 4890 + }, + { + "epoch": 3.911483253588517, + "grad_norm": 2.7319986820220947, + "learning_rate": 4.020608243297319e-05, + "loss": 1.437, + "step": 4905 + }, + { + "epoch": 3.923444976076555, + "grad_norm": 2.553680896759033, + "learning_rate": 4.0176070428171266e-05, + "loss": 1.3613, + "step": 4920 + }, + { + "epoch": 3.9354066985645932, + "grad_norm": 2.379471778869629, + "learning_rate": 4.014605842336935e-05, + "loss": 1.3638, + "step": 4935 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 2.8651113510131836, + "learning_rate": 4.011604641856743e-05, + "loss": 1.3265, + "step": 4950 + }, + { + "epoch": 3.9593301435406696, + "grad_norm": 2.366116762161255, + "learning_rate": 4.0086034413765513e-05, + "loss": 1.2701, + "step": 4965 + }, + { + "epoch": 3.971291866028708, + "grad_norm": 2.60257625579834, + "learning_rate": 4.005602240896359e-05, + "loss": 1.305, + "step": 4980 + }, + { + "epoch": 3.9832535885167464, + "grad_norm": 2.544235944747925, + "learning_rate": 4.002601040416167e-05, + "loss": 1.3632, + "step": 4995 + }, + { + "epoch": 3.9952153110047846, + "grad_norm": 2.541198253631592, + "learning_rate": 3.999599839935975e-05, + "loss": 1.4154, + "step": 5010 + }, + { + "epoch": 4.007177033492823, + "grad_norm": 3.7236313819885254, + "learning_rate": 3.9965986394557825e-05, + "loss": 1.1803, + "step": 5025 + }, + { + "epoch": 4.019138755980861, + "grad_norm": 3.206791877746582, + "learning_rate": 3.99359743897559e-05, + "loss": 0.9466, + "step": 5040 + }, + { + "epoch": 4.0311004784688995, + "grad_norm": 2.9792520999908447, + "learning_rate": 3.990596238495398e-05, + "loss": 0.8937, + "step": 5055 + }, + { + "epoch": 4.043062200956938, + "grad_norm": 3.3796586990356445, + "learning_rate": 3.987595038015206e-05, + "loss": 0.9352, + "step": 5070 + }, + { + "epoch": 4.055023923444976, + "grad_norm": 2.383775472640991, + "learning_rate": 3.984593837535014e-05, + "loss": 0.8506, + "step": 5085 + }, + { + "epoch": 4.0669856459330145, + "grad_norm": 2.6192071437835693, + "learning_rate": 3.981592637054822e-05, + "loss": 0.8886, + "step": 5100 + }, + { + "epoch": 4.078947368421052, + "grad_norm": 3.329030990600586, + "learning_rate": 3.97859143657463e-05, + "loss": 0.9639, + "step": 5115 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 3.970484733581543, + "learning_rate": 3.975590236094438e-05, + "loss": 0.9112, + "step": 5130 + }, + { + "epoch": 4.1028708133971294, + "grad_norm": 3.082409381866455, + "learning_rate": 3.972589035614246e-05, + "loss": 0.8825, + "step": 5145 + }, + { + "epoch": 4.114832535885167, + "grad_norm": 2.9433696269989014, + "learning_rate": 3.969587835134054e-05, + "loss": 0.9384, + "step": 5160 + }, + { + "epoch": 4.126794258373206, + "grad_norm": 3.1707279682159424, + "learning_rate": 3.966586634653862e-05, + "loss": 0.9025, + "step": 5175 + }, + { + "epoch": 4.138755980861244, + "grad_norm": 3.336472988128662, + "learning_rate": 3.9635854341736695e-05, + "loss": 0.9228, + "step": 5190 + }, + { + "epoch": 4.150717703349282, + "grad_norm": 3.4995670318603516, + "learning_rate": 3.960584233693477e-05, + "loss": 0.9847, + "step": 5205 + }, + { + "epoch": 4.162679425837321, + "grad_norm": 3.3354713916778564, + "learning_rate": 3.957583033213286e-05, + "loss": 0.9717, + "step": 5220 + }, + { + "epoch": 4.1746411483253585, + "grad_norm": 3.2553207874298096, + "learning_rate": 3.9545818327330935e-05, + "loss": 0.9973, + "step": 5235 + }, + { + "epoch": 4.186602870813397, + "grad_norm": 3.007181406021118, + "learning_rate": 3.951580632252901e-05, + "loss": 0.919, + "step": 5250 + }, + { + "epoch": 4.198564593301436, + "grad_norm": 2.7252211570739746, + "learning_rate": 3.948579431772709e-05, + "loss": 0.8914, + "step": 5265 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 3.078258514404297, + "learning_rate": 3.945578231292517e-05, + "loss": 1.0353, + "step": 5280 + }, + { + "epoch": 4.222488038277512, + "grad_norm": 3.0154271125793457, + "learning_rate": 3.942577030812325e-05, + "loss": 0.9594, + "step": 5295 + }, + { + "epoch": 4.23444976076555, + "grad_norm": 3.7115094661712646, + "learning_rate": 3.939575830332133e-05, + "loss": 0.9248, + "step": 5310 + }, + { + "epoch": 4.246411483253588, + "grad_norm": 3.135359048843384, + "learning_rate": 3.936574629851941e-05, + "loss": 0.9918, + "step": 5325 + }, + { + "epoch": 4.258373205741627, + "grad_norm": 2.8541269302368164, + "learning_rate": 3.9335734293717494e-05, + "loss": 0.974, + "step": 5340 + }, + { + "epoch": 4.270334928229665, + "grad_norm": 3.1880204677581787, + "learning_rate": 3.930572228891557e-05, + "loss": 1.0267, + "step": 5355 + }, + { + "epoch": 4.282296650717703, + "grad_norm": 4.082556247711182, + "learning_rate": 3.927571028411365e-05, + "loss": 0.9764, + "step": 5370 + }, + { + "epoch": 4.294258373205742, + "grad_norm": 3.121758460998535, + "learning_rate": 3.924569827931173e-05, + "loss": 1.0353, + "step": 5385 + }, + { + "epoch": 4.30622009569378, + "grad_norm": 3.3821141719818115, + "learning_rate": 3.9215686274509805e-05, + "loss": 1.0219, + "step": 5400 + }, + { + "epoch": 4.318181818181818, + "grad_norm": 3.336914300918579, + "learning_rate": 3.918567426970788e-05, + "loss": 1.0427, + "step": 5415 + }, + { + "epoch": 4.330143540669856, + "grad_norm": 3.1878132820129395, + "learning_rate": 3.915566226490596e-05, + "loss": 1.0125, + "step": 5430 + }, + { + "epoch": 4.342105263157895, + "grad_norm": 3.5293705463409424, + "learning_rate": 3.912565026010404e-05, + "loss": 0.9655, + "step": 5445 + }, + { + "epoch": 4.354066985645933, + "grad_norm": 2.9817090034484863, + "learning_rate": 3.909563825530212e-05, + "loss": 0.9854, + "step": 5460 + }, + { + "epoch": 4.366028708133971, + "grad_norm": 3.0998663902282715, + "learning_rate": 3.90656262505002e-05, + "loss": 0.951, + "step": 5475 + }, + { + "epoch": 4.37799043062201, + "grad_norm": 3.541856050491333, + "learning_rate": 3.903561424569828e-05, + "loss": 1.0302, + "step": 5490 + }, + { + "epoch": 4.389952153110048, + "grad_norm": 3.180595636367798, + "learning_rate": 3.9005602240896364e-05, + "loss": 0.9434, + "step": 5505 + }, + { + "epoch": 4.401913875598086, + "grad_norm": 3.341787099838257, + "learning_rate": 3.897559023609444e-05, + "loss": 1.0062, + "step": 5520 + }, + { + "epoch": 4.413875598086125, + "grad_norm": 3.4445912837982178, + "learning_rate": 3.894557823129252e-05, + "loss": 0.9558, + "step": 5535 + }, + { + "epoch": 4.425837320574162, + "grad_norm": 2.839120388031006, + "learning_rate": 3.89155662264906e-05, + "loss": 1.0152, + "step": 5550 + }, + { + "epoch": 4.437799043062201, + "grad_norm": 3.482067108154297, + "learning_rate": 3.8885554221688675e-05, + "loss": 1.0234, + "step": 5565 + }, + { + "epoch": 4.44976076555024, + "grad_norm": 2.869065761566162, + "learning_rate": 3.885554221688676e-05, + "loss": 1.0045, + "step": 5580 + }, + { + "epoch": 4.461722488038277, + "grad_norm": 3.366964101791382, + "learning_rate": 3.882553021208484e-05, + "loss": 1.0086, + "step": 5595 + }, + { + "epoch": 4.473684210526316, + "grad_norm": 3.8538451194763184, + "learning_rate": 3.8795518207282915e-05, + "loss": 1.0727, + "step": 5610 + }, + { + "epoch": 4.485645933014354, + "grad_norm": 3.1612632274627686, + "learning_rate": 3.876550620248099e-05, + "loss": 1.1, + "step": 5625 + }, + { + "epoch": 4.497607655502392, + "grad_norm": 3.4518115520477295, + "learning_rate": 3.873549419767907e-05, + "loss": 0.9788, + "step": 5640 + }, + { + "epoch": 4.509569377990431, + "grad_norm": 2.8597676753997803, + "learning_rate": 3.870548219287715e-05, + "loss": 1.0111, + "step": 5655 + }, + { + "epoch": 4.521531100478469, + "grad_norm": 3.2637124061584473, + "learning_rate": 3.8675470188075233e-05, + "loss": 0.9647, + "step": 5670 + }, + { + "epoch": 4.533492822966507, + "grad_norm": 3.176473379135132, + "learning_rate": 3.864545818327331e-05, + "loss": 1.0303, + "step": 5685 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 3.1555211544036865, + "learning_rate": 3.8615446178471396e-05, + "loss": 0.9983, + "step": 5700 + }, + { + "epoch": 4.557416267942584, + "grad_norm": 3.690917730331421, + "learning_rate": 3.8585434173669474e-05, + "loss": 1.0843, + "step": 5715 + }, + { + "epoch": 4.569377990430622, + "grad_norm": 3.4356346130371094, + "learning_rate": 3.855542216886755e-05, + "loss": 1.0957, + "step": 5730 + }, + { + "epoch": 4.58133971291866, + "grad_norm": 3.0207927227020264, + "learning_rate": 3.852541016406563e-05, + "loss": 0.9877, + "step": 5745 + }, + { + "epoch": 4.5933014354066986, + "grad_norm": 3.256007194519043, + "learning_rate": 3.849539815926371e-05, + "loss": 0.9934, + "step": 5760 + }, + { + "epoch": 4.605263157894737, + "grad_norm": 4.417782783508301, + "learning_rate": 3.8465386154461785e-05, + "loss": 1.0612, + "step": 5775 + }, + { + "epoch": 4.617224880382775, + "grad_norm": 2.802917242050171, + "learning_rate": 3.843537414965986e-05, + "loss": 1.0714, + "step": 5790 + }, + { + "epoch": 4.6291866028708135, + "grad_norm": 2.9113950729370117, + "learning_rate": 3.840536214485794e-05, + "loss": 1.0637, + "step": 5805 + }, + { + "epoch": 4.641148325358852, + "grad_norm": 3.0320019721984863, + "learning_rate": 3.8375350140056026e-05, + "loss": 1.0407, + "step": 5820 + }, + { + "epoch": 4.65311004784689, + "grad_norm": 2.9705982208251953, + "learning_rate": 3.83453381352541e-05, + "loss": 1.118, + "step": 5835 + }, + { + "epoch": 4.6650717703349285, + "grad_norm": 3.1082069873809814, + "learning_rate": 3.831532613045218e-05, + "loss": 1.1102, + "step": 5850 + }, + { + "epoch": 4.677033492822966, + "grad_norm": 3.2098066806793213, + "learning_rate": 3.828531412565026e-05, + "loss": 1.1063, + "step": 5865 + }, + { + "epoch": 4.688995215311005, + "grad_norm": 3.18621826171875, + "learning_rate": 3.8255302120848344e-05, + "loss": 1.0772, + "step": 5880 + }, + { + "epoch": 4.7009569377990434, + "grad_norm": 3.3197460174560547, + "learning_rate": 3.822529011604642e-05, + "loss": 1.0054, + "step": 5895 + }, + { + "epoch": 4.712918660287081, + "grad_norm": 2.8657805919647217, + "learning_rate": 3.81952781112445e-05, + "loss": 1.073, + "step": 5910 + }, + { + "epoch": 4.72488038277512, + "grad_norm": 2.897557497024536, + "learning_rate": 3.816526610644258e-05, + "loss": 1.0991, + "step": 5925 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 2.881815195083618, + "learning_rate": 3.813525410164066e-05, + "loss": 1.1037, + "step": 5940 + }, + { + "epoch": 4.748803827751196, + "grad_norm": 3.131378412246704, + "learning_rate": 3.810524209683874e-05, + "loss": 1.149, + "step": 5955 + }, + { + "epoch": 4.760765550239235, + "grad_norm": 3.3418426513671875, + "learning_rate": 3.807523009203682e-05, + "loss": 1.0799, + "step": 5970 + }, + { + "epoch": 4.7727272727272725, + "grad_norm": 2.759793519973755, + "learning_rate": 3.8045218087234895e-05, + "loss": 1.1026, + "step": 5985 + }, + { + "epoch": 4.784688995215311, + "grad_norm": 3.082688808441162, + "learning_rate": 3.801520608243297e-05, + "loss": 1.0911, + "step": 6000 + }, + { + "epoch": 4.796650717703349, + "grad_norm": 3.788597583770752, + "learning_rate": 3.798519407763105e-05, + "loss": 1.1133, + "step": 6015 + }, + { + "epoch": 4.8086124401913874, + "grad_norm": 3.0609753131866455, + "learning_rate": 3.795518207282913e-05, + "loss": 1.0023, + "step": 6030 + }, + { + "epoch": 4.820574162679426, + "grad_norm": 3.5260090827941895, + "learning_rate": 3.7925170068027214e-05, + "loss": 1.105, + "step": 6045 + }, + { + "epoch": 4.832535885167464, + "grad_norm": 3.1473610401153564, + "learning_rate": 3.789515806322529e-05, + "loss": 1.1896, + "step": 6060 + }, + { + "epoch": 4.844497607655502, + "grad_norm": 3.2314066886901855, + "learning_rate": 3.7865146058423376e-05, + "loss": 1.1403, + "step": 6075 + }, + { + "epoch": 4.856459330143541, + "grad_norm": 3.1266963481903076, + "learning_rate": 3.7835134053621454e-05, + "loss": 1.123, + "step": 6090 + }, + { + "epoch": 4.868421052631579, + "grad_norm": 3.1995601654052734, + "learning_rate": 3.780512204881953e-05, + "loss": 1.1833, + "step": 6105 + }, + { + "epoch": 4.880382775119617, + "grad_norm": 3.251296043395996, + "learning_rate": 3.777511004401761e-05, + "loss": 1.1502, + "step": 6120 + }, + { + "epoch": 4.892344497607656, + "grad_norm": 3.1420419216156006, + "learning_rate": 3.774509803921569e-05, + "loss": 1.1207, + "step": 6135 + }, + { + "epoch": 4.904306220095694, + "grad_norm": 2.992222785949707, + "learning_rate": 3.7715086034413765e-05, + "loss": 1.1347, + "step": 6150 + }, + { + "epoch": 4.916267942583732, + "grad_norm": 3.03808856010437, + "learning_rate": 3.768507402961184e-05, + "loss": 1.131, + "step": 6165 + }, + { + "epoch": 4.92822966507177, + "grad_norm": 3.9193668365478516, + "learning_rate": 3.765506202480993e-05, + "loss": 1.0749, + "step": 6180 + }, + { + "epoch": 4.940191387559809, + "grad_norm": 3.3145644664764404, + "learning_rate": 3.7625050020008006e-05, + "loss": 1.0406, + "step": 6195 + }, + { + "epoch": 4.952153110047847, + "grad_norm": 3.134812116622925, + "learning_rate": 3.7595038015206084e-05, + "loss": 1.1243, + "step": 6210 + }, + { + "epoch": 4.964114832535885, + "grad_norm": 3.403087854385376, + "learning_rate": 3.756502601040416e-05, + "loss": 1.0429, + "step": 6225 + }, + { + "epoch": 4.976076555023924, + "grad_norm": 3.0964858531951904, + "learning_rate": 3.753501400560224e-05, + "loss": 1.112, + "step": 6240 + }, + { + "epoch": 4.988038277511961, + "grad_norm": 4.416729927062988, + "learning_rate": 3.7505002000800324e-05, + "loss": 1.1144, + "step": 6255 + }, + { + "epoch": 5.0, + "grad_norm": 4.442926406860352, + "learning_rate": 3.74749899959984e-05, + "loss": 1.0938, + "step": 6270 + }, + { + "epoch": 5.011961722488039, + "grad_norm": 3.0728983879089355, + "learning_rate": 3.744497799119648e-05, + "loss": 0.6816, + "step": 6285 + }, + { + "epoch": 5.023923444976076, + "grad_norm": 3.4252402782440186, + "learning_rate": 3.7414965986394564e-05, + "loss": 0.7263, + "step": 6300 + }, + { + "epoch": 5.035885167464115, + "grad_norm": 4.501566410064697, + "learning_rate": 3.738495398159264e-05, + "loss": 0.6744, + "step": 6315 + }, + { + "epoch": 5.047846889952153, + "grad_norm": 3.8966481685638428, + "learning_rate": 3.735494197679072e-05, + "loss": 0.645, + "step": 6330 + }, + { + "epoch": 5.059808612440191, + "grad_norm": 3.794740915298462, + "learning_rate": 3.73249299719888e-05, + "loss": 0.6894, + "step": 6345 + }, + { + "epoch": 5.07177033492823, + "grad_norm": 3.1294026374816895, + "learning_rate": 3.7294917967186876e-05, + "loss": 0.6101, + "step": 6360 + }, + { + "epoch": 5.083732057416268, + "grad_norm": 3.1900405883789062, + "learning_rate": 3.7264905962384953e-05, + "loss": 0.6731, + "step": 6375 + }, + { + "epoch": 5.095693779904306, + "grad_norm": 3.9348907470703125, + "learning_rate": 3.723489395758303e-05, + "loss": 0.7257, + "step": 6390 + }, + { + "epoch": 5.107655502392345, + "grad_norm": 3.5655553340911865, + "learning_rate": 3.720488195278111e-05, + "loss": 0.6219, + "step": 6405 + }, + { + "epoch": 5.119617224880383, + "grad_norm": 3.678565740585327, + "learning_rate": 3.7174869947979194e-05, + "loss": 0.6896, + "step": 6420 + }, + { + "epoch": 5.131578947368421, + "grad_norm": 3.041287422180176, + "learning_rate": 3.714485794317727e-05, + "loss": 0.7084, + "step": 6435 + }, + { + "epoch": 5.143540669856459, + "grad_norm": 3.382601737976074, + "learning_rate": 3.7114845938375356e-05, + "loss": 0.6298, + "step": 6450 + }, + { + "epoch": 5.155502392344498, + "grad_norm": 3.4510035514831543, + "learning_rate": 3.7084833933573434e-05, + "loss": 0.6882, + "step": 6465 + }, + { + "epoch": 5.167464114832536, + "grad_norm": 4.204371929168701, + "learning_rate": 3.705482192877151e-05, + "loss": 0.7478, + "step": 6480 + }, + { + "epoch": 5.179425837320574, + "grad_norm": 3.669754981994629, + "learning_rate": 3.702480992396959e-05, + "loss": 0.7159, + "step": 6495 + }, + { + "epoch": 5.1913875598086126, + "grad_norm": 3.454606056213379, + "learning_rate": 3.699479791916767e-05, + "loss": 0.7049, + "step": 6510 + }, + { + "epoch": 5.203349282296651, + "grad_norm": 3.548112154006958, + "learning_rate": 3.6964785914365746e-05, + "loss": 0.7279, + "step": 6525 + }, + { + "epoch": 5.215311004784689, + "grad_norm": 4.184609413146973, + "learning_rate": 3.693477390956382e-05, + "loss": 0.7747, + "step": 6540 + }, + { + "epoch": 5.2272727272727275, + "grad_norm": 3.418808937072754, + "learning_rate": 3.690476190476191e-05, + "loss": 0.7833, + "step": 6555 + }, + { + "epoch": 5.239234449760765, + "grad_norm": 3.444638729095459, + "learning_rate": 3.6874749899959986e-05, + "loss": 0.81, + "step": 6570 + }, + { + "epoch": 5.251196172248804, + "grad_norm": 3.960958242416382, + "learning_rate": 3.6844737895158064e-05, + "loss": 0.6915, + "step": 6585 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 3.772879123687744, + "learning_rate": 3.681472589035614e-05, + "loss": 0.7157, + "step": 6600 + }, + { + "epoch": 5.27511961722488, + "grad_norm": 4.02428674697876, + "learning_rate": 3.6784713885554226e-05, + "loss": 0.7383, + "step": 6615 + }, + { + "epoch": 5.287081339712919, + "grad_norm": 3.4093050956726074, + "learning_rate": 3.6754701880752304e-05, + "loss": 0.7163, + "step": 6630 + }, + { + "epoch": 5.2990430622009566, + "grad_norm": 3.6924562454223633, + "learning_rate": 3.672468987595038e-05, + "loss": 0.7022, + "step": 6645 + }, + { + "epoch": 5.311004784688995, + "grad_norm": 3.356632947921753, + "learning_rate": 3.669467787114846e-05, + "loss": 0.737, + "step": 6660 + }, + { + "epoch": 5.322966507177034, + "grad_norm": 3.501210927963257, + "learning_rate": 3.6664665866346544e-05, + "loss": 0.7474, + "step": 6675 + }, + { + "epoch": 5.3349282296650715, + "grad_norm": 3.852551221847534, + "learning_rate": 3.663465386154462e-05, + "loss": 0.779, + "step": 6690 + }, + { + "epoch": 5.34688995215311, + "grad_norm": 3.4461312294006348, + "learning_rate": 3.66046418567427e-05, + "loss": 0.6816, + "step": 6705 + }, + { + "epoch": 5.358851674641148, + "grad_norm": 2.9088375568389893, + "learning_rate": 3.657462985194078e-05, + "loss": 0.7619, + "step": 6720 + }, + { + "epoch": 5.3708133971291865, + "grad_norm": 3.4227547645568848, + "learning_rate": 3.6544617847138856e-05, + "loss": 0.7646, + "step": 6735 + }, + { + "epoch": 5.382775119617225, + "grad_norm": 4.553009986877441, + "learning_rate": 3.6514605842336934e-05, + "loss": 0.7907, + "step": 6750 + }, + { + "epoch": 5.394736842105263, + "grad_norm": 3.965406656265259, + "learning_rate": 3.648459383753501e-05, + "loss": 0.7901, + "step": 6765 + }, + { + "epoch": 5.4066985645933014, + "grad_norm": 3.7064077854156494, + "learning_rate": 3.645458183273309e-05, + "loss": 0.758, + "step": 6780 + }, + { + "epoch": 5.41866028708134, + "grad_norm": 3.4479455947875977, + "learning_rate": 3.6424569827931174e-05, + "loss": 0.7439, + "step": 6795 + }, + { + "epoch": 5.430622009569378, + "grad_norm": 3.9599294662475586, + "learning_rate": 3.639455782312925e-05, + "loss": 0.8257, + "step": 6810 + }, + { + "epoch": 5.442583732057416, + "grad_norm": 3.7063801288604736, + "learning_rate": 3.6364545818327336e-05, + "loss": 0.7717, + "step": 6825 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 4.6955060958862305, + "learning_rate": 3.6334533813525414e-05, + "loss": 0.7575, + "step": 6840 + }, + { + "epoch": 5.466507177033493, + "grad_norm": 3.915292501449585, + "learning_rate": 3.630452180872349e-05, + "loss": 0.7989, + "step": 6855 + }, + { + "epoch": 5.478468899521531, + "grad_norm": 3.974541664123535, + "learning_rate": 3.627450980392157e-05, + "loss": 0.8685, + "step": 6870 + }, + { + "epoch": 5.490430622009569, + "grad_norm": 3.9493520259857178, + "learning_rate": 3.624449779911965e-05, + "loss": 0.8111, + "step": 6885 + }, + { + "epoch": 5.502392344497608, + "grad_norm": 3.7138257026672363, + "learning_rate": 3.6214485794317726e-05, + "loss": 0.8086, + "step": 6900 + }, + { + "epoch": 5.514354066985646, + "grad_norm": 3.838562250137329, + "learning_rate": 3.618447378951581e-05, + "loss": 0.8, + "step": 6915 + }, + { + "epoch": 5.526315789473684, + "grad_norm": 3.5369865894317627, + "learning_rate": 3.615446178471389e-05, + "loss": 0.7449, + "step": 6930 + }, + { + "epoch": 5.538277511961723, + "grad_norm": 3.607936382293701, + "learning_rate": 3.6124449779911966e-05, + "loss": 0.7974, + "step": 6945 + }, + { + "epoch": 5.55023923444976, + "grad_norm": 4.021537780761719, + "learning_rate": 3.6094437775110044e-05, + "loss": 0.6972, + "step": 6960 + }, + { + "epoch": 5.562200956937799, + "grad_norm": 4.086754322052002, + "learning_rate": 3.606442577030812e-05, + "loss": 0.8349, + "step": 6975 + }, + { + "epoch": 5.574162679425838, + "grad_norm": 3.385819673538208, + "learning_rate": 3.6034413765506206e-05, + "loss": 0.8016, + "step": 6990 + }, + { + "epoch": 5.586124401913875, + "grad_norm": 3.3851637840270996, + "learning_rate": 3.6004401760704284e-05, + "loss": 0.8013, + "step": 7005 + }, + { + "epoch": 5.598086124401914, + "grad_norm": 3.6127657890319824, + "learning_rate": 3.597438975590236e-05, + "loss": 0.889, + "step": 7020 + }, + { + "epoch": 5.610047846889952, + "grad_norm": 3.7455716133117676, + "learning_rate": 3.594437775110045e-05, + "loss": 0.8244, + "step": 7035 + }, + { + "epoch": 5.62200956937799, + "grad_norm": 3.5797011852264404, + "learning_rate": 3.5914365746298525e-05, + "loss": 0.8794, + "step": 7050 + }, + { + "epoch": 5.633971291866029, + "grad_norm": 3.6951963901519775, + "learning_rate": 3.58843537414966e-05, + "loss": 0.8377, + "step": 7065 + }, + { + "epoch": 5.645933014354067, + "grad_norm": 4.805546283721924, + "learning_rate": 3.585434173669468e-05, + "loss": 0.7658, + "step": 7080 + }, + { + "epoch": 5.657894736842105, + "grad_norm": 3.3476104736328125, + "learning_rate": 3.582432973189276e-05, + "loss": 0.8535, + "step": 7095 + }, + { + "epoch": 5.669856459330144, + "grad_norm": 3.7429189682006836, + "learning_rate": 3.5794317727090836e-05, + "loss": 0.7698, + "step": 7110 + }, + { + "epoch": 5.681818181818182, + "grad_norm": 3.6189913749694824, + "learning_rate": 3.5764305722288914e-05, + "loss": 0.8843, + "step": 7125 + }, + { + "epoch": 5.69377990430622, + "grad_norm": 3.614164113998413, + "learning_rate": 3.573429371748699e-05, + "loss": 0.7855, + "step": 7140 + }, + { + "epoch": 5.705741626794258, + "grad_norm": 3.9962081909179688, + "learning_rate": 3.5704281712685076e-05, + "loss": 0.8501, + "step": 7155 + }, + { + "epoch": 5.717703349282297, + "grad_norm": 3.6668338775634766, + "learning_rate": 3.5674269707883154e-05, + "loss": 0.7866, + "step": 7170 + }, + { + "epoch": 5.729665071770335, + "grad_norm": 3.9314942359924316, + "learning_rate": 3.564425770308123e-05, + "loss": 0.8003, + "step": 7185 + }, + { + "epoch": 5.741626794258373, + "grad_norm": 4.32262659072876, + "learning_rate": 3.5614245698279317e-05, + "loss": 0.8137, + "step": 7200 + }, + { + "epoch": 5.753588516746412, + "grad_norm": 5.040790557861328, + "learning_rate": 3.5584233693477394e-05, + "loss": 0.8354, + "step": 7215 + }, + { + "epoch": 5.76555023923445, + "grad_norm": 3.7755401134490967, + "learning_rate": 3.555422168867547e-05, + "loss": 0.8574, + "step": 7230 + }, + { + "epoch": 5.777511961722488, + "grad_norm": 3.8143343925476074, + "learning_rate": 3.552420968387355e-05, + "loss": 0.8091, + "step": 7245 + }, + { + "epoch": 5.7894736842105265, + "grad_norm": 3.4861605167388916, + "learning_rate": 3.549419767907163e-05, + "loss": 0.8304, + "step": 7260 + }, + { + "epoch": 5.801435406698564, + "grad_norm": 3.5389742851257324, + "learning_rate": 3.546418567426971e-05, + "loss": 0.8676, + "step": 7275 + }, + { + "epoch": 5.813397129186603, + "grad_norm": 3.465071439743042, + "learning_rate": 3.543417366946779e-05, + "loss": 0.8296, + "step": 7290 + }, + { + "epoch": 5.8253588516746415, + "grad_norm": 3.9034931659698486, + "learning_rate": 3.540416166466587e-05, + "loss": 0.8398, + "step": 7305 + }, + { + "epoch": 5.837320574162679, + "grad_norm": 3.817934989929199, + "learning_rate": 3.5374149659863946e-05, + "loss": 0.8602, + "step": 7320 + }, + { + "epoch": 5.849282296650718, + "grad_norm": 4.706762790679932, + "learning_rate": 3.5344137655062024e-05, + "loss": 0.8684, + "step": 7335 + }, + { + "epoch": 5.861244019138756, + "grad_norm": 3.3008809089660645, + "learning_rate": 3.53141256502601e-05, + "loss": 0.8182, + "step": 7350 + }, + { + "epoch": 5.873205741626794, + "grad_norm": 3.5898377895355225, + "learning_rate": 3.5284113645458186e-05, + "loss": 0.8512, + "step": 7365 + }, + { + "epoch": 5.885167464114833, + "grad_norm": 3.8670029640197754, + "learning_rate": 3.5254101640656264e-05, + "loss": 0.8412, + "step": 7380 + }, + { + "epoch": 5.8971291866028706, + "grad_norm": 3.6071064472198486, + "learning_rate": 3.522408963585435e-05, + "loss": 0.8578, + "step": 7395 + }, + { + "epoch": 5.909090909090909, + "grad_norm": 4.674183368682861, + "learning_rate": 3.519407763105243e-05, + "loss": 0.8554, + "step": 7410 + }, + { + "epoch": 5.921052631578947, + "grad_norm": 3.45503306388855, + "learning_rate": 3.5164065626250505e-05, + "loss": 0.9224, + "step": 7425 + }, + { + "epoch": 5.9330143540669855, + "grad_norm": 3.4863317012786865, + "learning_rate": 3.513405362144858e-05, + "loss": 0.8177, + "step": 7440 + }, + { + "epoch": 5.944976076555024, + "grad_norm": 3.9804773330688477, + "learning_rate": 3.510404161664666e-05, + "loss": 0.8379, + "step": 7455 + }, + { + "epoch": 5.956937799043062, + "grad_norm": 3.6782078742980957, + "learning_rate": 3.507402961184474e-05, + "loss": 0.8634, + "step": 7470 + }, + { + "epoch": 5.9688995215311005, + "grad_norm": 3.7234580516815186, + "learning_rate": 3.5044017607042816e-05, + "loss": 0.9142, + "step": 7485 + }, + { + "epoch": 5.980861244019139, + "grad_norm": 3.6034648418426514, + "learning_rate": 3.5014005602240894e-05, + "loss": 0.8777, + "step": 7500 + }, + { + "epoch": 5.992822966507177, + "grad_norm": 3.407047748565674, + "learning_rate": 3.498399359743898e-05, + "loss": 0.8191, + "step": 7515 + }, + { + "epoch": 6.0047846889952154, + "grad_norm": 4.2239508628845215, + "learning_rate": 3.4953981592637056e-05, + "loss": 0.7896, + "step": 7530 + }, + { + "epoch": 6.016746411483253, + "grad_norm": 2.516592502593994, + "learning_rate": 3.4923969587835134e-05, + "loss": 0.5012, + "step": 7545 + }, + { + "epoch": 6.028708133971292, + "grad_norm": 3.366042375564575, + "learning_rate": 3.489395758303321e-05, + "loss": 0.4626, + "step": 7560 + }, + { + "epoch": 6.04066985645933, + "grad_norm": 4.176771640777588, + "learning_rate": 3.48639455782313e-05, + "loss": 0.4813, + "step": 7575 + }, + { + "epoch": 6.052631578947368, + "grad_norm": 3.807236671447754, + "learning_rate": 3.4833933573429375e-05, + "loss": 0.4928, + "step": 7590 + }, + { + "epoch": 6.064593301435407, + "grad_norm": 3.5176925659179688, + "learning_rate": 3.480392156862745e-05, + "loss": 0.4474, + "step": 7605 + }, + { + "epoch": 6.076555023923445, + "grad_norm": 3.860903739929199, + "learning_rate": 3.477390956382553e-05, + "loss": 0.5181, + "step": 7620 + }, + { + "epoch": 6.088516746411483, + "grad_norm": 3.883094072341919, + "learning_rate": 3.4743897559023615e-05, + "loss": 0.497, + "step": 7635 + }, + { + "epoch": 6.100478468899522, + "grad_norm": 3.299124240875244, + "learning_rate": 3.471388555422169e-05, + "loss": 0.5023, + "step": 7650 + }, + { + "epoch": 6.1124401913875595, + "grad_norm": 3.780906915664673, + "learning_rate": 3.468387354941977e-05, + "loss": 0.4938, + "step": 7665 + }, + { + "epoch": 6.124401913875598, + "grad_norm": 3.906473159790039, + "learning_rate": 3.465386154461785e-05, + "loss": 0.52, + "step": 7680 + }, + { + "epoch": 6.136363636363637, + "grad_norm": 3.7031853199005127, + "learning_rate": 3.4623849539815926e-05, + "loss": 0.4922, + "step": 7695 + }, + { + "epoch": 6.148325358851674, + "grad_norm": 4.119719505310059, + "learning_rate": 3.4593837535014004e-05, + "loss": 0.4726, + "step": 7710 + }, + { + "epoch": 6.160287081339713, + "grad_norm": 3.637122869491577, + "learning_rate": 3.456382553021208e-05, + "loss": 0.4522, + "step": 7725 + }, + { + "epoch": 6.172248803827751, + "grad_norm": 3.6455516815185547, + "learning_rate": 3.453381352541017e-05, + "loss": 0.497, + "step": 7740 + }, + { + "epoch": 6.184210526315789, + "grad_norm": 3.90136981010437, + "learning_rate": 3.4503801520608245e-05, + "loss": 0.5286, + "step": 7755 + }, + { + "epoch": 6.196172248803828, + "grad_norm": 3.776540994644165, + "learning_rate": 3.447378951580633e-05, + "loss": 0.5407, + "step": 7770 + }, + { + "epoch": 6.208133971291866, + "grad_norm": 4.160264015197754, + "learning_rate": 3.444377751100441e-05, + "loss": 0.4874, + "step": 7785 + }, + { + "epoch": 6.220095693779904, + "grad_norm": 3.5366413593292236, + "learning_rate": 3.4413765506202485e-05, + "loss": 0.4708, + "step": 7800 + }, + { + "epoch": 6.232057416267943, + "grad_norm": 3.604766368865967, + "learning_rate": 3.438375350140056e-05, + "loss": 0.5326, + "step": 7815 + }, + { + "epoch": 6.244019138755981, + "grad_norm": 3.5916519165039062, + "learning_rate": 3.435374149659864e-05, + "loss": 0.5411, + "step": 7830 + }, + { + "epoch": 6.255980861244019, + "grad_norm": 3.626094102859497, + "learning_rate": 3.432372949179672e-05, + "loss": 0.5142, + "step": 7845 + }, + { + "epoch": 6.267942583732057, + "grad_norm": 4.346883296966553, + "learning_rate": 3.4293717486994796e-05, + "loss": 0.5135, + "step": 7860 + }, + { + "epoch": 6.279904306220096, + "grad_norm": 4.123327732086182, + "learning_rate": 3.426370548219288e-05, + "loss": 0.5403, + "step": 7875 + }, + { + "epoch": 6.291866028708134, + "grad_norm": 4.1574482917785645, + "learning_rate": 3.423369347739096e-05, + "loss": 0.5373, + "step": 7890 + }, + { + "epoch": 6.303827751196172, + "grad_norm": 3.9462273120880127, + "learning_rate": 3.4203681472589037e-05, + "loss": 0.5223, + "step": 7905 + }, + { + "epoch": 6.315789473684211, + "grad_norm": 4.356924533843994, + "learning_rate": 3.4173669467787114e-05, + "loss": 0.5857, + "step": 7920 + }, + { + "epoch": 6.327751196172249, + "grad_norm": 3.8217930793762207, + "learning_rate": 3.41436574629852e-05, + "loss": 0.5272, + "step": 7935 + }, + { + "epoch": 6.339712918660287, + "grad_norm": 3.689328908920288, + "learning_rate": 3.411364545818328e-05, + "loss": 0.5162, + "step": 7950 + }, + { + "epoch": 6.351674641148326, + "grad_norm": 3.6850223541259766, + "learning_rate": 3.4083633453381355e-05, + "loss": 0.582, + "step": 7965 + }, + { + "epoch": 6.363636363636363, + "grad_norm": 4.063047885894775, + "learning_rate": 3.405362144857943e-05, + "loss": 0.5642, + "step": 7980 + }, + { + "epoch": 6.375598086124402, + "grad_norm": 3.6065573692321777, + "learning_rate": 3.402360944377751e-05, + "loss": 0.5225, + "step": 7995 + }, + { + "epoch": 6.3875598086124405, + "grad_norm": 4.188450336456299, + "learning_rate": 3.3993597438975595e-05, + "loss": 0.5911, + "step": 8010 + }, + { + "epoch": 6.399521531100478, + "grad_norm": 3.9791886806488037, + "learning_rate": 3.396358543417367e-05, + "loss": 0.5178, + "step": 8025 + }, + { + "epoch": 6.411483253588517, + "grad_norm": 4.381253719329834, + "learning_rate": 3.393357342937175e-05, + "loss": 0.5344, + "step": 8040 + }, + { + "epoch": 6.423444976076555, + "grad_norm": 3.810927152633667, + "learning_rate": 3.390356142456983e-05, + "loss": 0.4915, + "step": 8055 + }, + { + "epoch": 6.435406698564593, + "grad_norm": 4.254152774810791, + "learning_rate": 3.3873549419767907e-05, + "loss": 0.601, + "step": 8070 + }, + { + "epoch": 6.447368421052632, + "grad_norm": 4.086537837982178, + "learning_rate": 3.3843537414965984e-05, + "loss": 0.5944, + "step": 8085 + }, + { + "epoch": 6.45933014354067, + "grad_norm": 4.881983280181885, + "learning_rate": 3.381352541016406e-05, + "loss": 0.5789, + "step": 8100 + }, + { + "epoch": 6.471291866028708, + "grad_norm": 4.15606689453125, + "learning_rate": 3.378351340536215e-05, + "loss": 0.5397, + "step": 8115 + }, + { + "epoch": 6.483253588516747, + "grad_norm": 3.6769986152648926, + "learning_rate": 3.3753501400560225e-05, + "loss": 0.5449, + "step": 8130 + }, + { + "epoch": 6.4952153110047846, + "grad_norm": 3.846041440963745, + "learning_rate": 3.372348939575831e-05, + "loss": 0.5555, + "step": 8145 + }, + { + "epoch": 6.507177033492823, + "grad_norm": 4.353069305419922, + "learning_rate": 3.369347739095639e-05, + "loss": 0.608, + "step": 8160 + }, + { + "epoch": 6.519138755980861, + "grad_norm": 4.087284564971924, + "learning_rate": 3.3663465386154465e-05, + "loss": 0.5741, + "step": 8175 + }, + { + "epoch": 6.5311004784688995, + "grad_norm": 4.356995582580566, + "learning_rate": 3.363345338135254e-05, + "loss": 0.6432, + "step": 8190 + }, + { + "epoch": 6.543062200956938, + "grad_norm": 3.855937957763672, + "learning_rate": 3.360344137655062e-05, + "loss": 0.5783, + "step": 8205 + }, + { + "epoch": 6.555023923444976, + "grad_norm": 3.820133686065674, + "learning_rate": 3.35734293717487e-05, + "loss": 0.5814, + "step": 8220 + }, + { + "epoch": 6.5669856459330145, + "grad_norm": 4.873568058013916, + "learning_rate": 3.3543417366946776e-05, + "loss": 0.6264, + "step": 8235 + }, + { + "epoch": 6.578947368421053, + "grad_norm": 3.8670310974121094, + "learning_rate": 3.351340536214486e-05, + "loss": 0.6271, + "step": 8250 + }, + { + "epoch": 6.590909090909091, + "grad_norm": 4.838265895843506, + "learning_rate": 3.348339335734294e-05, + "loss": 0.6346, + "step": 8265 + }, + { + "epoch": 6.6028708133971294, + "grad_norm": 4.0044403076171875, + "learning_rate": 3.345338135254102e-05, + "loss": 0.5724, + "step": 8280 + }, + { + "epoch": 6.614832535885167, + "grad_norm": 3.866497039794922, + "learning_rate": 3.3423369347739095e-05, + "loss": 0.6172, + "step": 8295 + }, + { + "epoch": 6.626794258373206, + "grad_norm": 4.213998317718506, + "learning_rate": 3.339335734293718e-05, + "loss": 0.6246, + "step": 8310 + }, + { + "epoch": 6.638755980861244, + "grad_norm": 4.162674427032471, + "learning_rate": 3.336334533813526e-05, + "loss": 0.6301, + "step": 8325 + }, + { + "epoch": 6.650717703349282, + "grad_norm": 4.032559394836426, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.6557, + "step": 8340 + }, + { + "epoch": 6.662679425837321, + "grad_norm": 4.416426658630371, + "learning_rate": 3.330332132853141e-05, + "loss": 0.6592, + "step": 8355 + }, + { + "epoch": 6.6746411483253585, + "grad_norm": 4.758429527282715, + "learning_rate": 3.32733093237295e-05, + "loss": 0.6753, + "step": 8370 + }, + { + "epoch": 6.686602870813397, + "grad_norm": 4.513240337371826, + "learning_rate": 3.3243297318927575e-05, + "loss": 0.5713, + "step": 8385 + }, + { + "epoch": 6.698564593301436, + "grad_norm": 4.007817268371582, + "learning_rate": 3.321328531412565e-05, + "loss": 0.596, + "step": 8400 + }, + { + "epoch": 6.7105263157894735, + "grad_norm": 4.17065954208374, + "learning_rate": 3.318327330932373e-05, + "loss": 0.5975, + "step": 8415 + }, + { + "epoch": 6.722488038277512, + "grad_norm": 3.68249773979187, + "learning_rate": 3.315326130452181e-05, + "loss": 0.563, + "step": 8430 + }, + { + "epoch": 6.73444976076555, + "grad_norm": 4.292535781860352, + "learning_rate": 3.312324929971989e-05, + "loss": 0.6413, + "step": 8445 + }, + { + "epoch": 6.746411483253588, + "grad_norm": 4.380221843719482, + "learning_rate": 3.3093237294917965e-05, + "loss": 0.5637, + "step": 8460 + }, + { + "epoch": 6.758373205741627, + "grad_norm": 3.799266815185547, + "learning_rate": 3.306322529011604e-05, + "loss": 0.6653, + "step": 8475 + }, + { + "epoch": 6.770334928229665, + "grad_norm": 4.119513034820557, + "learning_rate": 3.303321328531413e-05, + "loss": 0.6436, + "step": 8490 + }, + { + "epoch": 6.782296650717703, + "grad_norm": 4.17624044418335, + "learning_rate": 3.3003201280512205e-05, + "loss": 0.6778, + "step": 8505 + }, + { + "epoch": 6.794258373205742, + "grad_norm": 4.3085761070251465, + "learning_rate": 3.297318927571029e-05, + "loss": 0.6298, + "step": 8520 + }, + { + "epoch": 6.80622009569378, + "grad_norm": 3.8202457427978516, + "learning_rate": 3.294317727090837e-05, + "loss": 0.5924, + "step": 8535 + }, + { + "epoch": 6.818181818181818, + "grad_norm": 4.103767395019531, + "learning_rate": 3.2913165266106445e-05, + "loss": 0.5925, + "step": 8550 + }, + { + "epoch": 6.830143540669856, + "grad_norm": 4.139376640319824, + "learning_rate": 3.288315326130452e-05, + "loss": 0.6656, + "step": 8565 + }, + { + "epoch": 6.842105263157895, + "grad_norm": 4.039120674133301, + "learning_rate": 3.28531412565026e-05, + "loss": 0.6807, + "step": 8580 + }, + { + "epoch": 6.854066985645933, + "grad_norm": 4.153085708618164, + "learning_rate": 3.282312925170068e-05, + "loss": 0.6194, + "step": 8595 + }, + { + "epoch": 6.866028708133971, + "grad_norm": 4.125678539276123, + "learning_rate": 3.279311724689876e-05, + "loss": 0.6333, + "step": 8610 + }, + { + "epoch": 6.87799043062201, + "grad_norm": 4.25078821182251, + "learning_rate": 3.276310524209684e-05, + "loss": 0.6895, + "step": 8625 + }, + { + "epoch": 6.889952153110048, + "grad_norm": 3.782094955444336, + "learning_rate": 3.273309323729492e-05, + "loss": 0.6789, + "step": 8640 + }, + { + "epoch": 6.901913875598086, + "grad_norm": 4.739928245544434, + "learning_rate": 3.2703081232493e-05, + "loss": 0.6427, + "step": 8655 + }, + { + "epoch": 6.913875598086125, + "grad_norm": 4.479592800140381, + "learning_rate": 3.2673069227691075e-05, + "loss": 0.6309, + "step": 8670 + }, + { + "epoch": 6.925837320574162, + "grad_norm": 4.018124580383301, + "learning_rate": 3.264305722288916e-05, + "loss": 0.6828, + "step": 8685 + }, + { + "epoch": 6.937799043062201, + "grad_norm": 3.8505430221557617, + "learning_rate": 3.261304521808724e-05, + "loss": 0.6361, + "step": 8700 + }, + { + "epoch": 6.94976076555024, + "grad_norm": 3.596605062484741, + "learning_rate": 3.2583033213285315e-05, + "loss": 0.6297, + "step": 8715 + }, + { + "epoch": 6.961722488038277, + "grad_norm": 4.318160533905029, + "learning_rate": 3.25530212084834e-05, + "loss": 0.6403, + "step": 8730 + }, + { + "epoch": 6.973684210526316, + "grad_norm": 4.0697431564331055, + "learning_rate": 3.252300920368148e-05, + "loss": 0.6154, + "step": 8745 + }, + { + "epoch": 6.985645933014354, + "grad_norm": 4.358625411987305, + "learning_rate": 3.2492997198879555e-05, + "loss": 0.6782, + "step": 8760 + }, + { + "epoch": 6.997607655502392, + "grad_norm": 4.264054298400879, + "learning_rate": 3.246298519407763e-05, + "loss": 0.6498, + "step": 8775 + }, + { + "epoch": 7.009569377990431, + "grad_norm": 3.4622254371643066, + "learning_rate": 3.243297318927571e-05, + "loss": 0.4423, + "step": 8790 + }, + { + "epoch": 7.021531100478469, + "grad_norm": 4.359318733215332, + "learning_rate": 3.240296118447379e-05, + "loss": 0.2942, + "step": 8805 + }, + { + "epoch": 7.033492822966507, + "grad_norm": 4.986384391784668, + "learning_rate": 3.237294917967187e-05, + "loss": 0.3187, + "step": 8820 + }, + { + "epoch": 7.045454545454546, + "grad_norm": 4.1987104415893555, + "learning_rate": 3.2342937174869945e-05, + "loss": 0.3306, + "step": 8835 + }, + { + "epoch": 7.057416267942584, + "grad_norm": 4.6675028800964355, + "learning_rate": 3.231292517006803e-05, + "loss": 0.349, + "step": 8850 + }, + { + "epoch": 7.069377990430622, + "grad_norm": 4.357269763946533, + "learning_rate": 3.228291316526611e-05, + "loss": 0.3153, + "step": 8865 + }, + { + "epoch": 7.08133971291866, + "grad_norm": 3.168750762939453, + "learning_rate": 3.225290116046419e-05, + "loss": 0.3223, + "step": 8880 + }, + { + "epoch": 7.0933014354066986, + "grad_norm": 4.13469934463501, + "learning_rate": 3.222288915566227e-05, + "loss": 0.3289, + "step": 8895 + }, + { + "epoch": 7.105263157894737, + "grad_norm": 3.306483507156372, + "learning_rate": 3.219287715086035e-05, + "loss": 0.3357, + "step": 8910 + }, + { + "epoch": 7.117224880382775, + "grad_norm": 3.830190896987915, + "learning_rate": 3.2162865146058425e-05, + "loss": 0.3425, + "step": 8925 + }, + { + "epoch": 7.1291866028708135, + "grad_norm": 3.848161220550537, + "learning_rate": 3.21328531412565e-05, + "loss": 0.3412, + "step": 8940 + }, + { + "epoch": 7.141148325358851, + "grad_norm": 4.058409214019775, + "learning_rate": 3.210284113645458e-05, + "loss": 0.3333, + "step": 8955 + }, + { + "epoch": 7.15311004784689, + "grad_norm": 3.780856132507324, + "learning_rate": 3.2072829131652666e-05, + "loss": 0.3205, + "step": 8970 + }, + { + "epoch": 7.1650717703349285, + "grad_norm": 3.9334750175476074, + "learning_rate": 3.2042817126850744e-05, + "loss": 0.3546, + "step": 8985 + }, + { + "epoch": 7.177033492822966, + "grad_norm": 4.092038631439209, + "learning_rate": 3.201280512204882e-05, + "loss": 0.3295, + "step": 9000 + }, + { + "epoch": 7.188995215311005, + "grad_norm": 4.35646390914917, + "learning_rate": 3.19827931172469e-05, + "loss": 0.3593, + "step": 9015 + }, + { + "epoch": 7.2009569377990434, + "grad_norm": 3.8881773948669434, + "learning_rate": 3.195278111244498e-05, + "loss": 0.3541, + "step": 9030 + }, + { + "epoch": 7.212918660287081, + "grad_norm": 4.399089336395264, + "learning_rate": 3.1922769107643055e-05, + "loss": 0.3271, + "step": 9045 + }, + { + "epoch": 7.22488038277512, + "grad_norm": 4.376395225524902, + "learning_rate": 3.189275710284114e-05, + "loss": 0.4131, + "step": 9060 + }, + { + "epoch": 7.2368421052631575, + "grad_norm": 4.1286468505859375, + "learning_rate": 3.186274509803922e-05, + "loss": 0.3824, + "step": 9075 + }, + { + "epoch": 7.248803827751196, + "grad_norm": 4.728172302246094, + "learning_rate": 3.18327330932373e-05, + "loss": 0.3706, + "step": 9090 + }, + { + "epoch": 7.260765550239235, + "grad_norm": 3.76225209236145, + "learning_rate": 3.180272108843538e-05, + "loss": 0.3568, + "step": 9105 + }, + { + "epoch": 7.2727272727272725, + "grad_norm": 3.939035415649414, + "learning_rate": 3.177270908363346e-05, + "loss": 0.4092, + "step": 9120 + }, + { + "epoch": 7.284688995215311, + "grad_norm": 4.537744045257568, + "learning_rate": 3.1742697078831536e-05, + "loss": 0.3606, + "step": 9135 + }, + { + "epoch": 7.296650717703349, + "grad_norm": 4.309103965759277, + "learning_rate": 3.1712685074029613e-05, + "loss": 0.406, + "step": 9150 + }, + { + "epoch": 7.3086124401913874, + "grad_norm": 4.298764228820801, + "learning_rate": 3.168267306922769e-05, + "loss": 0.373, + "step": 9165 + }, + { + "epoch": 7.320574162679426, + "grad_norm": 4.205005645751953, + "learning_rate": 3.165266106442577e-05, + "loss": 0.3567, + "step": 9180 + }, + { + "epoch": 7.332535885167464, + "grad_norm": 4.051873207092285, + "learning_rate": 3.162264905962385e-05, + "loss": 0.377, + "step": 9195 + }, + { + "epoch": 7.344497607655502, + "grad_norm": 4.320316314697266, + "learning_rate": 3.159263705482193e-05, + "loss": 0.4071, + "step": 9210 + }, + { + "epoch": 7.356459330143541, + "grad_norm": 4.617473125457764, + "learning_rate": 3.156262505002001e-05, + "loss": 0.4048, + "step": 9225 + }, + { + "epoch": 7.368421052631579, + "grad_norm": 4.013522148132324, + "learning_rate": 3.153261304521809e-05, + "loss": 0.3792, + "step": 9240 + }, + { + "epoch": 7.380382775119617, + "grad_norm": 4.339334487915039, + "learning_rate": 3.150260104041617e-05, + "loss": 0.4172, + "step": 9255 + }, + { + "epoch": 7.392344497607655, + "grad_norm": 4.555285453796387, + "learning_rate": 3.147258903561425e-05, + "loss": 0.397, + "step": 9270 + }, + { + "epoch": 7.404306220095694, + "grad_norm": 3.832693576812744, + "learning_rate": 3.144257703081233e-05, + "loss": 0.3784, + "step": 9285 + }, + { + "epoch": 7.416267942583732, + "grad_norm": 4.14719295501709, + "learning_rate": 3.1412565026010406e-05, + "loss": 0.3979, + "step": 9300 + }, + { + "epoch": 7.42822966507177, + "grad_norm": 3.914750337600708, + "learning_rate": 3.138255302120848e-05, + "loss": 0.3848, + "step": 9315 + }, + { + "epoch": 7.440191387559809, + "grad_norm": 4.9536967277526855, + "learning_rate": 3.135254101640656e-05, + "loss": 0.4144, + "step": 9330 + }, + { + "epoch": 7.452153110047847, + "grad_norm": 4.35673713684082, + "learning_rate": 3.1322529011604646e-05, + "loss": 0.4446, + "step": 9345 + }, + { + "epoch": 7.464114832535885, + "grad_norm": 4.106342315673828, + "learning_rate": 3.1292517006802724e-05, + "loss": 0.4056, + "step": 9360 + }, + { + "epoch": 7.476076555023924, + "grad_norm": 4.211533546447754, + "learning_rate": 3.12625050020008e-05, + "loss": 0.4072, + "step": 9375 + }, + { + "epoch": 7.488038277511961, + "grad_norm": 3.965963840484619, + "learning_rate": 3.123249299719888e-05, + "loss": 0.4329, + "step": 9390 + }, + { + "epoch": 7.5, + "grad_norm": 4.13434362411499, + "learning_rate": 3.120248099239696e-05, + "loss": 0.4161, + "step": 9405 + }, + { + "epoch": 7.511961722488039, + "grad_norm": 6.448205947875977, + "learning_rate": 3.1172468987595035e-05, + "loss": 0.3927, + "step": 9420 + }, + { + "epoch": 7.523923444976076, + "grad_norm": 4.125397682189941, + "learning_rate": 3.114245698279312e-05, + "loss": 0.4021, + "step": 9435 + }, + { + "epoch": 7.535885167464115, + "grad_norm": 4.477077007293701, + "learning_rate": 3.11124449779912e-05, + "loss": 0.4195, + "step": 9450 + }, + { + "epoch": 7.547846889952153, + "grad_norm": 3.9981772899627686, + "learning_rate": 3.108243297318928e-05, + "loss": 0.4473, + "step": 9465 + }, + { + "epoch": 7.559808612440191, + "grad_norm": 4.3731689453125, + "learning_rate": 3.105242096838736e-05, + "loss": 0.4264, + "step": 9480 + }, + { + "epoch": 7.57177033492823, + "grad_norm": 4.046823501586914, + "learning_rate": 3.102240896358544e-05, + "loss": 0.4151, + "step": 9495 + }, + { + "epoch": 7.583732057416268, + "grad_norm": 4.526839733123779, + "learning_rate": 3.0992396958783516e-05, + "loss": 0.4426, + "step": 9510 + }, + { + "epoch": 7.595693779904306, + "grad_norm": 4.215605735778809, + "learning_rate": 3.0962384953981594e-05, + "loss": 0.4376, + "step": 9525 + }, + { + "epoch": 7.607655502392344, + "grad_norm": 4.018391132354736, + "learning_rate": 3.093237294917967e-05, + "loss": 0.4385, + "step": 9540 + }, + { + "epoch": 7.619617224880383, + "grad_norm": 5.19038200378418, + "learning_rate": 3.090236094437775e-05, + "loss": 0.4379, + "step": 9555 + }, + { + "epoch": 7.631578947368421, + "grad_norm": 4.6209611892700195, + "learning_rate": 3.087234893957583e-05, + "loss": 0.4445, + "step": 9570 + }, + { + "epoch": 7.643540669856459, + "grad_norm": 4.700253486633301, + "learning_rate": 3.084233693477391e-05, + "loss": 0.4309, + "step": 9585 + }, + { + "epoch": 7.655502392344498, + "grad_norm": 4.6337761878967285, + "learning_rate": 3.081232492997199e-05, + "loss": 0.4256, + "step": 9600 + }, + { + "epoch": 7.667464114832536, + "grad_norm": 4.5144734382629395, + "learning_rate": 3.078231292517007e-05, + "loss": 0.4685, + "step": 9615 + }, + { + "epoch": 7.679425837320574, + "grad_norm": 4.41657829284668, + "learning_rate": 3.075230092036815e-05, + "loss": 0.4455, + "step": 9630 + }, + { + "epoch": 7.6913875598086126, + "grad_norm": 4.547213554382324, + "learning_rate": 3.072228891556623e-05, + "loss": 0.4935, + "step": 9645 + }, + { + "epoch": 7.703349282296651, + "grad_norm": 4.367729187011719, + "learning_rate": 3.069227691076431e-05, + "loss": 0.4636, + "step": 9660 + }, + { + "epoch": 7.715311004784689, + "grad_norm": 4.459219932556152, + "learning_rate": 3.0662264905962386e-05, + "loss": 0.4668, + "step": 9675 + }, + { + "epoch": 7.7272727272727275, + "grad_norm": 4.355218887329102, + "learning_rate": 3.0632252901160464e-05, + "loss": 0.4296, + "step": 9690 + }, + { + "epoch": 7.739234449760765, + "grad_norm": 3.960000514984131, + "learning_rate": 3.060224089635855e-05, + "loss": 0.4429, + "step": 9705 + }, + { + "epoch": 7.751196172248804, + "grad_norm": 4.526662349700928, + "learning_rate": 3.0572228891556626e-05, + "loss": 0.4751, + "step": 9720 + }, + { + "epoch": 7.7631578947368425, + "grad_norm": 4.3358259201049805, + "learning_rate": 3.0542216886754704e-05, + "loss": 0.4885, + "step": 9735 + }, + { + "epoch": 7.77511961722488, + "grad_norm": 4.190465927124023, + "learning_rate": 3.0512204881952782e-05, + "loss": 0.4633, + "step": 9750 + }, + { + "epoch": 7.787081339712919, + "grad_norm": 4.320166110992432, + "learning_rate": 3.0482192877150863e-05, + "loss": 0.4926, + "step": 9765 + }, + { + "epoch": 7.7990430622009566, + "grad_norm": 3.990604877471924, + "learning_rate": 3.045218087234894e-05, + "loss": 0.4516, + "step": 9780 + }, + { + "epoch": 7.811004784688995, + "grad_norm": 5.037746906280518, + "learning_rate": 3.042216886754702e-05, + "loss": 0.4121, + "step": 9795 + }, + { + "epoch": 7.822966507177034, + "grad_norm": 5.006950855255127, + "learning_rate": 3.0392156862745097e-05, + "loss": 0.4643, + "step": 9810 + }, + { + "epoch": 7.8349282296650715, + "grad_norm": 4.678879261016846, + "learning_rate": 3.036214485794318e-05, + "loss": 0.4733, + "step": 9825 + }, + { + "epoch": 7.84688995215311, + "grad_norm": 4.293395042419434, + "learning_rate": 3.033213285314126e-05, + "loss": 0.4866, + "step": 9840 + }, + { + "epoch": 7.858851674641148, + "grad_norm": 4.712632656097412, + "learning_rate": 3.0302120848339337e-05, + "loss": 0.4878, + "step": 9855 + }, + { + "epoch": 7.8708133971291865, + "grad_norm": 4.51541805267334, + "learning_rate": 3.0272108843537418e-05, + "loss": 0.4721, + "step": 9870 + }, + { + "epoch": 7.882775119617225, + "grad_norm": 4.705857753753662, + "learning_rate": 3.0242096838735496e-05, + "loss": 0.4849, + "step": 9885 + }, + { + "epoch": 7.894736842105263, + "grad_norm": 4.610105037689209, + "learning_rate": 3.0212084833933574e-05, + "loss": 0.4974, + "step": 9900 + }, + { + "epoch": 7.9066985645933014, + "grad_norm": 4.228977680206299, + "learning_rate": 3.018207282913165e-05, + "loss": 0.468, + "step": 9915 + }, + { + "epoch": 7.91866028708134, + "grad_norm": 4.514330863952637, + "learning_rate": 3.015206082432973e-05, + "loss": 0.4857, + "step": 9930 + }, + { + "epoch": 7.930622009569378, + "grad_norm": 4.639202117919922, + "learning_rate": 3.0122048819527814e-05, + "loss": 0.3874, + "step": 9945 + }, + { + "epoch": 7.942583732057416, + "grad_norm": 4.870967864990234, + "learning_rate": 3.0092036814725892e-05, + "loss": 0.4849, + "step": 9960 + }, + { + "epoch": 7.954545454545455, + "grad_norm": 4.402018070220947, + "learning_rate": 3.0062024809923973e-05, + "loss": 0.492, + "step": 9975 + }, + { + "epoch": 7.966507177033493, + "grad_norm": 4.405611991882324, + "learning_rate": 3.003201280512205e-05, + "loss": 0.4874, + "step": 9990 + }, + { + "epoch": 7.978468899521531, + "grad_norm": 4.78075647354126, + "learning_rate": 3.000200080032013e-05, + "loss": 0.5089, + "step": 10005 + }, + { + "epoch": 7.990430622009569, + "grad_norm": 4.583403587341309, + "learning_rate": 2.9971988795518207e-05, + "loss": 0.4791, + "step": 10020 + }, + { + "epoch": 8.002392344497608, + "grad_norm": 3.6340909004211426, + "learning_rate": 2.9941976790716285e-05, + "loss": 0.4022, + "step": 10035 + }, + { + "epoch": 8.014354066985646, + "grad_norm": 3.58935809135437, + "learning_rate": 2.9911964785914366e-05, + "loss": 0.2033, + "step": 10050 + }, + { + "epoch": 8.026315789473685, + "grad_norm": 4.309442520141602, + "learning_rate": 2.988195278111245e-05, + "loss": 0.209, + "step": 10065 + }, + { + "epoch": 8.038277511961722, + "grad_norm": 3.540694236755371, + "learning_rate": 2.985194077631053e-05, + "loss": 0.2269, + "step": 10080 + }, + { + "epoch": 8.05023923444976, + "grad_norm": 4.051588535308838, + "learning_rate": 2.9821928771508606e-05, + "loss": 0.225, + "step": 10095 + }, + { + "epoch": 8.062200956937799, + "grad_norm": 3.8642947673797607, + "learning_rate": 2.9791916766706684e-05, + "loss": 0.2408, + "step": 10110 + }, + { + "epoch": 8.074162679425838, + "grad_norm": 4.4070539474487305, + "learning_rate": 2.9761904761904762e-05, + "loss": 0.2131, + "step": 10125 + }, + { + "epoch": 8.086124401913876, + "grad_norm": 3.5634195804595947, + "learning_rate": 2.9731892757102843e-05, + "loss": 0.2253, + "step": 10140 + }, + { + "epoch": 8.098086124401913, + "grad_norm": 4.4950737953186035, + "learning_rate": 2.970188075230092e-05, + "loss": 0.2438, + "step": 10155 + }, + { + "epoch": 8.110047846889952, + "grad_norm": 4.489715576171875, + "learning_rate": 2.9671868747499e-05, + "loss": 0.2151, + "step": 10170 + }, + { + "epoch": 8.12200956937799, + "grad_norm": 4.503179550170898, + "learning_rate": 2.9641856742697083e-05, + "loss": 0.2375, + "step": 10185 + }, + { + "epoch": 8.133971291866029, + "grad_norm": 4.019615173339844, + "learning_rate": 2.961184473789516e-05, + "loss": 0.253, + "step": 10200 + }, + { + "epoch": 8.145933014354068, + "grad_norm": 3.398512601852417, + "learning_rate": 2.958183273309324e-05, + "loss": 0.2437, + "step": 10215 + }, + { + "epoch": 8.157894736842104, + "grad_norm": 2.8724753856658936, + "learning_rate": 2.9551820728291317e-05, + "loss": 0.2236, + "step": 10230 + }, + { + "epoch": 8.169856459330143, + "grad_norm": 3.7883143424987793, + "learning_rate": 2.9521808723489398e-05, + "loss": 0.2164, + "step": 10245 + }, + { + "epoch": 8.181818181818182, + "grad_norm": 4.483898639678955, + "learning_rate": 2.9491796718687476e-05, + "loss": 0.231, + "step": 10260 + }, + { + "epoch": 8.19377990430622, + "grad_norm": 4.909805774688721, + "learning_rate": 2.9461784713885554e-05, + "loss": 0.2511, + "step": 10275 + }, + { + "epoch": 8.205741626794259, + "grad_norm": 4.415759563446045, + "learning_rate": 2.9431772709083632e-05, + "loss": 0.2259, + "step": 10290 + }, + { + "epoch": 8.217703349282298, + "grad_norm": 3.9223194122314453, + "learning_rate": 2.9401760704281716e-05, + "loss": 0.2479, + "step": 10305 + }, + { + "epoch": 8.229665071770334, + "grad_norm": 3.4528160095214844, + "learning_rate": 2.9371748699479794e-05, + "loss": 0.2275, + "step": 10320 + }, + { + "epoch": 8.241626794258373, + "grad_norm": 4.239967346191406, + "learning_rate": 2.9341736694677872e-05, + "loss": 0.2316, + "step": 10335 + }, + { + "epoch": 8.253588516746412, + "grad_norm": 4.16427755355835, + "learning_rate": 2.9311724689875953e-05, + "loss": 0.2818, + "step": 10350 + }, + { + "epoch": 8.26555023923445, + "grad_norm": 4.7562994956970215, + "learning_rate": 2.928171268507403e-05, + "loss": 0.2658, + "step": 10365 + }, + { + "epoch": 8.277511961722489, + "grad_norm": 4.450767517089844, + "learning_rate": 2.925170068027211e-05, + "loss": 0.2792, + "step": 10380 + }, + { + "epoch": 8.289473684210526, + "grad_norm": 4.766055583953857, + "learning_rate": 2.9221688675470187e-05, + "loss": 0.2926, + "step": 10395 + }, + { + "epoch": 8.301435406698564, + "grad_norm": 4.053709030151367, + "learning_rate": 2.9191676670668268e-05, + "loss": 0.2418, + "step": 10410 + }, + { + "epoch": 8.313397129186603, + "grad_norm": 4.844228267669678, + "learning_rate": 2.916166466586635e-05, + "loss": 0.2426, + "step": 10425 + }, + { + "epoch": 8.325358851674642, + "grad_norm": 3.6860673427581787, + "learning_rate": 2.913165266106443e-05, + "loss": 0.2542, + "step": 10440 + }, + { + "epoch": 8.33732057416268, + "grad_norm": 3.938351631164551, + "learning_rate": 2.910164065626251e-05, + "loss": 0.2769, + "step": 10455 + }, + { + "epoch": 8.349282296650717, + "grad_norm": 4.569359302520752, + "learning_rate": 2.9071628651460586e-05, + "loss": 0.2456, + "step": 10470 + }, + { + "epoch": 8.361244019138756, + "grad_norm": 3.8243377208709717, + "learning_rate": 2.9041616646658664e-05, + "loss": 0.2666, + "step": 10485 + }, + { + "epoch": 8.373205741626794, + "grad_norm": 4.553408145904541, + "learning_rate": 2.9011604641856742e-05, + "loss": 0.2891, + "step": 10500 + }, + { + "epoch": 8.385167464114833, + "grad_norm": 4.640753746032715, + "learning_rate": 2.8981592637054823e-05, + "loss": 0.2912, + "step": 10515 + }, + { + "epoch": 8.397129186602871, + "grad_norm": 4.968740940093994, + "learning_rate": 2.89515806322529e-05, + "loss": 0.2761, + "step": 10530 + }, + { + "epoch": 8.409090909090908, + "grad_norm": 4.833539962768555, + "learning_rate": 2.8921568627450986e-05, + "loss": 0.2915, + "step": 10545 + }, + { + "epoch": 8.421052631578947, + "grad_norm": 4.913358211517334, + "learning_rate": 2.8891556622649064e-05, + "loss": 0.2703, + "step": 10560 + }, + { + "epoch": 8.433014354066986, + "grad_norm": 3.7276763916015625, + "learning_rate": 2.886154461784714e-05, + "loss": 0.2705, + "step": 10575 + }, + { + "epoch": 8.444976076555024, + "grad_norm": 4.225296974182129, + "learning_rate": 2.883153261304522e-05, + "loss": 0.2944, + "step": 10590 + }, + { + "epoch": 8.456937799043063, + "grad_norm": 4.071160793304443, + "learning_rate": 2.8801520608243297e-05, + "loss": 0.3017, + "step": 10605 + }, + { + "epoch": 8.4688995215311, + "grad_norm": 4.818964958190918, + "learning_rate": 2.877150860344138e-05, + "loss": 0.3057, + "step": 10620 + }, + { + "epoch": 8.480861244019138, + "grad_norm": 4.391495704650879, + "learning_rate": 2.8741496598639456e-05, + "loss": 0.2854, + "step": 10635 + }, + { + "epoch": 8.492822966507177, + "grad_norm": 4.263548374176025, + "learning_rate": 2.8711484593837534e-05, + "loss": 0.2604, + "step": 10650 + } + ], + "logging_steps": 15, + "max_steps": 25000, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 15, + "total_flos": 7.844568831858917e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}