diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5523 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.857035364936042, + "eval_steps": 500, + "global_step": 39300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "grad_norm": 98.3414306640625, + "learning_rate": 1.4082317531978931e-05, + "loss": 1.674, + "step": 50 + }, + { + "epoch": 0.03, + "grad_norm": 21.889772415161133, + "learning_rate": 1.4064635063957864e-05, + "loss": 1.7321, + "step": 100 + }, + { + "epoch": 0.04, + "grad_norm": 78.81497955322266, + "learning_rate": 1.4046952595936794e-05, + "loss": 1.3246, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 117.79057312011719, + "learning_rate": 1.4029270127915727e-05, + "loss": 1.8399, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 89.93197631835938, + "learning_rate": 1.4011587659894659e-05, + "loss": 1.7021, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 5.327052116394043, + "learning_rate": 1.399390519187359e-05, + "loss": 1.3229, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 104.67691802978516, + "learning_rate": 1.397622272385252e-05, + "loss": 1.0449, + "step": 350 + }, + { + "epoch": 0.1, + "grad_norm": 62.50383377075195, + "learning_rate": 1.3958540255831453e-05, + "loss": 1.2135, + "step": 400 + }, + { + "epoch": 0.11, + "grad_norm": 30.1390380859375, + "learning_rate": 1.3940857787810384e-05, + "loss": 1.1312, + "step": 450 + }, + { + "epoch": 0.13, + "grad_norm": 172.32058715820312, + "learning_rate": 1.3923175319789316e-05, + "loss": 1.1339, + "step": 500 + }, + { + "epoch": 0.14, + "grad_norm": 149.6029052734375, + "learning_rate": 1.3905492851768248e-05, + "loss": 0.9226, + "step": 550 + }, + { + "epoch": 0.15, + "grad_norm": 104.08654022216797, + "learning_rate": 1.3887810383747179e-05, + "loss": 0.9141, + "step": 600 + }, + { + "epoch": 0.16, + "grad_norm": 28.90251350402832, + "learning_rate": 1.387012791572611e-05, + "loss": 0.7194, + "step": 650 + }, + { + "epoch": 0.18, + "grad_norm": 78.85499572753906, + "learning_rate": 1.3852445447705042e-05, + "loss": 1.051, + "step": 700 + }, + { + "epoch": 0.19, + "grad_norm": 59.84476089477539, + "learning_rate": 1.3834762979683973e-05, + "loss": 0.8815, + "step": 750 + }, + { + "epoch": 0.2, + "grad_norm": 47.683658599853516, + "learning_rate": 1.3817080511662905e-05, + "loss": 1.1052, + "step": 800 + }, + { + "epoch": 0.21, + "grad_norm": 73.24783325195312, + "learning_rate": 1.3799398043641836e-05, + "loss": 0.6957, + "step": 850 + }, + { + "epoch": 0.23, + "grad_norm": 121.98059844970703, + "learning_rate": 1.3781715575620768e-05, + "loss": 1.1512, + "step": 900 + }, + { + "epoch": 0.24, + "grad_norm": 115.57231140136719, + "learning_rate": 1.3764033107599699e-05, + "loss": 0.8512, + "step": 950 + }, + { + "epoch": 0.25, + "grad_norm": 40.25959014892578, + "learning_rate": 1.374635063957863e-05, + "loss": 0.873, + "step": 1000 + }, + { + "epoch": 0.26, + "grad_norm": 10.869709014892578, + "learning_rate": 1.3728668171557562e-05, + "loss": 0.7834, + "step": 1050 + }, + { + "epoch": 0.28, + "grad_norm": 128.24893188476562, + "learning_rate": 1.3710985703536495e-05, + "loss": 0.8042, + "step": 1100 + }, + { + "epoch": 0.29, + "grad_norm": 60.73322677612305, + "learning_rate": 1.3693303235515425e-05, + "loss": 1.0092, + "step": 1150 + }, + { + "epoch": 0.3, + "grad_norm": 19.39624786376953, + "learning_rate": 1.3675620767494358e-05, + "loss": 0.662, + "step": 1200 + }, + { + "epoch": 0.31, + "grad_norm": 0.13774849474430084, + "learning_rate": 1.3657938299473288e-05, + "loss": 0.98, + "step": 1250 + }, + { + "epoch": 0.33, + "grad_norm": 79.46333312988281, + "learning_rate": 1.3640255831452219e-05, + "loss": 0.7967, + "step": 1300 + }, + { + "epoch": 0.34, + "grad_norm": 13.158239364624023, + "learning_rate": 1.3622573363431151e-05, + "loss": 1.0218, + "step": 1350 + }, + { + "epoch": 0.35, + "grad_norm": 17.267330169677734, + "learning_rate": 1.3604890895410084e-05, + "loss": 0.8711, + "step": 1400 + }, + { + "epoch": 0.36, + "grad_norm": 174.72537231445312, + "learning_rate": 1.3587208427389015e-05, + "loss": 0.8711, + "step": 1450 + }, + { + "epoch": 0.38, + "grad_norm": 77.13172149658203, + "learning_rate": 1.3569525959367947e-05, + "loss": 1.0233, + "step": 1500 + }, + { + "epoch": 0.39, + "grad_norm": 48.417015075683594, + "learning_rate": 1.3551843491346878e-05, + "loss": 0.7682, + "step": 1550 + }, + { + "epoch": 0.4, + "grad_norm": 6.1959638595581055, + "learning_rate": 1.3534161023325808e-05, + "loss": 0.8792, + "step": 1600 + }, + { + "epoch": 0.41, + "grad_norm": 49.98043441772461, + "learning_rate": 1.351647855530474e-05, + "loss": 0.9868, + "step": 1650 + }, + { + "epoch": 0.43, + "grad_norm": 45.13309860229492, + "learning_rate": 1.3498796087283673e-05, + "loss": 0.5272, + "step": 1700 + }, + { + "epoch": 0.44, + "grad_norm": 8.423553466796875, + "learning_rate": 1.3481113619262604e-05, + "loss": 1.1983, + "step": 1750 + }, + { + "epoch": 0.45, + "grad_norm": 17.5786190032959, + "learning_rate": 1.3463431151241536e-05, + "loss": 0.7065, + "step": 1800 + }, + { + "epoch": 0.46, + "grad_norm": 5.939927577972412, + "learning_rate": 1.3445748683220467e-05, + "loss": 0.6674, + "step": 1850 + }, + { + "epoch": 0.48, + "grad_norm": 23.781694412231445, + "learning_rate": 1.3428066215199398e-05, + "loss": 0.7267, + "step": 1900 + }, + { + "epoch": 0.49, + "grad_norm": 0.4960607886314392, + "learning_rate": 1.341038374717833e-05, + "loss": 1.0549, + "step": 1950 + }, + { + "epoch": 0.5, + "grad_norm": 83.99737548828125, + "learning_rate": 1.3392701279157262e-05, + "loss": 0.786, + "step": 2000 + }, + { + "epoch": 0.51, + "grad_norm": 20.65607261657715, + "learning_rate": 1.3375018811136193e-05, + "loss": 0.9709, + "step": 2050 + }, + { + "epoch": 0.53, + "grad_norm": 1.0673532485961914, + "learning_rate": 1.3357336343115126e-05, + "loss": 0.8208, + "step": 2100 + }, + { + "epoch": 0.54, + "grad_norm": 10.350920677185059, + "learning_rate": 1.3339653875094056e-05, + "loss": 1.1503, + "step": 2150 + }, + { + "epoch": 0.55, + "grad_norm": 0.7176612019538879, + "learning_rate": 1.3321971407072987e-05, + "loss": 0.5841, + "step": 2200 + }, + { + "epoch": 0.56, + "grad_norm": 16.532655715942383, + "learning_rate": 1.330428893905192e-05, + "loss": 1.1618, + "step": 2250 + }, + { + "epoch": 0.58, + "grad_norm": 0.24398092925548553, + "learning_rate": 1.3286606471030852e-05, + "loss": 0.6052, + "step": 2300 + }, + { + "epoch": 0.59, + "grad_norm": 45.761695861816406, + "learning_rate": 1.3268924003009782e-05, + "loss": 1.0618, + "step": 2350 + }, + { + "epoch": 0.6, + "grad_norm": 0.3656911849975586, + "learning_rate": 1.3251241534988713e-05, + "loss": 0.8395, + "step": 2400 + }, + { + "epoch": 0.61, + "grad_norm": 56.36614227294922, + "learning_rate": 1.3233559066967646e-05, + "loss": 0.6547, + "step": 2450 + }, + { + "epoch": 0.63, + "grad_norm": 50.591705322265625, + "learning_rate": 1.3215876598946576e-05, + "loss": 0.9528, + "step": 2500 + }, + { + "epoch": 0.64, + "grad_norm": 11.290885925292969, + "learning_rate": 1.3198194130925507e-05, + "loss": 0.6811, + "step": 2550 + }, + { + "epoch": 0.65, + "grad_norm": 0.10668418556451797, + "learning_rate": 1.3180511662904441e-05, + "loss": 0.7421, + "step": 2600 + }, + { + "epoch": 0.66, + "grad_norm": 1.0529690980911255, + "learning_rate": 1.3162829194883372e-05, + "loss": 0.7665, + "step": 2650 + }, + { + "epoch": 0.68, + "grad_norm": 35.5570068359375, + "learning_rate": 1.3145146726862302e-05, + "loss": 0.6587, + "step": 2700 + }, + { + "epoch": 0.69, + "grad_norm": 47.973697662353516, + "learning_rate": 1.3127464258841235e-05, + "loss": 0.8273, + "step": 2750 + }, + { + "epoch": 0.7, + "grad_norm": 42.45454788208008, + "learning_rate": 1.3109781790820166e-05, + "loss": 0.8512, + "step": 2800 + }, + { + "epoch": 0.71, + "grad_norm": 52.255821228027344, + "learning_rate": 1.3092099322799096e-05, + "loss": 0.5748, + "step": 2850 + }, + { + "epoch": 0.73, + "grad_norm": 55.622413635253906, + "learning_rate": 1.307441685477803e-05, + "loss": 0.6585, + "step": 2900 + }, + { + "epoch": 0.74, + "grad_norm": 6.804417610168457, + "learning_rate": 1.3056734386756961e-05, + "loss": 0.9276, + "step": 2950 + }, + { + "epoch": 0.75, + "grad_norm": 8.9085054397583, + "learning_rate": 1.3039051918735892e-05, + "loss": 0.9573, + "step": 3000 + }, + { + "epoch": 0.76, + "grad_norm": 3.399890422821045, + "learning_rate": 1.3021369450714824e-05, + "loss": 0.815, + "step": 3050 + }, + { + "epoch": 0.78, + "grad_norm": 9.621098518371582, + "learning_rate": 1.3003686982693755e-05, + "loss": 0.6272, + "step": 3100 + }, + { + "epoch": 0.79, + "grad_norm": 34.52663803100586, + "learning_rate": 1.2986004514672686e-05, + "loss": 0.7548, + "step": 3150 + }, + { + "epoch": 0.8, + "grad_norm": 38.8935661315918, + "learning_rate": 1.296832204665162e-05, + "loss": 1.0272, + "step": 3200 + }, + { + "epoch": 0.82, + "grad_norm": 53.31705093383789, + "learning_rate": 1.295063957863055e-05, + "loss": 0.8594, + "step": 3250 + }, + { + "epoch": 0.83, + "grad_norm": 24.726455688476562, + "learning_rate": 1.2932957110609481e-05, + "loss": 0.7025, + "step": 3300 + }, + { + "epoch": 0.84, + "grad_norm": 35.29804992675781, + "learning_rate": 1.2915274642588413e-05, + "loss": 0.8359, + "step": 3350 + }, + { + "epoch": 0.85, + "grad_norm": 15.382336616516113, + "learning_rate": 1.2897592174567344e-05, + "loss": 0.7358, + "step": 3400 + }, + { + "epoch": 0.87, + "grad_norm": 2.9050614833831787, + "learning_rate": 1.2879909706546275e-05, + "loss": 0.8021, + "step": 3450 + }, + { + "epoch": 0.88, + "grad_norm": 44.734962463378906, + "learning_rate": 1.2862227238525209e-05, + "loss": 0.844, + "step": 3500 + }, + { + "epoch": 0.89, + "grad_norm": 14.811912536621094, + "learning_rate": 1.284454477050414e-05, + "loss": 0.7822, + "step": 3550 + }, + { + "epoch": 0.9, + "grad_norm": 44.70045471191406, + "learning_rate": 1.282686230248307e-05, + "loss": 1.0654, + "step": 3600 + }, + { + "epoch": 0.92, + "grad_norm": 48.43465805053711, + "learning_rate": 1.2809179834462003e-05, + "loss": 0.6354, + "step": 3650 + }, + { + "epoch": 0.93, + "grad_norm": 47.798423767089844, + "learning_rate": 1.2791497366440933e-05, + "loss": 0.8125, + "step": 3700 + }, + { + "epoch": 0.94, + "grad_norm": 42.33122634887695, + "learning_rate": 1.2773814898419864e-05, + "loss": 1.1325, + "step": 3750 + }, + { + "epoch": 0.95, + "grad_norm": 0.14906466007232666, + "learning_rate": 1.2756132430398797e-05, + "loss": 0.5325, + "step": 3800 + }, + { + "epoch": 0.97, + "grad_norm": 10.49329662322998, + "learning_rate": 1.2738449962377729e-05, + "loss": 0.7013, + "step": 3850 + }, + { + "epoch": 0.98, + "grad_norm": 21.828550338745117, + "learning_rate": 1.272076749435666e-05, + "loss": 0.5134, + "step": 3900 + }, + { + "epoch": 0.99, + "grad_norm": 1.0481252670288086, + "learning_rate": 1.270308502633559e-05, + "loss": 1.4255, + "step": 3950 + }, + { + "epoch": 1.0, + "grad_norm": 1.075194001197815, + "learning_rate": 1.2685402558314523e-05, + "loss": 0.7727, + "step": 4000 + }, + { + "epoch": 1.02, + "grad_norm": 17.64851188659668, + "learning_rate": 1.2667720090293453e-05, + "loss": 0.4984, + "step": 4050 + }, + { + "epoch": 1.03, + "grad_norm": 49.92161178588867, + "learning_rate": 1.2650037622272386e-05, + "loss": 0.9065, + "step": 4100 + }, + { + "epoch": 1.04, + "grad_norm": 11.019123077392578, + "learning_rate": 1.2632355154251318e-05, + "loss": 0.8184, + "step": 4150 + }, + { + "epoch": 1.05, + "grad_norm": 12.537881851196289, + "learning_rate": 1.2614672686230249e-05, + "loss": 0.6989, + "step": 4200 + }, + { + "epoch": 1.07, + "grad_norm": 0.5771467089653015, + "learning_rate": 1.259699021820918e-05, + "loss": 0.7282, + "step": 4250 + }, + { + "epoch": 1.08, + "grad_norm": 60.68583297729492, + "learning_rate": 1.2579307750188112e-05, + "loss": 0.695, + "step": 4300 + }, + { + "epoch": 1.09, + "grad_norm": 0.7341581583023071, + "learning_rate": 1.2561625282167043e-05, + "loss": 0.7021, + "step": 4350 + }, + { + "epoch": 1.1, + "grad_norm": 0.020291157066822052, + "learning_rate": 1.2543942814145975e-05, + "loss": 0.8563, + "step": 4400 + }, + { + "epoch": 1.12, + "grad_norm": 1.3924442529678345, + "learning_rate": 1.2526260346124907e-05, + "loss": 0.7378, + "step": 4450 + }, + { + "epoch": 1.13, + "grad_norm": 31.691173553466797, + "learning_rate": 1.2508577878103838e-05, + "loss": 0.5887, + "step": 4500 + }, + { + "epoch": 1.14, + "grad_norm": 1.1823307275772095, + "learning_rate": 1.2490895410082769e-05, + "loss": 0.8132, + "step": 4550 + }, + { + "epoch": 1.15, + "grad_norm": 0.08653511106967926, + "learning_rate": 1.2473212942061701e-05, + "loss": 0.8374, + "step": 4600 + }, + { + "epoch": 1.17, + "grad_norm": 2.169903039932251, + "learning_rate": 1.2455530474040632e-05, + "loss": 0.588, + "step": 4650 + }, + { + "epoch": 1.18, + "grad_norm": 56.76768112182617, + "learning_rate": 1.2437848006019564e-05, + "loss": 0.7869, + "step": 4700 + }, + { + "epoch": 1.19, + "grad_norm": 0.05390803515911102, + "learning_rate": 1.2420165537998497e-05, + "loss": 0.6243, + "step": 4750 + }, + { + "epoch": 1.2, + "grad_norm": 5.537655830383301, + "learning_rate": 1.2402483069977427e-05, + "loss": 0.737, + "step": 4800 + }, + { + "epoch": 1.22, + "grad_norm": 69.44229125976562, + "learning_rate": 1.2384800601956358e-05, + "loss": 1.0479, + "step": 4850 + }, + { + "epoch": 1.23, + "grad_norm": 45.22208023071289, + "learning_rate": 1.236711813393529e-05, + "loss": 0.8327, + "step": 4900 + }, + { + "epoch": 1.24, + "grad_norm": 22.553054809570312, + "learning_rate": 1.2349435665914221e-05, + "loss": 0.6587, + "step": 4950 + }, + { + "epoch": 1.25, + "grad_norm": 2.1869142055511475, + "learning_rate": 1.2331753197893154e-05, + "loss": 0.5913, + "step": 5000 + }, + { + "epoch": 1.27, + "grad_norm": 2.483933210372925, + "learning_rate": 1.2314070729872086e-05, + "loss": 0.8163, + "step": 5050 + }, + { + "epoch": 1.28, + "grad_norm": 18.768310546875, + "learning_rate": 1.2296388261851017e-05, + "loss": 0.6273, + "step": 5100 + }, + { + "epoch": 1.29, + "grad_norm": 56.0864372253418, + "learning_rate": 1.2278705793829947e-05, + "loss": 0.8787, + "step": 5150 + }, + { + "epoch": 1.3, + "grad_norm": 51.98051834106445, + "learning_rate": 1.226102332580888e-05, + "loss": 0.4302, + "step": 5200 + }, + { + "epoch": 1.32, + "grad_norm": 17.60165023803711, + "learning_rate": 1.224334085778781e-05, + "loss": 0.7238, + "step": 5250 + }, + { + "epoch": 1.33, + "grad_norm": 48.4942626953125, + "learning_rate": 1.2225658389766743e-05, + "loss": 0.8018, + "step": 5300 + }, + { + "epoch": 1.34, + "grad_norm": 14.206453323364258, + "learning_rate": 1.2207975921745674e-05, + "loss": 0.5428, + "step": 5350 + }, + { + "epoch": 1.35, + "grad_norm": 65.64610290527344, + "learning_rate": 1.2190293453724606e-05, + "loss": 0.7923, + "step": 5400 + }, + { + "epoch": 1.37, + "grad_norm": 9.786343574523926, + "learning_rate": 1.2172610985703537e-05, + "loss": 0.7779, + "step": 5450 + }, + { + "epoch": 1.38, + "grad_norm": 3.1632120609283447, + "learning_rate": 1.2154928517682467e-05, + "loss": 0.6474, + "step": 5500 + }, + { + "epoch": 1.39, + "grad_norm": 15.631272315979004, + "learning_rate": 1.21372460496614e-05, + "loss": 0.6736, + "step": 5550 + }, + { + "epoch": 1.4, + "grad_norm": 0.025490593165159225, + "learning_rate": 1.2119563581640332e-05, + "loss": 0.6371, + "step": 5600 + }, + { + "epoch": 1.42, + "grad_norm": 84.42486572265625, + "learning_rate": 1.2101881113619263e-05, + "loss": 0.9348, + "step": 5650 + }, + { + "epoch": 1.43, + "grad_norm": 0.32389989495277405, + "learning_rate": 1.2084198645598195e-05, + "loss": 0.8304, + "step": 5700 + }, + { + "epoch": 1.44, + "grad_norm": 49.16242599487305, + "learning_rate": 1.2066516177577126e-05, + "loss": 0.6624, + "step": 5750 + }, + { + "epoch": 1.45, + "grad_norm": 119.3700942993164, + "learning_rate": 1.2048833709556057e-05, + "loss": 1.1135, + "step": 5800 + }, + { + "epoch": 1.47, + "grad_norm": 0.15834768116474152, + "learning_rate": 1.2031151241534989e-05, + "loss": 0.6358, + "step": 5850 + }, + { + "epoch": 1.48, + "grad_norm": 54.722652435302734, + "learning_rate": 1.2013468773513922e-05, + "loss": 0.5639, + "step": 5900 + }, + { + "epoch": 1.49, + "grad_norm": 0.0872531533241272, + "learning_rate": 1.1995786305492852e-05, + "loss": 0.7912, + "step": 5950 + }, + { + "epoch": 1.5, + "grad_norm": 2.5009591579437256, + "learning_rate": 1.1978103837471785e-05, + "loss": 0.6478, + "step": 6000 + }, + { + "epoch": 1.52, + "grad_norm": 1.5101827383041382, + "learning_rate": 1.1960421369450715e-05, + "loss": 0.8577, + "step": 6050 + }, + { + "epoch": 1.53, + "grad_norm": 3.4737539291381836, + "learning_rate": 1.1942738901429646e-05, + "loss": 0.9474, + "step": 6100 + }, + { + "epoch": 1.54, + "grad_norm": 92.57341003417969, + "learning_rate": 1.1925056433408578e-05, + "loss": 0.8665, + "step": 6150 + }, + { + "epoch": 1.56, + "grad_norm": 38.56670379638672, + "learning_rate": 1.1907373965387509e-05, + "loss": 0.7833, + "step": 6200 + }, + { + "epoch": 1.57, + "grad_norm": 29.12518310546875, + "learning_rate": 1.1889691497366442e-05, + "loss": 0.7454, + "step": 6250 + }, + { + "epoch": 1.58, + "grad_norm": 69.91959381103516, + "learning_rate": 1.1872009029345374e-05, + "loss": 0.7843, + "step": 6300 + }, + { + "epoch": 1.59, + "grad_norm": 56.20566177368164, + "learning_rate": 1.1854326561324305e-05, + "loss": 0.841, + "step": 6350 + }, + { + "epoch": 1.61, + "grad_norm": 66.2998275756836, + "learning_rate": 1.1836644093303235e-05, + "loss": 0.723, + "step": 6400 + }, + { + "epoch": 1.62, + "grad_norm": 1.9407018423080444, + "learning_rate": 1.1818961625282168e-05, + "loss": 0.7235, + "step": 6450 + }, + { + "epoch": 1.63, + "grad_norm": 61.69858932495117, + "learning_rate": 1.1801279157261098e-05, + "loss": 0.8241, + "step": 6500 + }, + { + "epoch": 1.64, + "grad_norm": 8.412137985229492, + "learning_rate": 1.178359668924003e-05, + "loss": 0.564, + "step": 6550 + }, + { + "epoch": 1.66, + "grad_norm": 9.307317733764648, + "learning_rate": 1.1765914221218962e-05, + "loss": 0.8438, + "step": 6600 + }, + { + "epoch": 1.67, + "grad_norm": 41.45466995239258, + "learning_rate": 1.1748231753197894e-05, + "loss": 0.7763, + "step": 6650 + }, + { + "epoch": 1.68, + "grad_norm": 2.8245513439178467, + "learning_rate": 1.1730549285176825e-05, + "loss": 0.7476, + "step": 6700 + }, + { + "epoch": 1.69, + "grad_norm": 76.77831268310547, + "learning_rate": 1.1712866817155757e-05, + "loss": 0.9578, + "step": 6750 + }, + { + "epoch": 1.71, + "grad_norm": 0.004409218207001686, + "learning_rate": 1.1695184349134688e-05, + "loss": 0.8765, + "step": 6800 + }, + { + "epoch": 1.72, + "grad_norm": 46.58176803588867, + "learning_rate": 1.167750188111362e-05, + "loss": 0.5402, + "step": 6850 + }, + { + "epoch": 1.73, + "grad_norm": 5.006879806518555, + "learning_rate": 1.165981941309255e-05, + "loss": 0.4722, + "step": 6900 + }, + { + "epoch": 1.74, + "grad_norm": 2.194460153579712, + "learning_rate": 1.1642136945071483e-05, + "loss": 0.858, + "step": 6950 + }, + { + "epoch": 1.76, + "grad_norm": 0.012106262147426605, + "learning_rate": 1.1624454477050414e-05, + "loss": 0.6607, + "step": 7000 + }, + { + "epoch": 1.77, + "grad_norm": 6.08723258972168, + "learning_rate": 1.1606772009029345e-05, + "loss": 0.866, + "step": 7050 + }, + { + "epoch": 1.78, + "grad_norm": 51.338478088378906, + "learning_rate": 1.1589089541008277e-05, + "loss": 0.7508, + "step": 7100 + }, + { + "epoch": 1.79, + "grad_norm": 18.472858428955078, + "learning_rate": 1.157140707298721e-05, + "loss": 0.8686, + "step": 7150 + }, + { + "epoch": 1.81, + "grad_norm": 4.837900638580322, + "learning_rate": 1.155372460496614e-05, + "loss": 0.5302, + "step": 7200 + }, + { + "epoch": 1.82, + "grad_norm": 41.74524688720703, + "learning_rate": 1.1536042136945072e-05, + "loss": 0.7681, + "step": 7250 + }, + { + "epoch": 1.83, + "grad_norm": 30.557188034057617, + "learning_rate": 1.1518359668924003e-05, + "loss": 0.9107, + "step": 7300 + }, + { + "epoch": 1.84, + "grad_norm": 14.001880645751953, + "learning_rate": 1.1500677200902934e-05, + "loss": 0.5387, + "step": 7350 + }, + { + "epoch": 1.86, + "grad_norm": 0.1815216839313507, + "learning_rate": 1.1482994732881866e-05, + "loss": 0.8152, + "step": 7400 + }, + { + "epoch": 1.87, + "grad_norm": 36.915061950683594, + "learning_rate": 1.1465312264860799e-05, + "loss": 0.6313, + "step": 7450 + }, + { + "epoch": 1.88, + "grad_norm": 0.20334406197071075, + "learning_rate": 1.144762979683973e-05, + "loss": 0.8265, + "step": 7500 + }, + { + "epoch": 1.89, + "grad_norm": 0.0018741831881925464, + "learning_rate": 1.1429947328818662e-05, + "loss": 0.7202, + "step": 7550 + }, + { + "epoch": 1.91, + "grad_norm": 0.000707630708348006, + "learning_rate": 1.1412264860797592e-05, + "loss": 0.6488, + "step": 7600 + }, + { + "epoch": 1.92, + "grad_norm": 0.4616662561893463, + "learning_rate": 1.1394582392776523e-05, + "loss": 0.9402, + "step": 7650 + }, + { + "epoch": 1.93, + "grad_norm": 43.170814514160156, + "learning_rate": 1.1376899924755456e-05, + "loss": 0.763, + "step": 7700 + }, + { + "epoch": 1.94, + "grad_norm": 3.035790205001831, + "learning_rate": 1.1359217456734388e-05, + "loss": 0.5681, + "step": 7750 + }, + { + "epoch": 1.96, + "grad_norm": 45.11912536621094, + "learning_rate": 1.1341534988713319e-05, + "loss": 0.9795, + "step": 7800 + }, + { + "epoch": 1.97, + "grad_norm": 2.006427049636841, + "learning_rate": 1.1323852520692251e-05, + "loss": 0.4772, + "step": 7850 + }, + { + "epoch": 1.98, + "grad_norm": 69.13399505615234, + "learning_rate": 1.1306170052671182e-05, + "loss": 0.8649, + "step": 7900 + }, + { + "epoch": 1.99, + "grad_norm": 43.80717468261719, + "learning_rate": 1.1288487584650112e-05, + "loss": 0.6051, + "step": 7950 + }, + { + "epoch": 2.01, + "grad_norm": 1.3676908016204834, + "learning_rate": 1.1270805116629045e-05, + "loss": 0.4737, + "step": 8000 + }, + { + "epoch": 2.02, + "grad_norm": 18.533445358276367, + "learning_rate": 1.1253122648607977e-05, + "loss": 0.4353, + "step": 8050 + }, + { + "epoch": 2.03, + "grad_norm": 0.649580717086792, + "learning_rate": 1.1235440180586908e-05, + "loss": 0.9283, + "step": 8100 + }, + { + "epoch": 2.04, + "grad_norm": 37.0181999206543, + "learning_rate": 1.1217757712565839e-05, + "loss": 0.8631, + "step": 8150 + }, + { + "epoch": 2.06, + "grad_norm": 1.1191781759262085, + "learning_rate": 1.1200075244544771e-05, + "loss": 0.7166, + "step": 8200 + }, + { + "epoch": 2.07, + "grad_norm": 46.35097885131836, + "learning_rate": 1.1182392776523702e-05, + "loss": 0.6263, + "step": 8250 + }, + { + "epoch": 2.08, + "grad_norm": 9.393693923950195, + "learning_rate": 1.1164710308502632e-05, + "loss": 0.7146, + "step": 8300 + }, + { + "epoch": 2.09, + "grad_norm": 49.04343032836914, + "learning_rate": 1.1147027840481567e-05, + "loss": 0.5924, + "step": 8350 + }, + { + "epoch": 2.11, + "grad_norm": 2.917092800140381, + "learning_rate": 1.1129345372460497e-05, + "loss": 0.815, + "step": 8400 + }, + { + "epoch": 2.12, + "grad_norm": 6.2741618156433105, + "learning_rate": 1.1111662904439428e-05, + "loss": 0.8852, + "step": 8450 + }, + { + "epoch": 2.13, + "grad_norm": 0.026425007730722427, + "learning_rate": 1.109398043641836e-05, + "loss": 0.609, + "step": 8500 + }, + { + "epoch": 2.14, + "grad_norm": 8.229249954223633, + "learning_rate": 1.1076297968397291e-05, + "loss": 0.5546, + "step": 8550 + }, + { + "epoch": 2.16, + "grad_norm": 3.257112979888916, + "learning_rate": 1.1058615500376222e-05, + "loss": 0.6084, + "step": 8600 + }, + { + "epoch": 2.17, + "grad_norm": 44.147640228271484, + "learning_rate": 1.1040933032355156e-05, + "loss": 0.4687, + "step": 8650 + }, + { + "epoch": 2.18, + "grad_norm": 93.26548767089844, + "learning_rate": 1.1023250564334087e-05, + "loss": 0.6323, + "step": 8700 + }, + { + "epoch": 2.19, + "grad_norm": 83.17293548583984, + "learning_rate": 1.1005568096313017e-05, + "loss": 0.8759, + "step": 8750 + }, + { + "epoch": 2.21, + "grad_norm": 51.27419662475586, + "learning_rate": 1.098788562829195e-05, + "loss": 0.69, + "step": 8800 + }, + { + "epoch": 2.22, + "grad_norm": 0.0010558576323091984, + "learning_rate": 1.097020316027088e-05, + "loss": 0.5279, + "step": 8850 + }, + { + "epoch": 2.23, + "grad_norm": 73.43231201171875, + "learning_rate": 1.0952520692249811e-05, + "loss": 0.9285, + "step": 8900 + }, + { + "epoch": 2.24, + "grad_norm": 6.488553047180176, + "learning_rate": 1.0934838224228745e-05, + "loss": 0.6137, + "step": 8950 + }, + { + "epoch": 2.26, + "grad_norm": 53.465972900390625, + "learning_rate": 1.0917155756207676e-05, + "loss": 0.4718, + "step": 9000 + }, + { + "epoch": 2.27, + "grad_norm": 1.405421495437622, + "learning_rate": 1.0899473288186607e-05, + "loss": 0.7248, + "step": 9050 + }, + { + "epoch": 2.28, + "grad_norm": 58.552490234375, + "learning_rate": 1.0881790820165539e-05, + "loss": 0.6312, + "step": 9100 + }, + { + "epoch": 2.29, + "grad_norm": 85.75029754638672, + "learning_rate": 1.086410835214447e-05, + "loss": 1.1383, + "step": 9150 + }, + { + "epoch": 2.31, + "grad_norm": 1.4940392971038818, + "learning_rate": 1.08464258841234e-05, + "loss": 0.6104, + "step": 9200 + }, + { + "epoch": 2.32, + "grad_norm": 1.2434502840042114, + "learning_rate": 1.0828743416102334e-05, + "loss": 0.5124, + "step": 9250 + }, + { + "epoch": 2.33, + "grad_norm": 0.002772190608084202, + "learning_rate": 1.0811060948081265e-05, + "loss": 0.8389, + "step": 9300 + }, + { + "epoch": 2.35, + "grad_norm": 27.42812156677246, + "learning_rate": 1.0793378480060196e-05, + "loss": 0.6571, + "step": 9350 + }, + { + "epoch": 2.36, + "grad_norm": 70.63783264160156, + "learning_rate": 1.0775696012039128e-05, + "loss": 0.5234, + "step": 9400 + }, + { + "epoch": 2.37, + "grad_norm": 0.873970627784729, + "learning_rate": 1.0758013544018059e-05, + "loss": 0.7862, + "step": 9450 + }, + { + "epoch": 2.38, + "grad_norm": 0.0001105390620068647, + "learning_rate": 1.074033107599699e-05, + "loss": 0.9885, + "step": 9500 + }, + { + "epoch": 2.4, + "grad_norm": 2.0316097736358643, + "learning_rate": 1.0722648607975922e-05, + "loss": 0.6648, + "step": 9550 + }, + { + "epoch": 2.41, + "grad_norm": 33.791568756103516, + "learning_rate": 1.0704966139954854e-05, + "loss": 0.6746, + "step": 9600 + }, + { + "epoch": 2.42, + "grad_norm": 112.26337432861328, + "learning_rate": 1.0687283671933785e-05, + "loss": 0.787, + "step": 9650 + }, + { + "epoch": 2.43, + "grad_norm": 53.35863494873047, + "learning_rate": 1.0669601203912716e-05, + "loss": 0.5922, + "step": 9700 + }, + { + "epoch": 2.45, + "grad_norm": 0.0027942871674895287, + "learning_rate": 1.0651918735891648e-05, + "loss": 0.6236, + "step": 9750 + }, + { + "epoch": 2.46, + "grad_norm": 0.00036070370697416365, + "learning_rate": 1.0634236267870579e-05, + "loss": 0.6559, + "step": 9800 + }, + { + "epoch": 2.47, + "grad_norm": 2.5188686847686768, + "learning_rate": 1.0616553799849511e-05, + "loss": 1.002, + "step": 9850 + }, + { + "epoch": 2.48, + "grad_norm": 42.79086685180664, + "learning_rate": 1.0598871331828444e-05, + "loss": 1.001, + "step": 9900 + }, + { + "epoch": 2.5, + "grad_norm": 0.06492776423692703, + "learning_rate": 1.0581188863807374e-05, + "loss": 0.9975, + "step": 9950 + }, + { + "epoch": 2.51, + "grad_norm": 12.079846382141113, + "learning_rate": 1.0563506395786305e-05, + "loss": 0.6417, + "step": 10000 + }, + { + "epoch": 2.52, + "grad_norm": 98.72542572021484, + "learning_rate": 1.0545823927765237e-05, + "loss": 0.9242, + "step": 10050 + }, + { + "epoch": 2.53, + "grad_norm": 0.15632659196853638, + "learning_rate": 1.0528141459744168e-05, + "loss": 0.4118, + "step": 10100 + }, + { + "epoch": 2.55, + "grad_norm": 3.5314505100250244, + "learning_rate": 1.05104589917231e-05, + "loss": 0.6486, + "step": 10150 + }, + { + "epoch": 2.56, + "grad_norm": 0.06171553581953049, + "learning_rate": 1.0492776523702033e-05, + "loss": 0.7782, + "step": 10200 + }, + { + "epoch": 2.57, + "grad_norm": 69.53456115722656, + "learning_rate": 1.0475094055680964e-05, + "loss": 0.5421, + "step": 10250 + }, + { + "epoch": 2.58, + "grad_norm": 27.149484634399414, + "learning_rate": 1.0457411587659894e-05, + "loss": 0.7476, + "step": 10300 + }, + { + "epoch": 2.6, + "grad_norm": 3.7423877716064453, + "learning_rate": 1.0439729119638827e-05, + "loss": 0.7429, + "step": 10350 + }, + { + "epoch": 2.61, + "grad_norm": 0.6006436944007874, + "learning_rate": 1.0422046651617757e-05, + "loss": 0.4376, + "step": 10400 + }, + { + "epoch": 2.62, + "grad_norm": 0.2609996497631073, + "learning_rate": 1.040436418359669e-05, + "loss": 0.8938, + "step": 10450 + }, + { + "epoch": 2.63, + "grad_norm": 73.91007232666016, + "learning_rate": 1.0386681715575622e-05, + "loss": 0.7273, + "step": 10500 + }, + { + "epoch": 2.65, + "grad_norm": 0.010080622509121895, + "learning_rate": 1.0368999247554553e-05, + "loss": 0.7709, + "step": 10550 + }, + { + "epoch": 2.66, + "grad_norm": 5.206912994384766, + "learning_rate": 1.0351316779533484e-05, + "loss": 0.696, + "step": 10600 + }, + { + "epoch": 2.67, + "grad_norm": 94.36717987060547, + "learning_rate": 1.0333634311512416e-05, + "loss": 0.6964, + "step": 10650 + }, + { + "epoch": 2.68, + "grad_norm": 0.6438612341880798, + "learning_rate": 1.0315951843491347e-05, + "loss": 0.6461, + "step": 10700 + }, + { + "epoch": 2.7, + "grad_norm": 0.02532346546649933, + "learning_rate": 1.029826937547028e-05, + "loss": 0.8581, + "step": 10750 + }, + { + "epoch": 2.71, + "grad_norm": 1.5096291303634644, + "learning_rate": 1.0280586907449212e-05, + "loss": 0.4629, + "step": 10800 + }, + { + "epoch": 2.72, + "grad_norm": 81.77324676513672, + "learning_rate": 1.0262904439428142e-05, + "loss": 0.8681, + "step": 10850 + }, + { + "epoch": 2.73, + "grad_norm": 1.1398659944534302, + "learning_rate": 1.0245221971407073e-05, + "loss": 0.5162, + "step": 10900 + }, + { + "epoch": 2.75, + "grad_norm": 0.4226570725440979, + "learning_rate": 1.0227539503386005e-05, + "loss": 0.4572, + "step": 10950 + }, + { + "epoch": 2.76, + "grad_norm": 0.02047480270266533, + "learning_rate": 1.0209857035364936e-05, + "loss": 0.8946, + "step": 11000 + }, + { + "epoch": 2.77, + "grad_norm": 124.79954528808594, + "learning_rate": 1.0192174567343868e-05, + "loss": 0.8325, + "step": 11050 + }, + { + "epoch": 2.78, + "grad_norm": 7.112376624718308e-05, + "learning_rate": 1.01744920993228e-05, + "loss": 0.5664, + "step": 11100 + }, + { + "epoch": 2.8, + "grad_norm": 78.66365051269531, + "learning_rate": 1.0156809631301732e-05, + "loss": 0.9426, + "step": 11150 + }, + { + "epoch": 2.81, + "grad_norm": 9.567934466758743e-05, + "learning_rate": 1.0139127163280662e-05, + "loss": 0.4818, + "step": 11200 + }, + { + "epoch": 2.82, + "grad_norm": 0.003907013684511185, + "learning_rate": 1.0121444695259593e-05, + "loss": 0.743, + "step": 11250 + }, + { + "epoch": 2.83, + "grad_norm": 84.53366088867188, + "learning_rate": 1.0103762227238525e-05, + "loss": 0.8544, + "step": 11300 + }, + { + "epoch": 2.85, + "grad_norm": 3.4674291610717773, + "learning_rate": 1.0086079759217458e-05, + "loss": 0.5553, + "step": 11350 + }, + { + "epoch": 2.86, + "grad_norm": 125.62838745117188, + "learning_rate": 1.0068397291196388e-05, + "loss": 0.6168, + "step": 11400 + }, + { + "epoch": 2.87, + "grad_norm": 99.19140625, + "learning_rate": 1.005071482317532e-05, + "loss": 1.1238, + "step": 11450 + }, + { + "epoch": 2.88, + "grad_norm": 38.717559814453125, + "learning_rate": 1.0033032355154252e-05, + "loss": 1.0667, + "step": 11500 + }, + { + "epoch": 2.9, + "grad_norm": 28.915889739990234, + "learning_rate": 1.0015349887133182e-05, + "loss": 0.5045, + "step": 11550 + }, + { + "epoch": 2.91, + "grad_norm": 48.31145477294922, + "learning_rate": 9.997667419112115e-06, + "loss": 0.754, + "step": 11600 + }, + { + "epoch": 2.92, + "grad_norm": 0.06709738075733185, + "learning_rate": 9.979984951091047e-06, + "loss": 0.6229, + "step": 11650 + }, + { + "epoch": 2.93, + "grad_norm": 1.2689626216888428, + "learning_rate": 9.962302483069978e-06, + "loss": 0.7818, + "step": 11700 + }, + { + "epoch": 2.95, + "grad_norm": 35.311134338378906, + "learning_rate": 9.94462001504891e-06, + "loss": 1.0477, + "step": 11750 + }, + { + "epoch": 2.96, + "grad_norm": 88.91561889648438, + "learning_rate": 9.92693754702784e-06, + "loss": 0.6488, + "step": 11800 + }, + { + "epoch": 2.97, + "grad_norm": 70.55093383789062, + "learning_rate": 9.909255079006772e-06, + "loss": 0.5951, + "step": 11850 + }, + { + "epoch": 2.98, + "grad_norm": 89.51988983154297, + "learning_rate": 9.891572610985704e-06, + "loss": 0.6867, + "step": 11900 + }, + { + "epoch": 3.0, + "grad_norm": 0.40069764852523804, + "learning_rate": 9.873890142964636e-06, + "loss": 0.7094, + "step": 11950 + }, + { + "epoch": 3.01, + "grad_norm": 2.006258964538574, + "learning_rate": 9.856207674943567e-06, + "loss": 0.5428, + "step": 12000 + }, + { + "epoch": 3.02, + "grad_norm": 51.34798049926758, + "learning_rate": 9.8385252069225e-06, + "loss": 0.573, + "step": 12050 + }, + { + "epoch": 3.03, + "grad_norm": 95.47881317138672, + "learning_rate": 9.82084273890143e-06, + "loss": 0.4226, + "step": 12100 + }, + { + "epoch": 3.05, + "grad_norm": 0.07185523957014084, + "learning_rate": 9.80316027088036e-06, + "loss": 0.6424, + "step": 12150 + }, + { + "epoch": 3.06, + "grad_norm": 109.8128662109375, + "learning_rate": 9.785477802859293e-06, + "loss": 0.5279, + "step": 12200 + }, + { + "epoch": 3.07, + "grad_norm": 44.56191635131836, + "learning_rate": 9.767795334838224e-06, + "loss": 0.3463, + "step": 12250 + }, + { + "epoch": 3.09, + "grad_norm": 0.45552492141723633, + "learning_rate": 9.750112866817156e-06, + "loss": 0.6696, + "step": 12300 + }, + { + "epoch": 3.1, + "grad_norm": 0.0008902169647626579, + "learning_rate": 9.732430398796089e-06, + "loss": 0.3845, + "step": 12350 + }, + { + "epoch": 3.11, + "grad_norm": 134.49839782714844, + "learning_rate": 9.71474793077502e-06, + "loss": 0.8803, + "step": 12400 + }, + { + "epoch": 3.12, + "grad_norm": 0.21923835575580597, + "learning_rate": 9.69706546275395e-06, + "loss": 0.741, + "step": 12450 + }, + { + "epoch": 3.14, + "grad_norm": 0.2331884801387787, + "learning_rate": 9.679382994732883e-06, + "loss": 0.7015, + "step": 12500 + }, + { + "epoch": 3.15, + "grad_norm": 0.4663000702857971, + "learning_rate": 9.661700526711813e-06, + "loss": 0.7605, + "step": 12550 + }, + { + "epoch": 3.16, + "grad_norm": 59.55733871459961, + "learning_rate": 9.644018058690746e-06, + "loss": 0.5855, + "step": 12600 + }, + { + "epoch": 3.17, + "grad_norm": 0.8377301096916199, + "learning_rate": 9.626335590669676e-06, + "loss": 0.4117, + "step": 12650 + }, + { + "epoch": 3.19, + "grad_norm": 64.69242095947266, + "learning_rate": 9.608653122648609e-06, + "loss": 0.5216, + "step": 12700 + }, + { + "epoch": 3.2, + "grad_norm": 0.8485704660415649, + "learning_rate": 9.59097065462754e-06, + "loss": 0.6882, + "step": 12750 + }, + { + "epoch": 3.21, + "grad_norm": 143.98147583007812, + "learning_rate": 9.57328818660647e-06, + "loss": 0.6463, + "step": 12800 + }, + { + "epoch": 3.22, + "grad_norm": 132.84567260742188, + "learning_rate": 9.555605718585403e-06, + "loss": 0.7474, + "step": 12850 + }, + { + "epoch": 3.24, + "grad_norm": 8.179304122924805, + "learning_rate": 9.537923250564335e-06, + "loss": 0.375, + "step": 12900 + }, + { + "epoch": 3.25, + "grad_norm": 10.138591766357422, + "learning_rate": 9.520240782543266e-06, + "loss": 0.7204, + "step": 12950 + }, + { + "epoch": 3.26, + "grad_norm": 0.00011070028267567977, + "learning_rate": 9.502558314522198e-06, + "loss": 0.3631, + "step": 13000 + }, + { + "epoch": 3.27, + "grad_norm": 1.0425533056259155, + "learning_rate": 9.484875846501129e-06, + "loss": 0.6752, + "step": 13050 + }, + { + "epoch": 3.29, + "grad_norm": 19.544971466064453, + "learning_rate": 9.46719337848006e-06, + "loss": 0.4082, + "step": 13100 + }, + { + "epoch": 3.3, + "grad_norm": 3.29071121996094e-06, + "learning_rate": 9.449510910458992e-06, + "loss": 0.752, + "step": 13150 + }, + { + "epoch": 3.31, + "grad_norm": 1.4096872806549072, + "learning_rate": 9.431828442437924e-06, + "loss": 0.739, + "step": 13200 + }, + { + "epoch": 3.32, + "grad_norm": 0.1742667555809021, + "learning_rate": 9.414145974416855e-06, + "loss": 0.5783, + "step": 13250 + }, + { + "epoch": 3.34, + "grad_norm": 0.8604665398597717, + "learning_rate": 9.396463506395787e-06, + "loss": 0.8603, + "step": 13300 + }, + { + "epoch": 3.35, + "grad_norm": 6.3410016082343645e-06, + "learning_rate": 9.378781038374718e-06, + "loss": 0.4481, + "step": 13350 + }, + { + "epoch": 3.36, + "grad_norm": 157.0394744873047, + "learning_rate": 9.361098570353649e-06, + "loss": 0.6242, + "step": 13400 + }, + { + "epoch": 3.37, + "grad_norm": 0.00026235656696371734, + "learning_rate": 9.343416102332581e-06, + "loss": 0.7734, + "step": 13450 + }, + { + "epoch": 3.39, + "grad_norm": 0.48436620831489563, + "learning_rate": 9.325733634311513e-06, + "loss": 0.4109, + "step": 13500 + }, + { + "epoch": 3.4, + "grad_norm": 136.50823974609375, + "learning_rate": 9.308051166290444e-06, + "loss": 0.6214, + "step": 13550 + }, + { + "epoch": 3.41, + "grad_norm": 0.14412285387516022, + "learning_rate": 9.290368698269377e-06, + "loss": 0.2606, + "step": 13600 + }, + { + "epoch": 3.42, + "grad_norm": 11.025894165039062, + "learning_rate": 9.272686230248307e-06, + "loss": 0.7337, + "step": 13650 + }, + { + "epoch": 3.44, + "grad_norm": 121.1470718383789, + "learning_rate": 9.255003762227238e-06, + "loss": 0.7108, + "step": 13700 + }, + { + "epoch": 3.45, + "grad_norm": 0.08408990502357483, + "learning_rate": 9.23732129420617e-06, + "loss": 0.4979, + "step": 13750 + }, + { + "epoch": 3.46, + "grad_norm": 0.05547923222184181, + "learning_rate": 9.219638826185103e-06, + "loss": 0.299, + "step": 13800 + }, + { + "epoch": 3.47, + "grad_norm": 131.8295135498047, + "learning_rate": 9.201956358164033e-06, + "loss": 1.0513, + "step": 13850 + }, + { + "epoch": 3.49, + "grad_norm": 40.073734283447266, + "learning_rate": 9.184273890142966e-06, + "loss": 0.4599, + "step": 13900 + }, + { + "epoch": 3.5, + "grad_norm": 18.33232879638672, + "learning_rate": 9.166591422121897e-06, + "loss": 0.6182, + "step": 13950 + }, + { + "epoch": 3.51, + "grad_norm": 0.02969328872859478, + "learning_rate": 9.148908954100827e-06, + "loss": 0.4793, + "step": 14000 + }, + { + "epoch": 3.52, + "grad_norm": 0.36942940950393677, + "learning_rate": 9.13122648607976e-06, + "loss": 0.3778, + "step": 14050 + }, + { + "epoch": 3.54, + "grad_norm": 0.076649971306324, + "learning_rate": 9.113544018058692e-06, + "loss": 0.6148, + "step": 14100 + }, + { + "epoch": 3.55, + "grad_norm": 282.6568298339844, + "learning_rate": 9.095861550037623e-06, + "loss": 0.6784, + "step": 14150 + }, + { + "epoch": 3.56, + "grad_norm": 0.14636385440826416, + "learning_rate": 9.078179082016553e-06, + "loss": 0.9237, + "step": 14200 + }, + { + "epoch": 3.57, + "grad_norm": 0.014414280652999878, + "learning_rate": 9.060496613995486e-06, + "loss": 0.7111, + "step": 14250 + }, + { + "epoch": 3.59, + "grad_norm": 0.10564962774515152, + "learning_rate": 9.042814145974417e-06, + "loss": 0.4485, + "step": 14300 + }, + { + "epoch": 3.6, + "grad_norm": 0.10087831318378448, + "learning_rate": 9.025131677953347e-06, + "loss": 0.7537, + "step": 14350 + }, + { + "epoch": 3.61, + "grad_norm": 75.64422607421875, + "learning_rate": 9.007449209932281e-06, + "loss": 0.4629, + "step": 14400 + }, + { + "epoch": 3.62, + "grad_norm": 87.81208801269531, + "learning_rate": 8.989766741911212e-06, + "loss": 0.5313, + "step": 14450 + }, + { + "epoch": 3.64, + "grad_norm": 0.0018619262846186757, + "learning_rate": 8.972084273890143e-06, + "loss": 0.7642, + "step": 14500 + }, + { + "epoch": 3.65, + "grad_norm": 110.11195373535156, + "learning_rate": 8.954401805869075e-06, + "loss": 0.6499, + "step": 14550 + }, + { + "epoch": 3.66, + "grad_norm": 0.008621015585958958, + "learning_rate": 8.936719337848006e-06, + "loss": 0.3583, + "step": 14600 + }, + { + "epoch": 3.67, + "grad_norm": 0.022055380046367645, + "learning_rate": 8.919036869826937e-06, + "loss": 0.5497, + "step": 14650 + }, + { + "epoch": 3.69, + "grad_norm": 67.4389419555664, + "learning_rate": 8.90135440180587e-06, + "loss": 0.5981, + "step": 14700 + }, + { + "epoch": 3.7, + "grad_norm": 0.000478647300042212, + "learning_rate": 8.883671933784801e-06, + "loss": 0.259, + "step": 14750 + }, + { + "epoch": 3.71, + "grad_norm": 1.5297553539276123, + "learning_rate": 8.865989465763732e-06, + "loss": 0.6259, + "step": 14800 + }, + { + "epoch": 3.72, + "grad_norm": 36.321128845214844, + "learning_rate": 8.848306997742664e-06, + "loss": 0.6844, + "step": 14850 + }, + { + "epoch": 3.74, + "grad_norm": 175.9180450439453, + "learning_rate": 8.830624529721595e-06, + "loss": 0.5772, + "step": 14900 + }, + { + "epoch": 3.75, + "grad_norm": 178.33462524414062, + "learning_rate": 8.812942061700526e-06, + "loss": 0.5891, + "step": 14950 + }, + { + "epoch": 3.76, + "grad_norm": 0.00013845643843524158, + "learning_rate": 8.79525959367946e-06, + "loss": 0.464, + "step": 15000 + }, + { + "epoch": 3.77, + "grad_norm": 127.49348449707031, + "learning_rate": 8.77757712565839e-06, + "loss": 0.5844, + "step": 15050 + }, + { + "epoch": 3.79, + "grad_norm": 1.6402578353881836, + "learning_rate": 8.759894657637321e-06, + "loss": 0.7526, + "step": 15100 + }, + { + "epoch": 3.8, + "grad_norm": 0.008880015462636948, + "learning_rate": 8.742212189616254e-06, + "loss": 0.9234, + "step": 15150 + }, + { + "epoch": 3.81, + "grad_norm": 0.4811843931674957, + "learning_rate": 8.724529721595184e-06, + "loss": 0.848, + "step": 15200 + }, + { + "epoch": 3.82, + "grad_norm": 0.0008742750505916774, + "learning_rate": 8.706847253574115e-06, + "loss": 0.4136, + "step": 15250 + }, + { + "epoch": 3.84, + "grad_norm": 45.28816604614258, + "learning_rate": 8.68916478555305e-06, + "loss": 0.6978, + "step": 15300 + }, + { + "epoch": 3.85, + "grad_norm": 0.014465034939348698, + "learning_rate": 8.67148231753198e-06, + "loss": 0.6124, + "step": 15350 + }, + { + "epoch": 3.86, + "grad_norm": 0.01468442752957344, + "learning_rate": 8.65379984951091e-06, + "loss": 0.925, + "step": 15400 + }, + { + "epoch": 3.88, + "grad_norm": 1.076714283954061e-06, + "learning_rate": 8.636117381489843e-06, + "loss": 0.5271, + "step": 15450 + }, + { + "epoch": 3.89, + "grad_norm": 4.781663847097661e-07, + "learning_rate": 8.618434913468774e-06, + "loss": 0.4686, + "step": 15500 + }, + { + "epoch": 3.9, + "grad_norm": 1.0695022344589233, + "learning_rate": 8.600752445447704e-06, + "loss": 1.076, + "step": 15550 + }, + { + "epoch": 3.91, + "grad_norm": 0.3064178228378296, + "learning_rate": 8.583069977426637e-06, + "loss": 0.4409, + "step": 15600 + }, + { + "epoch": 3.93, + "grad_norm": 95.81256103515625, + "learning_rate": 8.56538750940557e-06, + "loss": 0.628, + "step": 15650 + }, + { + "epoch": 3.94, + "grad_norm": 0.011423008516430855, + "learning_rate": 8.5477050413845e-06, + "loss": 0.4738, + "step": 15700 + }, + { + "epoch": 3.95, + "grad_norm": 68.7823257446289, + "learning_rate": 8.53002257336343e-06, + "loss": 0.5614, + "step": 15750 + }, + { + "epoch": 3.96, + "grad_norm": 0.0003278045041952282, + "learning_rate": 8.512340105342363e-06, + "loss": 0.451, + "step": 15800 + }, + { + "epoch": 3.98, + "grad_norm": 5.685105293196102e-07, + "learning_rate": 8.494657637321294e-06, + "loss": 0.6919, + "step": 15850 + }, + { + "epoch": 3.99, + "grad_norm": 0.006908051203936338, + "learning_rate": 8.476975169300226e-06, + "loss": 0.7209, + "step": 15900 + }, + { + "epoch": 4.0, + "grad_norm": 0.14153322577476501, + "learning_rate": 8.459292701279158e-06, + "loss": 0.8544, + "step": 15950 + }, + { + "epoch": 4.01, + "grad_norm": 0.01233228575438261, + "learning_rate": 8.44161023325809e-06, + "loss": 0.1127, + "step": 16000 + }, + { + "epoch": 4.03, + "grad_norm": 0.02049972675740719, + "learning_rate": 8.42392776523702e-06, + "loss": 0.2392, + "step": 16050 + }, + { + "epoch": 4.04, + "grad_norm": 0.6001113653182983, + "learning_rate": 8.406245297215952e-06, + "loss": 0.2408, + "step": 16100 + }, + { + "epoch": 4.05, + "grad_norm": 0.7253586649894714, + "learning_rate": 8.388562829194883e-06, + "loss": 0.679, + "step": 16150 + }, + { + "epoch": 4.06, + "grad_norm": 0.20070885121822357, + "learning_rate": 8.370880361173815e-06, + "loss": 0.5534, + "step": 16200 + }, + { + "epoch": 4.08, + "grad_norm": 0.004428381100296974, + "learning_rate": 8.353197893152748e-06, + "loss": 0.3753, + "step": 16250 + }, + { + "epoch": 4.09, + "grad_norm": 0.1646382063627243, + "learning_rate": 8.335515425131678e-06, + "loss": 0.7136, + "step": 16300 + }, + { + "epoch": 4.1, + "grad_norm": 4.304123401641846, + "learning_rate": 8.31783295711061e-06, + "loss": 0.6533, + "step": 16350 + }, + { + "epoch": 4.11, + "grad_norm": 0.0014060864923521876, + "learning_rate": 8.300150489089542e-06, + "loss": 0.2931, + "step": 16400 + }, + { + "epoch": 4.13, + "grad_norm": 10.760331153869629, + "learning_rate": 8.282468021068472e-06, + "loss": 0.2996, + "step": 16450 + }, + { + "epoch": 4.14, + "grad_norm": 151.8526611328125, + "learning_rate": 8.264785553047405e-06, + "loss": 0.2342, + "step": 16500 + }, + { + "epoch": 4.15, + "grad_norm": 0.2262250781059265, + "learning_rate": 8.247103085026337e-06, + "loss": 0.2152, + "step": 16550 + }, + { + "epoch": 4.16, + "grad_norm": 0.028175359591841698, + "learning_rate": 8.229420617005268e-06, + "loss": 0.2108, + "step": 16600 + }, + { + "epoch": 4.18, + "grad_norm": 1.2244036197662354, + "learning_rate": 8.211738148984198e-06, + "loss": 0.4471, + "step": 16650 + }, + { + "epoch": 4.19, + "grad_norm": 0.12875045835971832, + "learning_rate": 8.194055680963131e-06, + "loss": 0.5662, + "step": 16700 + }, + { + "epoch": 4.2, + "grad_norm": 3.702627420425415, + "learning_rate": 8.176373212942062e-06, + "loss": 0.3945, + "step": 16750 + }, + { + "epoch": 4.21, + "grad_norm": 50.61404800415039, + "learning_rate": 8.158690744920994e-06, + "loss": 0.2347, + "step": 16800 + }, + { + "epoch": 4.23, + "grad_norm": 0.736967146396637, + "learning_rate": 8.141008276899926e-06, + "loss": 0.2615, + "step": 16850 + }, + { + "epoch": 4.24, + "grad_norm": 0.00011446132702985778, + "learning_rate": 8.123325808878857e-06, + "loss": 0.5149, + "step": 16900 + }, + { + "epoch": 4.25, + "grad_norm": 0.0010398293379694223, + "learning_rate": 8.105643340857788e-06, + "loss": 0.2957, + "step": 16950 + }, + { + "epoch": 4.26, + "grad_norm": 0.26418277621269226, + "learning_rate": 8.08796087283672e-06, + "loss": 0.2704, + "step": 17000 + }, + { + "epoch": 4.28, + "grad_norm": 0.8061837553977966, + "learning_rate": 8.070278404815651e-06, + "loss": 0.367, + "step": 17050 + }, + { + "epoch": 4.29, + "grad_norm": 0.010115943849086761, + "learning_rate": 8.052595936794583e-06, + "loss": 0.3768, + "step": 17100 + }, + { + "epoch": 4.3, + "grad_norm": 27.51811981201172, + "learning_rate": 8.034913468773514e-06, + "loss": 0.3892, + "step": 17150 + }, + { + "epoch": 4.31, + "grad_norm": 0.000684226572047919, + "learning_rate": 8.017231000752446e-06, + "loss": 0.1805, + "step": 17200 + }, + { + "epoch": 4.33, + "grad_norm": 0.08357678353786469, + "learning_rate": 7.999548532731377e-06, + "loss": 0.2773, + "step": 17250 + }, + { + "epoch": 4.34, + "grad_norm": 292.503662109375, + "learning_rate": 7.981866064710308e-06, + "loss": 0.6283, + "step": 17300 + }, + { + "epoch": 4.35, + "grad_norm": 0.1264430582523346, + "learning_rate": 7.96418359668924e-06, + "loss": 0.4072, + "step": 17350 + }, + { + "epoch": 4.36, + "grad_norm": 1.3433716958388686e-05, + "learning_rate": 7.946501128668173e-06, + "loss": 0.8405, + "step": 17400 + }, + { + "epoch": 4.38, + "grad_norm": 27.759994506835938, + "learning_rate": 7.928818660647103e-06, + "loss": 0.4456, + "step": 17450 + }, + { + "epoch": 4.39, + "grad_norm": 369.9099426269531, + "learning_rate": 7.911136192626036e-06, + "loss": 0.3382, + "step": 17500 + }, + { + "epoch": 4.4, + "grad_norm": 6.0055251121521, + "learning_rate": 7.893453724604966e-06, + "loss": 0.329, + "step": 17550 + }, + { + "epoch": 4.41, + "grad_norm": 0.17973710596561432, + "learning_rate": 7.875771256583897e-06, + "loss": 0.6193, + "step": 17600 + }, + { + "epoch": 4.43, + "grad_norm": 0.03942597284913063, + "learning_rate": 7.85808878856283e-06, + "loss": 0.3823, + "step": 17650 + }, + { + "epoch": 4.44, + "grad_norm": 0.0010533991735428572, + "learning_rate": 7.840406320541762e-06, + "loss": 0.6641, + "step": 17700 + }, + { + "epoch": 4.45, + "grad_norm": 3.6850650531050633e-07, + "learning_rate": 7.822723852520693e-06, + "loss": 0.4148, + "step": 17750 + }, + { + "epoch": 4.46, + "grad_norm": 5.283959399093874e-05, + "learning_rate": 7.805041384499625e-06, + "loss": 0.799, + "step": 17800 + }, + { + "epoch": 4.48, + "grad_norm": 0.01196613721549511, + "learning_rate": 7.787358916478556e-06, + "loss": 0.5424, + "step": 17850 + }, + { + "epoch": 4.49, + "grad_norm": 211.05799865722656, + "learning_rate": 7.769676448457486e-06, + "loss": 0.2341, + "step": 17900 + }, + { + "epoch": 4.5, + "grad_norm": 8.655371743770957e-07, + "learning_rate": 7.751993980436419e-06, + "loss": 0.5349, + "step": 17950 + }, + { + "epoch": 4.51, + "grad_norm": 1.5644945408621602e-11, + "learning_rate": 7.734311512415351e-06, + "loss": 0.0804, + "step": 18000 + }, + { + "epoch": 4.53, + "grad_norm": 0.00036508633638732135, + "learning_rate": 7.716629044394282e-06, + "loss": 0.3295, + "step": 18050 + }, + { + "epoch": 4.54, + "grad_norm": 1.3209816270357e-13, + "learning_rate": 7.698946576373214e-06, + "loss": 0.3606, + "step": 18100 + }, + { + "epoch": 4.55, + "grad_norm": 314.8194885253906, + "learning_rate": 7.681264108352145e-06, + "loss": 0.3064, + "step": 18150 + }, + { + "epoch": 4.56, + "grad_norm": 1.250010797093637e-07, + "learning_rate": 7.663581640331076e-06, + "loss": 0.2967, + "step": 18200 + }, + { + "epoch": 4.58, + "grad_norm": 1.0573174953460693, + "learning_rate": 7.645899172310008e-06, + "loss": 0.4857, + "step": 18250 + }, + { + "epoch": 4.59, + "grad_norm": 204.4314727783203, + "learning_rate": 7.628216704288939e-06, + "loss": 0.358, + "step": 18300 + }, + { + "epoch": 4.6, + "grad_norm": 0.02004345878958702, + "learning_rate": 7.610534236267871e-06, + "loss": 0.753, + "step": 18350 + }, + { + "epoch": 4.61, + "grad_norm": 302.43280029296875, + "learning_rate": 7.592851768246803e-06, + "loss": 0.3723, + "step": 18400 + }, + { + "epoch": 4.63, + "grad_norm": 0.0004978284705430269, + "learning_rate": 7.575169300225734e-06, + "loss": 0.4034, + "step": 18450 + }, + { + "epoch": 4.64, + "grad_norm": 93.66849517822266, + "learning_rate": 7.557486832204665e-06, + "loss": 0.4249, + "step": 18500 + }, + { + "epoch": 4.65, + "grad_norm": 0.001678618835285306, + "learning_rate": 7.5398043641835965e-06, + "loss": 0.7051, + "step": 18550 + }, + { + "epoch": 4.67, + "grad_norm": 0.37766626477241516, + "learning_rate": 7.522121896162528e-06, + "loss": 0.2375, + "step": 18600 + }, + { + "epoch": 4.68, + "grad_norm": 268.7151184082031, + "learning_rate": 7.50443942814146e-06, + "loss": 0.9723, + "step": 18650 + }, + { + "epoch": 4.69, + "grad_norm": 10.93520450592041, + "learning_rate": 7.486756960120392e-06, + "loss": 0.4446, + "step": 18700 + }, + { + "epoch": 4.7, + "grad_norm": 0.0002736333408392966, + "learning_rate": 7.4690744920993235e-06, + "loss": 0.5695, + "step": 18750 + }, + { + "epoch": 4.72, + "grad_norm": 0.006334410980343819, + "learning_rate": 7.451392024078254e-06, + "loss": 0.4816, + "step": 18800 + }, + { + "epoch": 4.73, + "grad_norm": 2.021748046754368e-10, + "learning_rate": 7.433709556057186e-06, + "loss": 0.6205, + "step": 18850 + }, + { + "epoch": 4.74, + "grad_norm": 77.61640930175781, + "learning_rate": 7.416027088036117e-06, + "loss": 0.1743, + "step": 18900 + }, + { + "epoch": 4.75, + "grad_norm": 0.24281173944473267, + "learning_rate": 7.39834462001505e-06, + "loss": 0.3958, + "step": 18950 + }, + { + "epoch": 4.77, + "grad_norm": 0.0005730040138587356, + "learning_rate": 7.380662151993981e-06, + "loss": 0.2709, + "step": 19000 + }, + { + "epoch": 4.78, + "grad_norm": 25.074310302734375, + "learning_rate": 7.362979683972912e-06, + "loss": 0.3811, + "step": 19050 + }, + { + "epoch": 4.79, + "grad_norm": 0.0002688245731405914, + "learning_rate": 7.3452972159518435e-06, + "loss": 0.2937, + "step": 19100 + }, + { + "epoch": 4.8, + "grad_norm": 6.246182601898909e-05, + "learning_rate": 7.327614747930775e-06, + "loss": 0.2862, + "step": 19150 + }, + { + "epoch": 4.82, + "grad_norm": 0.000318751554004848, + "learning_rate": 7.309932279909706e-06, + "loss": 0.132, + "step": 19200 + }, + { + "epoch": 4.83, + "grad_norm": 285.48297119140625, + "learning_rate": 7.292249811888639e-06, + "loss": 0.3126, + "step": 19250 + }, + { + "epoch": 4.84, + "grad_norm": 214.23065185546875, + "learning_rate": 7.2745673438675705e-06, + "loss": 0.6408, + "step": 19300 + }, + { + "epoch": 4.85, + "grad_norm": 305.9626159667969, + "learning_rate": 7.256884875846501e-06, + "loss": 0.4605, + "step": 19350 + }, + { + "epoch": 4.87, + "grad_norm": 4.2915186782011006e-07, + "learning_rate": 7.239202407825433e-06, + "loss": 0.2248, + "step": 19400 + }, + { + "epoch": 4.88, + "grad_norm": 265.24072265625, + "learning_rate": 7.221519939804364e-06, + "loss": 0.6776, + "step": 19450 + }, + { + "epoch": 4.89, + "grad_norm": 250.4654083251953, + "learning_rate": 7.203837471783295e-06, + "loss": 0.3709, + "step": 19500 + }, + { + "epoch": 4.9, + "grad_norm": 0.0005780484061688185, + "learning_rate": 7.186155003762228e-06, + "loss": 0.3521, + "step": 19550 + }, + { + "epoch": 4.92, + "grad_norm": 8.780172348022461, + "learning_rate": 7.168472535741159e-06, + "loss": 0.3998, + "step": 19600 + }, + { + "epoch": 4.93, + "grad_norm": 7.643636703491211, + "learning_rate": 7.1507900677200905e-06, + "loss": 0.5537, + "step": 19650 + }, + { + "epoch": 4.94, + "grad_norm": 0.0002484459837432951, + "learning_rate": 7.133107599699022e-06, + "loss": 0.4808, + "step": 19700 + }, + { + "epoch": 4.95, + "grad_norm": 0.2631732225418091, + "learning_rate": 7.115425131677953e-06, + "loss": 0.6781, + "step": 19750 + }, + { + "epoch": 4.97, + "grad_norm": 0.0346391536295414, + "learning_rate": 7.097742663656884e-06, + "loss": 0.1673, + "step": 19800 + }, + { + "epoch": 4.98, + "grad_norm": 0.006426098290830851, + "learning_rate": 7.0800601956358176e-06, + "loss": 0.2001, + "step": 19850 + }, + { + "epoch": 4.99, + "grad_norm": 0.070701465010643, + "learning_rate": 7.062377727614748e-06, + "loss": 0.6119, + "step": 19900 + }, + { + "epoch": 5.0, + "grad_norm": 9.641678479965776e-05, + "learning_rate": 7.04469525959368e-06, + "loss": 0.1432, + "step": 19950 + }, + { + "epoch": 5.02, + "grad_norm": 80.3648681640625, + "learning_rate": 7.027012791572611e-06, + "loss": 0.2837, + "step": 20000 + }, + { + "epoch": 5.03, + "grad_norm": 7.515856123063713e-05, + "learning_rate": 7.009330323551543e-06, + "loss": 0.0325, + "step": 20050 + }, + { + "epoch": 5.04, + "grad_norm": 9.76786541286856e-05, + "learning_rate": 6.9916478555304745e-06, + "loss": 0.28, + "step": 20100 + }, + { + "epoch": 5.05, + "grad_norm": 0.058834467083215714, + "learning_rate": 6.973965387509406e-06, + "loss": 0.119, + "step": 20150 + }, + { + "epoch": 5.07, + "grad_norm": 3.0734496116638184, + "learning_rate": 6.9562829194883376e-06, + "loss": 0.1121, + "step": 20200 + }, + { + "epoch": 5.08, + "grad_norm": 173.53060913085938, + "learning_rate": 6.938600451467269e-06, + "loss": 0.4994, + "step": 20250 + }, + { + "epoch": 5.09, + "grad_norm": 1.482841071265284e-06, + "learning_rate": 6.920917983446201e-06, + "loss": 0.4273, + "step": 20300 + }, + { + "epoch": 5.1, + "grad_norm": 0.06339254975318909, + "learning_rate": 6.903235515425132e-06, + "loss": 0.0653, + "step": 20350 + }, + { + "epoch": 5.12, + "grad_norm": 29.73435401916504, + "learning_rate": 6.885553047404064e-06, + "loss": 0.0064, + "step": 20400 + }, + { + "epoch": 5.13, + "grad_norm": 0.0535583458840847, + "learning_rate": 6.8678705793829944e-06, + "loss": 0.1328, + "step": 20450 + }, + { + "epoch": 5.14, + "grad_norm": 0.016700129956007004, + "learning_rate": 6.850188111361927e-06, + "loss": 0.3879, + "step": 20500 + }, + { + "epoch": 5.15, + "grad_norm": 3.702952017192729e-05, + "learning_rate": 6.832505643340858e-06, + "loss": 0.1604, + "step": 20550 + }, + { + "epoch": 5.17, + "grad_norm": 0.03472837060689926, + "learning_rate": 6.814823175319789e-06, + "loss": 0.2436, + "step": 20600 + }, + { + "epoch": 5.18, + "grad_norm": 3.1909748940961435e-05, + "learning_rate": 6.7971407072987215e-06, + "loss": 0.1352, + "step": 20650 + }, + { + "epoch": 5.19, + "grad_norm": 0.3979862630367279, + "learning_rate": 6.779458239277653e-06, + "loss": 0.1159, + "step": 20700 + }, + { + "epoch": 5.2, + "grad_norm": 0.0028309274930506945, + "learning_rate": 6.761775771256584e-06, + "loss": 0.2612, + "step": 20750 + }, + { + "epoch": 5.22, + "grad_norm": 0.7586016654968262, + "learning_rate": 6.744093303235516e-06, + "loss": 0.4589, + "step": 20800 + }, + { + "epoch": 5.23, + "grad_norm": 0.0062132058665156364, + "learning_rate": 6.726410835214448e-06, + "loss": 0.0843, + "step": 20850 + }, + { + "epoch": 5.24, + "grad_norm": 0.01292335707694292, + "learning_rate": 6.708728367193378e-06, + "loss": 0.092, + "step": 20900 + }, + { + "epoch": 5.25, + "grad_norm": 0.0012096440186724067, + "learning_rate": 6.691045899172311e-06, + "loss": 0.1515, + "step": 20950 + }, + { + "epoch": 5.27, + "grad_norm": 0.003023844677954912, + "learning_rate": 6.673363431151242e-06, + "loss": 0.3177, + "step": 21000 + }, + { + "epoch": 5.28, + "grad_norm": 106.2956771850586, + "learning_rate": 6.655680963130173e-06, + "loss": 0.0315, + "step": 21050 + }, + { + "epoch": 5.29, + "grad_norm": 0.0011365425307303667, + "learning_rate": 6.637998495109105e-06, + "loss": 0.0159, + "step": 21100 + }, + { + "epoch": 5.3, + "grad_norm": 39.502681732177734, + "learning_rate": 6.620316027088036e-06, + "loss": 0.3804, + "step": 21150 + }, + { + "epoch": 5.32, + "grad_norm": 0.017230931669473648, + "learning_rate": 6.602633559066968e-06, + "loss": 0.0453, + "step": 21200 + }, + { + "epoch": 5.33, + "grad_norm": 6.043082976248115e-06, + "learning_rate": 6.584951091045899e-06, + "loss": 0.3094, + "step": 21250 + }, + { + "epoch": 5.34, + "grad_norm": 6.83969769710302e-10, + "learning_rate": 6.567268623024831e-06, + "loss": 0.3382, + "step": 21300 + }, + { + "epoch": 5.35, + "grad_norm": 1.2151496714234156e-13, + "learning_rate": 6.549586155003762e-06, + "loss": 0.0469, + "step": 21350 + }, + { + "epoch": 5.37, + "grad_norm": 129.63966369628906, + "learning_rate": 6.531903686982694e-06, + "loss": 0.1542, + "step": 21400 + }, + { + "epoch": 5.38, + "grad_norm": 8.008062764019996e-07, + "learning_rate": 6.514221218961625e-06, + "loss": 0.1121, + "step": 21450 + }, + { + "epoch": 5.39, + "grad_norm": 195.3101043701172, + "learning_rate": 6.496538750940557e-06, + "loss": 0.134, + "step": 21500 + }, + { + "epoch": 5.41, + "grad_norm": 0.44227921962738037, + "learning_rate": 6.4788562829194885e-06, + "loss": 0.1614, + "step": 21550 + }, + { + "epoch": 5.42, + "grad_norm": 389.9450988769531, + "learning_rate": 6.46117381489842e-06, + "loss": 0.2223, + "step": 21600 + }, + { + "epoch": 5.43, + "grad_norm": 2.417748987681989e-07, + "learning_rate": 6.443491346877352e-06, + "loss": 0.2297, + "step": 21650 + }, + { + "epoch": 5.44, + "grad_norm": 0.0011466313153505325, + "learning_rate": 6.425808878856283e-06, + "loss": 0.0367, + "step": 21700 + }, + { + "epoch": 5.46, + "grad_norm": 0.4562750458717346, + "learning_rate": 6.408126410835215e-06, + "loss": 0.3361, + "step": 21750 + }, + { + "epoch": 5.47, + "grad_norm": 3.822188591584563e-05, + "learning_rate": 6.390443942814146e-06, + "loss": 0.0979, + "step": 21800 + }, + { + "epoch": 5.48, + "grad_norm": 100.44294738769531, + "learning_rate": 6.372761474793078e-06, + "loss": 0.0573, + "step": 21850 + }, + { + "epoch": 5.49, + "grad_norm": 1.8141976397600956e-05, + "learning_rate": 6.355079006772009e-06, + "loss": 0.6044, + "step": 21900 + }, + { + "epoch": 5.51, + "grad_norm": 2.5538651055034833e-12, + "learning_rate": 6.337396538750941e-06, + "loss": 0.2549, + "step": 21950 + }, + { + "epoch": 5.52, + "grad_norm": 7.968230164578927e-08, + "learning_rate": 6.319714070729872e-06, + "loss": 0.36, + "step": 22000 + }, + { + "epoch": 5.53, + "grad_norm": 0.001464845146983862, + "learning_rate": 6.302031602708804e-06, + "loss": 0.0043, + "step": 22050 + }, + { + "epoch": 5.54, + "grad_norm": 0.5217474102973938, + "learning_rate": 6.2843491346877355e-06, + "loss": 0.3358, + "step": 22100 + }, + { + "epoch": 5.56, + "grad_norm": 2.1627647583954968e-05, + "learning_rate": 6.266666666666666e-06, + "loss": 0.4962, + "step": 22150 + }, + { + "epoch": 5.57, + "grad_norm": 0.0039770700968801975, + "learning_rate": 6.248984198645599e-06, + "loss": 0.0886, + "step": 22200 + }, + { + "epoch": 5.58, + "grad_norm": 0.028452860191464424, + "learning_rate": 6.23130173062453e-06, + "loss": 0.2919, + "step": 22250 + }, + { + "epoch": 5.59, + "grad_norm": 1.0354268550872803, + "learning_rate": 6.213619262603461e-06, + "loss": 0.1583, + "step": 22300 + }, + { + "epoch": 5.61, + "grad_norm": 0.0001276719121960923, + "learning_rate": 6.195936794582393e-06, + "loss": 0.1062, + "step": 22350 + }, + { + "epoch": 5.62, + "grad_norm": 213.48941040039062, + "learning_rate": 6.178254326561325e-06, + "loss": 0.377, + "step": 22400 + }, + { + "epoch": 5.63, + "grad_norm": 8.587969205109403e-06, + "learning_rate": 6.1605718585402555e-06, + "loss": 0.2043, + "step": 22450 + }, + { + "epoch": 5.64, + "grad_norm": 0.011805477552115917, + "learning_rate": 6.142889390519188e-06, + "loss": 0.2744, + "step": 22500 + }, + { + "epoch": 5.66, + "grad_norm": 1.4445524776363072e-08, + "learning_rate": 6.125206922498119e-06, + "loss": 0.0145, + "step": 22550 + }, + { + "epoch": 5.67, + "grad_norm": 136.72720336914062, + "learning_rate": 6.10752445447705e-06, + "loss": 0.1608, + "step": 22600 + }, + { + "epoch": 5.68, + "grad_norm": 8.377895937883295e-06, + "learning_rate": 6.0898419864559826e-06, + "loss": 0.1146, + "step": 22650 + }, + { + "epoch": 5.69, + "grad_norm": 0.0005771568394266069, + "learning_rate": 6.072159518434913e-06, + "loss": 0.3716, + "step": 22700 + }, + { + "epoch": 5.71, + "grad_norm": 0.0033020416740328074, + "learning_rate": 6.054477050413845e-06, + "loss": 0.1609, + "step": 22750 + }, + { + "epoch": 5.72, + "grad_norm": 0.014289168640971184, + "learning_rate": 6.036794582392777e-06, + "loss": 0.2873, + "step": 22800 + }, + { + "epoch": 5.73, + "grad_norm": 433.4857482910156, + "learning_rate": 6.019112114371708e-06, + "loss": 0.2766, + "step": 22850 + }, + { + "epoch": 5.74, + "grad_norm": 51.506011962890625, + "learning_rate": 6.0014296463506395e-06, + "loss": 0.2557, + "step": 22900 + }, + { + "epoch": 5.76, + "grad_norm": 2.9865319106647803e-07, + "learning_rate": 5.983747178329572e-06, + "loss": 0.052, + "step": 22950 + }, + { + "epoch": 5.77, + "grad_norm": 0.0004749756189994514, + "learning_rate": 5.9660647103085026e-06, + "loss": 0.048, + "step": 23000 + }, + { + "epoch": 5.78, + "grad_norm": 296.063720703125, + "learning_rate": 5.948382242287434e-06, + "loss": 0.1432, + "step": 23050 + }, + { + "epoch": 5.79, + "grad_norm": 0.002446663100272417, + "learning_rate": 5.9306997742663665e-06, + "loss": 0.3151, + "step": 23100 + }, + { + "epoch": 5.81, + "grad_norm": 0.012231925502419472, + "learning_rate": 5.913017306245297e-06, + "loss": 0.0295, + "step": 23150 + }, + { + "epoch": 5.82, + "grad_norm": 0.006459045223891735, + "learning_rate": 5.895334838224229e-06, + "loss": 0.0319, + "step": 23200 + }, + { + "epoch": 5.83, + "grad_norm": 5.6175377238787405e-08, + "learning_rate": 5.87765237020316e-06, + "loss": 0.1096, + "step": 23250 + }, + { + "epoch": 5.84, + "grad_norm": 9.727654060043278e-07, + "learning_rate": 5.859969902182092e-06, + "loss": 0.365, + "step": 23300 + }, + { + "epoch": 5.86, + "grad_norm": 167.01791381835938, + "learning_rate": 5.842287434161023e-06, + "loss": 0.0494, + "step": 23350 + }, + { + "epoch": 5.87, + "grad_norm": 0.05854243040084839, + "learning_rate": 5.824604966139955e-06, + "loss": 0.0218, + "step": 23400 + }, + { + "epoch": 5.88, + "grad_norm": 2.8002886676148364e-09, + "learning_rate": 5.8069224981188865e-06, + "loss": 0.0119, + "step": 23450 + }, + { + "epoch": 5.89, + "grad_norm": 455.8995361328125, + "learning_rate": 5.789240030097818e-06, + "loss": 0.3402, + "step": 23500 + }, + { + "epoch": 5.91, + "grad_norm": 0.0034980960190296173, + "learning_rate": 5.77155756207675e-06, + "loss": 0.1623, + "step": 23550 + }, + { + "epoch": 5.92, + "grad_norm": 0.048077382147312164, + "learning_rate": 5.753875094055681e-06, + "loss": 0.5028, + "step": 23600 + }, + { + "epoch": 5.93, + "grad_norm": 1.1395950317382812, + "learning_rate": 5.736192626034613e-06, + "loss": 0.1841, + "step": 23650 + }, + { + "epoch": 5.94, + "grad_norm": 3.0090935979387723e-05, + "learning_rate": 5.718510158013544e-06, + "loss": 0.5312, + "step": 23700 + }, + { + "epoch": 5.96, + "grad_norm": 4.985315626981901e-08, + "learning_rate": 5.700827689992476e-06, + "loss": 0.0867, + "step": 23750 + }, + { + "epoch": 5.97, + "grad_norm": 0.7515669465065002, + "learning_rate": 5.683145221971407e-06, + "loss": 0.3645, + "step": 23800 + }, + { + "epoch": 5.98, + "grad_norm": 14.448786735534668, + "learning_rate": 5.665462753950339e-06, + "loss": 0.0975, + "step": 23850 + }, + { + "epoch": 5.99, + "grad_norm": 0.58511883020401, + "learning_rate": 5.6477802859292704e-06, + "loss": 0.0981, + "step": 23900 + }, + { + "epoch": 6.01, + "grad_norm": 5.8292873291065916e-05, + "learning_rate": 5.630097817908202e-06, + "loss": 0.2598, + "step": 23950 + }, + { + "epoch": 6.02, + "grad_norm": 0.03704287111759186, + "learning_rate": 5.6124153498871335e-06, + "loss": 0.1594, + "step": 24000 + }, + { + "epoch": 6.03, + "grad_norm": 0.0010854690335690975, + "learning_rate": 5.594732881866065e-06, + "loss": 0.2415, + "step": 24050 + }, + { + "epoch": 6.04, + "grad_norm": 381.2314147949219, + "learning_rate": 5.577050413844996e-06, + "loss": 0.0477, + "step": 24100 + }, + { + "epoch": 6.06, + "grad_norm": 8.66334667080082e-05, + "learning_rate": 5.559367945823928e-06, + "loss": 0.0424, + "step": 24150 + }, + { + "epoch": 6.07, + "grad_norm": 0.019515322521328926, + "learning_rate": 5.54168547780286e-06, + "loss": 0.3617, + "step": 24200 + }, + { + "epoch": 6.08, + "grad_norm": 0.00011614364484557882, + "learning_rate": 5.52400300978179e-06, + "loss": 0.1944, + "step": 24250 + }, + { + "epoch": 6.09, + "grad_norm": 0.00019373864051885903, + "learning_rate": 5.506320541760723e-06, + "loss": 0.0011, + "step": 24300 + }, + { + "epoch": 6.11, + "grad_norm": 1.0937032612901021e-08, + "learning_rate": 5.488638073739654e-06, + "loss": 0.0014, + "step": 24350 + }, + { + "epoch": 6.12, + "grad_norm": 2.1784097691945198e-13, + "learning_rate": 5.470955605718585e-06, + "loss": 0.0055, + "step": 24400 + }, + { + "epoch": 6.13, + "grad_norm": 0.01839843951165676, + "learning_rate": 5.4532731376975175e-06, + "loss": 0.0042, + "step": 24450 + }, + { + "epoch": 6.14, + "grad_norm": 4.981990930907898e-10, + "learning_rate": 5.435590669676449e-06, + "loss": 0.103, + "step": 24500 + }, + { + "epoch": 6.16, + "grad_norm": 0.0047708419151604176, + "learning_rate": 5.41790820165538e-06, + "loss": 0.0022, + "step": 24550 + }, + { + "epoch": 6.17, + "grad_norm": 0.003085497999563813, + "learning_rate": 5.400225733634312e-06, + "loss": 0.0021, + "step": 24600 + }, + { + "epoch": 6.18, + "grad_norm": 6.570710642250788e-11, + "learning_rate": 5.382543265613244e-06, + "loss": 0.2051, + "step": 24650 + }, + { + "epoch": 6.2, + "grad_norm": 0.0029285515192896128, + "learning_rate": 5.364860797592174e-06, + "loss": 0.0012, + "step": 24700 + }, + { + "epoch": 6.21, + "grad_norm": 3.4288578376617806e-07, + "learning_rate": 5.347178329571107e-06, + "loss": 0.0001, + "step": 24750 + }, + { + "epoch": 6.22, + "grad_norm": 0.00539399404078722, + "learning_rate": 5.3294958615500375e-06, + "loss": 0.2899, + "step": 24800 + }, + { + "epoch": 6.23, + "grad_norm": 2.6356909188507416e-07, + "learning_rate": 5.311813393528969e-06, + "loss": 0.0019, + "step": 24850 + }, + { + "epoch": 6.25, + "grad_norm": 0.019658172503113747, + "learning_rate": 5.294130925507901e-06, + "loss": 0.1133, + "step": 24900 + }, + { + "epoch": 6.26, + "grad_norm": 4.7282670834203344e-11, + "learning_rate": 5.276448457486832e-06, + "loss": 0.0001, + "step": 24950 + }, + { + "epoch": 6.27, + "grad_norm": 1.2473710739868693e-06, + "learning_rate": 5.258765989465764e-06, + "loss": 0.1143, + "step": 25000 + }, + { + "epoch": 6.28, + "grad_norm": 0.38085153698921204, + "learning_rate": 5.241083521444696e-06, + "loss": 0.059, + "step": 25050 + }, + { + "epoch": 6.3, + "grad_norm": 5.584224224090576, + "learning_rate": 5.223401053423627e-06, + "loss": 0.0833, + "step": 25100 + }, + { + "epoch": 6.31, + "grad_norm": 9.337106348539237e-06, + "learning_rate": 5.205718585402558e-06, + "loss": 0.0876, + "step": 25150 + }, + { + "epoch": 6.32, + "grad_norm": 4.118080099146937e-08, + "learning_rate": 5.188036117381491e-06, + "loss": 0.0703, + "step": 25200 + }, + { + "epoch": 6.33, + "grad_norm": 1.8987177554663504e-06, + "learning_rate": 5.170353649360421e-06, + "loss": 0.0625, + "step": 25250 + }, + { + "epoch": 6.35, + "grad_norm": 4.3221673462490173e-10, + "learning_rate": 5.152671181339353e-06, + "loss": 0.0284, + "step": 25300 + }, + { + "epoch": 6.36, + "grad_norm": 0.000691065622959286, + "learning_rate": 5.134988713318285e-06, + "loss": 0.0422, + "step": 25350 + }, + { + "epoch": 6.37, + "grad_norm": 0.00046700576785951853, + "learning_rate": 5.117306245297216e-06, + "loss": 0.0001, + "step": 25400 + }, + { + "epoch": 6.38, + "grad_norm": 0.008938438259065151, + "learning_rate": 5.099623777276148e-06, + "loss": 0.0141, + "step": 25450 + }, + { + "epoch": 6.4, + "grad_norm": 0.16503383219242096, + "learning_rate": 5.081941309255079e-06, + "loss": 0.0646, + "step": 25500 + }, + { + "epoch": 6.41, + "grad_norm": 8.952581993071362e-06, + "learning_rate": 5.064258841234011e-06, + "loss": 0.036, + "step": 25550 + }, + { + "epoch": 6.42, + "grad_norm": 0.014195716008543968, + "learning_rate": 5.046576373212942e-06, + "loss": 0.0005, + "step": 25600 + }, + { + "epoch": 6.43, + "grad_norm": 0.00028850819217041135, + "learning_rate": 5.028893905191874e-06, + "loss": 0.0754, + "step": 25650 + }, + { + "epoch": 6.45, + "grad_norm": 0.00020963407587260008, + "learning_rate": 5.011211437170805e-06, + "loss": 0.0003, + "step": 25700 + }, + { + "epoch": 6.46, + "grad_norm": 0.0010497659677639604, + "learning_rate": 4.993528969149737e-06, + "loss": 0.6013, + "step": 25750 + }, + { + "epoch": 6.47, + "grad_norm": 1.387237716699019e-06, + "learning_rate": 4.975846501128668e-06, + "loss": 0.005, + "step": 25800 + }, + { + "epoch": 6.48, + "grad_norm": 1.8294354958925396e-05, + "learning_rate": 4.9581640331076e-06, + "loss": 0.0, + "step": 25850 + }, + { + "epoch": 6.5, + "grad_norm": 4.903622539131902e-06, + "learning_rate": 4.9404815650865315e-06, + "loss": 0.0003, + "step": 25900 + }, + { + "epoch": 6.51, + "grad_norm": 0.000930552021600306, + "learning_rate": 4.922799097065463e-06, + "loss": 0.0464, + "step": 25950 + }, + { + "epoch": 6.52, + "grad_norm": 2.9821951102348976e-05, + "learning_rate": 4.905116629044395e-06, + "loss": 0.0854, + "step": 26000 + }, + { + "epoch": 6.53, + "grad_norm": 0.19266781210899353, + "learning_rate": 4.887434161023326e-06, + "loss": 0.1578, + "step": 26050 + }, + { + "epoch": 6.55, + "grad_norm": 6.610630862269318e-06, + "learning_rate": 4.869751693002258e-06, + "loss": 0.0004, + "step": 26100 + }, + { + "epoch": 6.56, + "grad_norm": 6.910874503773812e-07, + "learning_rate": 4.852069224981189e-06, + "loss": 0.0013, + "step": 26150 + }, + { + "epoch": 6.57, + "grad_norm": 0.00030907560721971095, + "learning_rate": 4.834386756960121e-06, + "loss": 0.0005, + "step": 26200 + }, + { + "epoch": 6.58, + "grad_norm": 1.2135699112292286e-09, + "learning_rate": 4.816704288939052e-06, + "loss": 0.1581, + "step": 26250 + }, + { + "epoch": 6.6, + "grad_norm": 8.979808626463637e-06, + "learning_rate": 4.799021820917984e-06, + "loss": 0.1339, + "step": 26300 + }, + { + "epoch": 6.61, + "grad_norm": 8.109305053949356e-05, + "learning_rate": 4.781339352896915e-06, + "loss": 0.1607, + "step": 26350 + }, + { + "epoch": 6.62, + "grad_norm": 0.11362000554800034, + "learning_rate": 4.763656884875847e-06, + "loss": 0.0152, + "step": 26400 + }, + { + "epoch": 6.63, + "grad_norm": 3.168620969518088e-05, + "learning_rate": 4.7459744168547785e-06, + "loss": 0.062, + "step": 26450 + }, + { + "epoch": 6.65, + "grad_norm": 2.37572979927063, + "learning_rate": 4.728291948833709e-06, + "loss": 0.0001, + "step": 26500 + }, + { + "epoch": 6.66, + "grad_norm": 1.1477128509795875e-06, + "learning_rate": 4.710609480812642e-06, + "loss": 0.2304, + "step": 26550 + }, + { + "epoch": 6.67, + "grad_norm": 3.561492079029449e-08, + "learning_rate": 4.692927012791573e-06, + "loss": 0.1046, + "step": 26600 + }, + { + "epoch": 6.68, + "grad_norm": 1.6958483457565308, + "learning_rate": 4.675244544770504e-06, + "loss": 0.0273, + "step": 26650 + }, + { + "epoch": 6.7, + "grad_norm": 6.609186675632372e-05, + "learning_rate": 4.657562076749436e-06, + "loss": 0.0504, + "step": 26700 + }, + { + "epoch": 6.71, + "grad_norm": 0.02066265046596527, + "learning_rate": 4.639879608728368e-06, + "loss": 0.0845, + "step": 26750 + }, + { + "epoch": 6.72, + "grad_norm": 0.6868598461151123, + "learning_rate": 4.6221971407072985e-06, + "loss": 0.064, + "step": 26800 + }, + { + "epoch": 6.73, + "grad_norm": 4.525861463378078e-09, + "learning_rate": 4.604514672686231e-06, + "loss": 0.0372, + "step": 26850 + }, + { + "epoch": 6.75, + "grad_norm": 0.0018904170719906688, + "learning_rate": 4.5868322046651625e-06, + "loss": 0.171, + "step": 26900 + }, + { + "epoch": 6.76, + "grad_norm": 0.06831281632184982, + "learning_rate": 4.569149736644093e-06, + "loss": 0.0005, + "step": 26950 + }, + { + "epoch": 6.77, + "grad_norm": 2.7328371288604103e-05, + "learning_rate": 4.5514672686230256e-06, + "loss": 0.0834, + "step": 27000 + }, + { + "epoch": 6.78, + "grad_norm": 1.312251782792373e-07, + "learning_rate": 4.533784800601956e-06, + "loss": 0.0009, + "step": 27050 + }, + { + "epoch": 6.8, + "grad_norm": 0.006464004050940275, + "learning_rate": 4.516102332580888e-06, + "loss": 0.1302, + "step": 27100 + }, + { + "epoch": 6.81, + "grad_norm": 4.0537888601477334e-09, + "learning_rate": 4.49841986455982e-06, + "loss": 0.1255, + "step": 27150 + }, + { + "epoch": 6.82, + "grad_norm": 0.0004817073349840939, + "learning_rate": 4.480737396538751e-06, + "loss": 0.001, + "step": 27200 + }, + { + "epoch": 6.83, + "grad_norm": 0.014918695203959942, + "learning_rate": 4.4630549285176825e-06, + "loss": 0.0019, + "step": 27250 + }, + { + "epoch": 6.85, + "grad_norm": 6.75780752420712e-17, + "learning_rate": 4.445372460496614e-06, + "loss": 0.0179, + "step": 27300 + }, + { + "epoch": 6.86, + "grad_norm": 382.1897888183594, + "learning_rate": 4.4276899924755456e-06, + "loss": 0.0396, + "step": 27350 + }, + { + "epoch": 6.87, + "grad_norm": 0.30687054991722107, + "learning_rate": 4.410007524454477e-06, + "loss": 0.0576, + "step": 27400 + }, + { + "epoch": 6.88, + "grad_norm": 1.2169127785455203e-06, + "learning_rate": 4.392325056433409e-06, + "loss": 0.0002, + "step": 27450 + }, + { + "epoch": 6.9, + "grad_norm": 6.928129077377054e-12, + "learning_rate": 4.37464258841234e-06, + "loss": 0.0989, + "step": 27500 + }, + { + "epoch": 6.91, + "grad_norm": 7.992535522305388e-10, + "learning_rate": 4.356960120391272e-06, + "loss": 0.0014, + "step": 27550 + }, + { + "epoch": 6.92, + "grad_norm": 0.001016330672428012, + "learning_rate": 4.339277652370203e-06, + "loss": 0.0796, + "step": 27600 + }, + { + "epoch": 6.94, + "grad_norm": 9.33817503323553e-08, + "learning_rate": 4.321595184349135e-06, + "loss": 0.0031, + "step": 27650 + }, + { + "epoch": 6.95, + "grad_norm": 3.0769423120524664e-10, + "learning_rate": 4.303912716328066e-06, + "loss": 0.0482, + "step": 27700 + }, + { + "epoch": 6.96, + "grad_norm": 5.2930868577050205e-09, + "learning_rate": 4.286230248306998e-06, + "loss": 0.0241, + "step": 27750 + }, + { + "epoch": 6.97, + "grad_norm": 2.738467628660146e-05, + "learning_rate": 4.2685477802859295e-06, + "loss": 0.0094, + "step": 27800 + }, + { + "epoch": 6.99, + "grad_norm": 1.259439272871532e-06, + "learning_rate": 4.250865312264861e-06, + "loss": 0.0011, + "step": 27850 + }, + { + "epoch": 7.0, + "grad_norm": 433.7135925292969, + "learning_rate": 4.233182844243792e-06, + "loss": 0.4265, + "step": 27900 + }, + { + "epoch": 7.01, + "grad_norm": 0.000105952778540086, + "learning_rate": 4.215500376222724e-06, + "loss": 0.0048, + "step": 27950 + }, + { + "epoch": 7.02, + "grad_norm": 0.2630611062049866, + "learning_rate": 4.197817908201656e-06, + "loss": 0.083, + "step": 28000 + }, + { + "epoch": 7.04, + "grad_norm": 1.2784289252221193e-11, + "learning_rate": 4.180135440180586e-06, + "loss": 0.0003, + "step": 28050 + }, + { + "epoch": 7.05, + "grad_norm": 4.8076164577137703e-11, + "learning_rate": 4.162452972159519e-06, + "loss": 0.0, + "step": 28100 + }, + { + "epoch": 7.06, + "grad_norm": 2.940306558230077e-07, + "learning_rate": 4.14477050413845e-06, + "loss": 0.0001, + "step": 28150 + }, + { + "epoch": 7.07, + "grad_norm": 4.1964653064496815e-05, + "learning_rate": 4.127088036117381e-06, + "loss": 0.0005, + "step": 28200 + }, + { + "epoch": 7.09, + "grad_norm": 0.005852025002241135, + "learning_rate": 4.1094055680963134e-06, + "loss": 0.0049, + "step": 28250 + }, + { + "epoch": 7.1, + "grad_norm": 0.05330043286085129, + "learning_rate": 4.091723100075245e-06, + "loss": 0.0, + "step": 28300 + }, + { + "epoch": 7.11, + "grad_norm": 2.5323606323013337e-08, + "learning_rate": 4.074040632054176e-06, + "loss": 0.0001, + "step": 28350 + }, + { + "epoch": 7.12, + "grad_norm": 0.004866173956543207, + "learning_rate": 4.056358164033108e-06, + "loss": 0.0002, + "step": 28400 + }, + { + "epoch": 7.14, + "grad_norm": 1.1348839645819453e-09, + "learning_rate": 4.038675696012039e-06, + "loss": 0.0361, + "step": 28450 + }, + { + "epoch": 7.15, + "grad_norm": 4.0626005102240015e-06, + "learning_rate": 4.02099322799097e-06, + "loss": 0.0006, + "step": 28500 + }, + { + "epoch": 7.16, + "grad_norm": 1.4158376870909706e-07, + "learning_rate": 4.003310759969903e-06, + "loss": 0.0, + "step": 28550 + }, + { + "epoch": 7.17, + "grad_norm": 3.5035823202633765e-06, + "learning_rate": 3.9856282919488334e-06, + "loss": 0.0, + "step": 28600 + }, + { + "epoch": 7.19, + "grad_norm": 7.668052421649918e-05, + "learning_rate": 3.967945823927765e-06, + "loss": 0.1484, + "step": 28650 + }, + { + "epoch": 7.2, + "grad_norm": 0.0006498922011815012, + "learning_rate": 3.950263355906697e-06, + "loss": 0.0, + "step": 28700 + }, + { + "epoch": 7.21, + "grad_norm": 1.2344708920863923e-05, + "learning_rate": 3.932580887885628e-06, + "loss": 0.0001, + "step": 28750 + }, + { + "epoch": 7.22, + "grad_norm": 4.231491038808599e-05, + "learning_rate": 3.91489841986456e-06, + "loss": 0.0001, + "step": 28800 + }, + { + "epoch": 7.24, + "grad_norm": 0.008648673072457314, + "learning_rate": 3.897215951843492e-06, + "loss": 0.0, + "step": 28850 + }, + { + "epoch": 7.25, + "grad_norm": 0.0010539034847170115, + "learning_rate": 3.879533483822423e-06, + "loss": 0.0, + "step": 28900 + }, + { + "epoch": 7.26, + "grad_norm": 5.991931902826764e-05, + "learning_rate": 3.861851015801354e-06, + "loss": 0.0001, + "step": 28950 + }, + { + "epoch": 7.27, + "grad_norm": 0.017336919903755188, + "learning_rate": 3.844168547780287e-06, + "loss": 0.0206, + "step": 29000 + }, + { + "epoch": 7.29, + "grad_norm": 0.0004083296225871891, + "learning_rate": 3.826486079759217e-06, + "loss": 0.0002, + "step": 29050 + }, + { + "epoch": 7.3, + "grad_norm": 9.027652740478516, + "learning_rate": 3.808803611738149e-06, + "loss": 0.0067, + "step": 29100 + }, + { + "epoch": 7.31, + "grad_norm": 0.0003242001694161445, + "learning_rate": 3.791121143717081e-06, + "loss": 0.0, + "step": 29150 + }, + { + "epoch": 7.32, + "grad_norm": 2.259884604427498e-05, + "learning_rate": 3.773438675696012e-06, + "loss": 0.0002, + "step": 29200 + }, + { + "epoch": 7.34, + "grad_norm": 9.495877265930176, + "learning_rate": 3.7557562076749436e-06, + "loss": 0.0002, + "step": 29250 + }, + { + "epoch": 7.35, + "grad_norm": 0.0059493957087397575, + "learning_rate": 3.7380737396538755e-06, + "loss": 0.0, + "step": 29300 + }, + { + "epoch": 7.36, + "grad_norm": 0.004485088866204023, + "learning_rate": 3.7203912716328067e-06, + "loss": 0.0, + "step": 29350 + }, + { + "epoch": 7.37, + "grad_norm": 8.322012309412881e-15, + "learning_rate": 3.702708803611738e-06, + "loss": 0.0018, + "step": 29400 + }, + { + "epoch": 7.39, + "grad_norm": 0.0009153097053058445, + "learning_rate": 3.68502633559067e-06, + "loss": 0.0001, + "step": 29450 + }, + { + "epoch": 7.4, + "grad_norm": 2.3616248654434457e-05, + "learning_rate": 3.6673438675696013e-06, + "loss": 0.1638, + "step": 29500 + }, + { + "epoch": 7.41, + "grad_norm": 0.0017722542397677898, + "learning_rate": 3.6496613995485324e-06, + "loss": 0.0123, + "step": 29550 + }, + { + "epoch": 7.42, + "grad_norm": 0.06969759613275528, + "learning_rate": 3.631978931527465e-06, + "loss": 0.0155, + "step": 29600 + }, + { + "epoch": 7.44, + "grad_norm": 1.5746809367556125e-06, + "learning_rate": 3.614296463506396e-06, + "loss": 0.0001, + "step": 29650 + }, + { + "epoch": 7.45, + "grad_norm": 3.0426802744010217e-10, + "learning_rate": 3.596613995485327e-06, + "loss": 0.0, + "step": 29700 + }, + { + "epoch": 7.46, + "grad_norm": 0.5712952017784119, + "learning_rate": 3.578931527464259e-06, + "loss": 0.1067, + "step": 29750 + }, + { + "epoch": 7.47, + "grad_norm": 0.766385555267334, + "learning_rate": 3.5612490594431906e-06, + "loss": 0.0107, + "step": 29800 + }, + { + "epoch": 7.49, + "grad_norm": 0.05696748197078705, + "learning_rate": 3.5435665914221217e-06, + "loss": 0.0013, + "step": 29850 + }, + { + "epoch": 7.5, + "grad_norm": 7.25884137864341e-06, + "learning_rate": 3.5258841234010537e-06, + "loss": 0.0, + "step": 29900 + }, + { + "epoch": 7.51, + "grad_norm": 1.7060403479263186e-05, + "learning_rate": 3.5082016553799852e-06, + "loss": 0.0002, + "step": 29950 + }, + { + "epoch": 7.52, + "grad_norm": 0.012671858072280884, + "learning_rate": 3.4905191873589168e-06, + "loss": 0.0, + "step": 30000 + }, + { + "epoch": 7.54, + "grad_norm": 2.8193007928223324e-09, + "learning_rate": 3.472836719337848e-06, + "loss": 0.0001, + "step": 30050 + }, + { + "epoch": 7.55, + "grad_norm": 0.019155049696564674, + "learning_rate": 3.4551542513167795e-06, + "loss": 0.0, + "step": 30100 + }, + { + "epoch": 7.56, + "grad_norm": 0.0020516354124993086, + "learning_rate": 3.4374717832957114e-06, + "loss": 0.0002, + "step": 30150 + }, + { + "epoch": 7.57, + "grad_norm": 2.4088294594548643e-05, + "learning_rate": 3.4197893152746425e-06, + "loss": 0.0492, + "step": 30200 + }, + { + "epoch": 7.59, + "grad_norm": 1.9164204786648043e-05, + "learning_rate": 3.402106847253574e-06, + "loss": 0.0312, + "step": 30250 + }, + { + "epoch": 7.6, + "grad_norm": 0.0160346832126379, + "learning_rate": 3.384424379232506e-06, + "loss": 0.0007, + "step": 30300 + }, + { + "epoch": 7.61, + "grad_norm": 7.57160614739405e-06, + "learning_rate": 3.366741911211437e-06, + "loss": 0.0005, + "step": 30350 + }, + { + "epoch": 7.62, + "grad_norm": 1.1699286504851525e-11, + "learning_rate": 3.3490594431903687e-06, + "loss": 0.0027, + "step": 30400 + }, + { + "epoch": 7.64, + "grad_norm": 1.0412069286758197e-06, + "learning_rate": 3.3313769751693003e-06, + "loss": 0.0011, + "step": 30450 + }, + { + "epoch": 7.65, + "grad_norm": 1.0678839998945477e-06, + "learning_rate": 3.313694507148232e-06, + "loss": 0.0, + "step": 30500 + }, + { + "epoch": 7.66, + "grad_norm": 2.537229315535683e-09, + "learning_rate": 3.2960120391271634e-06, + "loss": 0.0, + "step": 30550 + }, + { + "epoch": 7.67, + "grad_norm": 8.23991967990878e-07, + "learning_rate": 3.278329571106095e-06, + "loss": 0.0001, + "step": 30600 + }, + { + "epoch": 7.69, + "grad_norm": 0.0006322423578239977, + "learning_rate": 3.2606471030850265e-06, + "loss": 0.0001, + "step": 30650 + }, + { + "epoch": 7.7, + "grad_norm": 1.3688865863059618e-07, + "learning_rate": 3.242964635063958e-06, + "loss": 0.062, + "step": 30700 + }, + { + "epoch": 7.71, + "grad_norm": 2.41971292780363e-06, + "learning_rate": 3.2252821670428896e-06, + "loss": 0.1235, + "step": 30750 + }, + { + "epoch": 7.73, + "grad_norm": 2.4634087480990274e-07, + "learning_rate": 3.207599699021821e-06, + "loss": 0.0035, + "step": 30800 + }, + { + "epoch": 7.74, + "grad_norm": 3.1068152566149365e-06, + "learning_rate": 3.1899172310007527e-06, + "loss": 0.0205, + "step": 30850 + }, + { + "epoch": 7.75, + "grad_norm": 5.763430177552209e-09, + "learning_rate": 3.1722347629796842e-06, + "loss": 0.0, + "step": 30900 + }, + { + "epoch": 7.76, + "grad_norm": 0.008364195004105568, + "learning_rate": 3.1545522949586153e-06, + "loss": 0.0009, + "step": 30950 + }, + { + "epoch": 7.78, + "grad_norm": 0.00012845598394051194, + "learning_rate": 3.1368698269375473e-06, + "loss": 0.0008, + "step": 31000 + }, + { + "epoch": 7.79, + "grad_norm": 0.001842482597567141, + "learning_rate": 3.119187358916479e-06, + "loss": 0.0007, + "step": 31050 + }, + { + "epoch": 7.8, + "grad_norm": 1.2641396263113336e-10, + "learning_rate": 3.10150489089541e-06, + "loss": 0.0019, + "step": 31100 + }, + { + "epoch": 7.81, + "grad_norm": 0.00033131783129647374, + "learning_rate": 3.083822422874342e-06, + "loss": 0.0002, + "step": 31150 + }, + { + "epoch": 7.83, + "grad_norm": 1.851675369834993e-05, + "learning_rate": 3.0661399548532735e-06, + "loss": 0.0009, + "step": 31200 + }, + { + "epoch": 7.84, + "grad_norm": 0.00795644149184227, + "learning_rate": 3.0484574868322046e-06, + "loss": 0.077, + "step": 31250 + }, + { + "epoch": 7.85, + "grad_norm": 0.07745194435119629, + "learning_rate": 3.030775018811136e-06, + "loss": 0.0001, + "step": 31300 + }, + { + "epoch": 7.86, + "grad_norm": 0.10175588726997375, + "learning_rate": 3.013092550790068e-06, + "loss": 0.0307, + "step": 31350 + }, + { + "epoch": 7.88, + "grad_norm": 8.556443935958669e-05, + "learning_rate": 2.9954100827689993e-06, + "loss": 0.0, + "step": 31400 + }, + { + "epoch": 7.89, + "grad_norm": 0.9275371432304382, + "learning_rate": 2.977727614747931e-06, + "loss": 0.1478, + "step": 31450 + }, + { + "epoch": 7.9, + "grad_norm": 5.1567803360796916e-09, + "learning_rate": 2.960045146726863e-06, + "loss": 0.0598, + "step": 31500 + }, + { + "epoch": 7.91, + "grad_norm": 6.67710139623523e-07, + "learning_rate": 2.942362678705794e-06, + "loss": 0.0094, + "step": 31550 + }, + { + "epoch": 7.93, + "grad_norm": 1.458290155298414e-10, + "learning_rate": 2.9246802106847255e-06, + "loss": 0.0009, + "step": 31600 + }, + { + "epoch": 7.94, + "grad_norm": 5.1869348681066185e-05, + "learning_rate": 2.906997742663657e-06, + "loss": 0.0007, + "step": 31650 + }, + { + "epoch": 7.95, + "grad_norm": 0.00036754223401658237, + "learning_rate": 2.8893152746425886e-06, + "loss": 0.1282, + "step": 31700 + }, + { + "epoch": 7.96, + "grad_norm": 0.0028616636991500854, + "learning_rate": 2.87163280662152e-06, + "loss": 0.1516, + "step": 31750 + }, + { + "epoch": 7.98, + "grad_norm": 0.0008008142467588186, + "learning_rate": 2.8539503386004512e-06, + "loss": 0.0004, + "step": 31800 + }, + { + "epoch": 7.99, + "grad_norm": 1.0718519405372717e-07, + "learning_rate": 2.8362678705793832e-06, + "loss": 0.0, + "step": 31850 + }, + { + "epoch": 8.0, + "grad_norm": 0.0009103859774768353, + "learning_rate": 2.8185854025583148e-06, + "loss": 0.0001, + "step": 31900 + }, + { + "epoch": 8.01, + "grad_norm": 0.0001856798044173047, + "learning_rate": 2.800902934537246e-06, + "loss": 0.0, + "step": 31950 + }, + { + "epoch": 8.03, + "grad_norm": 0.00011591133807087317, + "learning_rate": 2.7832204665161774e-06, + "loss": 0.0001, + "step": 32000 + }, + { + "epoch": 8.04, + "grad_norm": 0.00040982267819345, + "learning_rate": 2.7655379984951094e-06, + "loss": 0.0, + "step": 32050 + }, + { + "epoch": 8.05, + "grad_norm": 2.265534648770995e-09, + "learning_rate": 2.7478555304740405e-06, + "loss": 0.0001, + "step": 32100 + }, + { + "epoch": 8.06, + "grad_norm": 5.858885425424898e-13, + "learning_rate": 2.730173062452972e-06, + "loss": 0.0001, + "step": 32150 + }, + { + "epoch": 8.08, + "grad_norm": 2.8236866932730136e-18, + "learning_rate": 2.712490594431904e-06, + "loss": 0.0, + "step": 32200 + }, + { + "epoch": 8.09, + "grad_norm": 0.0001981940004043281, + "learning_rate": 2.694808126410835e-06, + "loss": 0.0, + "step": 32250 + }, + { + "epoch": 8.1, + "grad_norm": 3.2661256511856696e-12, + "learning_rate": 2.6771256583897667e-06, + "loss": 0.0, + "step": 32300 + }, + { + "epoch": 8.11, + "grad_norm": 1.1293546776869334e-05, + "learning_rate": 2.6594431903686983e-06, + "loss": 0.0, + "step": 32350 + }, + { + "epoch": 8.13, + "grad_norm": 0.0003391726640984416, + "learning_rate": 2.64176072234763e-06, + "loss": 0.0, + "step": 32400 + }, + { + "epoch": 8.14, + "grad_norm": 6.486132042482495e-05, + "learning_rate": 2.6240782543265614e-06, + "loss": 0.0, + "step": 32450 + }, + { + "epoch": 8.15, + "grad_norm": 2.1309777366695926e-05, + "learning_rate": 2.606395786305493e-06, + "loss": 0.0, + "step": 32500 + }, + { + "epoch": 8.16, + "grad_norm": 4.4795211806558655e-07, + "learning_rate": 2.5887133182844245e-06, + "loss": 0.0007, + "step": 32550 + }, + { + "epoch": 8.18, + "grad_norm": 2.0528705402256264e-09, + "learning_rate": 2.571030850263356e-06, + "loss": 0.0, + "step": 32600 + }, + { + "epoch": 8.19, + "grad_norm": 4.783522308571264e-05, + "learning_rate": 2.5533483822422876e-06, + "loss": 0.0, + "step": 32650 + }, + { + "epoch": 8.2, + "grad_norm": 1.7800081408836377e-08, + "learning_rate": 2.535665914221219e-06, + "loss": 0.0, + "step": 32700 + }, + { + "epoch": 8.21, + "grad_norm": 0.0003143524518236518, + "learning_rate": 2.5179834462001507e-06, + "loss": 0.0003, + "step": 32750 + }, + { + "epoch": 8.23, + "grad_norm": 9.409014455741271e-05, + "learning_rate": 2.500300978179082e-06, + "loss": 0.0, + "step": 32800 + }, + { + "epoch": 8.24, + "grad_norm": 3.097814449404268e-09, + "learning_rate": 2.4826185101580133e-06, + "loss": 0.0, + "step": 32850 + }, + { + "epoch": 8.25, + "grad_norm": 6.660656595158798e-07, + "learning_rate": 2.4649360421369453e-06, + "loss": 0.0, + "step": 32900 + }, + { + "epoch": 8.26, + "grad_norm": 0.04804990068078041, + "learning_rate": 2.447253574115877e-06, + "loss": 0.0, + "step": 32950 + }, + { + "epoch": 8.28, + "grad_norm": 3.926641234386352e-09, + "learning_rate": 2.429571106094808e-06, + "loss": 0.0, + "step": 33000 + }, + { + "epoch": 8.29, + "grad_norm": 1.5834859022183257e-20, + "learning_rate": 2.4118886380737395e-06, + "loss": 0.0001, + "step": 33050 + }, + { + "epoch": 8.3, + "grad_norm": 0.20250500738620758, + "learning_rate": 2.3942061700526715e-06, + "loss": 0.0004, + "step": 33100 + }, + { + "epoch": 8.31, + "grad_norm": 5.932114959250612e-07, + "learning_rate": 2.3765237020316026e-06, + "loss": 0.0001, + "step": 33150 + }, + { + "epoch": 8.33, + "grad_norm": 1.5223192498248217e-11, + "learning_rate": 2.358841234010534e-06, + "loss": 0.0, + "step": 33200 + }, + { + "epoch": 8.34, + "grad_norm": 1.2739813826101454e-07, + "learning_rate": 2.341158765989466e-06, + "loss": 0.0, + "step": 33250 + }, + { + "epoch": 8.35, + "grad_norm": 1.2789546310898459e-08, + "learning_rate": 2.3234762979683973e-06, + "loss": 0.0001, + "step": 33300 + }, + { + "epoch": 8.36, + "grad_norm": 1.4692803233629093e-05, + "learning_rate": 2.305793829947329e-06, + "loss": 0.0, + "step": 33350 + }, + { + "epoch": 8.38, + "grad_norm": 0.00019242956477683038, + "learning_rate": 2.2881113619262604e-06, + "loss": 0.0403, + "step": 33400 + }, + { + "epoch": 8.39, + "grad_norm": 0.0, + "learning_rate": 2.270428893905192e-06, + "loss": 0.0, + "step": 33450 + }, + { + "epoch": 8.4, + "grad_norm": 0.002393543953076005, + "learning_rate": 2.2527464258841235e-06, + "loss": 0.0399, + "step": 33500 + }, + { + "epoch": 8.41, + "grad_norm": 1.7551202802223997e-07, + "learning_rate": 2.235063957863055e-06, + "loss": 0.0, + "step": 33550 + }, + { + "epoch": 8.43, + "grad_norm": 2.735872639547665e-11, + "learning_rate": 2.2173814898419866e-06, + "loss": 0.0004, + "step": 33600 + }, + { + "epoch": 8.44, + "grad_norm": 0.0003994428552687168, + "learning_rate": 2.199699021820918e-06, + "loss": 0.0, + "step": 33650 + }, + { + "epoch": 8.45, + "grad_norm": 2.7801218032836914, + "learning_rate": 2.1820165537998497e-06, + "loss": 0.0, + "step": 33700 + }, + { + "epoch": 8.47, + "grad_norm": 1.100529516406823e-06, + "learning_rate": 2.164334085778781e-06, + "loss": 0.0, + "step": 33750 + }, + { + "epoch": 8.48, + "grad_norm": 0.02319416031241417, + "learning_rate": 2.1466516177577128e-06, + "loss": 0.0, + "step": 33800 + }, + { + "epoch": 8.49, + "grad_norm": 0.0017326247179880738, + "learning_rate": 2.1289691497366443e-06, + "loss": 0.0, + "step": 33850 + }, + { + "epoch": 8.5, + "grad_norm": 1.8130524859216735e-10, + "learning_rate": 2.1112866817155754e-06, + "loss": 0.0001, + "step": 33900 + }, + { + "epoch": 8.52, + "grad_norm": 0.0017156396061182022, + "learning_rate": 2.0936042136945074e-06, + "loss": 0.0004, + "step": 33950 + }, + { + "epoch": 8.53, + "grad_norm": 0.0018524077022448182, + "learning_rate": 2.075921745673439e-06, + "loss": 0.0, + "step": 34000 + }, + { + "epoch": 8.54, + "grad_norm": 1.1194772923772689e-05, + "learning_rate": 2.05823927765237e-06, + "loss": 0.0, + "step": 34050 + }, + { + "epoch": 8.55, + "grad_norm": 7.453370471921517e-06, + "learning_rate": 2.040556809631302e-06, + "loss": 0.0164, + "step": 34100 + }, + { + "epoch": 8.57, + "grad_norm": 6.412294029090049e-10, + "learning_rate": 2.0228743416102336e-06, + "loss": 0.0, + "step": 34150 + }, + { + "epoch": 8.58, + "grad_norm": 1.4134855689861236e-17, + "learning_rate": 2.0051918735891647e-06, + "loss": 0.0, + "step": 34200 + }, + { + "epoch": 8.59, + "grad_norm": 0.00038817909080535173, + "learning_rate": 1.9875094055680963e-06, + "loss": 0.0365, + "step": 34250 + }, + { + "epoch": 8.6, + "grad_norm": 2.2248328605201095e-05, + "learning_rate": 1.9698269375470282e-06, + "loss": 0.0, + "step": 34300 + }, + { + "epoch": 8.62, + "grad_norm": 0.010381842032074928, + "learning_rate": 1.9521444695259594e-06, + "loss": 0.0, + "step": 34350 + }, + { + "epoch": 8.63, + "grad_norm": 0.001246288768015802, + "learning_rate": 1.934462001504891e-06, + "loss": 0.0, + "step": 34400 + }, + { + "epoch": 8.64, + "grad_norm": 1.8006402254104614, + "learning_rate": 1.916779533483823e-06, + "loss": 0.0007, + "step": 34450 + }, + { + "epoch": 8.65, + "grad_norm": 1.644072100681626e-10, + "learning_rate": 1.899097065462754e-06, + "loss": 0.0, + "step": 34500 + }, + { + "epoch": 8.67, + "grad_norm": 5.652666779099036e-09, + "learning_rate": 1.8814145974416856e-06, + "loss": 0.0, + "step": 34550 + }, + { + "epoch": 8.68, + "grad_norm": 0.003141549648717046, + "learning_rate": 1.8637321294206173e-06, + "loss": 0.0, + "step": 34600 + }, + { + "epoch": 8.69, + "grad_norm": 1.1486420135042863e-06, + "learning_rate": 1.8460496613995484e-06, + "loss": 0.0003, + "step": 34650 + }, + { + "epoch": 8.7, + "grad_norm": 1.1713603271346074e-05, + "learning_rate": 1.8283671933784802e-06, + "loss": 0.0, + "step": 34700 + }, + { + "epoch": 8.72, + "grad_norm": 1.2204428685436142e-06, + "learning_rate": 1.8106847253574115e-06, + "loss": 0.0, + "step": 34750 + }, + { + "epoch": 8.73, + "grad_norm": 0.0014657212886959314, + "learning_rate": 1.793002257336343e-06, + "loss": 0.0, + "step": 34800 + }, + { + "epoch": 8.74, + "grad_norm": 0.017868679016828537, + "learning_rate": 1.7753197893152748e-06, + "loss": 0.0, + "step": 34850 + }, + { + "epoch": 8.75, + "grad_norm": 3.3499613891763147e-06, + "learning_rate": 1.7576373212942062e-06, + "loss": 0.0001, + "step": 34900 + }, + { + "epoch": 8.77, + "grad_norm": 6.1278524476904295e-09, + "learning_rate": 1.7399548532731377e-06, + "loss": 0.0001, + "step": 34950 + }, + { + "epoch": 8.78, + "grad_norm": 1.445396605959104e-06, + "learning_rate": 1.7222723852520693e-06, + "loss": 0.0001, + "step": 35000 + }, + { + "epoch": 8.79, + "grad_norm": 0.0017798148328438401, + "learning_rate": 1.7045899172310008e-06, + "loss": 0.1651, + "step": 35050 + }, + { + "epoch": 8.8, + "grad_norm": 3.6833380789857983e-08, + "learning_rate": 1.6869074492099324e-06, + "loss": 0.0, + "step": 35100 + }, + { + "epoch": 8.82, + "grad_norm": 9.361156988463293e-11, + "learning_rate": 1.669224981188864e-06, + "loss": 0.0, + "step": 35150 + }, + { + "epoch": 8.83, + "grad_norm": 7.828115933250501e-09, + "learning_rate": 1.6515425131677955e-06, + "loss": 0.0, + "step": 35200 + }, + { + "epoch": 8.84, + "grad_norm": 1.020300643972405e-08, + "learning_rate": 1.6338600451467268e-06, + "loss": 0.0, + "step": 35250 + }, + { + "epoch": 8.85, + "grad_norm": 0.000530413759406656, + "learning_rate": 1.6161775771256586e-06, + "loss": 0.0, + "step": 35300 + }, + { + "epoch": 8.87, + "grad_norm": 8.391751182834639e-10, + "learning_rate": 1.59849510910459e-06, + "loss": 0.0, + "step": 35350 + }, + { + "epoch": 8.88, + "grad_norm": 0.003899802453815937, + "learning_rate": 1.5808126410835214e-06, + "loss": 0.0, + "step": 35400 + }, + { + "epoch": 8.89, + "grad_norm": 2.3727285224595107e-05, + "learning_rate": 1.5631301730624532e-06, + "loss": 0.0, + "step": 35450 + }, + { + "epoch": 8.9, + "grad_norm": 4.068557245773263e-06, + "learning_rate": 1.5454477050413845e-06, + "loss": 0.0, + "step": 35500 + }, + { + "epoch": 8.92, + "grad_norm": 0.007758264895528555, + "learning_rate": 1.527765237020316e-06, + "loss": 0.0, + "step": 35550 + }, + { + "epoch": 8.93, + "grad_norm": 0.00016868404054548591, + "learning_rate": 1.5100827689992474e-06, + "loss": 0.0, + "step": 35600 + }, + { + "epoch": 8.94, + "grad_norm": 0.00011449763405835256, + "learning_rate": 1.4924003009781792e-06, + "loss": 0.0, + "step": 35650 + }, + { + "epoch": 8.95, + "grad_norm": 4.054548298881855e-06, + "learning_rate": 1.4747178329571107e-06, + "loss": 0.0, + "step": 35700 + }, + { + "epoch": 8.97, + "grad_norm": 0.0010476693278178573, + "learning_rate": 1.457035364936042e-06, + "loss": 0.0001, + "step": 35750 + }, + { + "epoch": 8.98, + "grad_norm": 0.06502784043550491, + "learning_rate": 1.4393528969149738e-06, + "loss": 0.0, + "step": 35800 + }, + { + "epoch": 8.99, + "grad_norm": 2.2866407789479126e-07, + "learning_rate": 1.4216704288939052e-06, + "loss": 0.0, + "step": 35850 + }, + { + "epoch": 9.0, + "grad_norm": 0.00021389636094681919, + "learning_rate": 1.4039879608728367e-06, + "loss": 0.0002, + "step": 35900 + }, + { + "epoch": 9.02, + "grad_norm": 1.3870979032049036e-08, + "learning_rate": 1.3863054928517683e-06, + "loss": 0.0, + "step": 35950 + }, + { + "epoch": 9.03, + "grad_norm": 6.817894586674811e-07, + "learning_rate": 1.3686230248306998e-06, + "loss": 0.0, + "step": 36000 + }, + { + "epoch": 9.04, + "grad_norm": 6.899564031215277e-09, + "learning_rate": 1.3509405568096314e-06, + "loss": 0.0, + "step": 36050 + }, + { + "epoch": 9.05, + "grad_norm": 5.222953859629342e-06, + "learning_rate": 1.333258088788563e-06, + "loss": 0.0, + "step": 36100 + }, + { + "epoch": 9.07, + "grad_norm": 1.425233087104516e-08, + "learning_rate": 1.3155756207674945e-06, + "loss": 0.0, + "step": 36150 + }, + { + "epoch": 9.08, + "grad_norm": 0.001089599565602839, + "learning_rate": 1.2978931527464258e-06, + "loss": 0.0, + "step": 36200 + }, + { + "epoch": 9.09, + "grad_norm": 5.556981932386407e-07, + "learning_rate": 1.2802106847253576e-06, + "loss": 0.0, + "step": 36250 + }, + { + "epoch": 9.1, + "grad_norm": 3.3812255423981696e-05, + "learning_rate": 1.2625282167042889e-06, + "loss": 0.0, + "step": 36300 + }, + { + "epoch": 9.12, + "grad_norm": 0.008249111473560333, + "learning_rate": 1.2448457486832204e-06, + "loss": 0.0, + "step": 36350 + }, + { + "epoch": 9.13, + "grad_norm": 1.3336196388991084e-05, + "learning_rate": 1.2271632806621522e-06, + "loss": 0.0, + "step": 36400 + }, + { + "epoch": 9.14, + "grad_norm": 7.238022403655009e-10, + "learning_rate": 1.2094808126410835e-06, + "loss": 0.0, + "step": 36450 + }, + { + "epoch": 9.15, + "grad_norm": 1.9307069831775436e-11, + "learning_rate": 1.191798344620015e-06, + "loss": 0.0, + "step": 36500 + }, + { + "epoch": 9.17, + "grad_norm": 2.09102075932055e-11, + "learning_rate": 1.1741158765989466e-06, + "loss": 0.0, + "step": 36550 + }, + { + "epoch": 9.18, + "grad_norm": 0.005663714837282896, + "learning_rate": 1.1564334085778782e-06, + "loss": 0.0, + "step": 36600 + }, + { + "epoch": 9.19, + "grad_norm": 0.0010162381222471595, + "learning_rate": 1.1387509405568097e-06, + "loss": 0.0, + "step": 36650 + }, + { + "epoch": 9.2, + "grad_norm": 6.629519339185208e-05, + "learning_rate": 1.1210684725357413e-06, + "loss": 0.0, + "step": 36700 + }, + { + "epoch": 9.22, + "grad_norm": 3.708991016537766e-06, + "learning_rate": 1.1033860045146728e-06, + "loss": 0.0, + "step": 36750 + }, + { + "epoch": 9.23, + "grad_norm": 1.2199451703054365e-05, + "learning_rate": 1.0857035364936042e-06, + "loss": 0.0, + "step": 36800 + }, + { + "epoch": 9.24, + "grad_norm": 3.44480326930352e-07, + "learning_rate": 1.068021068472536e-06, + "loss": 0.0, + "step": 36850 + }, + { + "epoch": 9.26, + "grad_norm": 4.88109819229976e-09, + "learning_rate": 1.0503386004514673e-06, + "loss": 0.0, + "step": 36900 + }, + { + "epoch": 9.27, + "grad_norm": 1.233081690088511e-07, + "learning_rate": 1.0326561324303988e-06, + "loss": 0.0, + "step": 36950 + }, + { + "epoch": 9.28, + "grad_norm": 4.49614967479306e-09, + "learning_rate": 1.0149736644093304e-06, + "loss": 0.0, + "step": 37000 + }, + { + "epoch": 9.29, + "grad_norm": 1.2748416793328943e-06, + "learning_rate": 9.97291196388262e-07, + "loss": 0.0, + "step": 37050 + }, + { + "epoch": 9.31, + "grad_norm": 2.0055947869265442e-14, + "learning_rate": 9.796087283671935e-07, + "loss": 0.0, + "step": 37100 + }, + { + "epoch": 9.32, + "grad_norm": 1.1864563075153988e-15, + "learning_rate": 9.619262603461248e-07, + "loss": 0.0, + "step": 37150 + }, + { + "epoch": 9.33, + "grad_norm": 3.7789734051330015e-05, + "learning_rate": 9.442437923250566e-07, + "loss": 0.0, + "step": 37200 + }, + { + "epoch": 9.34, + "grad_norm": 0.2207726538181305, + "learning_rate": 9.26561324303988e-07, + "loss": 0.0, + "step": 37250 + }, + { + "epoch": 9.36, + "grad_norm": 0.0002287498500663787, + "learning_rate": 9.088788562829194e-07, + "loss": 0.0, + "step": 37300 + }, + { + "epoch": 9.37, + "grad_norm": 1.4228673350658028e-08, + "learning_rate": 8.911963882618511e-07, + "loss": 0.0, + "step": 37350 + }, + { + "epoch": 9.38, + "grad_norm": 2.1739325584408233e-16, + "learning_rate": 8.735139202407825e-07, + "loss": 0.0, + "step": 37400 + }, + { + "epoch": 9.39, + "grad_norm": 1.1177444037002715e-07, + "learning_rate": 8.558314522197141e-07, + "loss": 0.0, + "step": 37450 + }, + { + "epoch": 9.41, + "grad_norm": 2.286371909576701e-06, + "learning_rate": 8.381489841986456e-07, + "loss": 0.0, + "step": 37500 + }, + { + "epoch": 9.42, + "grad_norm": 0.0007677926332689822, + "learning_rate": 8.204665161775772e-07, + "loss": 0.0, + "step": 37550 + }, + { + "epoch": 9.43, + "grad_norm": 7.146362435150877e-08, + "learning_rate": 8.027840481565087e-07, + "loss": 0.0, + "step": 37600 + }, + { + "epoch": 9.44, + "grad_norm": 7.620369160576956e-06, + "learning_rate": 7.851015801354402e-07, + "loss": 0.0, + "step": 37650 + }, + { + "epoch": 9.46, + "grad_norm": 1.6175413009023032e-07, + "learning_rate": 7.674191121143717e-07, + "loss": 0.0, + "step": 37700 + }, + { + "epoch": 9.47, + "grad_norm": 1.4307224773801863e-06, + "learning_rate": 7.497366440933033e-07, + "loss": 0.0, + "step": 37750 + }, + { + "epoch": 9.48, + "grad_norm": 1.4143168414193497e-07, + "learning_rate": 7.320541760722348e-07, + "loss": 0.0, + "step": 37800 + }, + { + "epoch": 9.49, + "grad_norm": 5.554405676719276e-13, + "learning_rate": 7.143717080511664e-07, + "loss": 0.0, + "step": 37850 + }, + { + "epoch": 9.51, + "grad_norm": 3.3866279225414075e-10, + "learning_rate": 6.966892400300979e-07, + "loss": 0.0941, + "step": 37900 + }, + { + "epoch": 9.52, + "grad_norm": 6.048647804846041e-08, + "learning_rate": 6.790067720090294e-07, + "loss": 0.0, + "step": 37950 + }, + { + "epoch": 9.53, + "grad_norm": 2.0648124632316467e-07, + "learning_rate": 6.613243039879609e-07, + "loss": 0.0, + "step": 38000 + }, + { + "epoch": 9.54, + "grad_norm": 0.0016063437797129154, + "learning_rate": 6.436418359668924e-07, + "loss": 0.0, + "step": 38050 + }, + { + "epoch": 9.56, + "grad_norm": 1.247152141559127e-07, + "learning_rate": 6.259593679458239e-07, + "loss": 0.0, + "step": 38100 + }, + { + "epoch": 9.57, + "grad_norm": 8.07224120880079e-10, + "learning_rate": 6.082768999247555e-07, + "loss": 0.0, + "step": 38150 + }, + { + "epoch": 9.58, + "grad_norm": 7.635571320184498e-13, + "learning_rate": 5.90594431903687e-07, + "loss": 0.0, + "step": 38200 + }, + { + "epoch": 9.59, + "grad_norm": 4.792551688836966e-09, + "learning_rate": 5.729119638826185e-07, + "loss": 0.0, + "step": 38250 + }, + { + "epoch": 9.61, + "grad_norm": 3.3811686535045737e-06, + "learning_rate": 5.552294958615501e-07, + "loss": 0.0, + "step": 38300 + }, + { + "epoch": 9.62, + "grad_norm": 1.0496427338413383e-10, + "learning_rate": 5.375470278404815e-07, + "loss": 0.0, + "step": 38350 + }, + { + "epoch": 9.63, + "grad_norm": 0.000780309725087136, + "learning_rate": 5.198645598194131e-07, + "loss": 0.0, + "step": 38400 + }, + { + "epoch": 9.64, + "grad_norm": 4.170356078248005e-06, + "learning_rate": 5.021820917983446e-07, + "loss": 0.0, + "step": 38450 + }, + { + "epoch": 9.66, + "grad_norm": 0.004391836933791637, + "learning_rate": 4.844996237772762e-07, + "loss": 0.0, + "step": 38500 + }, + { + "epoch": 9.67, + "grad_norm": 6.341772859741468e-06, + "learning_rate": 4.668171557562077e-07, + "loss": 0.0, + "step": 38550 + }, + { + "epoch": 9.68, + "grad_norm": 0.006547071970999241, + "learning_rate": 4.4913468773513927e-07, + "loss": 0.0, + "step": 38600 + }, + { + "epoch": 9.69, + "grad_norm": 4.025184352940414e-06, + "learning_rate": 4.3145221971407076e-07, + "loss": 0.0, + "step": 38650 + }, + { + "epoch": 9.71, + "grad_norm": 0.007137050852179527, + "learning_rate": 4.1376975169300226e-07, + "loss": 0.0, + "step": 38700 + }, + { + "epoch": 9.72, + "grad_norm": 2.464033421745171e-10, + "learning_rate": 3.960872836719338e-07, + "loss": 0.0, + "step": 38750 + }, + { + "epoch": 9.73, + "grad_norm": 1.651005368330516e-05, + "learning_rate": 3.784048156508653e-07, + "loss": 0.0, + "step": 38800 + }, + { + "epoch": 9.74, + "grad_norm": 8.712972184021783e-12, + "learning_rate": 3.6072234762979685e-07, + "loss": 0.0, + "step": 38850 + }, + { + "epoch": 9.76, + "grad_norm": 0.00013669347390532494, + "learning_rate": 3.430398796087284e-07, + "loss": 0.0, + "step": 38900 + }, + { + "epoch": 9.77, + "grad_norm": 2.920106635428965e-05, + "learning_rate": 3.253574115876599e-07, + "loss": 0.0, + "step": 38950 + }, + { + "epoch": 9.78, + "grad_norm": 3.793598768453421e-09, + "learning_rate": 3.0767494356659144e-07, + "loss": 0.0, + "step": 39000 + }, + { + "epoch": 9.79, + "grad_norm": 1.0695604402144454e-09, + "learning_rate": 2.89992475545523e-07, + "loss": 0.0, + "step": 39050 + }, + { + "epoch": 9.81, + "grad_norm": 1.2106751764691062e-16, + "learning_rate": 2.723100075244545e-07, + "loss": 0.0, + "step": 39100 + }, + { + "epoch": 9.82, + "grad_norm": 3.6123870472692943e-07, + "learning_rate": 2.54627539503386e-07, + "loss": 0.0, + "step": 39150 + }, + { + "epoch": 9.83, + "grad_norm": 3.495557336918864e-07, + "learning_rate": 2.3694507148231756e-07, + "loss": 0.0, + "step": 39200 + }, + { + "epoch": 9.84, + "grad_norm": 4.76532950415276e-05, + "learning_rate": 2.1926260346124908e-07, + "loss": 0.0, + "step": 39250 + }, + { + "epoch": 9.86, + "grad_norm": 8.871047612046823e-05, + "learning_rate": 2.015801354401806e-07, + "loss": 0.0, + "step": 39300 + } + ], + "logging_steps": 50, + "max_steps": 39870, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 50, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}