diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.4325610316580605, + "epoch": 0.9996485441617778, "eval_steps": 500, - "global_step": 1000, + "global_step": 2311, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -7007,6 +7007,9183 @@ "learning_rate": 1.1469816272965881e-05, "loss": 1.5889, "step": 1000 + }, + { + "epoch": 0.43299359268971854, + "grad_norm": 23.486230850219727, + "learning_rate": 1.1461067366579179e-05, + "loss": 1.7299, + "step": 1001 + }, + { + "epoch": 0.4334261537213766, + "grad_norm": 21.794849395751953, + "learning_rate": 1.1452318460192478e-05, + "loss": 1.6118, + "step": 1002 + }, + { + "epoch": 0.43385871475303467, + "grad_norm": 20.80159568786621, + "learning_rate": 1.1443569553805776e-05, + "loss": 1.5558, + "step": 1003 + }, + { + "epoch": 0.43429127578469273, + "grad_norm": 21.411245346069336, + "learning_rate": 1.1434820647419073e-05, + "loss": 1.5028, + "step": 1004 + }, + { + "epoch": 0.4347238368163508, + "grad_norm": 20.596715927124023, + "learning_rate": 1.1426071741032372e-05, + "loss": 1.5536, + "step": 1005 + }, + { + "epoch": 0.43515639784800886, + "grad_norm": 22.81339454650879, + "learning_rate": 1.141732283464567e-05, + "loss": 1.5613, + "step": 1006 + }, + { + "epoch": 0.4355889588796669, + "grad_norm": 20.697265625, + "learning_rate": 1.1408573928258967e-05, + "loss": 1.5837, + "step": 1007 + }, + { + "epoch": 0.436021519911325, + "grad_norm": 18.886920928955078, + "learning_rate": 1.1399825021872267e-05, + "loss": 1.542, + "step": 1008 + }, + { + "epoch": 0.43645408094298305, + "grad_norm": 22.665058135986328, + "learning_rate": 1.1391076115485564e-05, + "loss": 1.6007, + "step": 1009 + }, + { + "epoch": 0.4368866419746411, + "grad_norm": 20.35870361328125, + "learning_rate": 1.1382327209098865e-05, + "loss": 1.5594, + "step": 1010 + }, + { + "epoch": 0.4373192030062992, + "grad_norm": 19.82785987854004, + "learning_rate": 1.1373578302712163e-05, + "loss": 1.6128, + "step": 1011 + }, + { + "epoch": 0.43775176403795724, + "grad_norm": 21.4642276763916, + "learning_rate": 1.136482939632546e-05, + "loss": 1.647, + "step": 1012 + }, + { + "epoch": 0.4381843250696153, + "grad_norm": 24.7077693939209, + "learning_rate": 1.135608048993876e-05, + "loss": 1.5577, + "step": 1013 + }, + { + "epoch": 0.43861688610127336, + "grad_norm": 21.480487823486328, + "learning_rate": 1.1347331583552057e-05, + "loss": 1.5856, + "step": 1014 + }, + { + "epoch": 0.4390494471329314, + "grad_norm": 24.280683517456055, + "learning_rate": 1.1338582677165354e-05, + "loss": 1.5539, + "step": 1015 + }, + { + "epoch": 0.4394820081645895, + "grad_norm": 22.40320587158203, + "learning_rate": 1.1329833770778653e-05, + "loss": 1.5617, + "step": 1016 + }, + { + "epoch": 0.43991456919624755, + "grad_norm": 24.9234619140625, + "learning_rate": 1.1321084864391951e-05, + "loss": 1.5467, + "step": 1017 + }, + { + "epoch": 0.4403471302279056, + "grad_norm": 20.776636123657227, + "learning_rate": 1.131233595800525e-05, + "loss": 1.56, + "step": 1018 + }, + { + "epoch": 0.4407796912595637, + "grad_norm": 20.39991569519043, + "learning_rate": 1.1303587051618548e-05, + "loss": 1.557, + "step": 1019 + }, + { + "epoch": 0.44121225229122174, + "grad_norm": 19.826358795166016, + "learning_rate": 1.1294838145231849e-05, + "loss": 1.6674, + "step": 1020 + }, + { + "epoch": 0.44164481332287975, + "grad_norm": 19.581579208374023, + "learning_rate": 1.1286089238845146e-05, + "loss": 1.5441, + "step": 1021 + }, + { + "epoch": 0.4420773743545378, + "grad_norm": 22.186416625976562, + "learning_rate": 1.1277340332458444e-05, + "loss": 1.4218, + "step": 1022 + }, + { + "epoch": 0.4425099353861959, + "grad_norm": 24.520288467407227, + "learning_rate": 1.1268591426071743e-05, + "loss": 1.6456, + "step": 1023 + }, + { + "epoch": 0.44294249641785394, + "grad_norm": 18.8167781829834, + "learning_rate": 1.125984251968504e-05, + "loss": 1.5243, + "step": 1024 + }, + { + "epoch": 0.443375057449512, + "grad_norm": 20.95041275024414, + "learning_rate": 1.1251093613298338e-05, + "loss": 1.6067, + "step": 1025 + }, + { + "epoch": 0.44380761848117006, + "grad_norm": 22.097684860229492, + "learning_rate": 1.1242344706911637e-05, + "loss": 1.4956, + "step": 1026 + }, + { + "epoch": 0.4442401795128281, + "grad_norm": 21.358125686645508, + "learning_rate": 1.1233595800524935e-05, + "loss": 1.623, + "step": 1027 + }, + { + "epoch": 0.4446727405444862, + "grad_norm": 19.22201919555664, + "learning_rate": 1.1224846894138232e-05, + "loss": 1.5917, + "step": 1028 + }, + { + "epoch": 0.44510530157614425, + "grad_norm": 20.375303268432617, + "learning_rate": 1.1216097987751533e-05, + "loss": 1.658, + "step": 1029 + }, + { + "epoch": 0.4455378626078023, + "grad_norm": 22.127113342285156, + "learning_rate": 1.120734908136483e-05, + "loss": 1.5364, + "step": 1030 + }, + { + "epoch": 0.4459704236394604, + "grad_norm": 23.640071868896484, + "learning_rate": 1.119860017497813e-05, + "loss": 1.4492, + "step": 1031 + }, + { + "epoch": 0.44640298467111844, + "grad_norm": 20.21621322631836, + "learning_rate": 1.1189851268591427e-05, + "loss": 1.5371, + "step": 1032 + }, + { + "epoch": 0.4468355457027765, + "grad_norm": 18.869565963745117, + "learning_rate": 1.1181102362204725e-05, + "loss": 1.5825, + "step": 1033 + }, + { + "epoch": 0.44726810673443457, + "grad_norm": 19.211931228637695, + "learning_rate": 1.1172353455818024e-05, + "loss": 1.5023, + "step": 1034 + }, + { + "epoch": 0.44770066776609263, + "grad_norm": 20.949472427368164, + "learning_rate": 1.1163604549431321e-05, + "loss": 1.662, + "step": 1035 + }, + { + "epoch": 0.4481332287977507, + "grad_norm": 19.43843650817871, + "learning_rate": 1.1154855643044619e-05, + "loss": 1.6401, + "step": 1036 + }, + { + "epoch": 0.44856578982940876, + "grad_norm": 21.74172592163086, + "learning_rate": 1.1146106736657918e-05, + "loss": 1.6335, + "step": 1037 + }, + { + "epoch": 0.4489983508610668, + "grad_norm": 23.03577423095703, + "learning_rate": 1.1137357830271216e-05, + "loss": 1.5985, + "step": 1038 + }, + { + "epoch": 0.4494309118927249, + "grad_norm": 20.48150634765625, + "learning_rate": 1.1128608923884517e-05, + "loss": 1.6556, + "step": 1039 + }, + { + "epoch": 0.44986347292438295, + "grad_norm": 21.354633331298828, + "learning_rate": 1.1119860017497814e-05, + "loss": 1.5034, + "step": 1040 + }, + { + "epoch": 0.450296033956041, + "grad_norm": 22.22134017944336, + "learning_rate": 1.1111111111111113e-05, + "loss": 1.5633, + "step": 1041 + }, + { + "epoch": 0.45072859498769907, + "grad_norm": 20.304677963256836, + "learning_rate": 1.1102362204724411e-05, + "loss": 1.5714, + "step": 1042 + }, + { + "epoch": 0.4511611560193571, + "grad_norm": 19.70705795288086, + "learning_rate": 1.1093613298337708e-05, + "loss": 1.6236, + "step": 1043 + }, + { + "epoch": 0.45159371705101514, + "grad_norm": 20.76530647277832, + "learning_rate": 1.1084864391951008e-05, + "loss": 1.5905, + "step": 1044 + }, + { + "epoch": 0.4520262780826732, + "grad_norm": 19.45387840270996, + "learning_rate": 1.1076115485564305e-05, + "loss": 1.5035, + "step": 1045 + }, + { + "epoch": 0.45245883911433127, + "grad_norm": 19.863208770751953, + "learning_rate": 1.1067366579177603e-05, + "loss": 1.5407, + "step": 1046 + }, + { + "epoch": 0.45289140014598933, + "grad_norm": 21.128826141357422, + "learning_rate": 1.1058617672790902e-05, + "loss": 1.5474, + "step": 1047 + }, + { + "epoch": 0.4533239611776474, + "grad_norm": 21.49403953552246, + "learning_rate": 1.1049868766404201e-05, + "loss": 1.6163, + "step": 1048 + }, + { + "epoch": 0.45375652220930546, + "grad_norm": 23.149555206298828, + "learning_rate": 1.10411198600175e-05, + "loss": 1.5622, + "step": 1049 + }, + { + "epoch": 0.4541890832409635, + "grad_norm": 21.647279739379883, + "learning_rate": 1.1032370953630798e-05, + "loss": 1.6026, + "step": 1050 + }, + { + "epoch": 0.4546216442726216, + "grad_norm": 21.786270141601562, + "learning_rate": 1.1023622047244095e-05, + "loss": 1.5098, + "step": 1051 + }, + { + "epoch": 0.45505420530427965, + "grad_norm": 23.140596389770508, + "learning_rate": 1.1014873140857394e-05, + "loss": 1.5424, + "step": 1052 + }, + { + "epoch": 0.4554867663359377, + "grad_norm": 20.646238327026367, + "learning_rate": 1.1006124234470692e-05, + "loss": 1.5512, + "step": 1053 + }, + { + "epoch": 0.4559193273675958, + "grad_norm": 21.41969871520996, + "learning_rate": 1.099737532808399e-05, + "loss": 1.5698, + "step": 1054 + }, + { + "epoch": 0.45635188839925384, + "grad_norm": 20.478525161743164, + "learning_rate": 1.0988626421697289e-05, + "loss": 1.5936, + "step": 1055 + }, + { + "epoch": 0.4567844494309119, + "grad_norm": 21.19014549255371, + "learning_rate": 1.0979877515310586e-05, + "loss": 1.5705, + "step": 1056 + }, + { + "epoch": 0.45721701046256996, + "grad_norm": 21.32801055908203, + "learning_rate": 1.0971128608923884e-05, + "loss": 1.5308, + "step": 1057 + }, + { + "epoch": 0.457649571494228, + "grad_norm": 21.304298400878906, + "learning_rate": 1.0962379702537185e-05, + "loss": 1.5296, + "step": 1058 + }, + { + "epoch": 0.4580821325258861, + "grad_norm": 19.55242156982422, + "learning_rate": 1.0953630796150482e-05, + "loss": 1.6441, + "step": 1059 + }, + { + "epoch": 0.45851469355754415, + "grad_norm": 19.598514556884766, + "learning_rate": 1.0944881889763781e-05, + "loss": 1.665, + "step": 1060 + }, + { + "epoch": 0.4589472545892022, + "grad_norm": 21.648754119873047, + "learning_rate": 1.0936132983377079e-05, + "loss": 1.538, + "step": 1061 + }, + { + "epoch": 0.4593798156208603, + "grad_norm": 20.485759735107422, + "learning_rate": 1.0927384076990376e-05, + "loss": 1.5405, + "step": 1062 + }, + { + "epoch": 0.45981237665251834, + "grad_norm": 21.35075569152832, + "learning_rate": 1.0918635170603676e-05, + "loss": 1.5241, + "step": 1063 + }, + { + "epoch": 0.4602449376841764, + "grad_norm": 21.848602294921875, + "learning_rate": 1.0909886264216973e-05, + "loss": 1.5681, + "step": 1064 + }, + { + "epoch": 0.4606774987158344, + "grad_norm": 23.42026710510254, + "learning_rate": 1.0901137357830272e-05, + "loss": 1.5778, + "step": 1065 + }, + { + "epoch": 0.4611100597474925, + "grad_norm": 22.494203567504883, + "learning_rate": 1.089238845144357e-05, + "loss": 1.6443, + "step": 1066 + }, + { + "epoch": 0.46154262077915054, + "grad_norm": 20.862600326538086, + "learning_rate": 1.0883639545056867e-05, + "loss": 1.5278, + "step": 1067 + }, + { + "epoch": 0.4619751818108086, + "grad_norm": 20.454578399658203, + "learning_rate": 1.0874890638670168e-05, + "loss": 1.6292, + "step": 1068 + }, + { + "epoch": 0.46240774284246666, + "grad_norm": 20.307090759277344, + "learning_rate": 1.0866141732283466e-05, + "loss": 1.665, + "step": 1069 + }, + { + "epoch": 0.4628403038741247, + "grad_norm": 20.755657196044922, + "learning_rate": 1.0857392825896765e-05, + "loss": 1.6456, + "step": 1070 + }, + { + "epoch": 0.4632728649057828, + "grad_norm": 22.096176147460938, + "learning_rate": 1.0848643919510062e-05, + "loss": 1.5195, + "step": 1071 + }, + { + "epoch": 0.46370542593744085, + "grad_norm": 23.748300552368164, + "learning_rate": 1.083989501312336e-05, + "loss": 1.5168, + "step": 1072 + }, + { + "epoch": 0.4641379869690989, + "grad_norm": 20.105592727661133, + "learning_rate": 1.083114610673666e-05, + "loss": 1.5407, + "step": 1073 + }, + { + "epoch": 0.464570548000757, + "grad_norm": 19.950580596923828, + "learning_rate": 1.0822397200349957e-05, + "loss": 1.588, + "step": 1074 + }, + { + "epoch": 0.46500310903241504, + "grad_norm": 20.73526954650879, + "learning_rate": 1.0813648293963254e-05, + "loss": 1.6316, + "step": 1075 + }, + { + "epoch": 0.4654356700640731, + "grad_norm": 21.673297882080078, + "learning_rate": 1.0804899387576553e-05, + "loss": 1.5985, + "step": 1076 + }, + { + "epoch": 0.46586823109573117, + "grad_norm": 22.8228759765625, + "learning_rate": 1.0796150481189853e-05, + "loss": 1.5836, + "step": 1077 + }, + { + "epoch": 0.46630079212738923, + "grad_norm": 22.153406143188477, + "learning_rate": 1.0787401574803152e-05, + "loss": 1.6233, + "step": 1078 + }, + { + "epoch": 0.4667333531590473, + "grad_norm": 21.604236602783203, + "learning_rate": 1.077865266841645e-05, + "loss": 1.5601, + "step": 1079 + }, + { + "epoch": 0.46716591419070536, + "grad_norm": 21.942169189453125, + "learning_rate": 1.0769903762029747e-05, + "loss": 1.5527, + "step": 1080 + }, + { + "epoch": 0.4675984752223634, + "grad_norm": 21.710886001586914, + "learning_rate": 1.0761154855643046e-05, + "loss": 1.5555, + "step": 1081 + }, + { + "epoch": 0.4680310362540215, + "grad_norm": 21.006336212158203, + "learning_rate": 1.0752405949256344e-05, + "loss": 1.6011, + "step": 1082 + }, + { + "epoch": 0.46846359728567954, + "grad_norm": 21.56496810913086, + "learning_rate": 1.0743657042869641e-05, + "loss": 1.4629, + "step": 1083 + }, + { + "epoch": 0.4688961583173376, + "grad_norm": 22.86446189880371, + "learning_rate": 1.073490813648294e-05, + "loss": 1.521, + "step": 1084 + }, + { + "epoch": 0.46932871934899567, + "grad_norm": 23.123199462890625, + "learning_rate": 1.0726159230096238e-05, + "loss": 1.5685, + "step": 1085 + }, + { + "epoch": 0.46976128038065373, + "grad_norm": 22.753873825073242, + "learning_rate": 1.0717410323709537e-05, + "loss": 1.668, + "step": 1086 + }, + { + "epoch": 0.47019384141231174, + "grad_norm": 22.325475692749023, + "learning_rate": 1.0708661417322836e-05, + "loss": 1.4948, + "step": 1087 + }, + { + "epoch": 0.4706264024439698, + "grad_norm": 21.678844451904297, + "learning_rate": 1.0699912510936135e-05, + "loss": 1.5436, + "step": 1088 + }, + { + "epoch": 0.47105896347562787, + "grad_norm": 21.273897171020508, + "learning_rate": 1.0691163604549433e-05, + "loss": 1.6103, + "step": 1089 + }, + { + "epoch": 0.47149152450728593, + "grad_norm": 23.457490921020508, + "learning_rate": 1.068241469816273e-05, + "loss": 1.5893, + "step": 1090 + }, + { + "epoch": 0.471924085538944, + "grad_norm": 21.083839416503906, + "learning_rate": 1.067366579177603e-05, + "loss": 1.5555, + "step": 1091 + }, + { + "epoch": 0.47235664657060206, + "grad_norm": 25.2357120513916, + "learning_rate": 1.0664916885389327e-05, + "loss": 1.522, + "step": 1092 + }, + { + "epoch": 0.4727892076022601, + "grad_norm": 23.40666961669922, + "learning_rate": 1.0656167979002625e-05, + "loss": 1.4986, + "step": 1093 + }, + { + "epoch": 0.4732217686339182, + "grad_norm": 20.224884033203125, + "learning_rate": 1.0647419072615924e-05, + "loss": 1.6008, + "step": 1094 + }, + { + "epoch": 0.47365432966557625, + "grad_norm": 19.231700897216797, + "learning_rate": 1.0638670166229221e-05, + "loss": 1.5874, + "step": 1095 + }, + { + "epoch": 0.4740868906972343, + "grad_norm": 21.34067726135254, + "learning_rate": 1.0629921259842522e-05, + "loss": 1.5708, + "step": 1096 + }, + { + "epoch": 0.47451945172889237, + "grad_norm": 20.3139705657959, + "learning_rate": 1.062117235345582e-05, + "loss": 1.5125, + "step": 1097 + }, + { + "epoch": 0.47495201276055043, + "grad_norm": 25.667116165161133, + "learning_rate": 1.0612423447069117e-05, + "loss": 1.5303, + "step": 1098 + }, + { + "epoch": 0.4753845737922085, + "grad_norm": 22.950712203979492, + "learning_rate": 1.0603674540682417e-05, + "loss": 1.6314, + "step": 1099 + }, + { + "epoch": 0.47581713482386656, + "grad_norm": 20.83335304260254, + "learning_rate": 1.0594925634295714e-05, + "loss": 1.5334, + "step": 1100 + }, + { + "epoch": 0.4762496958555246, + "grad_norm": 19.24706268310547, + "learning_rate": 1.0586176727909012e-05, + "loss": 1.6256, + "step": 1101 + }, + { + "epoch": 0.4766822568871827, + "grad_norm": 20.814455032348633, + "learning_rate": 1.057742782152231e-05, + "loss": 1.5851, + "step": 1102 + }, + { + "epoch": 0.47711481791884075, + "grad_norm": 21.661409378051758, + "learning_rate": 1.0568678915135608e-05, + "loss": 1.5247, + "step": 1103 + }, + { + "epoch": 0.4775473789504988, + "grad_norm": 20.899145126342773, + "learning_rate": 1.0559930008748906e-05, + "loss": 1.5708, + "step": 1104 + }, + { + "epoch": 0.4779799399821569, + "grad_norm": 24.386003494262695, + "learning_rate": 1.0551181102362205e-05, + "loss": 1.5527, + "step": 1105 + }, + { + "epoch": 0.47841250101381494, + "grad_norm": 25.34111785888672, + "learning_rate": 1.0542432195975504e-05, + "loss": 1.5851, + "step": 1106 + }, + { + "epoch": 0.478845062045473, + "grad_norm": 24.537324905395508, + "learning_rate": 1.0533683289588803e-05, + "loss": 1.596, + "step": 1107 + }, + { + "epoch": 0.47927762307713107, + "grad_norm": 21.804595947265625, + "learning_rate": 1.0524934383202101e-05, + "loss": 1.5322, + "step": 1108 + }, + { + "epoch": 0.4797101841087891, + "grad_norm": 20.191699981689453, + "learning_rate": 1.0516185476815398e-05, + "loss": 1.5758, + "step": 1109 + }, + { + "epoch": 0.48014274514044714, + "grad_norm": 21.43497085571289, + "learning_rate": 1.0507436570428698e-05, + "loss": 1.508, + "step": 1110 + }, + { + "epoch": 0.4805753061721052, + "grad_norm": 20.207677841186523, + "learning_rate": 1.0498687664041995e-05, + "loss": 1.6158, + "step": 1111 + }, + { + "epoch": 0.48100786720376326, + "grad_norm": 19.458316802978516, + "learning_rate": 1.0489938757655294e-05, + "loss": 1.5859, + "step": 1112 + }, + { + "epoch": 0.4814404282354213, + "grad_norm": 24.825956344604492, + "learning_rate": 1.0481189851268592e-05, + "loss": 1.5391, + "step": 1113 + }, + { + "epoch": 0.4818729892670794, + "grad_norm": 22.562938690185547, + "learning_rate": 1.047244094488189e-05, + "loss": 1.5431, + "step": 1114 + }, + { + "epoch": 0.48230555029873745, + "grad_norm": 22.471332550048828, + "learning_rate": 1.0463692038495189e-05, + "loss": 1.5292, + "step": 1115 + }, + { + "epoch": 0.4827381113303955, + "grad_norm": 21.53379249572754, + "learning_rate": 1.0454943132108488e-05, + "loss": 1.5683, + "step": 1116 + }, + { + "epoch": 0.4831706723620536, + "grad_norm": 19.84793472290039, + "learning_rate": 1.0446194225721787e-05, + "loss": 1.4873, + "step": 1117 + }, + { + "epoch": 0.48360323339371164, + "grad_norm": 21.779499053955078, + "learning_rate": 1.0437445319335085e-05, + "loss": 1.5814, + "step": 1118 + }, + { + "epoch": 0.4840357944253697, + "grad_norm": 20.556554794311523, + "learning_rate": 1.0428696412948382e-05, + "loss": 1.5547, + "step": 1119 + }, + { + "epoch": 0.48446835545702777, + "grad_norm": 20.402372360229492, + "learning_rate": 1.0419947506561681e-05, + "loss": 1.5654, + "step": 1120 + }, + { + "epoch": 0.48490091648868583, + "grad_norm": 22.942962646484375, + "learning_rate": 1.0411198600174979e-05, + "loss": 1.6105, + "step": 1121 + }, + { + "epoch": 0.4853334775203439, + "grad_norm": 20.789451599121094, + "learning_rate": 1.0402449693788276e-05, + "loss": 1.5835, + "step": 1122 + }, + { + "epoch": 0.48576603855200196, + "grad_norm": 20.95735740661621, + "learning_rate": 1.0393700787401575e-05, + "loss": 1.6086, + "step": 1123 + }, + { + "epoch": 0.48619859958366, + "grad_norm": 24.3394775390625, + "learning_rate": 1.0384951881014873e-05, + "loss": 1.5987, + "step": 1124 + }, + { + "epoch": 0.4866311606153181, + "grad_norm": 19.394668579101562, + "learning_rate": 1.0376202974628174e-05, + "loss": 1.5567, + "step": 1125 + }, + { + "epoch": 0.48706372164697614, + "grad_norm": 23.74590492248535, + "learning_rate": 1.0367454068241471e-05, + "loss": 1.5349, + "step": 1126 + }, + { + "epoch": 0.4874962826786342, + "grad_norm": 19.871522903442383, + "learning_rate": 1.0358705161854769e-05, + "loss": 1.5797, + "step": 1127 + }, + { + "epoch": 0.48792884371029227, + "grad_norm": 20.489187240600586, + "learning_rate": 1.0349956255468068e-05, + "loss": 1.5805, + "step": 1128 + }, + { + "epoch": 0.48836140474195033, + "grad_norm": 22.527732849121094, + "learning_rate": 1.0341207349081366e-05, + "loss": 1.5333, + "step": 1129 + }, + { + "epoch": 0.4887939657736084, + "grad_norm": 23.267118453979492, + "learning_rate": 1.0332458442694663e-05, + "loss": 1.5872, + "step": 1130 + }, + { + "epoch": 0.4892265268052664, + "grad_norm": 21.272428512573242, + "learning_rate": 1.0323709536307962e-05, + "loss": 1.5837, + "step": 1131 + }, + { + "epoch": 0.48965908783692447, + "grad_norm": 24.80011558532715, + "learning_rate": 1.031496062992126e-05, + "loss": 1.6086, + "step": 1132 + }, + { + "epoch": 0.49009164886858253, + "grad_norm": 21.551164627075195, + "learning_rate": 1.0306211723534559e-05, + "loss": 1.5909, + "step": 1133 + }, + { + "epoch": 0.4905242099002406, + "grad_norm": 19.916059494018555, + "learning_rate": 1.0297462817147857e-05, + "loss": 1.5726, + "step": 1134 + }, + { + "epoch": 0.49095677093189866, + "grad_norm": 22.31676483154297, + "learning_rate": 1.0288713910761157e-05, + "loss": 1.5, + "step": 1135 + }, + { + "epoch": 0.4913893319635567, + "grad_norm": 21.02925682067871, + "learning_rate": 1.0279965004374455e-05, + "loss": 1.5122, + "step": 1136 + }, + { + "epoch": 0.4918218929952148, + "grad_norm": 20.82086181640625, + "learning_rate": 1.0271216097987753e-05, + "loss": 1.5684, + "step": 1137 + }, + { + "epoch": 0.49225445402687285, + "grad_norm": 20.781131744384766, + "learning_rate": 1.0262467191601052e-05, + "loss": 1.6631, + "step": 1138 + }, + { + "epoch": 0.4926870150585309, + "grad_norm": 20.078981399536133, + "learning_rate": 1.025371828521435e-05, + "loss": 1.5923, + "step": 1139 + }, + { + "epoch": 0.49311957609018897, + "grad_norm": 21.633108139038086, + "learning_rate": 1.0244969378827647e-05, + "loss": 1.5812, + "step": 1140 + }, + { + "epoch": 0.49355213712184703, + "grad_norm": 21.029897689819336, + "learning_rate": 1.0236220472440946e-05, + "loss": 1.5658, + "step": 1141 + }, + { + "epoch": 0.4939846981535051, + "grad_norm": 20.9033203125, + "learning_rate": 1.0227471566054243e-05, + "loss": 1.5775, + "step": 1142 + }, + { + "epoch": 0.49441725918516316, + "grad_norm": 21.990711212158203, + "learning_rate": 1.0218722659667541e-05, + "loss": 1.6188, + "step": 1143 + }, + { + "epoch": 0.4948498202168212, + "grad_norm": 21.208660125732422, + "learning_rate": 1.0209973753280842e-05, + "loss": 1.5635, + "step": 1144 + }, + { + "epoch": 0.4952823812484793, + "grad_norm": 23.516277313232422, + "learning_rate": 1.020122484689414e-05, + "loss": 1.6485, + "step": 1145 + }, + { + "epoch": 0.49571494228013735, + "grad_norm": 24.508100509643555, + "learning_rate": 1.0192475940507439e-05, + "loss": 1.5996, + "step": 1146 + }, + { + "epoch": 0.4961475033117954, + "grad_norm": 23.392810821533203, + "learning_rate": 1.0183727034120736e-05, + "loss": 1.5059, + "step": 1147 + }, + { + "epoch": 0.4965800643434535, + "grad_norm": 20.714237213134766, + "learning_rate": 1.0174978127734034e-05, + "loss": 1.533, + "step": 1148 + }, + { + "epoch": 0.49701262537511154, + "grad_norm": 23.301856994628906, + "learning_rate": 1.0166229221347333e-05, + "loss": 1.5304, + "step": 1149 + }, + { + "epoch": 0.4974451864067696, + "grad_norm": 21.122617721557617, + "learning_rate": 1.015748031496063e-05, + "loss": 1.4702, + "step": 1150 + }, + { + "epoch": 0.49787774743842766, + "grad_norm": 24.017070770263672, + "learning_rate": 1.0148731408573928e-05, + "loss": 1.5159, + "step": 1151 + }, + { + "epoch": 0.4983103084700857, + "grad_norm": 22.126039505004883, + "learning_rate": 1.0139982502187227e-05, + "loss": 1.5913, + "step": 1152 + }, + { + "epoch": 0.49874286950174374, + "grad_norm": 21.73690414428711, + "learning_rate": 1.0131233595800525e-05, + "loss": 1.625, + "step": 1153 + }, + { + "epoch": 0.4991754305334018, + "grad_norm": 21.988893508911133, + "learning_rate": 1.0122484689413825e-05, + "loss": 1.5914, + "step": 1154 + }, + { + "epoch": 0.49960799156505986, + "grad_norm": 20.745668411254883, + "learning_rate": 1.0113735783027123e-05, + "loss": 1.5551, + "step": 1155 + }, + { + "epoch": 0.500040552596718, + "grad_norm": 21.369321823120117, + "learning_rate": 1.010498687664042e-05, + "loss": 1.5585, + "step": 1156 + }, + { + "epoch": 0.500473113628376, + "grad_norm": 21.196489334106445, + "learning_rate": 1.009623797025372e-05, + "loss": 1.4679, + "step": 1157 + }, + { + "epoch": 0.5009056746600341, + "grad_norm": 20.842437744140625, + "learning_rate": 1.0087489063867017e-05, + "loss": 1.5539, + "step": 1158 + }, + { + "epoch": 0.5013382356916921, + "grad_norm": 22.951557159423828, + "learning_rate": 1.0078740157480316e-05, + "loss": 1.5833, + "step": 1159 + }, + { + "epoch": 0.5017707967233502, + "grad_norm": 24.61578941345215, + "learning_rate": 1.0069991251093614e-05, + "loss": 1.4874, + "step": 1160 + }, + { + "epoch": 0.5022033577550082, + "grad_norm": 20.68556785583496, + "learning_rate": 1.0061242344706911e-05, + "loss": 1.5937, + "step": 1161 + }, + { + "epoch": 0.5026359187866664, + "grad_norm": 23.46632957458496, + "learning_rate": 1.005249343832021e-05, + "loss": 1.46, + "step": 1162 + }, + { + "epoch": 0.5030684798183244, + "grad_norm": 21.403005599975586, + "learning_rate": 1.0043744531933508e-05, + "loss": 1.5609, + "step": 1163 + }, + { + "epoch": 0.5035010408499824, + "grad_norm": 23.224864959716797, + "learning_rate": 1.0034995625546809e-05, + "loss": 1.5493, + "step": 1164 + }, + { + "epoch": 0.5039336018816405, + "grad_norm": 22.939931869506836, + "learning_rate": 1.0026246719160107e-05, + "loss": 1.5497, + "step": 1165 + }, + { + "epoch": 0.5043661629132985, + "grad_norm": 21.148677825927734, + "learning_rate": 1.0017497812773404e-05, + "loss": 1.6063, + "step": 1166 + }, + { + "epoch": 0.5047987239449566, + "grad_norm": 24.93348503112793, + "learning_rate": 1.0008748906386703e-05, + "loss": 1.4764, + "step": 1167 + }, + { + "epoch": 0.5052312849766146, + "grad_norm": 22.30760955810547, + "learning_rate": 1e-05, + "loss": 1.557, + "step": 1168 + }, + { + "epoch": 0.5056638460082727, + "grad_norm": 23.62921142578125, + "learning_rate": 9.991251093613298e-06, + "loss": 1.6243, + "step": 1169 + }, + { + "epoch": 0.5060964070399308, + "grad_norm": 23.39278221130371, + "learning_rate": 9.982502187226598e-06, + "loss": 1.5429, + "step": 1170 + }, + { + "epoch": 0.5065289680715889, + "grad_norm": 21.39177703857422, + "learning_rate": 9.973753280839897e-06, + "loss": 1.5076, + "step": 1171 + }, + { + "epoch": 0.5069615291032469, + "grad_norm": 20.722728729248047, + "learning_rate": 9.965004374453194e-06, + "loss": 1.4859, + "step": 1172 + }, + { + "epoch": 0.507394090134905, + "grad_norm": 21.878719329833984, + "learning_rate": 9.956255468066492e-06, + "loss": 1.5704, + "step": 1173 + }, + { + "epoch": 0.507826651166563, + "grad_norm": 21.7126407623291, + "learning_rate": 9.947506561679791e-06, + "loss": 1.5717, + "step": 1174 + }, + { + "epoch": 0.5082592121982211, + "grad_norm": 24.65056037902832, + "learning_rate": 9.938757655293088e-06, + "loss": 1.5916, + "step": 1175 + }, + { + "epoch": 0.5086917732298791, + "grad_norm": 22.16351318359375, + "learning_rate": 9.930008748906388e-06, + "loss": 1.6863, + "step": 1176 + }, + { + "epoch": 0.5091243342615372, + "grad_norm": 21.595354080200195, + "learning_rate": 9.921259842519685e-06, + "loss": 1.5668, + "step": 1177 + }, + { + "epoch": 0.5095568952931953, + "grad_norm": 21.123828887939453, + "learning_rate": 9.912510936132984e-06, + "loss": 1.6015, + "step": 1178 + }, + { + "epoch": 0.5099894563248534, + "grad_norm": 19.92538833618164, + "learning_rate": 9.903762029746282e-06, + "loss": 1.5397, + "step": 1179 + }, + { + "epoch": 0.5104220173565114, + "grad_norm": 19.72816276550293, + "learning_rate": 9.895013123359581e-06, + "loss": 1.5909, + "step": 1180 + }, + { + "epoch": 0.5108545783881695, + "grad_norm": 23.017501831054688, + "learning_rate": 9.88626421697288e-06, + "loss": 1.5516, + "step": 1181 + }, + { + "epoch": 0.5112871394198275, + "grad_norm": 20.519594192504883, + "learning_rate": 9.877515310586178e-06, + "loss": 1.6408, + "step": 1182 + }, + { + "epoch": 0.5117197004514856, + "grad_norm": 23.874664306640625, + "learning_rate": 9.868766404199475e-06, + "loss": 1.5156, + "step": 1183 + }, + { + "epoch": 0.5121522614831436, + "grad_norm": 22.534914016723633, + "learning_rate": 9.860017497812775e-06, + "loss": 1.5178, + "step": 1184 + }, + { + "epoch": 0.5125848225148018, + "grad_norm": 23.634246826171875, + "learning_rate": 9.851268591426074e-06, + "loss": 1.5734, + "step": 1185 + }, + { + "epoch": 0.5130173835464598, + "grad_norm": 21.05173683166504, + "learning_rate": 9.842519685039371e-06, + "loss": 1.5886, + "step": 1186 + }, + { + "epoch": 0.5134499445781178, + "grad_norm": 22.97358512878418, + "learning_rate": 9.833770778652669e-06, + "loss": 1.5493, + "step": 1187 + }, + { + "epoch": 0.5138825056097759, + "grad_norm": 23.60778045654297, + "learning_rate": 9.825021872265968e-06, + "loss": 1.5308, + "step": 1188 + }, + { + "epoch": 0.5143150666414339, + "grad_norm": 19.591341018676758, + "learning_rate": 9.816272965879266e-06, + "loss": 1.6701, + "step": 1189 + }, + { + "epoch": 0.514747627673092, + "grad_norm": 22.59847640991211, + "learning_rate": 9.807524059492565e-06, + "loss": 1.7357, + "step": 1190 + }, + { + "epoch": 0.51518018870475, + "grad_norm": 21.31041145324707, + "learning_rate": 9.798775153105862e-06, + "loss": 1.6354, + "step": 1191 + }, + { + "epoch": 0.5156127497364081, + "grad_norm": 26.626367568969727, + "learning_rate": 9.790026246719161e-06, + "loss": 1.5627, + "step": 1192 + }, + { + "epoch": 0.5160453107680661, + "grad_norm": 20.857505798339844, + "learning_rate": 9.781277340332459e-06, + "loss": 1.5965, + "step": 1193 + }, + { + "epoch": 0.5164778717997243, + "grad_norm": 20.571149826049805, + "learning_rate": 9.772528433945756e-06, + "loss": 1.5661, + "step": 1194 + }, + { + "epoch": 0.5169104328313823, + "grad_norm": 23.989408493041992, + "learning_rate": 9.763779527559056e-06, + "loss": 1.5563, + "step": 1195 + }, + { + "epoch": 0.5173429938630404, + "grad_norm": 21.287750244140625, + "learning_rate": 9.755030621172355e-06, + "loss": 1.5813, + "step": 1196 + }, + { + "epoch": 0.5177755548946984, + "grad_norm": 19.994047164916992, + "learning_rate": 9.746281714785652e-06, + "loss": 1.4655, + "step": 1197 + }, + { + "epoch": 0.5182081159263565, + "grad_norm": 22.351661682128906, + "learning_rate": 9.73753280839895e-06, + "loss": 1.5156, + "step": 1198 + }, + { + "epoch": 0.5186406769580145, + "grad_norm": 21.73786163330078, + "learning_rate": 9.728783902012249e-06, + "loss": 1.5882, + "step": 1199 + }, + { + "epoch": 0.5190732379896726, + "grad_norm": 24.997068405151367, + "learning_rate": 9.720034995625548e-06, + "loss": 1.5392, + "step": 1200 + }, + { + "epoch": 0.5195057990213306, + "grad_norm": 24.620372772216797, + "learning_rate": 9.711286089238846e-06, + "loss": 1.6197, + "step": 1201 + }, + { + "epoch": 0.5199383600529888, + "grad_norm": 20.77781105041504, + "learning_rate": 9.702537182852143e-06, + "loss": 1.5079, + "step": 1202 + }, + { + "epoch": 0.5203709210846468, + "grad_norm": 23.355987548828125, + "learning_rate": 9.693788276465443e-06, + "loss": 1.5842, + "step": 1203 + }, + { + "epoch": 0.5208034821163049, + "grad_norm": 19.493812561035156, + "learning_rate": 9.68503937007874e-06, + "loss": 1.4915, + "step": 1204 + }, + { + "epoch": 0.5212360431479629, + "grad_norm": 20.286083221435547, + "learning_rate": 9.67629046369204e-06, + "loss": 1.5201, + "step": 1205 + }, + { + "epoch": 0.521668604179621, + "grad_norm": 19.314844131469727, + "learning_rate": 9.667541557305338e-06, + "loss": 1.6382, + "step": 1206 + }, + { + "epoch": 0.522101165211279, + "grad_norm": 24.922771453857422, + "learning_rate": 9.658792650918636e-06, + "loss": 1.5938, + "step": 1207 + }, + { + "epoch": 0.522533726242937, + "grad_norm": 23.530563354492188, + "learning_rate": 9.650043744531934e-06, + "loss": 1.5612, + "step": 1208 + }, + { + "epoch": 0.5229662872745952, + "grad_norm": 23.83565902709961, + "learning_rate": 9.641294838145233e-06, + "loss": 1.6478, + "step": 1209 + }, + { + "epoch": 0.5233988483062532, + "grad_norm": 22.341461181640625, + "learning_rate": 9.632545931758532e-06, + "loss": 1.5618, + "step": 1210 + }, + { + "epoch": 0.5238314093379113, + "grad_norm": 20.392044067382812, + "learning_rate": 9.62379702537183e-06, + "loss": 1.5519, + "step": 1211 + }, + { + "epoch": 0.5242639703695693, + "grad_norm": 22.426546096801758, + "learning_rate": 9.615048118985127e-06, + "loss": 1.5484, + "step": 1212 + }, + { + "epoch": 0.5246965314012274, + "grad_norm": 23.25884437561035, + "learning_rate": 9.606299212598426e-06, + "loss": 1.5351, + "step": 1213 + }, + { + "epoch": 0.5251290924328854, + "grad_norm": 20.078369140625, + "learning_rate": 9.597550306211725e-06, + "loss": 1.5788, + "step": 1214 + }, + { + "epoch": 0.5255616534645435, + "grad_norm": 21.024873733520508, + "learning_rate": 9.588801399825023e-06, + "loss": 1.5521, + "step": 1215 + }, + { + "epoch": 0.5259942144962015, + "grad_norm": 21.520021438598633, + "learning_rate": 9.58005249343832e-06, + "loss": 1.5283, + "step": 1216 + }, + { + "epoch": 0.5264267755278597, + "grad_norm": 23.06103515625, + "learning_rate": 9.57130358705162e-06, + "loss": 1.5912, + "step": 1217 + }, + { + "epoch": 0.5268593365595177, + "grad_norm": 23.135066986083984, + "learning_rate": 9.562554680664917e-06, + "loss": 1.5659, + "step": 1218 + }, + { + "epoch": 0.5272918975911758, + "grad_norm": 22.494586944580078, + "learning_rate": 9.553805774278216e-06, + "loss": 1.5495, + "step": 1219 + }, + { + "epoch": 0.5277244586228338, + "grad_norm": 24.494667053222656, + "learning_rate": 9.545056867891514e-06, + "loss": 1.4783, + "step": 1220 + }, + { + "epoch": 0.5281570196544919, + "grad_norm": 22.232500076293945, + "learning_rate": 9.536307961504813e-06, + "loss": 1.6506, + "step": 1221 + }, + { + "epoch": 0.5285895806861499, + "grad_norm": 20.534900665283203, + "learning_rate": 9.52755905511811e-06, + "loss": 1.4604, + "step": 1222 + }, + { + "epoch": 0.529022141717808, + "grad_norm": 22.456693649291992, + "learning_rate": 9.518810148731408e-06, + "loss": 1.5402, + "step": 1223 + }, + { + "epoch": 0.529454702749466, + "grad_norm": 24.285858154296875, + "learning_rate": 9.510061242344707e-06, + "loss": 1.6183, + "step": 1224 + }, + { + "epoch": 0.5298872637811242, + "grad_norm": 24.39087677001953, + "learning_rate": 9.501312335958006e-06, + "loss": 1.5955, + "step": 1225 + }, + { + "epoch": 0.5303198248127822, + "grad_norm": 23.711259841918945, + "learning_rate": 9.492563429571304e-06, + "loss": 1.4826, + "step": 1226 + }, + { + "epoch": 0.5307523858444403, + "grad_norm": 23.221359252929688, + "learning_rate": 9.483814523184603e-06, + "loss": 1.541, + "step": 1227 + }, + { + "epoch": 0.5311849468760983, + "grad_norm": 22.01033592224121, + "learning_rate": 9.4750656167979e-06, + "loss": 1.5117, + "step": 1228 + }, + { + "epoch": 0.5316175079077564, + "grad_norm": 22.62164306640625, + "learning_rate": 9.4663167104112e-06, + "loss": 1.6338, + "step": 1229 + }, + { + "epoch": 0.5320500689394144, + "grad_norm": 25.81968879699707, + "learning_rate": 9.457567804024497e-06, + "loss": 1.5945, + "step": 1230 + }, + { + "epoch": 0.5324826299710724, + "grad_norm": 22.638782501220703, + "learning_rate": 9.448818897637797e-06, + "loss": 1.5698, + "step": 1231 + }, + { + "epoch": 0.5329151910027305, + "grad_norm": 22.303646087646484, + "learning_rate": 9.440069991251094e-06, + "loss": 1.5867, + "step": 1232 + }, + { + "epoch": 0.5333477520343886, + "grad_norm": 22.231616973876953, + "learning_rate": 9.431321084864393e-06, + "loss": 1.5598, + "step": 1233 + }, + { + "epoch": 0.5337803130660467, + "grad_norm": 21.435476303100586, + "learning_rate": 9.422572178477691e-06, + "loss": 1.5487, + "step": 1234 + }, + { + "epoch": 0.5342128740977047, + "grad_norm": 20.271909713745117, + "learning_rate": 9.41382327209099e-06, + "loss": 1.564, + "step": 1235 + }, + { + "epoch": 0.5346454351293628, + "grad_norm": 21.8601016998291, + "learning_rate": 9.405074365704288e-06, + "loss": 1.5758, + "step": 1236 + }, + { + "epoch": 0.5350779961610208, + "grad_norm": 18.850753784179688, + "learning_rate": 9.396325459317585e-06, + "loss": 1.6328, + "step": 1237 + }, + { + "epoch": 0.5355105571926789, + "grad_norm": 20.55095672607422, + "learning_rate": 9.387576552930884e-06, + "loss": 1.5706, + "step": 1238 + }, + { + "epoch": 0.5359431182243369, + "grad_norm": 22.207855224609375, + "learning_rate": 9.378827646544184e-06, + "loss": 1.5461, + "step": 1239 + }, + { + "epoch": 0.536375679255995, + "grad_norm": 22.02487564086914, + "learning_rate": 9.370078740157481e-06, + "loss": 1.5796, + "step": 1240 + }, + { + "epoch": 0.5368082402876531, + "grad_norm": 22.791046142578125, + "learning_rate": 9.361329833770779e-06, + "loss": 1.5041, + "step": 1241 + }, + { + "epoch": 0.5372408013193112, + "grad_norm": 21.210519790649414, + "learning_rate": 9.352580927384078e-06, + "loss": 1.541, + "step": 1242 + }, + { + "epoch": 0.5376733623509692, + "grad_norm": 23.86467933654785, + "learning_rate": 9.343832020997377e-06, + "loss": 1.5611, + "step": 1243 + }, + { + "epoch": 0.5381059233826273, + "grad_norm": 26.53850555419922, + "learning_rate": 9.335083114610674e-06, + "loss": 1.4797, + "step": 1244 + }, + { + "epoch": 0.5385384844142853, + "grad_norm": 22.709075927734375, + "learning_rate": 9.326334208223972e-06, + "loss": 1.5384, + "step": 1245 + }, + { + "epoch": 0.5389710454459434, + "grad_norm": 22.626386642456055, + "learning_rate": 9.317585301837271e-06, + "loss": 1.533, + "step": 1246 + }, + { + "epoch": 0.5394036064776014, + "grad_norm": 22.51513671875, + "learning_rate": 9.308836395450569e-06, + "loss": 1.4891, + "step": 1247 + }, + { + "epoch": 0.5398361675092596, + "grad_norm": 25.217557907104492, + "learning_rate": 9.300087489063868e-06, + "loss": 1.5106, + "step": 1248 + }, + { + "epoch": 0.5402687285409176, + "grad_norm": 21.37566566467285, + "learning_rate": 9.291338582677165e-06, + "loss": 1.5115, + "step": 1249 + }, + { + "epoch": 0.5407012895725757, + "grad_norm": 21.221803665161133, + "learning_rate": 9.282589676290465e-06, + "loss": 1.6049, + "step": 1250 + }, + { + "epoch": 0.5411338506042337, + "grad_norm": 22.976816177368164, + "learning_rate": 9.273840769903762e-06, + "loss": 1.4837, + "step": 1251 + }, + { + "epoch": 0.5415664116358917, + "grad_norm": 20.9698543548584, + "learning_rate": 9.265091863517061e-06, + "loss": 1.5605, + "step": 1252 + }, + { + "epoch": 0.5419989726675498, + "grad_norm": 24.19113540649414, + "learning_rate": 9.25634295713036e-06, + "loss": 1.5536, + "step": 1253 + }, + { + "epoch": 0.5424315336992078, + "grad_norm": 21.84588623046875, + "learning_rate": 9.247594050743658e-06, + "loss": 1.4345, + "step": 1254 + }, + { + "epoch": 0.5428640947308659, + "grad_norm": 23.569164276123047, + "learning_rate": 9.238845144356956e-06, + "loss": 1.595, + "step": 1255 + }, + { + "epoch": 0.543296655762524, + "grad_norm": 24.85906982421875, + "learning_rate": 9.230096237970255e-06, + "loss": 1.5155, + "step": 1256 + }, + { + "epoch": 0.5437292167941821, + "grad_norm": 19.473554611206055, + "learning_rate": 9.221347331583554e-06, + "loss": 1.5188, + "step": 1257 + }, + { + "epoch": 0.5441617778258401, + "grad_norm": 23.21884536743164, + "learning_rate": 9.212598425196852e-06, + "loss": 1.506, + "step": 1258 + }, + { + "epoch": 0.5445943388574982, + "grad_norm": 24.165271759033203, + "learning_rate": 9.203849518810149e-06, + "loss": 1.6034, + "step": 1259 + }, + { + "epoch": 0.5450268998891562, + "grad_norm": 22.93230628967285, + "learning_rate": 9.195100612423448e-06, + "loss": 1.5442, + "step": 1260 + }, + { + "epoch": 0.5454594609208143, + "grad_norm": 24.058223724365234, + "learning_rate": 9.186351706036746e-06, + "loss": 1.5162, + "step": 1261 + }, + { + "epoch": 0.5458920219524723, + "grad_norm": 22.84027862548828, + "learning_rate": 9.177602799650045e-06, + "loss": 1.4725, + "step": 1262 + }, + { + "epoch": 0.5463245829841304, + "grad_norm": 20.591529846191406, + "learning_rate": 9.168853893263342e-06, + "loss": 1.5473, + "step": 1263 + }, + { + "epoch": 0.5467571440157885, + "grad_norm": 23.851757049560547, + "learning_rate": 9.160104986876642e-06, + "loss": 1.664, + "step": 1264 + }, + { + "epoch": 0.5471897050474466, + "grad_norm": 21.493534088134766, + "learning_rate": 9.15135608048994e-06, + "loss": 1.5328, + "step": 1265 + }, + { + "epoch": 0.5476222660791046, + "grad_norm": 20.198074340820312, + "learning_rate": 9.142607174103237e-06, + "loss": 1.5544, + "step": 1266 + }, + { + "epoch": 0.5480548271107627, + "grad_norm": 22.380517959594727, + "learning_rate": 9.133858267716536e-06, + "loss": 1.4572, + "step": 1267 + }, + { + "epoch": 0.5484873881424207, + "grad_norm": 22.481168746948242, + "learning_rate": 9.125109361329835e-06, + "loss": 1.5962, + "step": 1268 + }, + { + "epoch": 0.5489199491740788, + "grad_norm": 22.247299194335938, + "learning_rate": 9.116360454943133e-06, + "loss": 1.5528, + "step": 1269 + }, + { + "epoch": 0.5493525102057368, + "grad_norm": 19.119709014892578, + "learning_rate": 9.10761154855643e-06, + "loss": 1.4795, + "step": 1270 + }, + { + "epoch": 0.549785071237395, + "grad_norm": 23.676963806152344, + "learning_rate": 9.09886264216973e-06, + "loss": 1.6428, + "step": 1271 + }, + { + "epoch": 0.550217632269053, + "grad_norm": 22.227872848510742, + "learning_rate": 9.090113735783029e-06, + "loss": 1.5312, + "step": 1272 + }, + { + "epoch": 0.5506501933007111, + "grad_norm": 22.329296112060547, + "learning_rate": 9.081364829396326e-06, + "loss": 1.5003, + "step": 1273 + }, + { + "epoch": 0.5510827543323691, + "grad_norm": 24.662952423095703, + "learning_rate": 9.072615923009625e-06, + "loss": 1.5753, + "step": 1274 + }, + { + "epoch": 0.5515153153640271, + "grad_norm": 23.11702537536621, + "learning_rate": 9.063867016622923e-06, + "loss": 1.535, + "step": 1275 + }, + { + "epoch": 0.5519478763956852, + "grad_norm": 21.952640533447266, + "learning_rate": 9.05511811023622e-06, + "loss": 1.6321, + "step": 1276 + }, + { + "epoch": 0.5523804374273432, + "grad_norm": 22.44942283630371, + "learning_rate": 9.04636920384952e-06, + "loss": 1.5539, + "step": 1277 + }, + { + "epoch": 0.5528129984590013, + "grad_norm": 21.685388565063477, + "learning_rate": 9.037620297462819e-06, + "loss": 1.6138, + "step": 1278 + }, + { + "epoch": 0.5532455594906593, + "grad_norm": 22.420637130737305, + "learning_rate": 9.028871391076116e-06, + "loss": 1.5373, + "step": 1279 + }, + { + "epoch": 0.5536781205223175, + "grad_norm": 23.004444122314453, + "learning_rate": 9.020122484689414e-06, + "loss": 1.5647, + "step": 1280 + }, + { + "epoch": 0.5541106815539755, + "grad_norm": 22.256057739257812, + "learning_rate": 9.011373578302713e-06, + "loss": 1.4894, + "step": 1281 + }, + { + "epoch": 0.5545432425856336, + "grad_norm": 26.35243034362793, + "learning_rate": 9.002624671916012e-06, + "loss": 1.4642, + "step": 1282 + }, + { + "epoch": 0.5549758036172916, + "grad_norm": 22.870492935180664, + "learning_rate": 8.99387576552931e-06, + "loss": 1.4622, + "step": 1283 + }, + { + "epoch": 0.5554083646489497, + "grad_norm": 21.987051010131836, + "learning_rate": 8.985126859142607e-06, + "loss": 1.598, + "step": 1284 + }, + { + "epoch": 0.5558409256806077, + "grad_norm": 21.95711898803711, + "learning_rate": 8.976377952755906e-06, + "loss": 1.557, + "step": 1285 + }, + { + "epoch": 0.5562734867122658, + "grad_norm": 22.57794189453125, + "learning_rate": 8.967629046369206e-06, + "loss": 1.5029, + "step": 1286 + }, + { + "epoch": 0.5567060477439238, + "grad_norm": 21.112510681152344, + "learning_rate": 8.958880139982503e-06, + "loss": 1.5741, + "step": 1287 + }, + { + "epoch": 0.557138608775582, + "grad_norm": 24.124717712402344, + "learning_rate": 8.9501312335958e-06, + "loss": 1.613, + "step": 1288 + }, + { + "epoch": 0.55757116980724, + "grad_norm": 31.32720375061035, + "learning_rate": 8.9413823272091e-06, + "loss": 1.6521, + "step": 1289 + }, + { + "epoch": 0.5580037308388981, + "grad_norm": 23.76942253112793, + "learning_rate": 8.932633420822397e-06, + "loss": 1.5079, + "step": 1290 + }, + { + "epoch": 0.5584362918705561, + "grad_norm": 22.088144302368164, + "learning_rate": 8.923884514435697e-06, + "loss": 1.5162, + "step": 1291 + }, + { + "epoch": 0.5588688529022142, + "grad_norm": 21.93614387512207, + "learning_rate": 8.915135608048994e-06, + "loss": 1.4684, + "step": 1292 + }, + { + "epoch": 0.5593014139338722, + "grad_norm": 23.71426773071289, + "learning_rate": 8.906386701662293e-06, + "loss": 1.6234, + "step": 1293 + }, + { + "epoch": 0.5597339749655303, + "grad_norm": 23.15918731689453, + "learning_rate": 8.89763779527559e-06, + "loss": 1.5453, + "step": 1294 + }, + { + "epoch": 0.5601665359971884, + "grad_norm": 24.360671997070312, + "learning_rate": 8.888888888888888e-06, + "loss": 1.5688, + "step": 1295 + }, + { + "epoch": 0.5605990970288464, + "grad_norm": 22.142484664916992, + "learning_rate": 8.880139982502188e-06, + "loss": 1.5051, + "step": 1296 + }, + { + "epoch": 0.5610316580605045, + "grad_norm": 23.395851135253906, + "learning_rate": 8.871391076115487e-06, + "loss": 1.6383, + "step": 1297 + }, + { + "epoch": 0.5614642190921625, + "grad_norm": 22.678421020507812, + "learning_rate": 8.862642169728784e-06, + "loss": 1.5335, + "step": 1298 + }, + { + "epoch": 0.5618967801238206, + "grad_norm": 23.081527709960938, + "learning_rate": 8.853893263342083e-06, + "loss": 1.6206, + "step": 1299 + }, + { + "epoch": 0.5623293411554786, + "grad_norm": 22.97568702697754, + "learning_rate": 8.845144356955381e-06, + "loss": 1.4891, + "step": 1300 + }, + { + "epoch": 0.5627619021871367, + "grad_norm": 22.3316707611084, + "learning_rate": 8.83639545056868e-06, + "loss": 1.6181, + "step": 1301 + }, + { + "epoch": 0.5631944632187947, + "grad_norm": 21.807104110717773, + "learning_rate": 8.827646544181978e-06, + "loss": 1.5856, + "step": 1302 + }, + { + "epoch": 0.5636270242504529, + "grad_norm": 23.432767868041992, + "learning_rate": 8.818897637795277e-06, + "loss": 1.5661, + "step": 1303 + }, + { + "epoch": 0.5640595852821109, + "grad_norm": 23.371694564819336, + "learning_rate": 8.810148731408574e-06, + "loss": 1.5358, + "step": 1304 + }, + { + "epoch": 0.564492146313769, + "grad_norm": 23.72265625, + "learning_rate": 8.801399825021874e-06, + "loss": 1.5152, + "step": 1305 + }, + { + "epoch": 0.564924707345427, + "grad_norm": 20.524398803710938, + "learning_rate": 8.792650918635171e-06, + "loss": 1.4969, + "step": 1306 + }, + { + "epoch": 0.5653572683770851, + "grad_norm": 24.223081588745117, + "learning_rate": 8.78390201224847e-06, + "loss": 1.5267, + "step": 1307 + }, + { + "epoch": 0.5657898294087431, + "grad_norm": 21.053010940551758, + "learning_rate": 8.775153105861768e-06, + "loss": 1.5373, + "step": 1308 + }, + { + "epoch": 0.5662223904404012, + "grad_norm": 24.919174194335938, + "learning_rate": 8.766404199475065e-06, + "loss": 1.5295, + "step": 1309 + }, + { + "epoch": 0.5666549514720592, + "grad_norm": 20.551292419433594, + "learning_rate": 8.757655293088365e-06, + "loss": 1.5628, + "step": 1310 + }, + { + "epoch": 0.5670875125037174, + "grad_norm": 22.388694763183594, + "learning_rate": 8.748906386701664e-06, + "loss": 1.5315, + "step": 1311 + }, + { + "epoch": 0.5675200735353754, + "grad_norm": 22.377506256103516, + "learning_rate": 8.740157480314961e-06, + "loss": 1.5567, + "step": 1312 + }, + { + "epoch": 0.5679526345670335, + "grad_norm": 21.797183990478516, + "learning_rate": 8.731408573928259e-06, + "loss": 1.524, + "step": 1313 + }, + { + "epoch": 0.5683851955986915, + "grad_norm": 25.161802291870117, + "learning_rate": 8.722659667541558e-06, + "loss": 1.5708, + "step": 1314 + }, + { + "epoch": 0.5688177566303496, + "grad_norm": 22.526390075683594, + "learning_rate": 8.713910761154857e-06, + "loss": 1.543, + "step": 1315 + }, + { + "epoch": 0.5692503176620076, + "grad_norm": 22.93593978881836, + "learning_rate": 8.705161854768155e-06, + "loss": 1.5886, + "step": 1316 + }, + { + "epoch": 0.5696828786936657, + "grad_norm": 21.80086326599121, + "learning_rate": 8.696412948381452e-06, + "loss": 1.5182, + "step": 1317 + }, + { + "epoch": 0.5701154397253237, + "grad_norm": 26.52342987060547, + "learning_rate": 8.687664041994751e-06, + "loss": 1.5266, + "step": 1318 + }, + { + "epoch": 0.5705480007569818, + "grad_norm": 23.737064361572266, + "learning_rate": 8.678915135608049e-06, + "loss": 1.5446, + "step": 1319 + }, + { + "epoch": 0.5709805617886399, + "grad_norm": 23.06212615966797, + "learning_rate": 8.670166229221348e-06, + "loss": 1.5367, + "step": 1320 + }, + { + "epoch": 0.5714131228202979, + "grad_norm": 22.22346305847168, + "learning_rate": 8.661417322834647e-06, + "loss": 1.5132, + "step": 1321 + }, + { + "epoch": 0.571845683851956, + "grad_norm": 23.046924591064453, + "learning_rate": 8.652668416447945e-06, + "loss": 1.6097, + "step": 1322 + }, + { + "epoch": 0.572278244883614, + "grad_norm": 23.361909866333008, + "learning_rate": 8.643919510061242e-06, + "loss": 1.5469, + "step": 1323 + }, + { + "epoch": 0.5727108059152721, + "grad_norm": 23.01518440246582, + "learning_rate": 8.635170603674542e-06, + "loss": 1.5453, + "step": 1324 + }, + { + "epoch": 0.5731433669469301, + "grad_norm": 22.828317642211914, + "learning_rate": 8.62642169728784e-06, + "loss": 1.4921, + "step": 1325 + }, + { + "epoch": 0.5735759279785883, + "grad_norm": 27.066354751586914, + "learning_rate": 8.617672790901138e-06, + "loss": 1.4901, + "step": 1326 + }, + { + "epoch": 0.5740084890102463, + "grad_norm": 23.641551971435547, + "learning_rate": 8.608923884514436e-06, + "loss": 1.5495, + "step": 1327 + }, + { + "epoch": 0.5744410500419044, + "grad_norm": 22.956571578979492, + "learning_rate": 8.600174978127735e-06, + "loss": 1.5145, + "step": 1328 + }, + { + "epoch": 0.5748736110735624, + "grad_norm": 21.42229461669922, + "learning_rate": 8.591426071741034e-06, + "loss": 1.4778, + "step": 1329 + }, + { + "epoch": 0.5753061721052205, + "grad_norm": 23.688533782958984, + "learning_rate": 8.582677165354332e-06, + "loss": 1.4983, + "step": 1330 + }, + { + "epoch": 0.5757387331368785, + "grad_norm": 24.6751708984375, + "learning_rate": 8.57392825896763e-06, + "loss": 1.4807, + "step": 1331 + }, + { + "epoch": 0.5761712941685366, + "grad_norm": 25.89187240600586, + "learning_rate": 8.565179352580928e-06, + "loss": 1.6793, + "step": 1332 + }, + { + "epoch": 0.5766038552001946, + "grad_norm": 21.923139572143555, + "learning_rate": 8.556430446194226e-06, + "loss": 1.4787, + "step": 1333 + }, + { + "epoch": 0.5770364162318528, + "grad_norm": 27.02901840209961, + "learning_rate": 8.547681539807525e-06, + "loss": 1.5807, + "step": 1334 + }, + { + "epoch": 0.5774689772635108, + "grad_norm": 24.652677536010742, + "learning_rate": 8.538932633420823e-06, + "loss": 1.5291, + "step": 1335 + }, + { + "epoch": 0.5779015382951689, + "grad_norm": 22.28380584716797, + "learning_rate": 8.530183727034122e-06, + "loss": 1.5341, + "step": 1336 + }, + { + "epoch": 0.5783340993268269, + "grad_norm": 27.593717575073242, + "learning_rate": 8.52143482064742e-06, + "loss": 1.5058, + "step": 1337 + }, + { + "epoch": 0.578766660358485, + "grad_norm": 23.606679916381836, + "learning_rate": 8.512685914260717e-06, + "loss": 1.5453, + "step": 1338 + }, + { + "epoch": 0.579199221390143, + "grad_norm": 19.93859100341797, + "learning_rate": 8.503937007874016e-06, + "loss": 1.6098, + "step": 1339 + }, + { + "epoch": 0.579631782421801, + "grad_norm": 21.376535415649414, + "learning_rate": 8.495188101487315e-06, + "loss": 1.6096, + "step": 1340 + }, + { + "epoch": 0.5800643434534591, + "grad_norm": 22.244482040405273, + "learning_rate": 8.486439195100613e-06, + "loss": 1.5739, + "step": 1341 + }, + { + "epoch": 0.5804969044851171, + "grad_norm": 21.95546531677246, + "learning_rate": 8.47769028871391e-06, + "loss": 1.5556, + "step": 1342 + }, + { + "epoch": 0.5809294655167753, + "grad_norm": 23.818946838378906, + "learning_rate": 8.46894138232721e-06, + "loss": 1.6165, + "step": 1343 + }, + { + "epoch": 0.5813620265484333, + "grad_norm": 24.244600296020508, + "learning_rate": 8.460192475940509e-06, + "loss": 1.5368, + "step": 1344 + }, + { + "epoch": 0.5817945875800914, + "grad_norm": 23.75138282775879, + "learning_rate": 8.451443569553806e-06, + "loss": 1.5739, + "step": 1345 + }, + { + "epoch": 0.5822271486117494, + "grad_norm": 21.759607315063477, + "learning_rate": 8.442694663167106e-06, + "loss": 1.5693, + "step": 1346 + }, + { + "epoch": 0.5826597096434075, + "grad_norm": 21.369449615478516, + "learning_rate": 8.433945756780403e-06, + "loss": 1.509, + "step": 1347 + }, + { + "epoch": 0.5830922706750655, + "grad_norm": 21.341304779052734, + "learning_rate": 8.4251968503937e-06, + "loss": 1.5313, + "step": 1348 + }, + { + "epoch": 0.5835248317067236, + "grad_norm": 21.93410873413086, + "learning_rate": 8.416447944007e-06, + "loss": 1.4952, + "step": 1349 + }, + { + "epoch": 0.5839573927383817, + "grad_norm": 21.560096740722656, + "learning_rate": 8.407699037620299e-06, + "loss": 1.587, + "step": 1350 + }, + { + "epoch": 0.5843899537700398, + "grad_norm": 22.302555084228516, + "learning_rate": 8.398950131233596e-06, + "loss": 1.4969, + "step": 1351 + }, + { + "epoch": 0.5848225148016978, + "grad_norm": 24.07124137878418, + "learning_rate": 8.390201224846894e-06, + "loss": 1.6227, + "step": 1352 + }, + { + "epoch": 0.5852550758333559, + "grad_norm": 21.081422805786133, + "learning_rate": 8.381452318460193e-06, + "loss": 1.5545, + "step": 1353 + }, + { + "epoch": 0.5856876368650139, + "grad_norm": 20.232650756835938, + "learning_rate": 8.372703412073492e-06, + "loss": 1.5344, + "step": 1354 + }, + { + "epoch": 0.586120197896672, + "grad_norm": 21.190690994262695, + "learning_rate": 8.36395450568679e-06, + "loss": 1.6478, + "step": 1355 + }, + { + "epoch": 0.58655275892833, + "grad_norm": 21.58733367919922, + "learning_rate": 8.355205599300087e-06, + "loss": 1.577, + "step": 1356 + }, + { + "epoch": 0.5869853199599881, + "grad_norm": 21.453083038330078, + "learning_rate": 8.346456692913387e-06, + "loss": 1.5533, + "step": 1357 + }, + { + "epoch": 0.5874178809916462, + "grad_norm": 20.776641845703125, + "learning_rate": 8.337707786526686e-06, + "loss": 1.5206, + "step": 1358 + }, + { + "epoch": 0.5878504420233043, + "grad_norm": 24.146244049072266, + "learning_rate": 8.328958880139983e-06, + "loss": 1.5626, + "step": 1359 + }, + { + "epoch": 0.5882830030549623, + "grad_norm": 19.88344383239746, + "learning_rate": 8.320209973753281e-06, + "loss": 1.5167, + "step": 1360 + }, + { + "epoch": 0.5887155640866204, + "grad_norm": 21.336252212524414, + "learning_rate": 8.31146106736658e-06, + "loss": 1.5685, + "step": 1361 + }, + { + "epoch": 0.5891481251182784, + "grad_norm": 23.71305274963379, + "learning_rate": 8.302712160979878e-06, + "loss": 1.5559, + "step": 1362 + }, + { + "epoch": 0.5895806861499364, + "grad_norm": 24.64626693725586, + "learning_rate": 8.293963254593177e-06, + "loss": 1.5773, + "step": 1363 + }, + { + "epoch": 0.5900132471815945, + "grad_norm": 24.751689910888672, + "learning_rate": 8.285214348206474e-06, + "loss": 1.5788, + "step": 1364 + }, + { + "epoch": 0.5904458082132525, + "grad_norm": 22.55054473876953, + "learning_rate": 8.276465441819773e-06, + "loss": 1.6049, + "step": 1365 + }, + { + "epoch": 0.5908783692449107, + "grad_norm": 23.089393615722656, + "learning_rate": 8.267716535433071e-06, + "loss": 1.5444, + "step": 1366 + }, + { + "epoch": 0.5913109302765687, + "grad_norm": 20.179182052612305, + "learning_rate": 8.25896762904637e-06, + "loss": 1.5167, + "step": 1367 + }, + { + "epoch": 0.5917434913082268, + "grad_norm": 22.71483039855957, + "learning_rate": 8.25021872265967e-06, + "loss": 1.5535, + "step": 1368 + }, + { + "epoch": 0.5921760523398848, + "grad_norm": 23.300128936767578, + "learning_rate": 8.241469816272967e-06, + "loss": 1.5337, + "step": 1369 + }, + { + "epoch": 0.5926086133715429, + "grad_norm": 23.242700576782227, + "learning_rate": 8.232720909886264e-06, + "loss": 1.5624, + "step": 1370 + }, + { + "epoch": 0.5930411744032009, + "grad_norm": 22.0483341217041, + "learning_rate": 8.223972003499564e-06, + "loss": 1.5575, + "step": 1371 + }, + { + "epoch": 0.593473735434859, + "grad_norm": 24.035541534423828, + "learning_rate": 8.215223097112861e-06, + "loss": 1.4662, + "step": 1372 + }, + { + "epoch": 0.593906296466517, + "grad_norm": 21.212360382080078, + "learning_rate": 8.20647419072616e-06, + "loss": 1.5534, + "step": 1373 + }, + { + "epoch": 0.5943388574981752, + "grad_norm": 23.624034881591797, + "learning_rate": 8.197725284339458e-06, + "loss": 1.5738, + "step": 1374 + }, + { + "epoch": 0.5947714185298332, + "grad_norm": 23.664756774902344, + "learning_rate": 8.188976377952757e-06, + "loss": 1.5385, + "step": 1375 + }, + { + "epoch": 0.5952039795614913, + "grad_norm": 23.286773681640625, + "learning_rate": 8.180227471566055e-06, + "loss": 1.5215, + "step": 1376 + }, + { + "epoch": 0.5956365405931493, + "grad_norm": 24.892528533935547, + "learning_rate": 8.171478565179354e-06, + "loss": 1.471, + "step": 1377 + }, + { + "epoch": 0.5960691016248074, + "grad_norm": 25.03875160217285, + "learning_rate": 8.162729658792651e-06, + "loss": 1.6867, + "step": 1378 + }, + { + "epoch": 0.5965016626564654, + "grad_norm": 24.291664123535156, + "learning_rate": 8.15398075240595e-06, + "loss": 1.5568, + "step": 1379 + }, + { + "epoch": 0.5969342236881235, + "grad_norm": 21.74358367919922, + "learning_rate": 8.145231846019248e-06, + "loss": 1.4808, + "step": 1380 + }, + { + "epoch": 0.5973667847197816, + "grad_norm": 20.80985450744629, + "learning_rate": 8.136482939632546e-06, + "loss": 1.6065, + "step": 1381 + }, + { + "epoch": 0.5977993457514397, + "grad_norm": 23.85978126525879, + "learning_rate": 8.127734033245845e-06, + "loss": 1.5018, + "step": 1382 + }, + { + "epoch": 0.5982319067830977, + "grad_norm": 23.80008316040039, + "learning_rate": 8.118985126859144e-06, + "loss": 1.4797, + "step": 1383 + }, + { + "epoch": 0.5986644678147557, + "grad_norm": 21.72329330444336, + "learning_rate": 8.110236220472441e-06, + "loss": 1.5553, + "step": 1384 + }, + { + "epoch": 0.5990970288464138, + "grad_norm": 23.285947799682617, + "learning_rate": 8.101487314085739e-06, + "loss": 1.4434, + "step": 1385 + }, + { + "epoch": 0.5995295898780718, + "grad_norm": 24.382734298706055, + "learning_rate": 8.092738407699038e-06, + "loss": 1.6215, + "step": 1386 + }, + { + "epoch": 0.5999621509097299, + "grad_norm": 22.842208862304688, + "learning_rate": 8.083989501312337e-06, + "loss": 1.5772, + "step": 1387 + }, + { + "epoch": 0.6003947119413879, + "grad_norm": 22.464563369750977, + "learning_rate": 8.075240594925635e-06, + "loss": 1.5454, + "step": 1388 + }, + { + "epoch": 0.6008272729730461, + "grad_norm": 23.2418155670166, + "learning_rate": 8.066491688538932e-06, + "loss": 1.4773, + "step": 1389 + }, + { + "epoch": 0.6012598340047041, + "grad_norm": 24.3996639251709, + "learning_rate": 8.057742782152232e-06, + "loss": 1.4524, + "step": 1390 + }, + { + "epoch": 0.6016923950363622, + "grad_norm": 21.880083084106445, + "learning_rate": 8.04899387576553e-06, + "loss": 1.5388, + "step": 1391 + }, + { + "epoch": 0.6021249560680202, + "grad_norm": 21.42142105102539, + "learning_rate": 8.040244969378828e-06, + "loss": 1.5757, + "step": 1392 + }, + { + "epoch": 0.6025575170996783, + "grad_norm": 24.134017944335938, + "learning_rate": 8.031496062992128e-06, + "loss": 1.5229, + "step": 1393 + }, + { + "epoch": 0.6029900781313363, + "grad_norm": 23.24073600769043, + "learning_rate": 8.022747156605425e-06, + "loss": 1.5801, + "step": 1394 + }, + { + "epoch": 0.6034226391629944, + "grad_norm": 20.904178619384766, + "learning_rate": 8.013998250218723e-06, + "loss": 1.5884, + "step": 1395 + }, + { + "epoch": 0.6038552001946524, + "grad_norm": 23.131088256835938, + "learning_rate": 8.005249343832022e-06, + "loss": 1.489, + "step": 1396 + }, + { + "epoch": 0.6042877612263106, + "grad_norm": 22.88324546813965, + "learning_rate": 7.996500437445321e-06, + "loss": 1.5062, + "step": 1397 + }, + { + "epoch": 0.6047203222579686, + "grad_norm": 22.445697784423828, + "learning_rate": 7.987751531058619e-06, + "loss": 1.5516, + "step": 1398 + }, + { + "epoch": 0.6051528832896267, + "grad_norm": 21.125619888305664, + "learning_rate": 7.979002624671916e-06, + "loss": 1.5458, + "step": 1399 + }, + { + "epoch": 0.6055854443212847, + "grad_norm": 23.79015350341797, + "learning_rate": 7.970253718285215e-06, + "loss": 1.4498, + "step": 1400 + }, + { + "epoch": 0.6060180053529428, + "grad_norm": 22.167564392089844, + "learning_rate": 7.961504811898514e-06, + "loss": 1.5495, + "step": 1401 + }, + { + "epoch": 0.6064505663846008, + "grad_norm": 22.303564071655273, + "learning_rate": 7.952755905511812e-06, + "loss": 1.5881, + "step": 1402 + }, + { + "epoch": 0.6068831274162589, + "grad_norm": 24.354480743408203, + "learning_rate": 7.94400699912511e-06, + "loss": 1.4995, + "step": 1403 + }, + { + "epoch": 0.607315688447917, + "grad_norm": 26.439773559570312, + "learning_rate": 7.935258092738409e-06, + "loss": 1.5314, + "step": 1404 + }, + { + "epoch": 0.607748249479575, + "grad_norm": 25.109752655029297, + "learning_rate": 7.926509186351706e-06, + "loss": 1.4808, + "step": 1405 + }, + { + "epoch": 0.6081808105112331, + "grad_norm": 21.283794403076172, + "learning_rate": 7.917760279965005e-06, + "loss": 1.5175, + "step": 1406 + }, + { + "epoch": 0.6086133715428911, + "grad_norm": 22.0447998046875, + "learning_rate": 7.909011373578303e-06, + "loss": 1.4999, + "step": 1407 + }, + { + "epoch": 0.6090459325745492, + "grad_norm": 21.30337905883789, + "learning_rate": 7.900262467191602e-06, + "loss": 1.4887, + "step": 1408 + }, + { + "epoch": 0.6094784936062072, + "grad_norm": 24.14295768737793, + "learning_rate": 7.8915135608049e-06, + "loss": 1.4664, + "step": 1409 + }, + { + "epoch": 0.6099110546378653, + "grad_norm": 22.371164321899414, + "learning_rate": 7.882764654418197e-06, + "loss": 1.5782, + "step": 1410 + }, + { + "epoch": 0.6103436156695233, + "grad_norm": 23.566246032714844, + "learning_rate": 7.874015748031496e-06, + "loss": 1.519, + "step": 1411 + }, + { + "epoch": 0.6107761767011815, + "grad_norm": 21.215904235839844, + "learning_rate": 7.865266841644796e-06, + "loss": 1.5402, + "step": 1412 + }, + { + "epoch": 0.6112087377328395, + "grad_norm": 21.46411895751953, + "learning_rate": 7.856517935258093e-06, + "loss": 1.4525, + "step": 1413 + }, + { + "epoch": 0.6116412987644976, + "grad_norm": 23.8530216217041, + "learning_rate": 7.847769028871392e-06, + "loss": 1.4487, + "step": 1414 + }, + { + "epoch": 0.6120738597961556, + "grad_norm": 23.55060386657715, + "learning_rate": 7.83902012248469e-06, + "loss": 1.5548, + "step": 1415 + }, + { + "epoch": 0.6125064208278137, + "grad_norm": 31.455612182617188, + "learning_rate": 7.830271216097989e-06, + "loss": 1.6577, + "step": 1416 + }, + { + "epoch": 0.6129389818594717, + "grad_norm": 26.714149475097656, + "learning_rate": 7.821522309711287e-06, + "loss": 1.5606, + "step": 1417 + }, + { + "epoch": 0.6133715428911298, + "grad_norm": 24.780031204223633, + "learning_rate": 7.812773403324586e-06, + "loss": 1.5787, + "step": 1418 + }, + { + "epoch": 0.6138041039227878, + "grad_norm": 23.39690589904785, + "learning_rate": 7.804024496937883e-06, + "loss": 1.5447, + "step": 1419 + }, + { + "epoch": 0.614236664954446, + "grad_norm": 21.379409790039062, + "learning_rate": 7.79527559055118e-06, + "loss": 1.4938, + "step": 1420 + }, + { + "epoch": 0.614669225986104, + "grad_norm": 22.49406623840332, + "learning_rate": 7.78652668416448e-06, + "loss": 1.4945, + "step": 1421 + }, + { + "epoch": 0.6151017870177621, + "grad_norm": 21.45618438720703, + "learning_rate": 7.77777777777778e-06, + "loss": 1.6223, + "step": 1422 + }, + { + "epoch": 0.6155343480494201, + "grad_norm": 21.71502113342285, + "learning_rate": 7.769028871391077e-06, + "loss": 1.594, + "step": 1423 + }, + { + "epoch": 0.6159669090810782, + "grad_norm": 23.39321517944336, + "learning_rate": 7.760279965004374e-06, + "loss": 1.4467, + "step": 1424 + }, + { + "epoch": 0.6163994701127362, + "grad_norm": 22.757183074951172, + "learning_rate": 7.751531058617673e-06, + "loss": 1.478, + "step": 1425 + }, + { + "epoch": 0.6168320311443943, + "grad_norm": 34.57364273071289, + "learning_rate": 7.742782152230973e-06, + "loss": 1.4439, + "step": 1426 + }, + { + "epoch": 0.6172645921760523, + "grad_norm": 24.04800796508789, + "learning_rate": 7.73403324584427e-06, + "loss": 1.5368, + "step": 1427 + }, + { + "epoch": 0.6176971532077103, + "grad_norm": 25.531238555908203, + "learning_rate": 7.725284339457568e-06, + "loss": 1.6016, + "step": 1428 + }, + { + "epoch": 0.6181297142393685, + "grad_norm": 25.783323287963867, + "learning_rate": 7.716535433070867e-06, + "loss": 1.5348, + "step": 1429 + }, + { + "epoch": 0.6185622752710265, + "grad_norm": 24.652729034423828, + "learning_rate": 7.707786526684166e-06, + "loss": 1.5232, + "step": 1430 + }, + { + "epoch": 0.6189948363026846, + "grad_norm": 24.11018180847168, + "learning_rate": 7.699037620297464e-06, + "loss": 1.5591, + "step": 1431 + }, + { + "epoch": 0.6194273973343426, + "grad_norm": 24.16909408569336, + "learning_rate": 7.690288713910761e-06, + "loss": 1.6298, + "step": 1432 + }, + { + "epoch": 0.6198599583660007, + "grad_norm": 23.931461334228516, + "learning_rate": 7.68153980752406e-06, + "loss": 1.4956, + "step": 1433 + }, + { + "epoch": 0.6202925193976587, + "grad_norm": 25.672365188598633, + "learning_rate": 7.672790901137358e-06, + "loss": 1.5273, + "step": 1434 + }, + { + "epoch": 0.6207250804293168, + "grad_norm": 22.84345817565918, + "learning_rate": 7.664041994750657e-06, + "loss": 1.5427, + "step": 1435 + }, + { + "epoch": 0.6211576414609749, + "grad_norm": 22.920881271362305, + "learning_rate": 7.655293088363955e-06, + "loss": 1.549, + "step": 1436 + }, + { + "epoch": 0.621590202492633, + "grad_norm": 22.238073348999023, + "learning_rate": 7.646544181977254e-06, + "loss": 1.5617, + "step": 1437 + }, + { + "epoch": 0.622022763524291, + "grad_norm": 21.899429321289062, + "learning_rate": 7.637795275590551e-06, + "loss": 1.4983, + "step": 1438 + }, + { + "epoch": 0.6224553245559491, + "grad_norm": 21.77754783630371, + "learning_rate": 7.62904636920385e-06, + "loss": 1.5032, + "step": 1439 + }, + { + "epoch": 0.6228878855876071, + "grad_norm": 24.998987197875977, + "learning_rate": 7.620297462817149e-06, + "loss": 1.3883, + "step": 1440 + }, + { + "epoch": 0.6233204466192652, + "grad_norm": 23.322284698486328, + "learning_rate": 7.611548556430447e-06, + "loss": 1.5162, + "step": 1441 + }, + { + "epoch": 0.6237530076509232, + "grad_norm": 21.041311264038086, + "learning_rate": 7.602799650043745e-06, + "loss": 1.4762, + "step": 1442 + }, + { + "epoch": 0.6241855686825813, + "grad_norm": 23.653440475463867, + "learning_rate": 7.594050743657043e-06, + "loss": 1.531, + "step": 1443 + }, + { + "epoch": 0.6246181297142394, + "grad_norm": 20.211307525634766, + "learning_rate": 7.585301837270341e-06, + "loss": 1.5188, + "step": 1444 + }, + { + "epoch": 0.6250506907458975, + "grad_norm": 25.027973175048828, + "learning_rate": 7.576552930883641e-06, + "loss": 1.5473, + "step": 1445 + }, + { + "epoch": 0.6254832517775555, + "grad_norm": 23.346525192260742, + "learning_rate": 7.567804024496939e-06, + "loss": 1.4762, + "step": 1446 + }, + { + "epoch": 0.6259158128092136, + "grad_norm": 24.292959213256836, + "learning_rate": 7.5590551181102365e-06, + "loss": 1.5621, + "step": 1447 + }, + { + "epoch": 0.6263483738408716, + "grad_norm": 23.621158599853516, + "learning_rate": 7.550306211723535e-06, + "loss": 1.4602, + "step": 1448 + }, + { + "epoch": 0.6267809348725296, + "grad_norm": 23.259824752807617, + "learning_rate": 7.541557305336834e-06, + "loss": 1.5399, + "step": 1449 + }, + { + "epoch": 0.6272134959041877, + "grad_norm": 24.204967498779297, + "learning_rate": 7.532808398950132e-06, + "loss": 1.5119, + "step": 1450 + }, + { + "epoch": 0.6276460569358457, + "grad_norm": 22.07662582397461, + "learning_rate": 7.52405949256343e-06, + "loss": 1.5115, + "step": 1451 + }, + { + "epoch": 0.6280786179675039, + "grad_norm": 23.451147079467773, + "learning_rate": 7.515310586176728e-06, + "loss": 1.6045, + "step": 1452 + }, + { + "epoch": 0.6285111789991619, + "grad_norm": 23.66141128540039, + "learning_rate": 7.506561679790027e-06, + "loss": 1.6329, + "step": 1453 + }, + { + "epoch": 0.62894374003082, + "grad_norm": 22.44936180114746, + "learning_rate": 7.497812773403326e-06, + "loss": 1.5992, + "step": 1454 + }, + { + "epoch": 0.629376301062478, + "grad_norm": 22.108919143676758, + "learning_rate": 7.489063867016624e-06, + "loss": 1.5392, + "step": 1455 + }, + { + "epoch": 0.6298088620941361, + "grad_norm": 24.86928939819336, + "learning_rate": 7.480314960629922e-06, + "loss": 1.5499, + "step": 1456 + }, + { + "epoch": 0.6302414231257941, + "grad_norm": 21.16976547241211, + "learning_rate": 7.47156605424322e-06, + "loss": 1.588, + "step": 1457 + }, + { + "epoch": 0.6306739841574522, + "grad_norm": 21.789356231689453, + "learning_rate": 7.4628171478565184e-06, + "loss": 1.5114, + "step": 1458 + }, + { + "epoch": 0.6311065451891102, + "grad_norm": 23.476760864257812, + "learning_rate": 7.454068241469818e-06, + "loss": 1.5496, + "step": 1459 + }, + { + "epoch": 0.6315391062207684, + "grad_norm": 25.469806671142578, + "learning_rate": 7.445319335083115e-06, + "loss": 1.6105, + "step": 1460 + }, + { + "epoch": 0.6319716672524264, + "grad_norm": 25.634347915649414, + "learning_rate": 7.4365704286964135e-06, + "loss": 1.5353, + "step": 1461 + }, + { + "epoch": 0.6324042282840845, + "grad_norm": 27.508028030395508, + "learning_rate": 7.427821522309712e-06, + "loss": 1.5767, + "step": 1462 + }, + { + "epoch": 0.6328367893157425, + "grad_norm": 23.841432571411133, + "learning_rate": 7.419072615923009e-06, + "loss": 1.6054, + "step": 1463 + }, + { + "epoch": 0.6332693503474006, + "grad_norm": 22.372793197631836, + "learning_rate": 7.410323709536309e-06, + "loss": 1.5289, + "step": 1464 + }, + { + "epoch": 0.6337019113790586, + "grad_norm": 23.19795036315918, + "learning_rate": 7.401574803149607e-06, + "loss": 1.5856, + "step": 1465 + }, + { + "epoch": 0.6341344724107167, + "grad_norm": 22.361799240112305, + "learning_rate": 7.392825896762905e-06, + "loss": 1.5092, + "step": 1466 + }, + { + "epoch": 0.6345670334423748, + "grad_norm": 22.069835662841797, + "learning_rate": 7.384076990376204e-06, + "loss": 1.6036, + "step": 1467 + }, + { + "epoch": 0.6349995944740329, + "grad_norm": 23.586353302001953, + "learning_rate": 7.375328083989501e-06, + "loss": 1.518, + "step": 1468 + }, + { + "epoch": 0.6354321555056909, + "grad_norm": 22.216527938842773, + "learning_rate": 7.3665791776028e-06, + "loss": 1.5903, + "step": 1469 + }, + { + "epoch": 0.635864716537349, + "grad_norm": 23.356054306030273, + "learning_rate": 7.357830271216099e-06, + "loss": 1.4951, + "step": 1470 + }, + { + "epoch": 0.636297277569007, + "grad_norm": 21.026004791259766, + "learning_rate": 7.349081364829397e-06, + "loss": 1.618, + "step": 1471 + }, + { + "epoch": 0.636729838600665, + "grad_norm": 23.06012535095215, + "learning_rate": 7.340332458442695e-06, + "loss": 1.5405, + "step": 1472 + }, + { + "epoch": 0.6371623996323231, + "grad_norm": 23.857873916625977, + "learning_rate": 7.331583552055994e-06, + "loss": 1.5567, + "step": 1473 + }, + { + "epoch": 0.6375949606639811, + "grad_norm": 23.048160552978516, + "learning_rate": 7.322834645669292e-06, + "loss": 1.5534, + "step": 1474 + }, + { + "epoch": 0.6380275216956393, + "grad_norm": 24.181964874267578, + "learning_rate": 7.3140857392825906e-06, + "loss": 1.5192, + "step": 1475 + }, + { + "epoch": 0.6384600827272973, + "grad_norm": 22.832210540771484, + "learning_rate": 7.305336832895888e-06, + "loss": 1.5114, + "step": 1476 + }, + { + "epoch": 0.6388926437589554, + "grad_norm": 23.764835357666016, + "learning_rate": 7.2965879265091864e-06, + "loss": 1.4807, + "step": 1477 + }, + { + "epoch": 0.6393252047906134, + "grad_norm": 24.419448852539062, + "learning_rate": 7.287839020122486e-06, + "loss": 1.4671, + "step": 1478 + }, + { + "epoch": 0.6397577658222715, + "grad_norm": 25.90771484375, + "learning_rate": 7.279090113735784e-06, + "loss": 1.5565, + "step": 1479 + }, + { + "epoch": 0.6401903268539295, + "grad_norm": 24.24135398864746, + "learning_rate": 7.270341207349082e-06, + "loss": 1.4725, + "step": 1480 + }, + { + "epoch": 0.6406228878855876, + "grad_norm": 23.335277557373047, + "learning_rate": 7.26159230096238e-06, + "loss": 1.5215, + "step": 1481 + }, + { + "epoch": 0.6410554489172456, + "grad_norm": 23.67999839782715, + "learning_rate": 7.252843394575678e-06, + "loss": 1.5626, + "step": 1482 + }, + { + "epoch": 0.6414880099489038, + "grad_norm": 26.6694393157959, + "learning_rate": 7.2440944881889774e-06, + "loss": 1.4421, + "step": 1483 + }, + { + "epoch": 0.6419205709805618, + "grad_norm": 23.231159210205078, + "learning_rate": 7.235345581802276e-06, + "loss": 1.5262, + "step": 1484 + }, + { + "epoch": 0.6423531320122199, + "grad_norm": 24.250080108642578, + "learning_rate": 7.226596675415573e-06, + "loss": 1.5133, + "step": 1485 + }, + { + "epoch": 0.6427856930438779, + "grad_norm": 21.954505920410156, + "learning_rate": 7.217847769028872e-06, + "loss": 1.5227, + "step": 1486 + }, + { + "epoch": 0.643218254075536, + "grad_norm": 22.040283203125, + "learning_rate": 7.20909886264217e-06, + "loss": 1.59, + "step": 1487 + }, + { + "epoch": 0.643650815107194, + "grad_norm": 23.680068969726562, + "learning_rate": 7.200349956255469e-06, + "loss": 1.5092, + "step": 1488 + }, + { + "epoch": 0.6440833761388521, + "grad_norm": 25.096607208251953, + "learning_rate": 7.191601049868768e-06, + "loss": 1.468, + "step": 1489 + }, + { + "epoch": 0.6445159371705101, + "grad_norm": 22.989320755004883, + "learning_rate": 7.182852143482065e-06, + "loss": 1.5895, + "step": 1490 + }, + { + "epoch": 0.6449484982021683, + "grad_norm": 25.26317024230957, + "learning_rate": 7.1741032370953635e-06, + "loss": 1.4943, + "step": 1491 + }, + { + "epoch": 0.6453810592338263, + "grad_norm": 25.946033477783203, + "learning_rate": 7.165354330708662e-06, + "loss": 1.635, + "step": 1492 + }, + { + "epoch": 0.6458136202654843, + "grad_norm": 23.764036178588867, + "learning_rate": 7.156605424321961e-06, + "loss": 1.6082, + "step": 1493 + }, + { + "epoch": 0.6462461812971424, + "grad_norm": 26.842212677001953, + "learning_rate": 7.1478565179352585e-06, + "loss": 1.5848, + "step": 1494 + }, + { + "epoch": 0.6466787423288004, + "grad_norm": 24.90704917907715, + "learning_rate": 7.139107611548557e-06, + "loss": 1.5782, + "step": 1495 + }, + { + "epoch": 0.6471113033604585, + "grad_norm": 26.029441833496094, + "learning_rate": 7.130358705161855e-06, + "loss": 1.5036, + "step": 1496 + }, + { + "epoch": 0.6475438643921165, + "grad_norm": 25.079856872558594, + "learning_rate": 7.1216097987751545e-06, + "loss": 1.4376, + "step": 1497 + }, + { + "epoch": 0.6479764254237746, + "grad_norm": 25.471174240112305, + "learning_rate": 7.112860892388452e-06, + "loss": 1.3823, + "step": 1498 + }, + { + "epoch": 0.6484089864554327, + "grad_norm": 21.90082359313965, + "learning_rate": 7.10411198600175e-06, + "loss": 1.5895, + "step": 1499 + }, + { + "epoch": 0.6488415474870908, + "grad_norm": 22.75554656982422, + "learning_rate": 7.095363079615049e-06, + "loss": 1.6006, + "step": 1500 + }, + { + "epoch": 0.6492741085187488, + "grad_norm": 21.22007179260254, + "learning_rate": 7.086614173228347e-06, + "loss": 1.5194, + "step": 1501 + }, + { + "epoch": 0.6497066695504069, + "grad_norm": 22.74795913696289, + "learning_rate": 7.077865266841646e-06, + "loss": 1.5761, + "step": 1502 + }, + { + "epoch": 0.6501392305820649, + "grad_norm": 22.385860443115234, + "learning_rate": 7.069116360454944e-06, + "loss": 1.5468, + "step": 1503 + }, + { + "epoch": 0.650571791613723, + "grad_norm": 23.7584228515625, + "learning_rate": 7.060367454068242e-06, + "loss": 1.5242, + "step": 1504 + }, + { + "epoch": 0.651004352645381, + "grad_norm": 25.09479522705078, + "learning_rate": 7.0516185476815405e-06, + "loss": 1.5615, + "step": 1505 + }, + { + "epoch": 0.6514369136770392, + "grad_norm": 24.615421295166016, + "learning_rate": 7.042869641294838e-06, + "loss": 1.5362, + "step": 1506 + }, + { + "epoch": 0.6518694747086972, + "grad_norm": 21.421411514282227, + "learning_rate": 7.034120734908137e-06, + "loss": 1.5744, + "step": 1507 + }, + { + "epoch": 0.6523020357403553, + "grad_norm": 23.396808624267578, + "learning_rate": 7.025371828521436e-06, + "loss": 1.5602, + "step": 1508 + }, + { + "epoch": 0.6527345967720133, + "grad_norm": 22.569921493530273, + "learning_rate": 7.016622922134734e-06, + "loss": 1.5355, + "step": 1509 + }, + { + "epoch": 0.6531671578036714, + "grad_norm": 24.40047264099121, + "learning_rate": 7.0078740157480315e-06, + "loss": 1.5158, + "step": 1510 + }, + { + "epoch": 0.6535997188353294, + "grad_norm": 24.43023681640625, + "learning_rate": 6.99912510936133e-06, + "loss": 1.5551, + "step": 1511 + }, + { + "epoch": 0.6540322798669875, + "grad_norm": 21.310428619384766, + "learning_rate": 6.990376202974629e-06, + "loss": 1.5114, + "step": 1512 + }, + { + "epoch": 0.6544648408986455, + "grad_norm": 21.625455856323242, + "learning_rate": 6.981627296587927e-06, + "loss": 1.5002, + "step": 1513 + }, + { + "epoch": 0.6548974019303037, + "grad_norm": 24.013227462768555, + "learning_rate": 6.972878390201226e-06, + "loss": 1.5115, + "step": 1514 + }, + { + "epoch": 0.6553299629619617, + "grad_norm": 20.982900619506836, + "learning_rate": 6.964129483814523e-06, + "loss": 1.4118, + "step": 1515 + }, + { + "epoch": 0.6557625239936197, + "grad_norm": 23.199033737182617, + "learning_rate": 6.955380577427822e-06, + "loss": 1.5651, + "step": 1516 + }, + { + "epoch": 0.6561950850252778, + "grad_norm": 21.069751739501953, + "learning_rate": 6.946631671041121e-06, + "loss": 1.5955, + "step": 1517 + }, + { + "epoch": 0.6566276460569358, + "grad_norm": 22.66398811340332, + "learning_rate": 6.937882764654419e-06, + "loss": 1.4432, + "step": 1518 + }, + { + "epoch": 0.6570602070885939, + "grad_norm": 22.937976837158203, + "learning_rate": 6.929133858267717e-06, + "loss": 1.5522, + "step": 1519 + }, + { + "epoch": 0.6574927681202519, + "grad_norm": 22.356679916381836, + "learning_rate": 6.920384951881015e-06, + "loss": 1.5067, + "step": 1520 + }, + { + "epoch": 0.65792532915191, + "grad_norm": 23.95052719116211, + "learning_rate": 6.911636045494314e-06, + "loss": 1.5553, + "step": 1521 + }, + { + "epoch": 0.658357890183568, + "grad_norm": 24.117918014526367, + "learning_rate": 6.902887139107613e-06, + "loss": 1.4865, + "step": 1522 + }, + { + "epoch": 0.6587904512152262, + "grad_norm": 23.363407135009766, + "learning_rate": 6.89413823272091e-06, + "loss": 1.4588, + "step": 1523 + }, + { + "epoch": 0.6592230122468842, + "grad_norm": 24.435625076293945, + "learning_rate": 6.8853893263342085e-06, + "loss": 1.4985, + "step": 1524 + }, + { + "epoch": 0.6596555732785423, + "grad_norm": 25.14236831665039, + "learning_rate": 6.876640419947507e-06, + "loss": 1.5323, + "step": 1525 + }, + { + "epoch": 0.6600881343102003, + "grad_norm": 22.674089431762695, + "learning_rate": 6.867891513560806e-06, + "loss": 1.5134, + "step": 1526 + }, + { + "epoch": 0.6605206953418584, + "grad_norm": 24.138879776000977, + "learning_rate": 6.859142607174104e-06, + "loss": 1.5683, + "step": 1527 + }, + { + "epoch": 0.6609532563735164, + "grad_norm": 24.909719467163086, + "learning_rate": 6.850393700787402e-06, + "loss": 1.5579, + "step": 1528 + }, + { + "epoch": 0.6613858174051745, + "grad_norm": 23.420795440673828, + "learning_rate": 6.8416447944007e-06, + "loss": 1.4783, + "step": 1529 + }, + { + "epoch": 0.6618183784368326, + "grad_norm": 22.931936264038086, + "learning_rate": 6.832895888013999e-06, + "loss": 1.4801, + "step": 1530 + }, + { + "epoch": 0.6622509394684907, + "grad_norm": 21.55242156982422, + "learning_rate": 6.824146981627298e-06, + "loss": 1.4999, + "step": 1531 + }, + { + "epoch": 0.6626835005001487, + "grad_norm": 21.650358200073242, + "learning_rate": 6.815398075240595e-06, + "loss": 1.5101, + "step": 1532 + }, + { + "epoch": 0.6631160615318068, + "grad_norm": 23.8084659576416, + "learning_rate": 6.806649168853894e-06, + "loss": 1.4968, + "step": 1533 + }, + { + "epoch": 0.6635486225634648, + "grad_norm": 25.13161849975586, + "learning_rate": 6.797900262467192e-06, + "loss": 1.4169, + "step": 1534 + }, + { + "epoch": 0.6639811835951229, + "grad_norm": 22.81217384338379, + "learning_rate": 6.78915135608049e-06, + "loss": 1.4825, + "step": 1535 + }, + { + "epoch": 0.6644137446267809, + "grad_norm": 25.360429763793945, + "learning_rate": 6.78040244969379e-06, + "loss": 1.6435, + "step": 1536 + }, + { + "epoch": 0.6648463056584389, + "grad_norm": 22.05304527282715, + "learning_rate": 6.771653543307087e-06, + "loss": 1.5476, + "step": 1537 + }, + { + "epoch": 0.6652788666900971, + "grad_norm": 24.10348892211914, + "learning_rate": 6.7629046369203855e-06, + "loss": 1.4382, + "step": 1538 + }, + { + "epoch": 0.6657114277217551, + "grad_norm": 22.268739700317383, + "learning_rate": 6.754155730533684e-06, + "loss": 1.618, + "step": 1539 + }, + { + "epoch": 0.6661439887534132, + "grad_norm": 21.813589096069336, + "learning_rate": 6.745406824146981e-06, + "loss": 1.5424, + "step": 1540 + }, + { + "epoch": 0.6665765497850712, + "grad_norm": 24.29230308532715, + "learning_rate": 6.736657917760281e-06, + "loss": 1.5056, + "step": 1541 + }, + { + "epoch": 0.6670091108167293, + "grad_norm": 21.526540756225586, + "learning_rate": 6.727909011373579e-06, + "loss": 1.531, + "step": 1542 + }, + { + "epoch": 0.6674416718483873, + "grad_norm": 22.080156326293945, + "learning_rate": 6.719160104986877e-06, + "loss": 1.5371, + "step": 1543 + }, + { + "epoch": 0.6678742328800454, + "grad_norm": 22.51006507873535, + "learning_rate": 6.710411198600175e-06, + "loss": 1.5069, + "step": 1544 + }, + { + "epoch": 0.6683067939117034, + "grad_norm": 25.575986862182617, + "learning_rate": 6.701662292213474e-06, + "loss": 1.511, + "step": 1545 + }, + { + "epoch": 0.6687393549433616, + "grad_norm": 22.93172836303711, + "learning_rate": 6.692913385826772e-06, + "loss": 1.501, + "step": 1546 + }, + { + "epoch": 0.6691719159750196, + "grad_norm": 22.480873107910156, + "learning_rate": 6.684164479440071e-06, + "loss": 1.5077, + "step": 1547 + }, + { + "epoch": 0.6696044770066777, + "grad_norm": 22.858264923095703, + "learning_rate": 6.675415573053369e-06, + "loss": 1.4774, + "step": 1548 + }, + { + "epoch": 0.6700370380383357, + "grad_norm": 24.259767532348633, + "learning_rate": 6.666666666666667e-06, + "loss": 1.6461, + "step": 1549 + }, + { + "epoch": 0.6704695990699938, + "grad_norm": 24.198890686035156, + "learning_rate": 6.657917760279966e-06, + "loss": 1.5393, + "step": 1550 + }, + { + "epoch": 0.6709021601016518, + "grad_norm": 24.402128219604492, + "learning_rate": 6.649168853893264e-06, + "loss": 1.4286, + "step": 1551 + }, + { + "epoch": 0.6713347211333099, + "grad_norm": 25.59145736694336, + "learning_rate": 6.6404199475065626e-06, + "loss": 1.4993, + "step": 1552 + }, + { + "epoch": 0.671767282164968, + "grad_norm": 23.374197006225586, + "learning_rate": 6.63167104111986e-06, + "loss": 1.5453, + "step": 1553 + }, + { + "epoch": 0.6721998431966261, + "grad_norm": 26.37010955810547, + "learning_rate": 6.6229221347331584e-06, + "loss": 1.5741, + "step": 1554 + }, + { + "epoch": 0.6726324042282841, + "grad_norm": 25.05316162109375, + "learning_rate": 6.614173228346458e-06, + "loss": 1.539, + "step": 1555 + }, + { + "epoch": 0.6730649652599422, + "grad_norm": 23.657917022705078, + "learning_rate": 6.605424321959756e-06, + "loss": 1.5029, + "step": 1556 + }, + { + "epoch": 0.6734975262916002, + "grad_norm": 22.25530433654785, + "learning_rate": 6.5966754155730535e-06, + "loss": 1.538, + "step": 1557 + }, + { + "epoch": 0.6739300873232583, + "grad_norm": 26.930683135986328, + "learning_rate": 6.587926509186352e-06, + "loss": 1.5289, + "step": 1558 + }, + { + "epoch": 0.6743626483549163, + "grad_norm": 23.665882110595703, + "learning_rate": 6.57917760279965e-06, + "loss": 1.5161, + "step": 1559 + }, + { + "epoch": 0.6747952093865743, + "grad_norm": 22.633747100830078, + "learning_rate": 6.5704286964129495e-06, + "loss": 1.4837, + "step": 1560 + }, + { + "epoch": 0.6752277704182325, + "grad_norm": 27.529674530029297, + "learning_rate": 6.561679790026248e-06, + "loss": 1.5903, + "step": 1561 + }, + { + "epoch": 0.6756603314498905, + "grad_norm": 23.555315017700195, + "learning_rate": 6.552930883639545e-06, + "loss": 1.5309, + "step": 1562 + }, + { + "epoch": 0.6760928924815486, + "grad_norm": 22.861183166503906, + "learning_rate": 6.544181977252844e-06, + "loss": 1.5248, + "step": 1563 + }, + { + "epoch": 0.6765254535132066, + "grad_norm": 23.239778518676758, + "learning_rate": 6.535433070866142e-06, + "loss": 1.4878, + "step": 1564 + }, + { + "epoch": 0.6769580145448647, + "grad_norm": 24.08327293395996, + "learning_rate": 6.526684164479441e-06, + "loss": 1.4779, + "step": 1565 + }, + { + "epoch": 0.6773905755765227, + "grad_norm": 24.711755752563477, + "learning_rate": 6.517935258092739e-06, + "loss": 1.6168, + "step": 1566 + }, + { + "epoch": 0.6778231366081808, + "grad_norm": 26.82582664489746, + "learning_rate": 6.509186351706037e-06, + "loss": 1.4592, + "step": 1567 + }, + { + "epoch": 0.6782556976398388, + "grad_norm": 25.213279724121094, + "learning_rate": 6.5004374453193355e-06, + "loss": 1.5868, + "step": 1568 + }, + { + "epoch": 0.678688258671497, + "grad_norm": 22.981929779052734, + "learning_rate": 6.491688538932633e-06, + "loss": 1.6304, + "step": 1569 + }, + { + "epoch": 0.679120819703155, + "grad_norm": 22.704452514648438, + "learning_rate": 6.482939632545932e-06, + "loss": 1.4529, + "step": 1570 + }, + { + "epoch": 0.6795533807348131, + "grad_norm": 25.932876586914062, + "learning_rate": 6.4741907261592306e-06, + "loss": 1.5983, + "step": 1571 + }, + { + "epoch": 0.6799859417664711, + "grad_norm": 23.21229362487793, + "learning_rate": 6.465441819772529e-06, + "loss": 1.4824, + "step": 1572 + }, + { + "epoch": 0.6804185027981292, + "grad_norm": 23.76874351501465, + "learning_rate": 6.456692913385827e-06, + "loss": 1.4459, + "step": 1573 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 24.482866287231445, + "learning_rate": 6.4479440069991265e-06, + "loss": 1.5083, + "step": 1574 + }, + { + "epoch": 0.6812836248614453, + "grad_norm": 23.35637855529785, + "learning_rate": 6.439195100612424e-06, + "loss": 1.5173, + "step": 1575 + }, + { + "epoch": 0.6817161858931033, + "grad_norm": 22.728713989257812, + "learning_rate": 6.430446194225722e-06, + "loss": 1.5578, + "step": 1576 + }, + { + "epoch": 0.6821487469247615, + "grad_norm": 27.757644653320312, + "learning_rate": 6.421697287839021e-06, + "loss": 1.557, + "step": 1577 + }, + { + "epoch": 0.6825813079564195, + "grad_norm": 24.904579162597656, + "learning_rate": 6.412948381452318e-06, + "loss": 1.4613, + "step": 1578 + }, + { + "epoch": 0.6830138689880776, + "grad_norm": 24.260875701904297, + "learning_rate": 6.4041994750656174e-06, + "loss": 1.5102, + "step": 1579 + }, + { + "epoch": 0.6834464300197356, + "grad_norm": 25.12516975402832, + "learning_rate": 6.395450568678916e-06, + "loss": 1.5472, + "step": 1580 + }, + { + "epoch": 0.6838789910513936, + "grad_norm": 23.9464054107666, + "learning_rate": 6.386701662292214e-06, + "loss": 1.5611, + "step": 1581 + }, + { + "epoch": 0.6843115520830517, + "grad_norm": 22.76926040649414, + "learning_rate": 6.3779527559055125e-06, + "loss": 1.5379, + "step": 1582 + }, + { + "epoch": 0.6847441131147097, + "grad_norm": 24.24293327331543, + "learning_rate": 6.36920384951881e-06, + "loss": 1.5672, + "step": 1583 + }, + { + "epoch": 0.6851766741463678, + "grad_norm": 24.250621795654297, + "learning_rate": 6.360454943132109e-06, + "loss": 1.4307, + "step": 1584 + }, + { + "epoch": 0.6856092351780259, + "grad_norm": 25.41415786743164, + "learning_rate": 6.351706036745408e-06, + "loss": 1.5174, + "step": 1585 + }, + { + "epoch": 0.686041796209684, + "grad_norm": 21.934829711914062, + "learning_rate": 6.342957130358706e-06, + "loss": 1.5329, + "step": 1586 + }, + { + "epoch": 0.686474357241342, + "grad_norm": 23.609012603759766, + "learning_rate": 6.3342082239720035e-06, + "loss": 1.5426, + "step": 1587 + }, + { + "epoch": 0.6869069182730001, + "grad_norm": 25.188135147094727, + "learning_rate": 6.325459317585302e-06, + "loss": 1.5221, + "step": 1588 + }, + { + "epoch": 0.6873394793046581, + "grad_norm": 23.130638122558594, + "learning_rate": 6.316710411198601e-06, + "loss": 1.5202, + "step": 1589 + }, + { + "epoch": 0.6877720403363162, + "grad_norm": 24.72777557373047, + "learning_rate": 6.307961504811899e-06, + "loss": 1.5004, + "step": 1590 + }, + { + "epoch": 0.6882046013679742, + "grad_norm": 23.27838134765625, + "learning_rate": 6.299212598425197e-06, + "loss": 1.5503, + "step": 1591 + }, + { + "epoch": 0.6886371623996324, + "grad_norm": 20.87567710876465, + "learning_rate": 6.290463692038495e-06, + "loss": 1.5856, + "step": 1592 + }, + { + "epoch": 0.6890697234312904, + "grad_norm": 23.302906036376953, + "learning_rate": 6.281714785651794e-06, + "loss": 1.524, + "step": 1593 + }, + { + "epoch": 0.6895022844629485, + "grad_norm": 24.006710052490234, + "learning_rate": 6.272965879265093e-06, + "loss": 1.4691, + "step": 1594 + }, + { + "epoch": 0.6899348454946065, + "grad_norm": 24.12818145751953, + "learning_rate": 6.264216972878391e-06, + "loss": 1.5727, + "step": 1595 + }, + { + "epoch": 0.6903674065262646, + "grad_norm": 25.554643630981445, + "learning_rate": 6.255468066491689e-06, + "loss": 1.5754, + "step": 1596 + }, + { + "epoch": 0.6907999675579226, + "grad_norm": 21.56632423400879, + "learning_rate": 6.246719160104987e-06, + "loss": 1.5766, + "step": 1597 + }, + { + "epoch": 0.6912325285895807, + "grad_norm": 26.7813720703125, + "learning_rate": 6.237970253718286e-06, + "loss": 1.5778, + "step": 1598 + }, + { + "epoch": 0.6916650896212387, + "grad_norm": 24.2366943359375, + "learning_rate": 6.229221347331585e-06, + "loss": 1.5162, + "step": 1599 + }, + { + "epoch": 0.6920976506528969, + "grad_norm": 24.078445434570312, + "learning_rate": 6.220472440944882e-06, + "loss": 1.4297, + "step": 1600 + }, + { + "epoch": 0.6925302116845549, + "grad_norm": 26.070178985595703, + "learning_rate": 6.2117235345581805e-06, + "loss": 1.5393, + "step": 1601 + }, + { + "epoch": 0.692962772716213, + "grad_norm": 27.011350631713867, + "learning_rate": 6.202974628171479e-06, + "loss": 1.4795, + "step": 1602 + }, + { + "epoch": 0.693395333747871, + "grad_norm": 27.4875431060791, + "learning_rate": 6.194225721784778e-06, + "loss": 1.5485, + "step": 1603 + }, + { + "epoch": 0.693827894779529, + "grad_norm": 22.1238956451416, + "learning_rate": 6.185476815398076e-06, + "loss": 1.428, + "step": 1604 + }, + { + "epoch": 0.6942604558111871, + "grad_norm": 23.28427505493164, + "learning_rate": 6.176727909011374e-06, + "loss": 1.6064, + "step": 1605 + }, + { + "epoch": 0.6946930168428451, + "grad_norm": 24.525197982788086, + "learning_rate": 6.167979002624672e-06, + "loss": 1.5426, + "step": 1606 + }, + { + "epoch": 0.6951255778745032, + "grad_norm": 26.526762008666992, + "learning_rate": 6.159230096237971e-06, + "loss": 1.4851, + "step": 1607 + }, + { + "epoch": 0.6955581389061612, + "grad_norm": 23.86712646484375, + "learning_rate": 6.15048118985127e-06, + "loss": 1.5635, + "step": 1608 + }, + { + "epoch": 0.6959906999378194, + "grad_norm": 22.40746307373047, + "learning_rate": 6.141732283464567e-06, + "loss": 1.5988, + "step": 1609 + }, + { + "epoch": 0.6964232609694774, + "grad_norm": 22.69624137878418, + "learning_rate": 6.132983377077866e-06, + "loss": 1.444, + "step": 1610 + }, + { + "epoch": 0.6968558220011355, + "grad_norm": 22.52001953125, + "learning_rate": 6.124234470691164e-06, + "loss": 1.4884, + "step": 1611 + }, + { + "epoch": 0.6972883830327935, + "grad_norm": 25.607025146484375, + "learning_rate": 6.115485564304462e-06, + "loss": 1.4535, + "step": 1612 + }, + { + "epoch": 0.6977209440644516, + "grad_norm": 22.519060134887695, + "learning_rate": 6.106736657917761e-06, + "loss": 1.6537, + "step": 1613 + }, + { + "epoch": 0.6981535050961096, + "grad_norm": 22.56156349182129, + "learning_rate": 6.097987751531059e-06, + "loss": 1.5571, + "step": 1614 + }, + { + "epoch": 0.6985860661277677, + "grad_norm": 22.961685180664062, + "learning_rate": 6.0892388451443576e-06, + "loss": 1.5163, + "step": 1615 + }, + { + "epoch": 0.6990186271594258, + "grad_norm": 24.06474494934082, + "learning_rate": 6.080489938757655e-06, + "loss": 1.5118, + "step": 1616 + }, + { + "epoch": 0.6994511881910839, + "grad_norm": 23.049427032470703, + "learning_rate": 6.0717410323709534e-06, + "loss": 1.5018, + "step": 1617 + }, + { + "epoch": 0.6998837492227419, + "grad_norm": 25.137136459350586, + "learning_rate": 6.062992125984253e-06, + "loss": 1.5839, + "step": 1618 + }, + { + "epoch": 0.7003163102544, + "grad_norm": 23.529890060424805, + "learning_rate": 6.054243219597551e-06, + "loss": 1.5558, + "step": 1619 + }, + { + "epoch": 0.700748871286058, + "grad_norm": 22.19614601135254, + "learning_rate": 6.045494313210849e-06, + "loss": 1.5381, + "step": 1620 + }, + { + "epoch": 0.7011814323177161, + "grad_norm": 26.739168167114258, + "learning_rate": 6.036745406824147e-06, + "loss": 1.5358, + "step": 1621 + }, + { + "epoch": 0.7016139933493741, + "grad_norm": 25.52427864074707, + "learning_rate": 6.027996500437446e-06, + "loss": 1.5421, + "step": 1622 + }, + { + "epoch": 0.7020465543810323, + "grad_norm": 23.286373138427734, + "learning_rate": 6.0192475940507444e-06, + "loss": 1.4983, + "step": 1623 + }, + { + "epoch": 0.7024791154126903, + "grad_norm": 25.0222225189209, + "learning_rate": 6.010498687664043e-06, + "loss": 1.4992, + "step": 1624 + }, + { + "epoch": 0.7029116764443483, + "grad_norm": 26.569257736206055, + "learning_rate": 6.00174978127734e-06, + "loss": 1.5621, + "step": 1625 + }, + { + "epoch": 0.7033442374760064, + "grad_norm": 23.545291900634766, + "learning_rate": 5.993000874890639e-06, + "loss": 1.5807, + "step": 1626 + }, + { + "epoch": 0.7037767985076644, + "grad_norm": 25.80244255065918, + "learning_rate": 5.984251968503938e-06, + "loss": 1.5377, + "step": 1627 + }, + { + "epoch": 0.7042093595393225, + "grad_norm": 24.305273056030273, + "learning_rate": 5.975503062117236e-06, + "loss": 1.5414, + "step": 1628 + }, + { + "epoch": 0.7046419205709805, + "grad_norm": 24.8024845123291, + "learning_rate": 5.966754155730535e-06, + "loss": 1.5594, + "step": 1629 + }, + { + "epoch": 0.7050744816026386, + "grad_norm": 22.716646194458008, + "learning_rate": 5.958005249343832e-06, + "loss": 1.5006, + "step": 1630 + }, + { + "epoch": 0.7055070426342966, + "grad_norm": 25.36906623840332, + "learning_rate": 5.9492563429571305e-06, + "loss": 1.5169, + "step": 1631 + }, + { + "epoch": 0.7059396036659548, + "grad_norm": 21.435344696044922, + "learning_rate": 5.94050743657043e-06, + "loss": 1.4967, + "step": 1632 + }, + { + "epoch": 0.7063721646976128, + "grad_norm": 25.601346969604492, + "learning_rate": 5.931758530183728e-06, + "loss": 1.4927, + "step": 1633 + }, + { + "epoch": 0.7068047257292709, + "grad_norm": 23.794708251953125, + "learning_rate": 5.9230096237970256e-06, + "loss": 1.4312, + "step": 1634 + }, + { + "epoch": 0.7072372867609289, + "grad_norm": 26.20401954650879, + "learning_rate": 5.914260717410324e-06, + "loss": 1.488, + "step": 1635 + }, + { + "epoch": 0.707669847792587, + "grad_norm": 23.398365020751953, + "learning_rate": 5.905511811023622e-06, + "loss": 1.4526, + "step": 1636 + }, + { + "epoch": 0.708102408824245, + "grad_norm": 24.44811248779297, + "learning_rate": 5.8967629046369215e-06, + "loss": 1.5993, + "step": 1637 + }, + { + "epoch": 0.7085349698559031, + "grad_norm": 24.23762321472168, + "learning_rate": 5.888013998250219e-06, + "loss": 1.5917, + "step": 1638 + }, + { + "epoch": 0.7089675308875611, + "grad_norm": 27.343517303466797, + "learning_rate": 5.879265091863517e-06, + "loss": 1.4431, + "step": 1639 + }, + { + "epoch": 0.7094000919192193, + "grad_norm": 26.040502548217773, + "learning_rate": 5.870516185476816e-06, + "loss": 1.5874, + "step": 1640 + }, + { + "epoch": 0.7098326529508773, + "grad_norm": 25.721101760864258, + "learning_rate": 5.861767279090114e-06, + "loss": 1.5325, + "step": 1641 + }, + { + "epoch": 0.7102652139825354, + "grad_norm": 24.612388610839844, + "learning_rate": 5.853018372703413e-06, + "loss": 1.4986, + "step": 1642 + }, + { + "epoch": 0.7106977750141934, + "grad_norm": 24.61174964904785, + "learning_rate": 5.844269466316711e-06, + "loss": 1.5429, + "step": 1643 + }, + { + "epoch": 0.7111303360458515, + "grad_norm": 31.583274841308594, + "learning_rate": 5.835520559930009e-06, + "loss": 1.5787, + "step": 1644 + }, + { + "epoch": 0.7115628970775095, + "grad_norm": 23.573389053344727, + "learning_rate": 5.8267716535433075e-06, + "loss": 1.5006, + "step": 1645 + }, + { + "epoch": 0.7119954581091676, + "grad_norm": 28.629196166992188, + "learning_rate": 5.818022747156607e-06, + "loss": 1.5066, + "step": 1646 + }, + { + "epoch": 0.7124280191408257, + "grad_norm": 24.07172966003418, + "learning_rate": 5.809273840769904e-06, + "loss": 1.5167, + "step": 1647 + }, + { + "epoch": 0.7128605801724837, + "grad_norm": 24.85234260559082, + "learning_rate": 5.800524934383203e-06, + "loss": 1.4958, + "step": 1648 + }, + { + "epoch": 0.7132931412041418, + "grad_norm": 24.3563232421875, + "learning_rate": 5.791776027996501e-06, + "loss": 1.5272, + "step": 1649 + }, + { + "epoch": 0.7137257022357998, + "grad_norm": 24.525150299072266, + "learning_rate": 5.7830271216097985e-06, + "loss": 1.4952, + "step": 1650 + }, + { + "epoch": 0.7141582632674579, + "grad_norm": 26.782798767089844, + "learning_rate": 5.774278215223098e-06, + "loss": 1.5519, + "step": 1651 + }, + { + "epoch": 0.7145908242991159, + "grad_norm": 22.02875518798828, + "learning_rate": 5.765529308836396e-06, + "loss": 1.5515, + "step": 1652 + }, + { + "epoch": 0.715023385330774, + "grad_norm": 24.18636703491211, + "learning_rate": 5.756780402449694e-06, + "loss": 1.6241, + "step": 1653 + }, + { + "epoch": 0.715455946362432, + "grad_norm": 20.57954216003418, + "learning_rate": 5.748031496062993e-06, + "loss": 1.4652, + "step": 1654 + }, + { + "epoch": 0.7158885073940902, + "grad_norm": 24.09521484375, + "learning_rate": 5.73928258967629e-06, + "loss": 1.3606, + "step": 1655 + }, + { + "epoch": 0.7163210684257482, + "grad_norm": 26.569869995117188, + "learning_rate": 5.7305336832895895e-06, + "loss": 1.5271, + "step": 1656 + }, + { + "epoch": 0.7167536294574063, + "grad_norm": 24.32697296142578, + "learning_rate": 5.721784776902888e-06, + "loss": 1.5423, + "step": 1657 + }, + { + "epoch": 0.7171861904890643, + "grad_norm": 24.233909606933594, + "learning_rate": 5.713035870516186e-06, + "loss": 1.6511, + "step": 1658 + }, + { + "epoch": 0.7176187515207224, + "grad_norm": 26.0461483001709, + "learning_rate": 5.704286964129484e-06, + "loss": 1.4889, + "step": 1659 + }, + { + "epoch": 0.7180513125523804, + "grad_norm": 24.910232543945312, + "learning_rate": 5.695538057742782e-06, + "loss": 1.5006, + "step": 1660 + }, + { + "epoch": 0.7184838735840385, + "grad_norm": 25.941944122314453, + "learning_rate": 5.686789151356081e-06, + "loss": 1.4548, + "step": 1661 + }, + { + "epoch": 0.7189164346156965, + "grad_norm": 21.430110931396484, + "learning_rate": 5.67804024496938e-06, + "loss": 1.4481, + "step": 1662 + }, + { + "epoch": 0.7193489956473547, + "grad_norm": 27.890432357788086, + "learning_rate": 5.669291338582677e-06, + "loss": 1.4916, + "step": 1663 + }, + { + "epoch": 0.7197815566790127, + "grad_norm": 25.04297637939453, + "learning_rate": 5.6605424321959755e-06, + "loss": 1.4469, + "step": 1664 + }, + { + "epoch": 0.7202141177106708, + "grad_norm": 24.669153213500977, + "learning_rate": 5.651793525809274e-06, + "loss": 1.4939, + "step": 1665 + }, + { + "epoch": 0.7206466787423288, + "grad_norm": 27.076704025268555, + "learning_rate": 5.643044619422573e-06, + "loss": 1.569, + "step": 1666 + }, + { + "epoch": 0.7210792397739869, + "grad_norm": 22.596973419189453, + "learning_rate": 5.6342957130358714e-06, + "loss": 1.5492, + "step": 1667 + }, + { + "epoch": 0.7215118008056449, + "grad_norm": 25.775602340698242, + "learning_rate": 5.625546806649169e-06, + "loss": 1.6141, + "step": 1668 + }, + { + "epoch": 0.7219443618373029, + "grad_norm": 23.130680084228516, + "learning_rate": 5.616797900262467e-06, + "loss": 1.5137, + "step": 1669 + }, + { + "epoch": 0.722376922868961, + "grad_norm": 32.87105941772461, + "learning_rate": 5.6080489938757665e-06, + "loss": 1.5107, + "step": 1670 + }, + { + "epoch": 0.722809483900619, + "grad_norm": 22.43234634399414, + "learning_rate": 5.599300087489065e-06, + "loss": 1.5348, + "step": 1671 + }, + { + "epoch": 0.7232420449322772, + "grad_norm": 24.693084716796875, + "learning_rate": 5.590551181102362e-06, + "loss": 1.5212, + "step": 1672 + }, + { + "epoch": 0.7236746059639352, + "grad_norm": 27.20313835144043, + "learning_rate": 5.581802274715661e-06, + "loss": 1.5393, + "step": 1673 + }, + { + "epoch": 0.7241071669955933, + "grad_norm": 23.293119430541992, + "learning_rate": 5.573053368328959e-06, + "loss": 1.4915, + "step": 1674 + }, + { + "epoch": 0.7245397280272513, + "grad_norm": 26.227306365966797, + "learning_rate": 5.564304461942258e-06, + "loss": 1.513, + "step": 1675 + }, + { + "epoch": 0.7249722890589094, + "grad_norm": 24.33916473388672, + "learning_rate": 5.555555555555557e-06, + "loss": 1.4589, + "step": 1676 + }, + { + "epoch": 0.7254048500905674, + "grad_norm": 24.176958084106445, + "learning_rate": 5.546806649168854e-06, + "loss": 1.5841, + "step": 1677 + }, + { + "epoch": 0.7258374111222256, + "grad_norm": 22.859975814819336, + "learning_rate": 5.5380577427821525e-06, + "loss": 1.5555, + "step": 1678 + }, + { + "epoch": 0.7262699721538836, + "grad_norm": 24.679033279418945, + "learning_rate": 5.529308836395451e-06, + "loss": 1.544, + "step": 1679 + }, + { + "epoch": 0.7267025331855417, + "grad_norm": 23.848819732666016, + "learning_rate": 5.52055993000875e-06, + "loss": 1.4953, + "step": 1680 + }, + { + "epoch": 0.7271350942171997, + "grad_norm": 23.183744430541992, + "learning_rate": 5.511811023622048e-06, + "loss": 1.55, + "step": 1681 + }, + { + "epoch": 0.7275676552488578, + "grad_norm": 25.85765838623047, + "learning_rate": 5.503062117235346e-06, + "loss": 1.543, + "step": 1682 + }, + { + "epoch": 0.7280002162805158, + "grad_norm": 25.841501235961914, + "learning_rate": 5.494313210848644e-06, + "loss": 1.5411, + "step": 1683 + }, + { + "epoch": 0.7284327773121739, + "grad_norm": 25.476608276367188, + "learning_rate": 5.485564304461942e-06, + "loss": 1.502, + "step": 1684 + }, + { + "epoch": 0.7288653383438319, + "grad_norm": 24.201326370239258, + "learning_rate": 5.476815398075241e-06, + "loss": 1.5492, + "step": 1685 + }, + { + "epoch": 0.72929789937549, + "grad_norm": 26.180383682250977, + "learning_rate": 5.468066491688539e-06, + "loss": 1.5663, + "step": 1686 + }, + { + "epoch": 0.7297304604071481, + "grad_norm": 23.471723556518555, + "learning_rate": 5.459317585301838e-06, + "loss": 1.4523, + "step": 1687 + }, + { + "epoch": 0.7301630214388062, + "grad_norm": 28.88964080810547, + "learning_rate": 5.450568678915136e-06, + "loss": 1.5238, + "step": 1688 + }, + { + "epoch": 0.7305955824704642, + "grad_norm": 24.187883377075195, + "learning_rate": 5.441819772528434e-06, + "loss": 1.6256, + "step": 1689 + }, + { + "epoch": 0.7310281435021223, + "grad_norm": 23.76744270324707, + "learning_rate": 5.433070866141733e-06, + "loss": 1.5351, + "step": 1690 + }, + { + "epoch": 0.7314607045337803, + "grad_norm": 25.365325927734375, + "learning_rate": 5.424321959755031e-06, + "loss": 1.4795, + "step": 1691 + }, + { + "epoch": 0.7318932655654383, + "grad_norm": 27.68093490600586, + "learning_rate": 5.41557305336833e-06, + "loss": 1.6068, + "step": 1692 + }, + { + "epoch": 0.7323258265970964, + "grad_norm": 26.40992546081543, + "learning_rate": 5.406824146981627e-06, + "loss": 1.497, + "step": 1693 + }, + { + "epoch": 0.7327583876287544, + "grad_norm": 26.072214126586914, + "learning_rate": 5.398075240594926e-06, + "loss": 1.5387, + "step": 1694 + }, + { + "epoch": 0.7331909486604126, + "grad_norm": 22.766809463500977, + "learning_rate": 5.389326334208225e-06, + "loss": 1.6233, + "step": 1695 + }, + { + "epoch": 0.7336235096920706, + "grad_norm": 24.374536514282227, + "learning_rate": 5.380577427821523e-06, + "loss": 1.5207, + "step": 1696 + }, + { + "epoch": 0.7340560707237287, + "grad_norm": 26.02762794494629, + "learning_rate": 5.3718285214348205e-06, + "loss": 1.5178, + "step": 1697 + }, + { + "epoch": 0.7344886317553867, + "grad_norm": 24.251422882080078, + "learning_rate": 5.363079615048119e-06, + "loss": 1.5194, + "step": 1698 + }, + { + "epoch": 0.7349211927870448, + "grad_norm": 28.283960342407227, + "learning_rate": 5.354330708661418e-06, + "loss": 1.5407, + "step": 1699 + }, + { + "epoch": 0.7353537538187028, + "grad_norm": 24.403696060180664, + "learning_rate": 5.3455818022747165e-06, + "loss": 1.5313, + "step": 1700 + }, + { + "epoch": 0.735786314850361, + "grad_norm": 25.62765884399414, + "learning_rate": 5.336832895888015e-06, + "loss": 1.5413, + "step": 1701 + }, + { + "epoch": 0.736218875882019, + "grad_norm": 26.185192108154297, + "learning_rate": 5.328083989501312e-06, + "loss": 1.4735, + "step": 1702 + }, + { + "epoch": 0.7366514369136771, + "grad_norm": 22.629499435424805, + "learning_rate": 5.319335083114611e-06, + "loss": 1.4404, + "step": 1703 + }, + { + "epoch": 0.7370839979453351, + "grad_norm": 21.76143455505371, + "learning_rate": 5.31058617672791e-06, + "loss": 1.5389, + "step": 1704 + }, + { + "epoch": 0.7375165589769932, + "grad_norm": 24.350290298461914, + "learning_rate": 5.301837270341208e-06, + "loss": 1.5272, + "step": 1705 + }, + { + "epoch": 0.7379491200086512, + "grad_norm": 25.934980392456055, + "learning_rate": 5.293088363954506e-06, + "loss": 1.5434, + "step": 1706 + }, + { + "epoch": 0.7383816810403093, + "grad_norm": 26.150161743164062, + "learning_rate": 5.284339457567804e-06, + "loss": 1.4915, + "step": 1707 + }, + { + "epoch": 0.7388142420719673, + "grad_norm": 26.145160675048828, + "learning_rate": 5.2755905511811025e-06, + "loss": 1.4885, + "step": 1708 + }, + { + "epoch": 0.7392468031036254, + "grad_norm": 27.158536911010742, + "learning_rate": 5.266841644794402e-06, + "loss": 1.4621, + "step": 1709 + }, + { + "epoch": 0.7396793641352835, + "grad_norm": 27.394515991210938, + "learning_rate": 5.258092738407699e-06, + "loss": 1.5549, + "step": 1710 + }, + { + "epoch": 0.7401119251669416, + "grad_norm": 24.41299819946289, + "learning_rate": 5.2493438320209976e-06, + "loss": 1.4812, + "step": 1711 + }, + { + "epoch": 0.7405444861985996, + "grad_norm": 26.23542022705078, + "learning_rate": 5.240594925634296e-06, + "loss": 1.452, + "step": 1712 + }, + { + "epoch": 0.7409770472302576, + "grad_norm": 23.603107452392578, + "learning_rate": 5.231846019247594e-06, + "loss": 1.5045, + "step": 1713 + }, + { + "epoch": 0.7414096082619157, + "grad_norm": 25.918743133544922, + "learning_rate": 5.2230971128608935e-06, + "loss": 1.5866, + "step": 1714 + }, + { + "epoch": 0.7418421692935737, + "grad_norm": 23.324562072753906, + "learning_rate": 5.214348206474191e-06, + "loss": 1.4572, + "step": 1715 + }, + { + "epoch": 0.7422747303252318, + "grad_norm": 23.02033233642578, + "learning_rate": 5.205599300087489e-06, + "loss": 1.4901, + "step": 1716 + }, + { + "epoch": 0.7427072913568898, + "grad_norm": 25.430767059326172, + "learning_rate": 5.196850393700788e-06, + "loss": 1.4732, + "step": 1717 + }, + { + "epoch": 0.743139852388548, + "grad_norm": 25.643844604492188, + "learning_rate": 5.188101487314087e-06, + "loss": 1.5468, + "step": 1718 + }, + { + "epoch": 0.743572413420206, + "grad_norm": 22.182497024536133, + "learning_rate": 5.1793525809273845e-06, + "loss": 1.5145, + "step": 1719 + }, + { + "epoch": 0.7440049744518641, + "grad_norm": 25.15062713623047, + "learning_rate": 5.170603674540683e-06, + "loss": 1.5308, + "step": 1720 + }, + { + "epoch": 0.7444375354835221, + "grad_norm": 27.65828514099121, + "learning_rate": 5.161854768153981e-06, + "loss": 1.5002, + "step": 1721 + }, + { + "epoch": 0.7448700965151802, + "grad_norm": 23.610389709472656, + "learning_rate": 5.1531058617672795e-06, + "loss": 1.593, + "step": 1722 + }, + { + "epoch": 0.7453026575468382, + "grad_norm": 22.645122528076172, + "learning_rate": 5.144356955380579e-06, + "loss": 1.5042, + "step": 1723 + }, + { + "epoch": 0.7457352185784963, + "grad_norm": 26.383073806762695, + "learning_rate": 5.135608048993876e-06, + "loss": 1.5823, + "step": 1724 + }, + { + "epoch": 0.7461677796101543, + "grad_norm": 24.08680534362793, + "learning_rate": 5.126859142607175e-06, + "loss": 1.4827, + "step": 1725 + }, + { + "epoch": 0.7466003406418125, + "grad_norm": 28.29230308532715, + "learning_rate": 5.118110236220473e-06, + "loss": 1.5249, + "step": 1726 + }, + { + "epoch": 0.7470329016734705, + "grad_norm": 24.57362937927246, + "learning_rate": 5.1093613298337705e-06, + "loss": 1.4491, + "step": 1727 + }, + { + "epoch": 0.7474654627051286, + "grad_norm": 23.962614059448242, + "learning_rate": 5.10061242344707e-06, + "loss": 1.5894, + "step": 1728 + }, + { + "epoch": 0.7478980237367866, + "grad_norm": 23.7990779876709, + "learning_rate": 5.091863517060368e-06, + "loss": 1.5585, + "step": 1729 + }, + { + "epoch": 0.7483305847684447, + "grad_norm": 24.12856674194336, + "learning_rate": 5.083114610673666e-06, + "loss": 1.5773, + "step": 1730 + }, + { + "epoch": 0.7487631458001027, + "grad_norm": 23.70834732055664, + "learning_rate": 5.074365704286964e-06, + "loss": 1.4753, + "step": 1731 + }, + { + "epoch": 0.7491957068317608, + "grad_norm": 23.08877182006836, + "learning_rate": 5.065616797900262e-06, + "loss": 1.6544, + "step": 1732 + }, + { + "epoch": 0.7496282678634189, + "grad_norm": 26.548851013183594, + "learning_rate": 5.0568678915135615e-06, + "loss": 1.5633, + "step": 1733 + }, + { + "epoch": 0.750060828895077, + "grad_norm": 24.939672470092773, + "learning_rate": 5.04811898512686e-06, + "loss": 1.5656, + "step": 1734 + }, + { + "epoch": 0.750493389926735, + "grad_norm": 23.20163345336914, + "learning_rate": 5.039370078740158e-06, + "loss": 1.5713, + "step": 1735 + }, + { + "epoch": 0.750925950958393, + "grad_norm": 22.919689178466797, + "learning_rate": 5.030621172353456e-06, + "loss": 1.6136, + "step": 1736 + }, + { + "epoch": 0.7513585119900511, + "grad_norm": 24.276479721069336, + "learning_rate": 5.021872265966754e-06, + "loss": 1.5749, + "step": 1737 + }, + { + "epoch": 0.7517910730217091, + "grad_norm": 23.846363067626953, + "learning_rate": 5.013123359580053e-06, + "loss": 1.4735, + "step": 1738 + }, + { + "epoch": 0.7522236340533672, + "grad_norm": 25.617586135864258, + "learning_rate": 5.004374453193352e-06, + "loss": 1.5511, + "step": 1739 + }, + { + "epoch": 0.7526561950850252, + "grad_norm": 22.6473445892334, + "learning_rate": 4.995625546806649e-06, + "loss": 1.504, + "step": 1740 + }, + { + "epoch": 0.7530887561166834, + "grad_norm": 24.413610458374023, + "learning_rate": 4.986876640419948e-06, + "loss": 1.5588, + "step": 1741 + }, + { + "epoch": 0.7535213171483414, + "grad_norm": 24.66851806640625, + "learning_rate": 4.978127734033246e-06, + "loss": 1.5151, + "step": 1742 + }, + { + "epoch": 0.7539538781799995, + "grad_norm": 23.771665573120117, + "learning_rate": 4.969378827646544e-06, + "loss": 1.4847, + "step": 1743 + }, + { + "epoch": 0.7543864392116575, + "grad_norm": 24.447561264038086, + "learning_rate": 4.960629921259843e-06, + "loss": 1.55, + "step": 1744 + }, + { + "epoch": 0.7548190002433156, + "grad_norm": 22.75802230834961, + "learning_rate": 4.951881014873141e-06, + "loss": 1.5186, + "step": 1745 + }, + { + "epoch": 0.7552515612749736, + "grad_norm": 26.083370208740234, + "learning_rate": 4.94313210848644e-06, + "loss": 1.4842, + "step": 1746 + }, + { + "epoch": 0.7556841223066317, + "grad_norm": 29.3764705657959, + "learning_rate": 4.934383202099738e-06, + "loss": 1.4959, + "step": 1747 + }, + { + "epoch": 0.7561166833382897, + "grad_norm": 21.326786041259766, + "learning_rate": 4.925634295713037e-06, + "loss": 1.5026, + "step": 1748 + }, + { + "epoch": 0.7565492443699479, + "grad_norm": 23.675289154052734, + "learning_rate": 4.916885389326334e-06, + "loss": 1.5059, + "step": 1749 + }, + { + "epoch": 0.7569818054016059, + "grad_norm": 25.60411834716797, + "learning_rate": 4.908136482939633e-06, + "loss": 1.5985, + "step": 1750 + }, + { + "epoch": 0.757414366433264, + "grad_norm": 24.007932662963867, + "learning_rate": 4.899387576552931e-06, + "loss": 1.4974, + "step": 1751 + }, + { + "epoch": 0.757846927464922, + "grad_norm": 28.738752365112305, + "learning_rate": 4.8906386701662295e-06, + "loss": 1.4758, + "step": 1752 + }, + { + "epoch": 0.7582794884965801, + "grad_norm": 24.201730728149414, + "learning_rate": 4.881889763779528e-06, + "loss": 1.5549, + "step": 1753 + }, + { + "epoch": 0.7587120495282381, + "grad_norm": 24.409114837646484, + "learning_rate": 4.873140857392826e-06, + "loss": 1.5193, + "step": 1754 + }, + { + "epoch": 0.7591446105598962, + "grad_norm": 22.474170684814453, + "learning_rate": 4.8643919510061246e-06, + "loss": 1.6365, + "step": 1755 + }, + { + "epoch": 0.7595771715915542, + "grad_norm": 22.524370193481445, + "learning_rate": 4.855643044619423e-06, + "loss": 1.6074, + "step": 1756 + }, + { + "epoch": 0.7600097326232123, + "grad_norm": 26.538591384887695, + "learning_rate": 4.846894138232721e-06, + "loss": 1.5673, + "step": 1757 + }, + { + "epoch": 0.7604422936548704, + "grad_norm": 25.404539108276367, + "learning_rate": 4.83814523184602e-06, + "loss": 1.5393, + "step": 1758 + }, + { + "epoch": 0.7608748546865284, + "grad_norm": 23.63199806213379, + "learning_rate": 4.829396325459318e-06, + "loss": 1.5652, + "step": 1759 + }, + { + "epoch": 0.7613074157181865, + "grad_norm": 26.102270126342773, + "learning_rate": 4.820647419072616e-06, + "loss": 1.5968, + "step": 1760 + }, + { + "epoch": 0.7617399767498445, + "grad_norm": 27.475393295288086, + "learning_rate": 4.811898512685915e-06, + "loss": 1.5657, + "step": 1761 + }, + { + "epoch": 0.7621725377815026, + "grad_norm": 24.71074676513672, + "learning_rate": 4.803149606299213e-06, + "loss": 1.5442, + "step": 1762 + }, + { + "epoch": 0.7626050988131606, + "grad_norm": 25.49890899658203, + "learning_rate": 4.7944006999125114e-06, + "loss": 1.4888, + "step": 1763 + }, + { + "epoch": 0.7630376598448187, + "grad_norm": 23.629859924316406, + "learning_rate": 4.78565179352581e-06, + "loss": 1.5163, + "step": 1764 + }, + { + "epoch": 0.7634702208764768, + "grad_norm": 24.619083404541016, + "learning_rate": 4.776902887139108e-06, + "loss": 1.5097, + "step": 1765 + }, + { + "epoch": 0.7639027819081349, + "grad_norm": 21.8817138671875, + "learning_rate": 4.7681539807524065e-06, + "loss": 1.5555, + "step": 1766 + }, + { + "epoch": 0.7643353429397929, + "grad_norm": 23.76637840270996, + "learning_rate": 4.759405074365704e-06, + "loss": 1.4962, + "step": 1767 + }, + { + "epoch": 0.764767903971451, + "grad_norm": 24.75080108642578, + "learning_rate": 4.750656167979003e-06, + "loss": 1.5414, + "step": 1768 + }, + { + "epoch": 0.765200465003109, + "grad_norm": 24.92739486694336, + "learning_rate": 4.741907261592302e-06, + "loss": 1.5096, + "step": 1769 + }, + { + "epoch": 0.7656330260347671, + "grad_norm": 24.84958839416504, + "learning_rate": 4.7331583552056e-06, + "loss": 1.5318, + "step": 1770 + }, + { + "epoch": 0.7660655870664251, + "grad_norm": 24.568798065185547, + "learning_rate": 4.724409448818898e-06, + "loss": 1.5618, + "step": 1771 + }, + { + "epoch": 0.7664981480980833, + "grad_norm": 23.898578643798828, + "learning_rate": 4.715660542432197e-06, + "loss": 1.58, + "step": 1772 + }, + { + "epoch": 0.7669307091297413, + "grad_norm": 24.494558334350586, + "learning_rate": 4.706911636045495e-06, + "loss": 1.4997, + "step": 1773 + }, + { + "epoch": 0.7673632701613994, + "grad_norm": 21.843421936035156, + "learning_rate": 4.6981627296587926e-06, + "loss": 1.5825, + "step": 1774 + }, + { + "epoch": 0.7677958311930574, + "grad_norm": 25.82838249206543, + "learning_rate": 4.689413823272092e-06, + "loss": 1.5319, + "step": 1775 + }, + { + "epoch": 0.7682283922247155, + "grad_norm": 24.003957748413086, + "learning_rate": 4.680664916885389e-06, + "loss": 1.4995, + "step": 1776 + }, + { + "epoch": 0.7686609532563735, + "grad_norm": 24.655353546142578, + "learning_rate": 4.6719160104986885e-06, + "loss": 1.456, + "step": 1777 + }, + { + "epoch": 0.7690935142880316, + "grad_norm": 24.714584350585938, + "learning_rate": 4.663167104111986e-06, + "loss": 1.4928, + "step": 1778 + }, + { + "epoch": 0.7695260753196896, + "grad_norm": 26.724472045898438, + "learning_rate": 4.654418197725284e-06, + "loss": 1.5625, + "step": 1779 + }, + { + "epoch": 0.7699586363513476, + "grad_norm": 24.749422073364258, + "learning_rate": 4.645669291338583e-06, + "loss": 1.5352, + "step": 1780 + }, + { + "epoch": 0.7703911973830058, + "grad_norm": 25.769168853759766, + "learning_rate": 4.636920384951881e-06, + "loss": 1.525, + "step": 1781 + }, + { + "epoch": 0.7708237584146638, + "grad_norm": 27.238039016723633, + "learning_rate": 4.62817147856518e-06, + "loss": 1.4711, + "step": 1782 + }, + { + "epoch": 0.7712563194463219, + "grad_norm": 28.644994735717773, + "learning_rate": 4.619422572178478e-06, + "loss": 1.5006, + "step": 1783 + }, + { + "epoch": 0.7716888804779799, + "grad_norm": 27.155855178833008, + "learning_rate": 4.610673665791777e-06, + "loss": 1.5224, + "step": 1784 + }, + { + "epoch": 0.772121441509638, + "grad_norm": 24.1883487701416, + "learning_rate": 4.6019247594050745e-06, + "loss": 1.5639, + "step": 1785 + }, + { + "epoch": 0.772554002541296, + "grad_norm": 23.588848114013672, + "learning_rate": 4.593175853018373e-06, + "loss": 1.4729, + "step": 1786 + }, + { + "epoch": 0.7729865635729541, + "grad_norm": 23.275426864624023, + "learning_rate": 4.584426946631671e-06, + "loss": 1.5298, + "step": 1787 + }, + { + "epoch": 0.7734191246046122, + "grad_norm": 24.45703887939453, + "learning_rate": 4.57567804024497e-06, + "loss": 1.5589, + "step": 1788 + }, + { + "epoch": 0.7738516856362703, + "grad_norm": 25.52694320678711, + "learning_rate": 4.566929133858268e-06, + "loss": 1.4867, + "step": 1789 + }, + { + "epoch": 0.7742842466679283, + "grad_norm": 24.09499740600586, + "learning_rate": 4.558180227471566e-06, + "loss": 1.5231, + "step": 1790 + }, + { + "epoch": 0.7747168076995864, + "grad_norm": 24.523792266845703, + "learning_rate": 4.549431321084865e-06, + "loss": 1.5179, + "step": 1791 + }, + { + "epoch": 0.7751493687312444, + "grad_norm": 23.561460494995117, + "learning_rate": 4.540682414698163e-06, + "loss": 1.5205, + "step": 1792 + }, + { + "epoch": 0.7755819297629025, + "grad_norm": 26.378007888793945, + "learning_rate": 4.531933508311461e-06, + "loss": 1.5417, + "step": 1793 + }, + { + "epoch": 0.7760144907945605, + "grad_norm": 23.561283111572266, + "learning_rate": 4.52318460192476e-06, + "loss": 1.589, + "step": 1794 + }, + { + "epoch": 0.7764470518262186, + "grad_norm": 27.071557998657227, + "learning_rate": 4.514435695538058e-06, + "loss": 1.4608, + "step": 1795 + }, + { + "epoch": 0.7768796128578767, + "grad_norm": 26.57014274597168, + "learning_rate": 4.5056867891513565e-06, + "loss": 1.4033, + "step": 1796 + }, + { + "epoch": 0.7773121738895348, + "grad_norm": 24.375045776367188, + "learning_rate": 4.496937882764655e-06, + "loss": 1.5522, + "step": 1797 + }, + { + "epoch": 0.7777447349211928, + "grad_norm": 25.525135040283203, + "learning_rate": 4.488188976377953e-06, + "loss": 1.6045, + "step": 1798 + }, + { + "epoch": 0.7781772959528509, + "grad_norm": 28.482580184936523, + "learning_rate": 4.4794400699912516e-06, + "loss": 1.5458, + "step": 1799 + }, + { + "epoch": 0.7786098569845089, + "grad_norm": 25.094606399536133, + "learning_rate": 4.47069116360455e-06, + "loss": 1.4793, + "step": 1800 + }, + { + "epoch": 0.7790424180161669, + "grad_norm": 26.066743850708008, + "learning_rate": 4.461942257217848e-06, + "loss": 1.4703, + "step": 1801 + }, + { + "epoch": 0.779474979047825, + "grad_norm": 25.871166229248047, + "learning_rate": 4.453193350831147e-06, + "loss": 1.5241, + "step": 1802 + }, + { + "epoch": 0.779907540079483, + "grad_norm": 24.948637008666992, + "learning_rate": 4.444444444444444e-06, + "loss": 1.517, + "step": 1803 + }, + { + "epoch": 0.7803401011111412, + "grad_norm": 25.19872283935547, + "learning_rate": 4.435695538057743e-06, + "loss": 1.552, + "step": 1804 + }, + { + "epoch": 0.7807726621427992, + "grad_norm": 24.39336395263672, + "learning_rate": 4.426946631671042e-06, + "loss": 1.3931, + "step": 1805 + }, + { + "epoch": 0.7812052231744573, + "grad_norm": 24.73307228088379, + "learning_rate": 4.41819772528434e-06, + "loss": 1.5159, + "step": 1806 + }, + { + "epoch": 0.7816377842061153, + "grad_norm": 24.064722061157227, + "learning_rate": 4.4094488188976384e-06, + "loss": 1.5075, + "step": 1807 + }, + { + "epoch": 0.7820703452377734, + "grad_norm": 24.808164596557617, + "learning_rate": 4.400699912510937e-06, + "loss": 1.4825, + "step": 1808 + }, + { + "epoch": 0.7825029062694314, + "grad_norm": 27.774396896362305, + "learning_rate": 4.391951006124235e-06, + "loss": 1.5035, + "step": 1809 + }, + { + "epoch": 0.7829354673010895, + "grad_norm": 27.53242301940918, + "learning_rate": 4.383202099737533e-06, + "loss": 1.5277, + "step": 1810 + }, + { + "epoch": 0.7833680283327475, + "grad_norm": 24.68132781982422, + "learning_rate": 4.374453193350832e-06, + "loss": 1.4304, + "step": 1811 + }, + { + "epoch": 0.7838005893644057, + "grad_norm": 26.517210006713867, + "learning_rate": 4.365704286964129e-06, + "loss": 1.5441, + "step": 1812 + }, + { + "epoch": 0.7842331503960637, + "grad_norm": 25.836271286010742, + "learning_rate": 4.356955380577429e-06, + "loss": 1.46, + "step": 1813 + }, + { + "epoch": 0.7846657114277218, + "grad_norm": 22.400842666625977, + "learning_rate": 4.348206474190726e-06, + "loss": 1.5226, + "step": 1814 + }, + { + "epoch": 0.7850982724593798, + "grad_norm": 23.194393157958984, + "learning_rate": 4.3394575678040245e-06, + "loss": 1.5745, + "step": 1815 + }, + { + "epoch": 0.7855308334910379, + "grad_norm": 23.99626922607422, + "learning_rate": 4.330708661417324e-06, + "loss": 1.4476, + "step": 1816 + }, + { + "epoch": 0.7859633945226959, + "grad_norm": 25.947378158569336, + "learning_rate": 4.321959755030621e-06, + "loss": 1.5264, + "step": 1817 + }, + { + "epoch": 0.786395955554354, + "grad_norm": 26.823701858520508, + "learning_rate": 4.31321084864392e-06, + "loss": 1.6003, + "step": 1818 + }, + { + "epoch": 0.786828516586012, + "grad_norm": 26.935449600219727, + "learning_rate": 4.304461942257218e-06, + "loss": 1.5052, + "step": 1819 + }, + { + "epoch": 0.7872610776176702, + "grad_norm": 24.033824920654297, + "learning_rate": 4.295713035870517e-06, + "loss": 1.5205, + "step": 1820 + }, + { + "epoch": 0.7876936386493282, + "grad_norm": 21.987449645996094, + "learning_rate": 4.286964129483815e-06, + "loss": 1.461, + "step": 1821 + }, + { + "epoch": 0.7881261996809863, + "grad_norm": 28.15187644958496, + "learning_rate": 4.278215223097113e-06, + "loss": 1.4807, + "step": 1822 + }, + { + "epoch": 0.7885587607126443, + "grad_norm": 23.641508102416992, + "learning_rate": 4.269466316710411e-06, + "loss": 1.5484, + "step": 1823 + }, + { + "epoch": 0.7889913217443023, + "grad_norm": 24.58562469482422, + "learning_rate": 4.26071741032371e-06, + "loss": 1.4434, + "step": 1824 + }, + { + "epoch": 0.7894238827759604, + "grad_norm": 23.53512954711914, + "learning_rate": 4.251968503937008e-06, + "loss": 1.5811, + "step": 1825 + }, + { + "epoch": 0.7898564438076184, + "grad_norm": 23.88713264465332, + "learning_rate": 4.2432195975503064e-06, + "loss": 1.5309, + "step": 1826 + }, + { + "epoch": 0.7902890048392766, + "grad_norm": 25.259674072265625, + "learning_rate": 4.234470691163605e-06, + "loss": 1.6166, + "step": 1827 + }, + { + "epoch": 0.7907215658709346, + "grad_norm": 22.053977966308594, + "learning_rate": 4.225721784776903e-06, + "loss": 1.5886, + "step": 1828 + }, + { + "epoch": 0.7911541269025927, + "grad_norm": 23.540071487426758, + "learning_rate": 4.2169728783902015e-06, + "loss": 1.5388, + "step": 1829 + }, + { + "epoch": 0.7915866879342507, + "grad_norm": 23.088363647460938, + "learning_rate": 4.2082239720035e-06, + "loss": 1.5588, + "step": 1830 + }, + { + "epoch": 0.7920192489659088, + "grad_norm": 24.821250915527344, + "learning_rate": 4.199475065616798e-06, + "loss": 1.5518, + "step": 1831 + }, + { + "epoch": 0.7924518099975668, + "grad_norm": 25.494277954101562, + "learning_rate": 4.190726159230097e-06, + "loss": 1.5499, + "step": 1832 + }, + { + "epoch": 0.7928843710292249, + "grad_norm": 25.74853515625, + "learning_rate": 4.181977252843395e-06, + "loss": 1.5049, + "step": 1833 + }, + { + "epoch": 0.7933169320608829, + "grad_norm": 24.494779586791992, + "learning_rate": 4.173228346456693e-06, + "loss": 1.4597, + "step": 1834 + }, + { + "epoch": 0.7937494930925411, + "grad_norm": 23.52466583251953, + "learning_rate": 4.164479440069992e-06, + "loss": 1.4917, + "step": 1835 + }, + { + "epoch": 0.7941820541241991, + "grad_norm": 24.15924072265625, + "learning_rate": 4.15573053368329e-06, + "loss": 1.5182, + "step": 1836 + }, + { + "epoch": 0.7946146151558572, + "grad_norm": 24.49439239501953, + "learning_rate": 4.146981627296588e-06, + "loss": 1.5663, + "step": 1837 + }, + { + "epoch": 0.7950471761875152, + "grad_norm": 25.374263763427734, + "learning_rate": 4.138232720909887e-06, + "loss": 1.4795, + "step": 1838 + }, + { + "epoch": 0.7954797372191733, + "grad_norm": 25.588665008544922, + "learning_rate": 4.129483814523185e-06, + "loss": 1.531, + "step": 1839 + }, + { + "epoch": 0.7959122982508313, + "grad_norm": 26.306419372558594, + "learning_rate": 4.1207349081364835e-06, + "loss": 1.5243, + "step": 1840 + }, + { + "epoch": 0.7963448592824894, + "grad_norm": 24.638086318969727, + "learning_rate": 4.111986001749782e-06, + "loss": 1.4911, + "step": 1841 + }, + { + "epoch": 0.7967774203141474, + "grad_norm": 25.27682113647461, + "learning_rate": 4.10323709536308e-06, + "loss": 1.4854, + "step": 1842 + }, + { + "epoch": 0.7972099813458056, + "grad_norm": 28.576251983642578, + "learning_rate": 4.0944881889763785e-06, + "loss": 1.4369, + "step": 1843 + }, + { + "epoch": 0.7976425423774636, + "grad_norm": 26.261287689208984, + "learning_rate": 4.085739282589677e-06, + "loss": 1.5401, + "step": 1844 + }, + { + "epoch": 0.7980751034091216, + "grad_norm": 24.996610641479492, + "learning_rate": 4.076990376202975e-06, + "loss": 1.5306, + "step": 1845 + }, + { + "epoch": 0.7985076644407797, + "grad_norm": 23.960554122924805, + "learning_rate": 4.068241469816273e-06, + "loss": 1.5065, + "step": 1846 + }, + { + "epoch": 0.7989402254724377, + "grad_norm": 31.743310928344727, + "learning_rate": 4.059492563429572e-06, + "loss": 1.59, + "step": 1847 + }, + { + "epoch": 0.7993727865040958, + "grad_norm": 23.445873260498047, + "learning_rate": 4.0507436570428695e-06, + "loss": 1.5049, + "step": 1848 + }, + { + "epoch": 0.7998053475357538, + "grad_norm": 24.140777587890625, + "learning_rate": 4.041994750656169e-06, + "loss": 1.5721, + "step": 1849 + }, + { + "epoch": 0.800237908567412, + "grad_norm": 23.796100616455078, + "learning_rate": 4.033245844269466e-06, + "loss": 1.5244, + "step": 1850 + }, + { + "epoch": 0.80067046959907, + "grad_norm": 29.071531295776367, + "learning_rate": 4.024496937882765e-06, + "loss": 1.5071, + "step": 1851 + }, + { + "epoch": 0.8011030306307281, + "grad_norm": 24.83522605895996, + "learning_rate": 4.015748031496064e-06, + "loss": 1.5093, + "step": 1852 + }, + { + "epoch": 0.8015355916623861, + "grad_norm": 25.007980346679688, + "learning_rate": 4.006999125109361e-06, + "loss": 1.51, + "step": 1853 + }, + { + "epoch": 0.8019681526940442, + "grad_norm": 23.728660583496094, + "learning_rate": 3.9982502187226605e-06, + "loss": 1.475, + "step": 1854 + }, + { + "epoch": 0.8024007137257022, + "grad_norm": 26.348461151123047, + "learning_rate": 3.989501312335958e-06, + "loss": 1.5531, + "step": 1855 + }, + { + "epoch": 0.8028332747573603, + "grad_norm": 25.381500244140625, + "learning_rate": 3.980752405949257e-06, + "loss": 1.4599, + "step": 1856 + }, + { + "epoch": 0.8032658357890183, + "grad_norm": 24.115501403808594, + "learning_rate": 3.972003499562555e-06, + "loss": 1.5345, + "step": 1857 + }, + { + "epoch": 0.8036983968206765, + "grad_norm": 24.597143173217773, + "learning_rate": 3.963254593175853e-06, + "loss": 1.4688, + "step": 1858 + }, + { + "epoch": 0.8041309578523345, + "grad_norm": 24.78520965576172, + "learning_rate": 3.9545056867891515e-06, + "loss": 1.4823, + "step": 1859 + }, + { + "epoch": 0.8045635188839926, + "grad_norm": 23.59719467163086, + "learning_rate": 3.94575678040245e-06, + "loss": 1.4657, + "step": 1860 + }, + { + "epoch": 0.8049960799156506, + "grad_norm": 26.260265350341797, + "learning_rate": 3.937007874015748e-06, + "loss": 1.5238, + "step": 1861 + }, + { + "epoch": 0.8054286409473087, + "grad_norm": 26.81787872314453, + "learning_rate": 3.9282589676290465e-06, + "loss": 1.5246, + "step": 1862 + }, + { + "epoch": 0.8058612019789667, + "grad_norm": 24.150634765625, + "learning_rate": 3.919510061242345e-06, + "loss": 1.4304, + "step": 1863 + }, + { + "epoch": 0.8062937630106248, + "grad_norm": 25.50913429260254, + "learning_rate": 3.910761154855643e-06, + "loss": 1.482, + "step": 1864 + }, + { + "epoch": 0.8067263240422828, + "grad_norm": 27.578847885131836, + "learning_rate": 3.902012248468942e-06, + "loss": 1.4806, + "step": 1865 + }, + { + "epoch": 0.8071588850739408, + "grad_norm": 28.148900985717773, + "learning_rate": 3.89326334208224e-06, + "loss": 1.5452, + "step": 1866 + }, + { + "epoch": 0.807591446105599, + "grad_norm": 25.29563331604004, + "learning_rate": 3.884514435695538e-06, + "loss": 1.4855, + "step": 1867 + }, + { + "epoch": 0.808024007137257, + "grad_norm": 26.5310001373291, + "learning_rate": 3.875765529308837e-06, + "loss": 1.4845, + "step": 1868 + }, + { + "epoch": 0.8084565681689151, + "grad_norm": 24.8538818359375, + "learning_rate": 3.867016622922135e-06, + "loss": 1.5439, + "step": 1869 + }, + { + "epoch": 0.8088891292005731, + "grad_norm": 24.38702964782715, + "learning_rate": 3.858267716535433e-06, + "loss": 1.524, + "step": 1870 + }, + { + "epoch": 0.8093216902322312, + "grad_norm": 25.538375854492188, + "learning_rate": 3.849518810148732e-06, + "loss": 1.5586, + "step": 1871 + }, + { + "epoch": 0.8097542512638892, + "grad_norm": 24.558469772338867, + "learning_rate": 3.84076990376203e-06, + "loss": 1.5687, + "step": 1872 + }, + { + "epoch": 0.8101868122955473, + "grad_norm": 24.07853126525879, + "learning_rate": 3.8320209973753285e-06, + "loss": 1.5702, + "step": 1873 + }, + { + "epoch": 0.8106193733272054, + "grad_norm": 23.100337982177734, + "learning_rate": 3.823272090988627e-06, + "loss": 1.5732, + "step": 1874 + }, + { + "epoch": 0.8110519343588635, + "grad_norm": 24.077980041503906, + "learning_rate": 3.814523184601925e-06, + "loss": 1.4838, + "step": 1875 + }, + { + "epoch": 0.8114844953905215, + "grad_norm": 24.32474708557129, + "learning_rate": 3.8057742782152236e-06, + "loss": 1.5323, + "step": 1876 + }, + { + "epoch": 0.8119170564221796, + "grad_norm": 26.63454246520996, + "learning_rate": 3.7970253718285215e-06, + "loss": 1.581, + "step": 1877 + }, + { + "epoch": 0.8123496174538376, + "grad_norm": 26.129480361938477, + "learning_rate": 3.7882764654418203e-06, + "loss": 1.5964, + "step": 1878 + }, + { + "epoch": 0.8127821784854957, + "grad_norm": 25.31804847717285, + "learning_rate": 3.7795275590551182e-06, + "loss": 1.5183, + "step": 1879 + }, + { + "epoch": 0.8132147395171537, + "grad_norm": 25.525976181030273, + "learning_rate": 3.770778652668417e-06, + "loss": 1.5983, + "step": 1880 + }, + { + "epoch": 0.8136473005488118, + "grad_norm": 23.734786987304688, + "learning_rate": 3.762029746281715e-06, + "loss": 1.555, + "step": 1881 + }, + { + "epoch": 0.8140798615804699, + "grad_norm": 23.824871063232422, + "learning_rate": 3.7532808398950133e-06, + "loss": 1.5189, + "step": 1882 + }, + { + "epoch": 0.814512422612128, + "grad_norm": 26.2841854095459, + "learning_rate": 3.744531933508312e-06, + "loss": 1.543, + "step": 1883 + }, + { + "epoch": 0.814944983643786, + "grad_norm": 27.476755142211914, + "learning_rate": 3.73578302712161e-06, + "loss": 1.5171, + "step": 1884 + }, + { + "epoch": 0.8153775446754441, + "grad_norm": 24.633068084716797, + "learning_rate": 3.727034120734909e-06, + "loss": 1.5069, + "step": 1885 + }, + { + "epoch": 0.8158101057071021, + "grad_norm": 26.17974281311035, + "learning_rate": 3.7182852143482068e-06, + "loss": 1.5122, + "step": 1886 + }, + { + "epoch": 0.8162426667387602, + "grad_norm": 24.839109420776367, + "learning_rate": 3.7095363079615047e-06, + "loss": 1.5153, + "step": 1887 + }, + { + "epoch": 0.8166752277704182, + "grad_norm": 21.22071647644043, + "learning_rate": 3.7007874015748035e-06, + "loss": 1.5247, + "step": 1888 + }, + { + "epoch": 0.8171077888020762, + "grad_norm": 25.769725799560547, + "learning_rate": 3.692038495188102e-06, + "loss": 1.4744, + "step": 1889 + }, + { + "epoch": 0.8175403498337344, + "grad_norm": 24.709821701049805, + "learning_rate": 3.6832895888014e-06, + "loss": 1.5333, + "step": 1890 + }, + { + "epoch": 0.8179729108653924, + "grad_norm": 25.408071517944336, + "learning_rate": 3.6745406824146986e-06, + "loss": 1.5651, + "step": 1891 + }, + { + "epoch": 0.8184054718970505, + "grad_norm": 24.688087463378906, + "learning_rate": 3.665791776027997e-06, + "loss": 1.4822, + "step": 1892 + }, + { + "epoch": 0.8188380329287085, + "grad_norm": 24.41063117980957, + "learning_rate": 3.6570428696412953e-06, + "loss": 1.5928, + "step": 1893 + }, + { + "epoch": 0.8192705939603666, + "grad_norm": 25.023773193359375, + "learning_rate": 3.6482939632545932e-06, + "loss": 1.5182, + "step": 1894 + }, + { + "epoch": 0.8197031549920246, + "grad_norm": 26.05735969543457, + "learning_rate": 3.639545056867892e-06, + "loss": 1.5007, + "step": 1895 + }, + { + "epoch": 0.8201357160236827, + "grad_norm": 27.816280364990234, + "learning_rate": 3.63079615048119e-06, + "loss": 1.4621, + "step": 1896 + }, + { + "epoch": 0.8205682770553407, + "grad_norm": 27.703954696655273, + "learning_rate": 3.6220472440944887e-06, + "loss": 1.4638, + "step": 1897 + }, + { + "epoch": 0.8210008380869989, + "grad_norm": 26.362079620361328, + "learning_rate": 3.6132983377077867e-06, + "loss": 1.5282, + "step": 1898 + }, + { + "epoch": 0.8214333991186569, + "grad_norm": 25.99664306640625, + "learning_rate": 3.604549431321085e-06, + "loss": 1.4904, + "step": 1899 + }, + { + "epoch": 0.821865960150315, + "grad_norm": 24.36591911315918, + "learning_rate": 3.595800524934384e-06, + "loss": 1.555, + "step": 1900 + }, + { + "epoch": 0.822298521181973, + "grad_norm": 25.207164764404297, + "learning_rate": 3.5870516185476817e-06, + "loss": 1.5043, + "step": 1901 + }, + { + "epoch": 0.8227310822136311, + "grad_norm": 27.927532196044922, + "learning_rate": 3.5783027121609805e-06, + "loss": 1.5988, + "step": 1902 + }, + { + "epoch": 0.8231636432452891, + "grad_norm": 23.66588020324707, + "learning_rate": 3.5695538057742785e-06, + "loss": 1.5167, + "step": 1903 + }, + { + "epoch": 0.8235962042769472, + "grad_norm": 25.874834060668945, + "learning_rate": 3.5608048993875772e-06, + "loss": 1.4365, + "step": 1904 + }, + { + "epoch": 0.8240287653086052, + "grad_norm": 23.785438537597656, + "learning_rate": 3.552055993000875e-06, + "loss": 1.5468, + "step": 1905 + }, + { + "epoch": 0.8244613263402634, + "grad_norm": 27.696264266967773, + "learning_rate": 3.5433070866141735e-06, + "loss": 1.5845, + "step": 1906 + }, + { + "epoch": 0.8248938873719214, + "grad_norm": 24.913143157958984, + "learning_rate": 3.534558180227472e-06, + "loss": 1.6031, + "step": 1907 + }, + { + "epoch": 0.8253264484035795, + "grad_norm": 25.623226165771484, + "learning_rate": 3.5258092738407703e-06, + "loss": 1.4782, + "step": 1908 + }, + { + "epoch": 0.8257590094352375, + "grad_norm": 26.468833923339844, + "learning_rate": 3.5170603674540686e-06, + "loss": 1.5557, + "step": 1909 + }, + { + "epoch": 0.8261915704668955, + "grad_norm": 25.889429092407227, + "learning_rate": 3.508311461067367e-06, + "loss": 1.4884, + "step": 1910 + }, + { + "epoch": 0.8266241314985536, + "grad_norm": 23.6556453704834, + "learning_rate": 3.499562554680665e-06, + "loss": 1.5173, + "step": 1911 + }, + { + "epoch": 0.8270566925302116, + "grad_norm": 26.38465690612793, + "learning_rate": 3.4908136482939637e-06, + "loss": 1.4916, + "step": 1912 + }, + { + "epoch": 0.8274892535618698, + "grad_norm": 22.687150955200195, + "learning_rate": 3.4820647419072616e-06, + "loss": 1.4722, + "step": 1913 + }, + { + "epoch": 0.8279218145935278, + "grad_norm": 23.410667419433594, + "learning_rate": 3.4733158355205604e-06, + "loss": 1.4913, + "step": 1914 + }, + { + "epoch": 0.8283543756251859, + "grad_norm": 28.485891342163086, + "learning_rate": 3.4645669291338583e-06, + "loss": 1.4458, + "step": 1915 + }, + { + "epoch": 0.8287869366568439, + "grad_norm": 25.982906341552734, + "learning_rate": 3.455818022747157e-06, + "loss": 1.4411, + "step": 1916 + }, + { + "epoch": 0.829219497688502, + "grad_norm": 25.231077194213867, + "learning_rate": 3.447069116360455e-06, + "loss": 1.4979, + "step": 1917 + }, + { + "epoch": 0.82965205872016, + "grad_norm": 23.16214370727539, + "learning_rate": 3.4383202099737534e-06, + "loss": 1.4828, + "step": 1918 + }, + { + "epoch": 0.8300846197518181, + "grad_norm": 26.24925994873047, + "learning_rate": 3.429571303587052e-06, + "loss": 1.5102, + "step": 1919 + }, + { + "epoch": 0.8305171807834761, + "grad_norm": 21.732011795043945, + "learning_rate": 3.42082239720035e-06, + "loss": 1.5264, + "step": 1920 + }, + { + "epoch": 0.8309497418151343, + "grad_norm": 26.923465728759766, + "learning_rate": 3.412073490813649e-06, + "loss": 1.4386, + "step": 1921 + }, + { + "epoch": 0.8313823028467923, + "grad_norm": 26.28173828125, + "learning_rate": 3.403324584426947e-06, + "loss": 1.5147, + "step": 1922 + }, + { + "epoch": 0.8318148638784504, + "grad_norm": 27.070018768310547, + "learning_rate": 3.394575678040245e-06, + "loss": 1.4853, + "step": 1923 + }, + { + "epoch": 0.8322474249101084, + "grad_norm": 23.679351806640625, + "learning_rate": 3.3858267716535436e-06, + "loss": 1.5418, + "step": 1924 + }, + { + "epoch": 0.8326799859417665, + "grad_norm": 27.61638832092285, + "learning_rate": 3.377077865266842e-06, + "loss": 1.5201, + "step": 1925 + }, + { + "epoch": 0.8331125469734245, + "grad_norm": 26.128068923950195, + "learning_rate": 3.3683289588801403e-06, + "loss": 1.5359, + "step": 1926 + }, + { + "epoch": 0.8335451080050826, + "grad_norm": 24.252161026000977, + "learning_rate": 3.3595800524934387e-06, + "loss": 1.4833, + "step": 1927 + }, + { + "epoch": 0.8339776690367406, + "grad_norm": 26.1506404876709, + "learning_rate": 3.350831146106737e-06, + "loss": 1.5333, + "step": 1928 + }, + { + "epoch": 0.8344102300683988, + "grad_norm": 25.713539123535156, + "learning_rate": 3.3420822397200354e-06, + "loss": 1.5724, + "step": 1929 + }, + { + "epoch": 0.8348427911000568, + "grad_norm": 24.772716522216797, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.5876, + "step": 1930 + }, + { + "epoch": 0.8352753521317149, + "grad_norm": 26.96317481994629, + "learning_rate": 3.324584426946632e-06, + "loss": 1.6016, + "step": 1931 + }, + { + "epoch": 0.8357079131633729, + "grad_norm": 25.352603912353516, + "learning_rate": 3.31583552055993e-06, + "loss": 1.4939, + "step": 1932 + }, + { + "epoch": 0.8361404741950309, + "grad_norm": 24.081235885620117, + "learning_rate": 3.307086614173229e-06, + "loss": 1.4755, + "step": 1933 + }, + { + "epoch": 0.836573035226689, + "grad_norm": 26.306793212890625, + "learning_rate": 3.2983377077865268e-06, + "loss": 1.5863, + "step": 1934 + }, + { + "epoch": 0.837005596258347, + "grad_norm": 27.727188110351562, + "learning_rate": 3.289588801399825e-06, + "loss": 1.5823, + "step": 1935 + }, + { + "epoch": 0.8374381572900051, + "grad_norm": 25.65025520324707, + "learning_rate": 3.280839895013124e-06, + "loss": 1.4594, + "step": 1936 + }, + { + "epoch": 0.8378707183216632, + "grad_norm": 23.665359497070312, + "learning_rate": 3.272090988626422e-06, + "loss": 1.6313, + "step": 1937 + }, + { + "epoch": 0.8383032793533213, + "grad_norm": 23.80894660949707, + "learning_rate": 3.2633420822397206e-06, + "loss": 1.5172, + "step": 1938 + }, + { + "epoch": 0.8387358403849793, + "grad_norm": 26.22355842590332, + "learning_rate": 3.2545931758530186e-06, + "loss": 1.564, + "step": 1939 + }, + { + "epoch": 0.8391684014166374, + "grad_norm": 24.528112411499023, + "learning_rate": 3.2458442694663165e-06, + "loss": 1.519, + "step": 1940 + }, + { + "epoch": 0.8396009624482954, + "grad_norm": 26.274965286254883, + "learning_rate": 3.2370953630796153e-06, + "loss": 1.5012, + "step": 1941 + }, + { + "epoch": 0.8400335234799535, + "grad_norm": 26.261157989501953, + "learning_rate": 3.2283464566929136e-06, + "loss": 1.481, + "step": 1942 + }, + { + "epoch": 0.8404660845116115, + "grad_norm": 27.06733512878418, + "learning_rate": 3.219597550306212e-06, + "loss": 1.5262, + "step": 1943 + }, + { + "epoch": 0.8408986455432697, + "grad_norm": 23.958738327026367, + "learning_rate": 3.2108486439195104e-06, + "loss": 1.5564, + "step": 1944 + }, + { + "epoch": 0.8413312065749277, + "grad_norm": 24.236982345581055, + "learning_rate": 3.2020997375328087e-06, + "loss": 1.5062, + "step": 1945 + }, + { + "epoch": 0.8417637676065858, + "grad_norm": 24.281795501708984, + "learning_rate": 3.193350831146107e-06, + "loss": 1.5618, + "step": 1946 + }, + { + "epoch": 0.8421963286382438, + "grad_norm": 23.478912353515625, + "learning_rate": 3.184601924759405e-06, + "loss": 1.5643, + "step": 1947 + }, + { + "epoch": 0.8426288896699019, + "grad_norm": 24.562108993530273, + "learning_rate": 3.175853018372704e-06, + "loss": 1.4998, + "step": 1948 + }, + { + "epoch": 0.8430614507015599, + "grad_norm": 24.686317443847656, + "learning_rate": 3.1671041119860017e-06, + "loss": 1.4872, + "step": 1949 + }, + { + "epoch": 0.843494011733218, + "grad_norm": 25.728281021118164, + "learning_rate": 3.1583552055993005e-06, + "loss": 1.505, + "step": 1950 + }, + { + "epoch": 0.843926572764876, + "grad_norm": 26.217914581298828, + "learning_rate": 3.1496062992125985e-06, + "loss": 1.4911, + "step": 1951 + }, + { + "epoch": 0.8443591337965342, + "grad_norm": 25.863449096679688, + "learning_rate": 3.140857392825897e-06, + "loss": 1.6666, + "step": 1952 + }, + { + "epoch": 0.8447916948281922, + "grad_norm": 25.692018508911133, + "learning_rate": 3.1321084864391956e-06, + "loss": 1.5049, + "step": 1953 + }, + { + "epoch": 0.8452242558598502, + "grad_norm": 25.147676467895508, + "learning_rate": 3.1233595800524935e-06, + "loss": 1.5504, + "step": 1954 + }, + { + "epoch": 0.8456568168915083, + "grad_norm": 26.268417358398438, + "learning_rate": 3.1146106736657923e-06, + "loss": 1.4939, + "step": 1955 + }, + { + "epoch": 0.8460893779231663, + "grad_norm": 22.62088394165039, + "learning_rate": 3.1058617672790903e-06, + "loss": 1.5196, + "step": 1956 + }, + { + "epoch": 0.8465219389548244, + "grad_norm": 23.995553970336914, + "learning_rate": 3.097112860892389e-06, + "loss": 1.4776, + "step": 1957 + }, + { + "epoch": 0.8469544999864824, + "grad_norm": 24.5943546295166, + "learning_rate": 3.088363954505687e-06, + "loss": 1.5271, + "step": 1958 + }, + { + "epoch": 0.8473870610181405, + "grad_norm": 24.14809799194336, + "learning_rate": 3.0796150481189853e-06, + "loss": 1.5425, + "step": 1959 + }, + { + "epoch": 0.8478196220497985, + "grad_norm": 22.18802833557129, + "learning_rate": 3.0708661417322837e-06, + "loss": 1.5071, + "step": 1960 + }, + { + "epoch": 0.8482521830814567, + "grad_norm": 23.465879440307617, + "learning_rate": 3.062117235345582e-06, + "loss": 1.4322, + "step": 1961 + }, + { + "epoch": 0.8486847441131147, + "grad_norm": 25.830198287963867, + "learning_rate": 3.0533683289588804e-06, + "loss": 1.5651, + "step": 1962 + }, + { + "epoch": 0.8491173051447728, + "grad_norm": 28.602500915527344, + "learning_rate": 3.0446194225721788e-06, + "loss": 1.3828, + "step": 1963 + }, + { + "epoch": 0.8495498661764308, + "grad_norm": 24.01179313659668, + "learning_rate": 3.0358705161854767e-06, + "loss": 1.5234, + "step": 1964 + }, + { + "epoch": 0.8499824272080889, + "grad_norm": 26.218347549438477, + "learning_rate": 3.0271216097987755e-06, + "loss": 1.4781, + "step": 1965 + }, + { + "epoch": 0.8504149882397469, + "grad_norm": 26.33125877380371, + "learning_rate": 3.0183727034120734e-06, + "loss": 1.4859, + "step": 1966 + }, + { + "epoch": 0.850847549271405, + "grad_norm": 25.678081512451172, + "learning_rate": 3.0096237970253722e-06, + "loss": 1.4866, + "step": 1967 + }, + { + "epoch": 0.851280110303063, + "grad_norm": 29.468505859375, + "learning_rate": 3.00087489063867e-06, + "loss": 1.5086, + "step": 1968 + }, + { + "epoch": 0.8517126713347212, + "grad_norm": 23.861677169799805, + "learning_rate": 2.992125984251969e-06, + "loss": 1.5575, + "step": 1969 + }, + { + "epoch": 0.8521452323663792, + "grad_norm": 25.250822067260742, + "learning_rate": 2.9833770778652673e-06, + "loss": 1.4855, + "step": 1970 + }, + { + "epoch": 0.8525777933980373, + "grad_norm": 26.805891036987305, + "learning_rate": 2.9746281714785652e-06, + "loss": 1.4851, + "step": 1971 + }, + { + "epoch": 0.8530103544296953, + "grad_norm": 23.66602897644043, + "learning_rate": 2.965879265091864e-06, + "loss": 1.5238, + "step": 1972 + }, + { + "epoch": 0.8534429154613534, + "grad_norm": 25.112504959106445, + "learning_rate": 2.957130358705162e-06, + "loss": 1.4722, + "step": 1973 + }, + { + "epoch": 0.8538754764930114, + "grad_norm": 24.553518295288086, + "learning_rate": 2.9483814523184607e-06, + "loss": 1.5377, + "step": 1974 + }, + { + "epoch": 0.8543080375246696, + "grad_norm": 27.337066650390625, + "learning_rate": 2.9396325459317587e-06, + "loss": 1.5673, + "step": 1975 + }, + { + "epoch": 0.8547405985563276, + "grad_norm": 28.423484802246094, + "learning_rate": 2.930883639545057e-06, + "loss": 1.5152, + "step": 1976 + }, + { + "epoch": 0.8551731595879856, + "grad_norm": 25.569091796875, + "learning_rate": 2.9221347331583554e-06, + "loss": 1.409, + "step": 1977 + }, + { + "epoch": 0.8556057206196437, + "grad_norm": 22.94341468811035, + "learning_rate": 2.9133858267716538e-06, + "loss": 1.4882, + "step": 1978 + }, + { + "epoch": 0.8560382816513017, + "grad_norm": 23.691856384277344, + "learning_rate": 2.904636920384952e-06, + "loss": 1.4557, + "step": 1979 + }, + { + "epoch": 0.8564708426829598, + "grad_norm": 27.209903717041016, + "learning_rate": 2.8958880139982505e-06, + "loss": 1.51, + "step": 1980 + }, + { + "epoch": 0.8569034037146178, + "grad_norm": 26.904014587402344, + "learning_rate": 2.887139107611549e-06, + "loss": 1.6156, + "step": 1981 + }, + { + "epoch": 0.8573359647462759, + "grad_norm": 24.399385452270508, + "learning_rate": 2.878390201224847e-06, + "loss": 1.5173, + "step": 1982 + }, + { + "epoch": 0.8577685257779339, + "grad_norm": 23.10108184814453, + "learning_rate": 2.869641294838145e-06, + "loss": 1.4796, + "step": 1983 + }, + { + "epoch": 0.8582010868095921, + "grad_norm": 27.3771915435791, + "learning_rate": 2.860892388451444e-06, + "loss": 1.5105, + "step": 1984 + }, + { + "epoch": 0.8586336478412501, + "grad_norm": 24.214082717895508, + "learning_rate": 2.852143482064742e-06, + "loss": 1.5859, + "step": 1985 + }, + { + "epoch": 0.8590662088729082, + "grad_norm": 25.662439346313477, + "learning_rate": 2.8433945756780406e-06, + "loss": 1.4917, + "step": 1986 + }, + { + "epoch": 0.8594987699045662, + "grad_norm": 26.082138061523438, + "learning_rate": 2.8346456692913386e-06, + "loss": 1.5926, + "step": 1987 + }, + { + "epoch": 0.8599313309362243, + "grad_norm": 25.768564224243164, + "learning_rate": 2.825896762904637e-06, + "loss": 1.5071, + "step": 1988 + }, + { + "epoch": 0.8603638919678823, + "grad_norm": 26.158370971679688, + "learning_rate": 2.8171478565179357e-06, + "loss": 1.4536, + "step": 1989 + }, + { + "epoch": 0.8607964529995404, + "grad_norm": 25.917757034301758, + "learning_rate": 2.8083989501312337e-06, + "loss": 1.4675, + "step": 1990 + }, + { + "epoch": 0.8612290140311984, + "grad_norm": 24.782541275024414, + "learning_rate": 2.7996500437445324e-06, + "loss": 1.4918, + "step": 1991 + }, + { + "epoch": 0.8616615750628566, + "grad_norm": 24.238004684448242, + "learning_rate": 2.7909011373578304e-06, + "loss": 1.55, + "step": 1992 + }, + { + "epoch": 0.8620941360945146, + "grad_norm": 28.761098861694336, + "learning_rate": 2.782152230971129e-06, + "loss": 1.5032, + "step": 1993 + }, + { + "epoch": 0.8625266971261727, + "grad_norm": 28.440311431884766, + "learning_rate": 2.773403324584427e-06, + "loss": 1.4544, + "step": 1994 + }, + { + "epoch": 0.8629592581578307, + "grad_norm": 24.347803115844727, + "learning_rate": 2.7646544181977255e-06, + "loss": 1.4856, + "step": 1995 + }, + { + "epoch": 0.8633918191894888, + "grad_norm": 23.017044067382812, + "learning_rate": 2.755905511811024e-06, + "loss": 1.513, + "step": 1996 + }, + { + "epoch": 0.8638243802211468, + "grad_norm": 24.277076721191406, + "learning_rate": 2.747156605424322e-06, + "loss": 1.5532, + "step": 1997 + }, + { + "epoch": 0.8642569412528048, + "grad_norm": 26.928024291992188, + "learning_rate": 2.7384076990376205e-06, + "loss": 1.5431, + "step": 1998 + }, + { + "epoch": 0.864689502284463, + "grad_norm": 25.287187576293945, + "learning_rate": 2.729658792650919e-06, + "loss": 1.5137, + "step": 1999 + }, + { + "epoch": 0.865122063316121, + "grad_norm": 23.46804428100586, + "learning_rate": 2.720909886264217e-06, + "loss": 1.5105, + "step": 2000 + }, + { + "epoch": 0.8655546243477791, + "grad_norm": 26.317270278930664, + "learning_rate": 2.7121609798775156e-06, + "loss": 1.4855, + "step": 2001 + }, + { + "epoch": 0.8659871853794371, + "grad_norm": 23.94223403930664, + "learning_rate": 2.7034120734908135e-06, + "loss": 1.5016, + "step": 2002 + }, + { + "epoch": 0.8664197464110952, + "grad_norm": 25.011159896850586, + "learning_rate": 2.6946631671041123e-06, + "loss": 1.5259, + "step": 2003 + }, + { + "epoch": 0.8668523074427532, + "grad_norm": 26.059602737426758, + "learning_rate": 2.6859142607174103e-06, + "loss": 1.4911, + "step": 2004 + }, + { + "epoch": 0.8672848684744113, + "grad_norm": 25.560832977294922, + "learning_rate": 2.677165354330709e-06, + "loss": 1.5273, + "step": 2005 + }, + { + "epoch": 0.8677174295060693, + "grad_norm": 24.12883186340332, + "learning_rate": 2.6684164479440074e-06, + "loss": 1.5653, + "step": 2006 + }, + { + "epoch": 0.8681499905377275, + "grad_norm": 24.72055435180664, + "learning_rate": 2.6596675415573053e-06, + "loss": 1.532, + "step": 2007 + }, + { + "epoch": 0.8685825515693855, + "grad_norm": 23.121793746948242, + "learning_rate": 2.650918635170604e-06, + "loss": 1.532, + "step": 2008 + }, + { + "epoch": 0.8690151126010436, + "grad_norm": 27.0283145904541, + "learning_rate": 2.642169728783902e-06, + "loss": 1.4829, + "step": 2009 + }, + { + "epoch": 0.8694476736327016, + "grad_norm": 25.010900497436523, + "learning_rate": 2.633420822397201e-06, + "loss": 1.5846, + "step": 2010 + }, + { + "epoch": 0.8698802346643597, + "grad_norm": 26.372800827026367, + "learning_rate": 2.6246719160104988e-06, + "loss": 1.5037, + "step": 2011 + }, + { + "epoch": 0.8703127956960177, + "grad_norm": 24.331981658935547, + "learning_rate": 2.615923009623797e-06, + "loss": 1.431, + "step": 2012 + }, + { + "epoch": 0.8707453567276758, + "grad_norm": 24.65950584411621, + "learning_rate": 2.6071741032370955e-06, + "loss": 1.4707, + "step": 2013 + }, + { + "epoch": 0.8711779177593338, + "grad_norm": 25.584592819213867, + "learning_rate": 2.598425196850394e-06, + "loss": 1.4639, + "step": 2014 + }, + { + "epoch": 0.871610478790992, + "grad_norm": 24.871679306030273, + "learning_rate": 2.5896762904636922e-06, + "loss": 1.4434, + "step": 2015 + }, + { + "epoch": 0.87204303982265, + "grad_norm": 28.186784744262695, + "learning_rate": 2.5809273840769906e-06, + "loss": 1.5004, + "step": 2016 + }, + { + "epoch": 0.8724756008543081, + "grad_norm": 23.437673568725586, + "learning_rate": 2.5721784776902894e-06, + "loss": 1.5752, + "step": 2017 + }, + { + "epoch": 0.8729081618859661, + "grad_norm": 24.34725570678711, + "learning_rate": 2.5634295713035873e-06, + "loss": 1.5573, + "step": 2018 + }, + { + "epoch": 0.8733407229176242, + "grad_norm": 26.968994140625, + "learning_rate": 2.5546806649168852e-06, + "loss": 1.466, + "step": 2019 + }, + { + "epoch": 0.8737732839492822, + "grad_norm": 23.812091827392578, + "learning_rate": 2.545931758530184e-06, + "loss": 1.4728, + "step": 2020 + }, + { + "epoch": 0.8742058449809402, + "grad_norm": 26.316484451293945, + "learning_rate": 2.537182852143482e-06, + "loss": 1.495, + "step": 2021 + }, + { + "epoch": 0.8746384060125983, + "grad_norm": 26.36723518371582, + "learning_rate": 2.5284339457567807e-06, + "loss": 1.4384, + "step": 2022 + }, + { + "epoch": 0.8750709670442564, + "grad_norm": 27.980514526367188, + "learning_rate": 2.519685039370079e-06, + "loss": 1.5268, + "step": 2023 + }, + { + "epoch": 0.8755035280759145, + "grad_norm": 25.934511184692383, + "learning_rate": 2.510936132983377e-06, + "loss": 1.5593, + "step": 2024 + }, + { + "epoch": 0.8759360891075725, + "grad_norm": 24.022119522094727, + "learning_rate": 2.502187226596676e-06, + "loss": 1.5023, + "step": 2025 + }, + { + "epoch": 0.8763686501392306, + "grad_norm": 24.290563583374023, + "learning_rate": 2.493438320209974e-06, + "loss": 1.4827, + "step": 2026 + }, + { + "epoch": 0.8768012111708886, + "grad_norm": 24.21749496459961, + "learning_rate": 2.484689413823272e-06, + "loss": 1.425, + "step": 2027 + }, + { + "epoch": 0.8772337722025467, + "grad_norm": 26.60518455505371, + "learning_rate": 2.4759405074365705e-06, + "loss": 1.5017, + "step": 2028 + }, + { + "epoch": 0.8776663332342047, + "grad_norm": 25.764251708984375, + "learning_rate": 2.467191601049869e-06, + "loss": 1.5647, + "step": 2029 + }, + { + "epoch": 0.8780988942658629, + "grad_norm": 24.295669555664062, + "learning_rate": 2.458442694663167e-06, + "loss": 1.4745, + "step": 2030 + }, + { + "epoch": 0.8785314552975209, + "grad_norm": 28.384014129638672, + "learning_rate": 2.4496937882764656e-06, + "loss": 1.554, + "step": 2031 + }, + { + "epoch": 0.878964016329179, + "grad_norm": 23.786191940307617, + "learning_rate": 2.440944881889764e-06, + "loss": 1.5333, + "step": 2032 + }, + { + "epoch": 0.879396577360837, + "grad_norm": 25.375789642333984, + "learning_rate": 2.4321959755030623e-06, + "loss": 1.5183, + "step": 2033 + }, + { + "epoch": 0.8798291383924951, + "grad_norm": 24.33387565612793, + "learning_rate": 2.4234470691163606e-06, + "loss": 1.4466, + "step": 2034 + }, + { + "epoch": 0.8802616994241531, + "grad_norm": 23.24261474609375, + "learning_rate": 2.414698162729659e-06, + "loss": 1.5362, + "step": 2035 + }, + { + "epoch": 0.8806942604558112, + "grad_norm": 24.91956329345703, + "learning_rate": 2.4059492563429574e-06, + "loss": 1.4217, + "step": 2036 + }, + { + "epoch": 0.8811268214874692, + "grad_norm": 24.583356857299805, + "learning_rate": 2.3972003499562557e-06, + "loss": 1.4938, + "step": 2037 + }, + { + "epoch": 0.8815593825191274, + "grad_norm": 25.635778427124023, + "learning_rate": 2.388451443569554e-06, + "loss": 1.5102, + "step": 2038 + }, + { + "epoch": 0.8819919435507854, + "grad_norm": 24.82320785522461, + "learning_rate": 2.379702537182852e-06, + "loss": 1.5763, + "step": 2039 + }, + { + "epoch": 0.8824245045824435, + "grad_norm": 24.23630714416504, + "learning_rate": 2.370953630796151e-06, + "loss": 1.5198, + "step": 2040 + }, + { + "epoch": 0.8828570656141015, + "grad_norm": 22.778940200805664, + "learning_rate": 2.362204724409449e-06, + "loss": 1.5602, + "step": 2041 + }, + { + "epoch": 0.8832896266457595, + "grad_norm": 23.935001373291016, + "learning_rate": 2.3534558180227475e-06, + "loss": 1.4802, + "step": 2042 + }, + { + "epoch": 0.8837221876774176, + "grad_norm": 27.67841148376465, + "learning_rate": 2.344706911636046e-06, + "loss": 1.5749, + "step": 2043 + }, + { + "epoch": 0.8841547487090756, + "grad_norm": 25.99941635131836, + "learning_rate": 2.3359580052493442e-06, + "loss": 1.4838, + "step": 2044 + }, + { + "epoch": 0.8845873097407337, + "grad_norm": 26.464733123779297, + "learning_rate": 2.327209098862642e-06, + "loss": 1.4711, + "step": 2045 + }, + { + "epoch": 0.8850198707723917, + "grad_norm": 24.384105682373047, + "learning_rate": 2.3184601924759405e-06, + "loss": 1.5368, + "step": 2046 + }, + { + "epoch": 0.8854524318040499, + "grad_norm": 26.473554611206055, + "learning_rate": 2.309711286089239e-06, + "loss": 1.5003, + "step": 2047 + }, + { + "epoch": 0.8858849928357079, + "grad_norm": 23.190519332885742, + "learning_rate": 2.3009623797025373e-06, + "loss": 1.4441, + "step": 2048 + }, + { + "epoch": 0.886317553867366, + "grad_norm": 25.446199417114258, + "learning_rate": 2.2922134733158356e-06, + "loss": 1.5295, + "step": 2049 + }, + { + "epoch": 0.886750114899024, + "grad_norm": 25.416261672973633, + "learning_rate": 2.283464566929134e-06, + "loss": 1.4706, + "step": 2050 + }, + { + "epoch": 0.8871826759306821, + "grad_norm": 23.452497482299805, + "learning_rate": 2.2747156605424323e-06, + "loss": 1.54, + "step": 2051 + }, + { + "epoch": 0.8876152369623401, + "grad_norm": 26.014728546142578, + "learning_rate": 2.2659667541557307e-06, + "loss": 1.531, + "step": 2052 + }, + { + "epoch": 0.8880477979939982, + "grad_norm": 28.156362533569336, + "learning_rate": 2.257217847769029e-06, + "loss": 1.6611, + "step": 2053 + }, + { + "epoch": 0.8884803590256563, + "grad_norm": 28.15045928955078, + "learning_rate": 2.2484689413823274e-06, + "loss": 1.4967, + "step": 2054 + }, + { + "epoch": 0.8889129200573144, + "grad_norm": 26.710493087768555, + "learning_rate": 2.2397200349956258e-06, + "loss": 1.5291, + "step": 2055 + }, + { + "epoch": 0.8893454810889724, + "grad_norm": 26.1806640625, + "learning_rate": 2.230971128608924e-06, + "loss": 1.6146, + "step": 2056 + }, + { + "epoch": 0.8897780421206305, + "grad_norm": 28.079967498779297, + "learning_rate": 2.222222222222222e-06, + "loss": 1.5331, + "step": 2057 + }, + { + "epoch": 0.8902106031522885, + "grad_norm": 26.610944747924805, + "learning_rate": 2.213473315835521e-06, + "loss": 1.4869, + "step": 2058 + }, + { + "epoch": 0.8906431641839466, + "grad_norm": 24.159992218017578, + "learning_rate": 2.2047244094488192e-06, + "loss": 1.5627, + "step": 2059 + }, + { + "epoch": 0.8910757252156046, + "grad_norm": 25.268817901611328, + "learning_rate": 2.1959755030621176e-06, + "loss": 1.513, + "step": 2060 + }, + { + "epoch": 0.8915082862472627, + "grad_norm": 23.28951644897461, + "learning_rate": 2.187226596675416e-06, + "loss": 1.5356, + "step": 2061 + }, + { + "epoch": 0.8919408472789208, + "grad_norm": 24.559032440185547, + "learning_rate": 2.1784776902887143e-06, + "loss": 1.5505, + "step": 2062 + }, + { + "epoch": 0.8923734083105789, + "grad_norm": 22.86376190185547, + "learning_rate": 2.1697287839020122e-06, + "loss": 1.4958, + "step": 2063 + }, + { + "epoch": 0.8928059693422369, + "grad_norm": 25.818510055541992, + "learning_rate": 2.1609798775153106e-06, + "loss": 1.5579, + "step": 2064 + }, + { + "epoch": 0.8932385303738949, + "grad_norm": 23.554603576660156, + "learning_rate": 2.152230971128609e-06, + "loss": 1.5331, + "step": 2065 + }, + { + "epoch": 0.893671091405553, + "grad_norm": 22.988496780395508, + "learning_rate": 2.1434820647419073e-06, + "loss": 1.4828, + "step": 2066 + }, + { + "epoch": 0.894103652437211, + "grad_norm": 25.738300323486328, + "learning_rate": 2.1347331583552057e-06, + "loss": 1.5182, + "step": 2067 + }, + { + "epoch": 0.8945362134688691, + "grad_norm": 26.744293212890625, + "learning_rate": 2.125984251968504e-06, + "loss": 1.4402, + "step": 2068 + }, + { + "epoch": 0.8949687745005271, + "grad_norm": 26.04250144958496, + "learning_rate": 2.1172353455818024e-06, + "loss": 1.4826, + "step": 2069 + }, + { + "epoch": 0.8954013355321853, + "grad_norm": 24.584280014038086, + "learning_rate": 2.1084864391951008e-06, + "loss": 1.5665, + "step": 2070 + }, + { + "epoch": 0.8958338965638433, + "grad_norm": 28.59994888305664, + "learning_rate": 2.099737532808399e-06, + "loss": 1.4376, + "step": 2071 + }, + { + "epoch": 0.8962664575955014, + "grad_norm": 24.084014892578125, + "learning_rate": 2.0909886264216975e-06, + "loss": 1.4553, + "step": 2072 + }, + { + "epoch": 0.8966990186271594, + "grad_norm": 25.38064193725586, + "learning_rate": 2.082239720034996e-06, + "loss": 1.5537, + "step": 2073 + }, + { + "epoch": 0.8971315796588175, + "grad_norm": 24.919994354248047, + "learning_rate": 2.073490813648294e-06, + "loss": 1.508, + "step": 2074 + }, + { + "epoch": 0.8975641406904755, + "grad_norm": 24.744308471679688, + "learning_rate": 2.0647419072615926e-06, + "loss": 1.5346, + "step": 2075 + }, + { + "epoch": 0.8979967017221336, + "grad_norm": 24.991436004638672, + "learning_rate": 2.055993000874891e-06, + "loss": 1.5086, + "step": 2076 + }, + { + "epoch": 0.8984292627537916, + "grad_norm": 26.734596252441406, + "learning_rate": 2.0472440944881893e-06, + "loss": 1.4796, + "step": 2077 + }, + { + "epoch": 0.8988618237854498, + "grad_norm": 24.347599029541016, + "learning_rate": 2.0384951881014876e-06, + "loss": 1.5146, + "step": 2078 + }, + { + "epoch": 0.8992943848171078, + "grad_norm": 24.754878997802734, + "learning_rate": 2.029746281714786e-06, + "loss": 1.5116, + "step": 2079 + }, + { + "epoch": 0.8997269458487659, + "grad_norm": 23.960243225097656, + "learning_rate": 2.0209973753280844e-06, + "loss": 1.5233, + "step": 2080 + }, + { + "epoch": 0.9001595068804239, + "grad_norm": 24.335981369018555, + "learning_rate": 2.0122484689413823e-06, + "loss": 1.5004, + "step": 2081 + }, + { + "epoch": 0.900592067912082, + "grad_norm": 25.127904891967773, + "learning_rate": 2.0034995625546807e-06, + "loss": 1.4853, + "step": 2082 + }, + { + "epoch": 0.90102462894374, + "grad_norm": 27.572782516479492, + "learning_rate": 1.994750656167979e-06, + "loss": 1.4948, + "step": 2083 + }, + { + "epoch": 0.9014571899753981, + "grad_norm": 25.628870010375977, + "learning_rate": 1.9860017497812774e-06, + "loss": 1.5031, + "step": 2084 + }, + { + "epoch": 0.9018897510070562, + "grad_norm": 24.450708389282227, + "learning_rate": 1.9772528433945757e-06, + "loss": 1.4799, + "step": 2085 + }, + { + "epoch": 0.9023223120387142, + "grad_norm": 23.836322784423828, + "learning_rate": 1.968503937007874e-06, + "loss": 1.571, + "step": 2086 + }, + { + "epoch": 0.9027548730703723, + "grad_norm": 26.30670928955078, + "learning_rate": 1.9597550306211725e-06, + "loss": 1.6103, + "step": 2087 + }, + { + "epoch": 0.9031874341020303, + "grad_norm": 25.850881576538086, + "learning_rate": 1.951006124234471e-06, + "loss": 1.5946, + "step": 2088 + }, + { + "epoch": 0.9036199951336884, + "grad_norm": 21.866104125976562, + "learning_rate": 1.942257217847769e-06, + "loss": 1.5118, + "step": 2089 + }, + { + "epoch": 0.9040525561653464, + "grad_norm": 24.595714569091797, + "learning_rate": 1.9335083114610675e-06, + "loss": 1.4713, + "step": 2090 + }, + { + "epoch": 0.9044851171970045, + "grad_norm": 28.41547393798828, + "learning_rate": 1.924759405074366e-06, + "loss": 1.6027, + "step": 2091 + }, + { + "epoch": 0.9049176782286625, + "grad_norm": 27.03108787536621, + "learning_rate": 1.9160104986876642e-06, + "loss": 1.5333, + "step": 2092 + }, + { + "epoch": 0.9053502392603207, + "grad_norm": 24.61553955078125, + "learning_rate": 1.9072615923009624e-06, + "loss": 1.4534, + "step": 2093 + }, + { + "epoch": 0.9057828002919787, + "grad_norm": 23.870311737060547, + "learning_rate": 1.8985126859142608e-06, + "loss": 1.5434, + "step": 2094 + }, + { + "epoch": 0.9062153613236368, + "grad_norm": 23.868675231933594, + "learning_rate": 1.8897637795275591e-06, + "loss": 1.5228, + "step": 2095 + }, + { + "epoch": 0.9066479223552948, + "grad_norm": 24.33563804626465, + "learning_rate": 1.8810148731408575e-06, + "loss": 1.5551, + "step": 2096 + }, + { + "epoch": 0.9070804833869529, + "grad_norm": 24.130783081054688, + "learning_rate": 1.872265966754156e-06, + "loss": 1.5063, + "step": 2097 + }, + { + "epoch": 0.9075130444186109, + "grad_norm": 22.190542221069336, + "learning_rate": 1.8635170603674544e-06, + "loss": 1.4745, + "step": 2098 + }, + { + "epoch": 0.907945605450269, + "grad_norm": 27.16103744506836, + "learning_rate": 1.8547681539807523e-06, + "loss": 1.4179, + "step": 2099 + }, + { + "epoch": 0.908378166481927, + "grad_norm": 23.1556339263916, + "learning_rate": 1.846019247594051e-06, + "loss": 1.5758, + "step": 2100 + }, + { + "epoch": 0.9088107275135852, + "grad_norm": 23.74553871154785, + "learning_rate": 1.8372703412073493e-06, + "loss": 1.5291, + "step": 2101 + }, + { + "epoch": 0.9092432885452432, + "grad_norm": 24.70638084411621, + "learning_rate": 1.8285214348206476e-06, + "loss": 1.5172, + "step": 2102 + }, + { + "epoch": 0.9096758495769013, + "grad_norm": 28.004220962524414, + "learning_rate": 1.819772528433946e-06, + "loss": 1.4554, + "step": 2103 + }, + { + "epoch": 0.9101084106085593, + "grad_norm": 25.194591522216797, + "learning_rate": 1.8110236220472444e-06, + "loss": 1.4265, + "step": 2104 + }, + { + "epoch": 0.9105409716402174, + "grad_norm": 26.836763381958008, + "learning_rate": 1.8022747156605425e-06, + "loss": 1.4578, + "step": 2105 + }, + { + "epoch": 0.9109735326718754, + "grad_norm": 22.51131248474121, + "learning_rate": 1.7935258092738409e-06, + "loss": 1.445, + "step": 2106 + }, + { + "epoch": 0.9114060937035335, + "grad_norm": 23.413801193237305, + "learning_rate": 1.7847769028871392e-06, + "loss": 1.5159, + "step": 2107 + }, + { + "epoch": 0.9118386547351915, + "grad_norm": 25.643709182739258, + "learning_rate": 1.7760279965004376e-06, + "loss": 1.4713, + "step": 2108 + }, + { + "epoch": 0.9122712157668496, + "grad_norm": 26.521026611328125, + "learning_rate": 1.767279090113736e-06, + "loss": 1.5231, + "step": 2109 + }, + { + "epoch": 0.9127037767985077, + "grad_norm": 26.597665786743164, + "learning_rate": 1.7585301837270343e-06, + "loss": 1.4962, + "step": 2110 + }, + { + "epoch": 0.9131363378301657, + "grad_norm": 24.662185668945312, + "learning_rate": 1.7497812773403325e-06, + "loss": 1.4917, + "step": 2111 + }, + { + "epoch": 0.9135688988618238, + "grad_norm": 25.956602096557617, + "learning_rate": 1.7410323709536308e-06, + "loss": 1.4278, + "step": 2112 + }, + { + "epoch": 0.9140014598934818, + "grad_norm": 25.572887420654297, + "learning_rate": 1.7322834645669292e-06, + "loss": 1.5098, + "step": 2113 + }, + { + "epoch": 0.9144340209251399, + "grad_norm": 26.01473045349121, + "learning_rate": 1.7235345581802275e-06, + "loss": 1.5062, + "step": 2114 + }, + { + "epoch": 0.9148665819567979, + "grad_norm": 23.53647804260254, + "learning_rate": 1.714785651793526e-06, + "loss": 1.5157, + "step": 2115 + }, + { + "epoch": 0.915299142988456, + "grad_norm": 26.578235626220703, + "learning_rate": 1.7060367454068245e-06, + "loss": 1.5472, + "step": 2116 + }, + { + "epoch": 0.9157317040201141, + "grad_norm": 24.396869659423828, + "learning_rate": 1.6972878390201224e-06, + "loss": 1.5047, + "step": 2117 + }, + { + "epoch": 0.9161642650517722, + "grad_norm": 21.657743453979492, + "learning_rate": 1.688538932633421e-06, + "loss": 1.5496, + "step": 2118 + }, + { + "epoch": 0.9165968260834302, + "grad_norm": 26.7194766998291, + "learning_rate": 1.6797900262467193e-06, + "loss": 1.5533, + "step": 2119 + }, + { + "epoch": 0.9170293871150883, + "grad_norm": 28.471969604492188, + "learning_rate": 1.6710411198600177e-06, + "loss": 1.5192, + "step": 2120 + }, + { + "epoch": 0.9174619481467463, + "grad_norm": 25.8294620513916, + "learning_rate": 1.662292213473316e-06, + "loss": 1.572, + "step": 2121 + }, + { + "epoch": 0.9178945091784044, + "grad_norm": 24.569931030273438, + "learning_rate": 1.6535433070866144e-06, + "loss": 1.4803, + "step": 2122 + }, + { + "epoch": 0.9183270702100624, + "grad_norm": 28.031354904174805, + "learning_rate": 1.6447944006999126e-06, + "loss": 1.402, + "step": 2123 + }, + { + "epoch": 0.9187596312417206, + "grad_norm": 23.158405303955078, + "learning_rate": 1.636045494313211e-06, + "loss": 1.5053, + "step": 2124 + }, + { + "epoch": 0.9191921922733786, + "grad_norm": 24.734914779663086, + "learning_rate": 1.6272965879265093e-06, + "loss": 1.4882, + "step": 2125 + }, + { + "epoch": 0.9196247533050367, + "grad_norm": 25.3793888092041, + "learning_rate": 1.6185476815398076e-06, + "loss": 1.5247, + "step": 2126 + }, + { + "epoch": 0.9200573143366947, + "grad_norm": 26.39560890197754, + "learning_rate": 1.609798775153106e-06, + "loss": 1.5502, + "step": 2127 + }, + { + "epoch": 0.9204898753683528, + "grad_norm": 25.682659149169922, + "learning_rate": 1.6010498687664044e-06, + "loss": 1.4543, + "step": 2128 + }, + { + "epoch": 0.9209224364000108, + "grad_norm": 25.748123168945312, + "learning_rate": 1.5923009623797025e-06, + "loss": 1.4761, + "step": 2129 + }, + { + "epoch": 0.9213549974316688, + "grad_norm": 23.998931884765625, + "learning_rate": 1.5835520559930009e-06, + "loss": 1.4683, + "step": 2130 + }, + { + "epoch": 0.9217875584633269, + "grad_norm": 31.975799560546875, + "learning_rate": 1.5748031496062992e-06, + "loss": 1.5012, + "step": 2131 + }, + { + "epoch": 0.922220119494985, + "grad_norm": 26.674463272094727, + "learning_rate": 1.5660542432195978e-06, + "loss": 1.4607, + "step": 2132 + }, + { + "epoch": 0.9226526805266431, + "grad_norm": 22.74907684326172, + "learning_rate": 1.5573053368328962e-06, + "loss": 1.4775, + "step": 2133 + }, + { + "epoch": 0.9230852415583011, + "grad_norm": 28.162485122680664, + "learning_rate": 1.5485564304461945e-06, + "loss": 1.4859, + "step": 2134 + }, + { + "epoch": 0.9235178025899592, + "grad_norm": 25.198293685913086, + "learning_rate": 1.5398075240594927e-06, + "loss": 1.4851, + "step": 2135 + }, + { + "epoch": 0.9239503636216172, + "grad_norm": 27.7397403717041, + "learning_rate": 1.531058617672791e-06, + "loss": 1.5154, + "step": 2136 + }, + { + "epoch": 0.9243829246532753, + "grad_norm": 26.483898162841797, + "learning_rate": 1.5223097112860894e-06, + "loss": 1.5139, + "step": 2137 + }, + { + "epoch": 0.9248154856849333, + "grad_norm": 23.793954849243164, + "learning_rate": 1.5135608048993877e-06, + "loss": 1.5656, + "step": 2138 + }, + { + "epoch": 0.9252480467165914, + "grad_norm": 27.676443099975586, + "learning_rate": 1.5048118985126861e-06, + "loss": 1.5341, + "step": 2139 + }, + { + "epoch": 0.9256806077482495, + "grad_norm": 27.942245483398438, + "learning_rate": 1.4960629921259845e-06, + "loss": 1.5453, + "step": 2140 + }, + { + "epoch": 0.9261131687799076, + "grad_norm": 23.83132553100586, + "learning_rate": 1.4873140857392826e-06, + "loss": 1.5125, + "step": 2141 + }, + { + "epoch": 0.9265457298115656, + "grad_norm": 25.71536636352539, + "learning_rate": 1.478565179352581e-06, + "loss": 1.4835, + "step": 2142 + }, + { + "epoch": 0.9269782908432237, + "grad_norm": 22.844636917114258, + "learning_rate": 1.4698162729658793e-06, + "loss": 1.5344, + "step": 2143 + }, + { + "epoch": 0.9274108518748817, + "grad_norm": 26.83110237121582, + "learning_rate": 1.4610673665791777e-06, + "loss": 1.4896, + "step": 2144 + }, + { + "epoch": 0.9278434129065398, + "grad_norm": 23.741893768310547, + "learning_rate": 1.452318460192476e-06, + "loss": 1.5107, + "step": 2145 + }, + { + "epoch": 0.9282759739381978, + "grad_norm": 29.269779205322266, + "learning_rate": 1.4435695538057744e-06, + "loss": 1.5069, + "step": 2146 + }, + { + "epoch": 0.928708534969856, + "grad_norm": 24.56169319152832, + "learning_rate": 1.4348206474190726e-06, + "loss": 1.4771, + "step": 2147 + }, + { + "epoch": 0.929141096001514, + "grad_norm": 26.20133399963379, + "learning_rate": 1.426071741032371e-06, + "loss": 1.5259, + "step": 2148 + }, + { + "epoch": 0.9295736570331721, + "grad_norm": 25.875614166259766, + "learning_rate": 1.4173228346456693e-06, + "loss": 1.4957, + "step": 2149 + }, + { + "epoch": 0.9300062180648301, + "grad_norm": 28.533090591430664, + "learning_rate": 1.4085739282589679e-06, + "loss": 1.5897, + "step": 2150 + }, + { + "epoch": 0.9304387790964882, + "grad_norm": 26.25223731994629, + "learning_rate": 1.3998250218722662e-06, + "loss": 1.5214, + "step": 2151 + }, + { + "epoch": 0.9308713401281462, + "grad_norm": 23.350339889526367, + "learning_rate": 1.3910761154855646e-06, + "loss": 1.5911, + "step": 2152 + }, + { + "epoch": 0.9313039011598042, + "grad_norm": 25.860279083251953, + "learning_rate": 1.3823272090988627e-06, + "loss": 1.4472, + "step": 2153 + }, + { + "epoch": 0.9317364621914623, + "grad_norm": 25.473173141479492, + "learning_rate": 1.373578302712161e-06, + "loss": 1.5506, + "step": 2154 + }, + { + "epoch": 0.9321690232231203, + "grad_norm": 24.590129852294922, + "learning_rate": 1.3648293963254594e-06, + "loss": 1.5507, + "step": 2155 + }, + { + "epoch": 0.9326015842547785, + "grad_norm": 26.397546768188477, + "learning_rate": 1.3560804899387578e-06, + "loss": 1.5377, + "step": 2156 + }, + { + "epoch": 0.9330341452864365, + "grad_norm": 27.98323631286621, + "learning_rate": 1.3473315835520562e-06, + "loss": 1.5116, + "step": 2157 + }, + { + "epoch": 0.9334667063180946, + "grad_norm": 24.400859832763672, + "learning_rate": 1.3385826771653545e-06, + "loss": 1.5775, + "step": 2158 + }, + { + "epoch": 0.9338992673497526, + "grad_norm": 26.30158233642578, + "learning_rate": 1.3298337707786527e-06, + "loss": 1.4523, + "step": 2159 + }, + { + "epoch": 0.9343318283814107, + "grad_norm": 23.896175384521484, + "learning_rate": 1.321084864391951e-06, + "loss": 1.4708, + "step": 2160 + }, + { + "epoch": 0.9347643894130687, + "grad_norm": 24.987720489501953, + "learning_rate": 1.3123359580052494e-06, + "loss": 1.4932, + "step": 2161 + }, + { + "epoch": 0.9351969504447268, + "grad_norm": 23.302629470825195, + "learning_rate": 1.3035870516185478e-06, + "loss": 1.463, + "step": 2162 + }, + { + "epoch": 0.9356295114763848, + "grad_norm": 26.809837341308594, + "learning_rate": 1.2948381452318461e-06, + "loss": 1.5367, + "step": 2163 + }, + { + "epoch": 0.936062072508043, + "grad_norm": 23.170120239257812, + "learning_rate": 1.2860892388451447e-06, + "loss": 1.4773, + "step": 2164 + }, + { + "epoch": 0.936494633539701, + "grad_norm": 24.36250114440918, + "learning_rate": 1.2773403324584426e-06, + "loss": 1.5286, + "step": 2165 + }, + { + "epoch": 0.9369271945713591, + "grad_norm": 26.31621551513672, + "learning_rate": 1.268591426071741e-06, + "loss": 1.4818, + "step": 2166 + }, + { + "epoch": 0.9373597556030171, + "grad_norm": 27.618297576904297, + "learning_rate": 1.2598425196850396e-06, + "loss": 1.4788, + "step": 2167 + }, + { + "epoch": 0.9377923166346752, + "grad_norm": 25.09479522705078, + "learning_rate": 1.251093613298338e-06, + "loss": 1.5178, + "step": 2168 + }, + { + "epoch": 0.9382248776663332, + "grad_norm": 26.131237030029297, + "learning_rate": 1.242344706911636e-06, + "loss": 1.5872, + "step": 2169 + }, + { + "epoch": 0.9386574386979913, + "grad_norm": 25.003849029541016, + "learning_rate": 1.2335958005249344e-06, + "loss": 1.5635, + "step": 2170 + }, + { + "epoch": 0.9390899997296493, + "grad_norm": 26.515464782714844, + "learning_rate": 1.2248468941382328e-06, + "loss": 1.5046, + "step": 2171 + }, + { + "epoch": 0.9395225607613075, + "grad_norm": 26.55370330810547, + "learning_rate": 1.2160979877515311e-06, + "loss": 1.5933, + "step": 2172 + }, + { + "epoch": 0.9399551217929655, + "grad_norm": 24.8226375579834, + "learning_rate": 1.2073490813648295e-06, + "loss": 1.5164, + "step": 2173 + }, + { + "epoch": 0.9403876828246235, + "grad_norm": 24.112558364868164, + "learning_rate": 1.1986001749781279e-06, + "loss": 1.5325, + "step": 2174 + }, + { + "epoch": 0.9408202438562816, + "grad_norm": 25.355792999267578, + "learning_rate": 1.189851268591426e-06, + "loss": 1.5334, + "step": 2175 + }, + { + "epoch": 0.9412528048879396, + "grad_norm": 26.321578979492188, + "learning_rate": 1.1811023622047246e-06, + "loss": 1.4804, + "step": 2176 + }, + { + "epoch": 0.9416853659195977, + "grad_norm": 25.188034057617188, + "learning_rate": 1.172353455818023e-06, + "loss": 1.5346, + "step": 2177 + }, + { + "epoch": 0.9421179269512557, + "grad_norm": 26.958105087280273, + "learning_rate": 1.163604549431321e-06, + "loss": 1.5058, + "step": 2178 + }, + { + "epoch": 0.9425504879829139, + "grad_norm": 23.981430053710938, + "learning_rate": 1.1548556430446194e-06, + "loss": 1.5336, + "step": 2179 + }, + { + "epoch": 0.9429830490145719, + "grad_norm": 24.413782119750977, + "learning_rate": 1.1461067366579178e-06, + "loss": 1.5212, + "step": 2180 + }, + { + "epoch": 0.94341561004623, + "grad_norm": 25.38534164428711, + "learning_rate": 1.1373578302712162e-06, + "loss": 1.4876, + "step": 2181 + }, + { + "epoch": 0.943848171077888, + "grad_norm": 27.410280227661133, + "learning_rate": 1.1286089238845145e-06, + "loss": 1.495, + "step": 2182 + }, + { + "epoch": 0.9442807321095461, + "grad_norm": 24.01247787475586, + "learning_rate": 1.1198600174978129e-06, + "loss": 1.4992, + "step": 2183 + }, + { + "epoch": 0.9447132931412041, + "grad_norm": 28.51532554626465, + "learning_rate": 1.111111111111111e-06, + "loss": 1.4806, + "step": 2184 + }, + { + "epoch": 0.9451458541728622, + "grad_norm": 24.75975227355957, + "learning_rate": 1.1023622047244096e-06, + "loss": 1.5124, + "step": 2185 + }, + { + "epoch": 0.9455784152045202, + "grad_norm": 24.5733585357666, + "learning_rate": 1.093613298337708e-06, + "loss": 1.5007, + "step": 2186 + }, + { + "epoch": 0.9460109762361784, + "grad_norm": 25.291410446166992, + "learning_rate": 1.0848643919510061e-06, + "loss": 1.4037, + "step": 2187 + }, + { + "epoch": 0.9464435372678364, + "grad_norm": 22.40592384338379, + "learning_rate": 1.0761154855643045e-06, + "loss": 1.506, + "step": 2188 + }, + { + "epoch": 0.9468760982994945, + "grad_norm": 25.94204330444336, + "learning_rate": 1.0673665791776028e-06, + "loss": 1.4871, + "step": 2189 + }, + { + "epoch": 0.9473086593311525, + "grad_norm": 23.761117935180664, + "learning_rate": 1.0586176727909012e-06, + "loss": 1.5323, + "step": 2190 + }, + { + "epoch": 0.9477412203628106, + "grad_norm": 25.79743766784668, + "learning_rate": 1.0498687664041996e-06, + "loss": 1.5, + "step": 2191 + }, + { + "epoch": 0.9481737813944686, + "grad_norm": 28.8542423248291, + "learning_rate": 1.041119860017498e-06, + "loss": 1.4627, + "step": 2192 + }, + { + "epoch": 0.9486063424261267, + "grad_norm": 24.279600143432617, + "learning_rate": 1.0323709536307963e-06, + "loss": 1.5207, + "step": 2193 + }, + { + "epoch": 0.9490389034577847, + "grad_norm": 32.73613357543945, + "learning_rate": 1.0236220472440946e-06, + "loss": 1.5151, + "step": 2194 + }, + { + "epoch": 0.9494714644894429, + "grad_norm": 25.603696823120117, + "learning_rate": 1.014873140857393e-06, + "loss": 1.4946, + "step": 2195 + }, + { + "epoch": 0.9499040255211009, + "grad_norm": 29.79012680053711, + "learning_rate": 1.0061242344706911e-06, + "loss": 1.5321, + "step": 2196 + }, + { + "epoch": 0.9503365865527589, + "grad_norm": 25.095794677734375, + "learning_rate": 9.973753280839895e-07, + "loss": 1.5347, + "step": 2197 + }, + { + "epoch": 0.950769147584417, + "grad_norm": 24.920623779296875, + "learning_rate": 9.886264216972879e-07, + "loss": 1.4609, + "step": 2198 + }, + { + "epoch": 0.951201708616075, + "grad_norm": 24.81159019470215, + "learning_rate": 9.798775153105862e-07, + "loss": 1.4245, + "step": 2199 + }, + { + "epoch": 0.9516342696477331, + "grad_norm": 25.507144927978516, + "learning_rate": 9.711286089238846e-07, + "loss": 1.5234, + "step": 2200 + }, + { + "epoch": 0.9520668306793911, + "grad_norm": 25.74333381652832, + "learning_rate": 9.62379702537183e-07, + "loss": 1.4334, + "step": 2201 + }, + { + "epoch": 0.9524993917110492, + "grad_norm": 24.9305477142334, + "learning_rate": 9.536307961504812e-07, + "loss": 1.4945, + "step": 2202 + }, + { + "epoch": 0.9529319527427073, + "grad_norm": 25.96180534362793, + "learning_rate": 9.448818897637796e-07, + "loss": 1.4937, + "step": 2203 + }, + { + "epoch": 0.9533645137743654, + "grad_norm": 27.482013702392578, + "learning_rate": 9.36132983377078e-07, + "loss": 1.5735, + "step": 2204 + }, + { + "epoch": 0.9537970748060234, + "grad_norm": 27.310945510864258, + "learning_rate": 9.273840769903762e-07, + "loss": 1.6501, + "step": 2205 + }, + { + "epoch": 0.9542296358376815, + "grad_norm": 23.459266662597656, + "learning_rate": 9.186351706036746e-07, + "loss": 1.4727, + "step": 2206 + }, + { + "epoch": 0.9546621968693395, + "grad_norm": 23.028839111328125, + "learning_rate": 9.09886264216973e-07, + "loss": 1.5571, + "step": 2207 + }, + { + "epoch": 0.9550947579009976, + "grad_norm": 24.47319984436035, + "learning_rate": 9.011373578302713e-07, + "loss": 1.4652, + "step": 2208 + }, + { + "epoch": 0.9555273189326556, + "grad_norm": 27.27988052368164, + "learning_rate": 8.923884514435696e-07, + "loss": 1.6036, + "step": 2209 + }, + { + "epoch": 0.9559598799643138, + "grad_norm": 27.1883544921875, + "learning_rate": 8.83639545056868e-07, + "loss": 1.4815, + "step": 2210 + }, + { + "epoch": 0.9563924409959718, + "grad_norm": 27.201946258544922, + "learning_rate": 8.748906386701662e-07, + "loss": 1.4541, + "step": 2211 + }, + { + "epoch": 0.9568250020276299, + "grad_norm": 27.259296417236328, + "learning_rate": 8.661417322834646e-07, + "loss": 1.5087, + "step": 2212 + }, + { + "epoch": 0.9572575630592879, + "grad_norm": 25.546985626220703, + "learning_rate": 8.57392825896763e-07, + "loss": 1.5417, + "step": 2213 + }, + { + "epoch": 0.957690124090946, + "grad_norm": 26.166793823242188, + "learning_rate": 8.486439195100612e-07, + "loss": 1.4723, + "step": 2214 + }, + { + "epoch": 0.958122685122604, + "grad_norm": 26.597272872924805, + "learning_rate": 8.398950131233597e-07, + "loss": 1.5238, + "step": 2215 + }, + { + "epoch": 0.9585552461542621, + "grad_norm": 24.605998992919922, + "learning_rate": 8.31146106736658e-07, + "loss": 1.5571, + "step": 2216 + }, + { + "epoch": 0.9589878071859201, + "grad_norm": 24.375625610351562, + "learning_rate": 8.223972003499563e-07, + "loss": 1.4745, + "step": 2217 + }, + { + "epoch": 0.9594203682175781, + "grad_norm": 26.005847930908203, + "learning_rate": 8.136482939632546e-07, + "loss": 1.5447, + "step": 2218 + }, + { + "epoch": 0.9598529292492363, + "grad_norm": 27.76963996887207, + "learning_rate": 8.04899387576553e-07, + "loss": 1.5089, + "step": 2219 + }, + { + "epoch": 0.9602854902808943, + "grad_norm": 25.300994873046875, + "learning_rate": 7.961504811898513e-07, + "loss": 1.5734, + "step": 2220 + }, + { + "epoch": 0.9607180513125524, + "grad_norm": 26.90599822998047, + "learning_rate": 7.874015748031496e-07, + "loss": 1.5282, + "step": 2221 + }, + { + "epoch": 0.9611506123442104, + "grad_norm": 25.654613494873047, + "learning_rate": 7.786526684164481e-07, + "loss": 1.5522, + "step": 2222 + }, + { + "epoch": 0.9615831733758685, + "grad_norm": 23.53044319152832, + "learning_rate": 7.699037620297463e-07, + "loss": 1.575, + "step": 2223 + }, + { + "epoch": 0.9620157344075265, + "grad_norm": 25.62672996520996, + "learning_rate": 7.611548556430447e-07, + "loss": 1.5661, + "step": 2224 + }, + { + "epoch": 0.9624482954391846, + "grad_norm": 24.500530242919922, + "learning_rate": 7.524059492563431e-07, + "loss": 1.5151, + "step": 2225 + }, + { + "epoch": 0.9628808564708426, + "grad_norm": 25.327373504638672, + "learning_rate": 7.436570428696413e-07, + "loss": 1.5006, + "step": 2226 + }, + { + "epoch": 0.9633134175025008, + "grad_norm": 28.519956588745117, + "learning_rate": 7.349081364829397e-07, + "loss": 1.5537, + "step": 2227 + }, + { + "epoch": 0.9637459785341588, + "grad_norm": 25.847875595092773, + "learning_rate": 7.26159230096238e-07, + "loss": 1.4917, + "step": 2228 + }, + { + "epoch": 0.9641785395658169, + "grad_norm": 22.345125198364258, + "learning_rate": 7.174103237095363e-07, + "loss": 1.4354, + "step": 2229 + }, + { + "epoch": 0.9646111005974749, + "grad_norm": 24.792699813842773, + "learning_rate": 7.086614173228346e-07, + "loss": 1.4301, + "step": 2230 + }, + { + "epoch": 0.965043661629133, + "grad_norm": 25.965551376342773, + "learning_rate": 6.999125109361331e-07, + "loss": 1.5495, + "step": 2231 + }, + { + "epoch": 0.965476222660791, + "grad_norm": 26.518917083740234, + "learning_rate": 6.911636045494314e-07, + "loss": 1.5347, + "step": 2232 + }, + { + "epoch": 0.9659087836924491, + "grad_norm": 22.920303344726562, + "learning_rate": 6.824146981627297e-07, + "loss": 1.4384, + "step": 2233 + }, + { + "epoch": 0.9663413447241072, + "grad_norm": 23.388620376586914, + "learning_rate": 6.736657917760281e-07, + "loss": 1.4741, + "step": 2234 + }, + { + "epoch": 0.9667739057557653, + "grad_norm": 22.90789222717285, + "learning_rate": 6.649168853893263e-07, + "loss": 1.4389, + "step": 2235 + }, + { + "epoch": 0.9672064667874233, + "grad_norm": 27.07576560974121, + "learning_rate": 6.561679790026247e-07, + "loss": 1.5521, + "step": 2236 + }, + { + "epoch": 0.9676390278190814, + "grad_norm": 25.001564025878906, + "learning_rate": 6.474190726159231e-07, + "loss": 1.5137, + "step": 2237 + }, + { + "epoch": 0.9680715888507394, + "grad_norm": 23.1292724609375, + "learning_rate": 6.386701662292213e-07, + "loss": 1.4152, + "step": 2238 + }, + { + "epoch": 0.9685041498823975, + "grad_norm": 24.822486877441406, + "learning_rate": 6.299212598425198e-07, + "loss": 1.4223, + "step": 2239 + }, + { + "epoch": 0.9689367109140555, + "grad_norm": 23.44011116027832, + "learning_rate": 6.21172353455818e-07, + "loss": 1.5192, + "step": 2240 + }, + { + "epoch": 0.9693692719457135, + "grad_norm": 25.917972564697266, + "learning_rate": 6.124234470691164e-07, + "loss": 1.5659, + "step": 2241 + }, + { + "epoch": 0.9698018329773717, + "grad_norm": 24.592409133911133, + "learning_rate": 6.036745406824148e-07, + "loss": 1.5472, + "step": 2242 + }, + { + "epoch": 0.9702343940090297, + "grad_norm": 25.710756301879883, + "learning_rate": 5.94925634295713e-07, + "loss": 1.4849, + "step": 2243 + }, + { + "epoch": 0.9706669550406878, + "grad_norm": 30.253639221191406, + "learning_rate": 5.861767279090115e-07, + "loss": 1.5312, + "step": 2244 + }, + { + "epoch": 0.9710995160723458, + "grad_norm": 27.50519371032715, + "learning_rate": 5.774278215223097e-07, + "loss": 1.54, + "step": 2245 + }, + { + "epoch": 0.9715320771040039, + "grad_norm": 26.179685592651367, + "learning_rate": 5.686789151356081e-07, + "loss": 1.5439, + "step": 2246 + }, + { + "epoch": 0.9719646381356619, + "grad_norm": 24.04256248474121, + "learning_rate": 5.599300087489064e-07, + "loss": 1.5165, + "step": 2247 + }, + { + "epoch": 0.97239719916732, + "grad_norm": 25.094362258911133, + "learning_rate": 5.511811023622048e-07, + "loss": 1.5274, + "step": 2248 + }, + { + "epoch": 0.972829760198978, + "grad_norm": 22.677230834960938, + "learning_rate": 5.424321959755031e-07, + "loss": 1.4912, + "step": 2249 + }, + { + "epoch": 0.9732623212306362, + "grad_norm": 27.18683624267578, + "learning_rate": 5.336832895888014e-07, + "loss": 1.4782, + "step": 2250 + }, + { + "epoch": 0.9736948822622942, + "grad_norm": 25.23105239868164, + "learning_rate": 5.249343832020998e-07, + "loss": 1.5043, + "step": 2251 + }, + { + "epoch": 0.9741274432939523, + "grad_norm": 24.919349670410156, + "learning_rate": 5.161854768153981e-07, + "loss": 1.5297, + "step": 2252 + }, + { + "epoch": 0.9745600043256103, + "grad_norm": 26.34563636779785, + "learning_rate": 5.074365704286965e-07, + "loss": 1.4319, + "step": 2253 + }, + { + "epoch": 0.9749925653572684, + "grad_norm": 24.52495002746582, + "learning_rate": 4.986876640419948e-07, + "loss": 1.6465, + "step": 2254 + }, + { + "epoch": 0.9754251263889264, + "grad_norm": 24.07614517211914, + "learning_rate": 4.899387576552931e-07, + "loss": 1.6281, + "step": 2255 + }, + { + "epoch": 0.9758576874205845, + "grad_norm": 26.345617294311523, + "learning_rate": 4.811898512685915e-07, + "loss": 1.5466, + "step": 2256 + }, + { + "epoch": 0.9762902484522425, + "grad_norm": 27.2408390045166, + "learning_rate": 4.724409448818898e-07, + "loss": 1.5557, + "step": 2257 + }, + { + "epoch": 0.9767228094839007, + "grad_norm": 23.67082977294922, + "learning_rate": 4.636920384951881e-07, + "loss": 1.5431, + "step": 2258 + }, + { + "epoch": 0.9771553705155587, + "grad_norm": 23.821247100830078, + "learning_rate": 4.549431321084865e-07, + "loss": 1.5224, + "step": 2259 + }, + { + "epoch": 0.9775879315472168, + "grad_norm": 24.50645637512207, + "learning_rate": 4.461942257217848e-07, + "loss": 1.4193, + "step": 2260 + }, + { + "epoch": 0.9780204925788748, + "grad_norm": 24.018909454345703, + "learning_rate": 4.374453193350831e-07, + "loss": 1.5108, + "step": 2261 + }, + { + "epoch": 0.9784530536105328, + "grad_norm": 24.982769012451172, + "learning_rate": 4.286964129483815e-07, + "loss": 1.4694, + "step": 2262 + }, + { + "epoch": 0.9788856146421909, + "grad_norm": 26.493566513061523, + "learning_rate": 4.1994750656167983e-07, + "loss": 1.4441, + "step": 2263 + }, + { + "epoch": 0.9793181756738489, + "grad_norm": 22.410762786865234, + "learning_rate": 4.1119860017497814e-07, + "loss": 1.5065, + "step": 2264 + }, + { + "epoch": 0.979750736705507, + "grad_norm": 24.40163230895996, + "learning_rate": 4.024496937882765e-07, + "loss": 1.5389, + "step": 2265 + }, + { + "epoch": 0.9801832977371651, + "grad_norm": 26.319671630859375, + "learning_rate": 3.937007874015748e-07, + "loss": 1.4334, + "step": 2266 + }, + { + "epoch": 0.9806158587688232, + "grad_norm": 22.440717697143555, + "learning_rate": 3.8495188101487317e-07, + "loss": 1.5234, + "step": 2267 + }, + { + "epoch": 0.9810484198004812, + "grad_norm": 23.569488525390625, + "learning_rate": 3.7620297462817153e-07, + "loss": 1.4278, + "step": 2268 + }, + { + "epoch": 0.9814809808321393, + "grad_norm": 24.376298904418945, + "learning_rate": 3.6745406824146983e-07, + "loss": 1.5538, + "step": 2269 + }, + { + "epoch": 0.9819135418637973, + "grad_norm": 25.128238677978516, + "learning_rate": 3.5870516185476814e-07, + "loss": 1.5044, + "step": 2270 + }, + { + "epoch": 0.9823461028954554, + "grad_norm": 24.355140686035156, + "learning_rate": 3.4995625546806655e-07, + "loss": 1.4869, + "step": 2271 + }, + { + "epoch": 0.9827786639271134, + "grad_norm": 26.226696014404297, + "learning_rate": 3.4120734908136486e-07, + "loss": 1.4047, + "step": 2272 + }, + { + "epoch": 0.9832112249587716, + "grad_norm": 24.68053436279297, + "learning_rate": 3.3245844269466317e-07, + "loss": 1.4378, + "step": 2273 + }, + { + "epoch": 0.9836437859904296, + "grad_norm": 23.87020492553711, + "learning_rate": 3.2370953630796153e-07, + "loss": 1.4567, + "step": 2274 + }, + { + "epoch": 0.9840763470220877, + "grad_norm": 26.111167907714844, + "learning_rate": 3.149606299212599e-07, + "loss": 1.4861, + "step": 2275 + }, + { + "epoch": 0.9845089080537457, + "grad_norm": 24.464523315429688, + "learning_rate": 3.062117235345582e-07, + "loss": 1.4386, + "step": 2276 + }, + { + "epoch": 0.9849414690854038, + "grad_norm": 25.67591094970703, + "learning_rate": 2.974628171478565e-07, + "loss": 1.4829, + "step": 2277 + }, + { + "epoch": 0.9853740301170618, + "grad_norm": 25.558279037475586, + "learning_rate": 2.8871391076115486e-07, + "loss": 1.487, + "step": 2278 + }, + { + "epoch": 0.9858065911487199, + "grad_norm": 27.101078033447266, + "learning_rate": 2.799650043744532e-07, + "loss": 1.5293, + "step": 2279 + }, + { + "epoch": 0.9862391521803779, + "grad_norm": 23.139122009277344, + "learning_rate": 2.7121609798775153e-07, + "loss": 1.491, + "step": 2280 + }, + { + "epoch": 0.9866717132120361, + "grad_norm": 24.572038650512695, + "learning_rate": 2.624671916010499e-07, + "loss": 1.4598, + "step": 2281 + }, + { + "epoch": 0.9871042742436941, + "grad_norm": 24.457639694213867, + "learning_rate": 2.5371828521434825e-07, + "loss": 1.5327, + "step": 2282 + }, + { + "epoch": 0.9875368352753522, + "grad_norm": 26.919748306274414, + "learning_rate": 2.4496937882764656e-07, + "loss": 1.5191, + "step": 2283 + }, + { + "epoch": 0.9879693963070102, + "grad_norm": 24.509489059448242, + "learning_rate": 2.362204724409449e-07, + "loss": 1.5107, + "step": 2284 + }, + { + "epoch": 0.9884019573386682, + "grad_norm": 24.133081436157227, + "learning_rate": 2.2747156605424325e-07, + "loss": 1.4597, + "step": 2285 + }, + { + "epoch": 0.9888345183703263, + "grad_norm": 27.456026077270508, + "learning_rate": 2.1872265966754156e-07, + "loss": 1.5723, + "step": 2286 + }, + { + "epoch": 0.9892670794019843, + "grad_norm": 25.23147964477539, + "learning_rate": 2.0997375328083992e-07, + "loss": 1.472, + "step": 2287 + }, + { + "epoch": 0.9896996404336424, + "grad_norm": 25.515771865844727, + "learning_rate": 2.0122484689413825e-07, + "loss": 1.5353, + "step": 2288 + }, + { + "epoch": 0.9901322014653005, + "grad_norm": 25.060243606567383, + "learning_rate": 1.9247594050743658e-07, + "loss": 1.4872, + "step": 2289 + }, + { + "epoch": 0.9905647624969586, + "grad_norm": 25.204627990722656, + "learning_rate": 1.8372703412073492e-07, + "loss": 1.5119, + "step": 2290 + }, + { + "epoch": 0.9909973235286166, + "grad_norm": 26.539810180664062, + "learning_rate": 1.7497812773403328e-07, + "loss": 1.5112, + "step": 2291 + }, + { + "epoch": 0.9914298845602747, + "grad_norm": 25.467899322509766, + "learning_rate": 1.6622922134733158e-07, + "loss": 1.4897, + "step": 2292 + }, + { + "epoch": 0.9918624455919327, + "grad_norm": 26.030517578125, + "learning_rate": 1.5748031496062994e-07, + "loss": 1.5333, + "step": 2293 + }, + { + "epoch": 0.9922950066235908, + "grad_norm": 26.485551834106445, + "learning_rate": 1.4873140857392825e-07, + "loss": 1.5446, + "step": 2294 + }, + { + "epoch": 0.9927275676552488, + "grad_norm": 26.391727447509766, + "learning_rate": 1.399825021872266e-07, + "loss": 1.4711, + "step": 2295 + }, + { + "epoch": 0.993160128686907, + "grad_norm": 23.88796615600586, + "learning_rate": 1.3123359580052494e-07, + "loss": 1.4731, + "step": 2296 + }, + { + "epoch": 0.993592689718565, + "grad_norm": 24.405637741088867, + "learning_rate": 1.2248468941382328e-07, + "loss": 1.5611, + "step": 2297 + }, + { + "epoch": 0.9940252507502231, + "grad_norm": 26.79961585998535, + "learning_rate": 1.1373578302712162e-07, + "loss": 1.547, + "step": 2298 + }, + { + "epoch": 0.9944578117818811, + "grad_norm": 27.189603805541992, + "learning_rate": 1.0498687664041996e-07, + "loss": 1.5443, + "step": 2299 + }, + { + "epoch": 0.9948903728135392, + "grad_norm": 27.315834045410156, + "learning_rate": 9.623797025371829e-08, + "loss": 1.3623, + "step": 2300 + }, + { + "epoch": 0.9953229338451972, + "grad_norm": 27.901859283447266, + "learning_rate": 8.748906386701664e-08, + "loss": 1.6198, + "step": 2301 + }, + { + "epoch": 0.9957554948768553, + "grad_norm": 26.05302619934082, + "learning_rate": 7.874015748031497e-08, + "loss": 1.4631, + "step": 2302 + }, + { + "epoch": 0.9961880559085133, + "grad_norm": 25.083433151245117, + "learning_rate": 6.99912510936133e-08, + "loss": 1.5496, + "step": 2303 + }, + { + "epoch": 0.9966206169401715, + "grad_norm": 25.853899002075195, + "learning_rate": 6.124234470691164e-08, + "loss": 1.5728, + "step": 2304 + }, + { + "epoch": 0.9970531779718295, + "grad_norm": 24.412639617919922, + "learning_rate": 5.249343832020998e-08, + "loss": 1.5394, + "step": 2305 + }, + { + "epoch": 0.9974857390034875, + "grad_norm": 26.720706939697266, + "learning_rate": 4.374453193350832e-08, + "loss": 1.4967, + "step": 2306 + }, + { + "epoch": 0.9979183000351456, + "grad_norm": 27.0684814453125, + "learning_rate": 3.499562554680665e-08, + "loss": 1.5429, + "step": 2307 + }, + { + "epoch": 0.9983508610668036, + "grad_norm": 24.51413917541504, + "learning_rate": 2.624671916010499e-08, + "loss": 1.5263, + "step": 2308 + }, + { + "epoch": 0.9987834220984617, + "grad_norm": 27.829641342163086, + "learning_rate": 1.7497812773403326e-08, + "loss": 1.492, + "step": 2309 + }, + { + "epoch": 0.9992159831301197, + "grad_norm": 24.01323127746582, + "learning_rate": 8.748906386701663e-09, + "loss": 1.5018, + "step": 2310 + }, + { + "epoch": 0.9996485441617778, + "grad_norm": 24.6170597076416, + "learning_rate": 0.0, + "loss": 1.533, + "step": 2311 } ], "logging_steps": 1, @@ -7021,12 +16198,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } }, - "total_flos": 1.2407280868764634e+17, + "total_flos": 2.868910410465792e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null