{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 9480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010548523206751054, "grad_norm": 1.3075357675552368, "learning_rate": 0.00015822784810126583, "loss": 7.5213, "step": 10 }, { "epoch": 0.002109704641350211, "grad_norm": 1.180863380432129, "learning_rate": 0.00031645569620253165, "loss": 6.8987, "step": 20 }, { "epoch": 0.0031645569620253164, "grad_norm": 0.8489615321159363, "learning_rate": 0.00047468354430379745, "loss": 6.2384, "step": 30 }, { "epoch": 0.004219409282700422, "grad_norm": 0.9192500114440918, "learning_rate": 0.0006329113924050633, "loss": 5.747, "step": 40 }, { "epoch": 0.005274261603375527, "grad_norm": 0.9643535614013672, "learning_rate": 0.0007911392405063291, "loss": 5.2708, "step": 50 }, { "epoch": 0.006329113924050633, "grad_norm": 1.082902193069458, "learning_rate": 0.0009493670886075949, "loss": 4.7644, "step": 60 }, { "epoch": 0.007383966244725738, "grad_norm": 1.1981812715530396, "learning_rate": 0.0011075949367088608, "loss": 4.3723, "step": 70 }, { "epoch": 0.008438818565400843, "grad_norm": 0.9766468405723572, "learning_rate": 0.0012658227848101266, "loss": 4.1225, "step": 80 }, { "epoch": 0.00949367088607595, "grad_norm": 1.1699119806289673, "learning_rate": 0.0014240506329113926, "loss": 3.9235, "step": 90 }, { "epoch": 0.010548523206751054, "grad_norm": 1.0279523134231567, "learning_rate": 0.0015, "loss": 3.7655, "step": 100 }, { "epoch": 0.011603375527426161, "grad_norm": 0.9181833863258362, "learning_rate": 0.0015, "loss": 3.6204, "step": 110 }, { "epoch": 0.012658227848101266, "grad_norm": 0.8613824248313904, "learning_rate": 0.0015, "loss": 3.5071, "step": 120 }, { "epoch": 0.013713080168776372, "grad_norm": 1.1336807012557983, "learning_rate": 0.0015, "loss": 3.395, "step": 130 }, { "epoch": 0.014767932489451477, "grad_norm": 0.9157042503356934, "learning_rate": 0.0015, "loss": 3.2994, "step": 140 }, { "epoch": 0.015822784810126583, "grad_norm": 0.9386192560195923, "learning_rate": 0.0015, "loss": 3.2244, "step": 150 }, { "epoch": 0.016877637130801686, "grad_norm": 0.9380351901054382, "learning_rate": 0.0015, "loss": 3.1574, "step": 160 }, { "epoch": 0.017932489451476793, "grad_norm": 0.6966649889945984, "learning_rate": 0.0015, "loss": 3.0937, "step": 170 }, { "epoch": 0.0189873417721519, "grad_norm": 0.7090355157852173, "learning_rate": 0.0015, "loss": 3.0498, "step": 180 }, { "epoch": 0.020042194092827006, "grad_norm": 0.7136836647987366, "learning_rate": 0.0015, "loss": 2.9903, "step": 190 }, { "epoch": 0.02109704641350211, "grad_norm": 0.8697147369384766, "learning_rate": 0.0015, "loss": 2.9377, "step": 200 }, { "epoch": 0.022151898734177215, "grad_norm": 0.7215303182601929, "learning_rate": 0.0015, "loss": 2.9076, "step": 210 }, { "epoch": 0.023206751054852322, "grad_norm": 0.8969533443450928, "learning_rate": 0.0015, "loss": 2.8731, "step": 220 }, { "epoch": 0.024261603375527425, "grad_norm": 0.6747064590454102, "learning_rate": 0.0015, "loss": 2.8266, "step": 230 }, { "epoch": 0.02531645569620253, "grad_norm": 0.9318302273750305, "learning_rate": 0.0015, "loss": 2.7889, "step": 240 }, { "epoch": 0.026371308016877638, "grad_norm": 0.71550452709198, "learning_rate": 0.0015, "loss": 2.7549, "step": 250 }, { "epoch": 0.027426160337552744, "grad_norm": 0.7597529292106628, "learning_rate": 0.0015, "loss": 2.7197, "step": 260 }, { "epoch": 0.028481012658227847, "grad_norm": 0.7744061946868896, "learning_rate": 0.0015, "loss": 2.6953, "step": 270 }, { "epoch": 0.029535864978902954, "grad_norm": 0.8805778622627258, "learning_rate": 0.0015, "loss": 2.6567, "step": 280 }, { "epoch": 0.03059071729957806, "grad_norm": 0.8690669536590576, "learning_rate": 0.0015, "loss": 2.6336, "step": 290 }, { "epoch": 0.03164556962025317, "grad_norm": 1.0534539222717285, "learning_rate": 0.0015, "loss": 2.6088, "step": 300 }, { "epoch": 0.03270042194092827, "grad_norm": 0.9530712366104126, "learning_rate": 0.0015, "loss": 2.5892, "step": 310 }, { "epoch": 0.03375527426160337, "grad_norm": 1.4720412492752075, "learning_rate": 0.0015, "loss": 2.5548, "step": 320 }, { "epoch": 0.03481012658227848, "grad_norm": 0.9948314428329468, "learning_rate": 0.0015, "loss": 2.5482, "step": 330 }, { "epoch": 0.035864978902953586, "grad_norm": 0.9229395985603333, "learning_rate": 0.0015, "loss": 2.5241, "step": 340 }, { "epoch": 0.03691983122362869, "grad_norm": 1.1244056224822998, "learning_rate": 0.0015, "loss": 2.4848, "step": 350 }, { "epoch": 0.0379746835443038, "grad_norm": 0.7936338186264038, "learning_rate": 0.0015, "loss": 2.4668, "step": 360 }, { "epoch": 0.039029535864978905, "grad_norm": 0.9502090215682983, "learning_rate": 0.0015, "loss": 2.4538, "step": 370 }, { "epoch": 0.04008438818565401, "grad_norm": 0.7836471796035767, "learning_rate": 0.0015, "loss": 2.445, "step": 380 }, { "epoch": 0.04113924050632911, "grad_norm": 0.8590785264968872, "learning_rate": 0.0015, "loss": 2.4195, "step": 390 }, { "epoch": 0.04219409282700422, "grad_norm": 1.0463407039642334, "learning_rate": 0.0015, "loss": 2.3997, "step": 400 }, { "epoch": 0.043248945147679324, "grad_norm": 1.039176344871521, "learning_rate": 0.0015, "loss": 2.3899, "step": 410 }, { "epoch": 0.04430379746835443, "grad_norm": 0.8505009412765503, "learning_rate": 0.0015, "loss": 2.3748, "step": 420 }, { "epoch": 0.04535864978902954, "grad_norm": 0.8631711006164551, "learning_rate": 0.0015, "loss": 2.3399, "step": 430 }, { "epoch": 0.046413502109704644, "grad_norm": 0.9587793946266174, "learning_rate": 0.0015, "loss": 2.326, "step": 440 }, { "epoch": 0.04746835443037975, "grad_norm": 0.7890628576278687, "learning_rate": 0.0015, "loss": 2.3267, "step": 450 }, { "epoch": 0.04852320675105485, "grad_norm": 0.763205349445343, "learning_rate": 0.0015, "loss": 2.3002, "step": 460 }, { "epoch": 0.049578059071729956, "grad_norm": 0.7925857901573181, "learning_rate": 0.0015, "loss": 2.2826, "step": 470 }, { "epoch": 0.05063291139240506, "grad_norm": 0.7465478181838989, "learning_rate": 0.0015, "loss": 2.2844, "step": 480 }, { "epoch": 0.05168776371308017, "grad_norm": 0.9607719779014587, "learning_rate": 0.0015, "loss": 2.2541, "step": 490 }, { "epoch": 0.052742616033755275, "grad_norm": 0.7803492546081543, "learning_rate": 0.0015, "loss": 2.2435, "step": 500 }, { "epoch": 0.05379746835443038, "grad_norm": 0.9835359454154968, "learning_rate": 0.0015, "loss": 2.2327, "step": 510 }, { "epoch": 0.05485232067510549, "grad_norm": 1.1225404739379883, "learning_rate": 0.0015, "loss": 2.2231, "step": 520 }, { "epoch": 0.05590717299578059, "grad_norm": 0.8247981667518616, "learning_rate": 0.0015, "loss": 2.2064, "step": 530 }, { "epoch": 0.056962025316455694, "grad_norm": 0.8044012188911438, "learning_rate": 0.0015, "loss": 2.1905, "step": 540 }, { "epoch": 0.0580168776371308, "grad_norm": 0.8903504610061646, "learning_rate": 0.0015, "loss": 2.1921, "step": 550 }, { "epoch": 0.05907172995780591, "grad_norm": 0.8271088600158691, "learning_rate": 0.0015, "loss": 2.1587, "step": 560 }, { "epoch": 0.060126582278481014, "grad_norm": 0.6447353363037109, "learning_rate": 0.0015, "loss": 2.1608, "step": 570 }, { "epoch": 0.06118143459915612, "grad_norm": 1.0156145095825195, "learning_rate": 0.0015, "loss": 2.162, "step": 580 }, { "epoch": 0.06223628691983123, "grad_norm": 0.8628018498420715, "learning_rate": 0.0015, "loss": 2.1352, "step": 590 }, { "epoch": 0.06329113924050633, "grad_norm": 0.7424734830856323, "learning_rate": 0.0015, "loss": 2.1174, "step": 600 }, { "epoch": 0.06434599156118144, "grad_norm": 0.8701589107513428, "learning_rate": 0.0015, "loss": 2.1179, "step": 610 }, { "epoch": 0.06540084388185655, "grad_norm": 0.76248699426651, "learning_rate": 0.0015, "loss": 2.1089, "step": 620 }, { "epoch": 0.06645569620253164, "grad_norm": 0.8957002758979797, "learning_rate": 0.0015, "loss": 2.0911, "step": 630 }, { "epoch": 0.06751054852320675, "grad_norm": 0.9295898675918579, "learning_rate": 0.0015, "loss": 2.1067, "step": 640 }, { "epoch": 0.06856540084388185, "grad_norm": 0.792677104473114, "learning_rate": 0.0015, "loss": 2.0931, "step": 650 }, { "epoch": 0.06962025316455696, "grad_norm": 0.877686083316803, "learning_rate": 0.0015, "loss": 2.083, "step": 660 }, { "epoch": 0.07067510548523206, "grad_norm": 0.8630133271217346, "learning_rate": 0.0015, "loss": 2.0581, "step": 670 }, { "epoch": 0.07172995780590717, "grad_norm": 0.8127005696296692, "learning_rate": 0.0015, "loss": 2.0547, "step": 680 }, { "epoch": 0.07278481012658228, "grad_norm": 0.7369611263275146, "learning_rate": 0.0015, "loss": 2.0627, "step": 690 }, { "epoch": 0.07383966244725738, "grad_norm": 0.9251187443733215, "learning_rate": 0.0015, "loss": 2.0457, "step": 700 }, { "epoch": 0.07489451476793249, "grad_norm": 0.752349853515625, "learning_rate": 0.0015, "loss": 2.0381, "step": 710 }, { "epoch": 0.0759493670886076, "grad_norm": 0.7315734624862671, "learning_rate": 0.0015, "loss": 2.0194, "step": 720 }, { "epoch": 0.0770042194092827, "grad_norm": 0.926827609539032, "learning_rate": 0.0015, "loss": 2.0147, "step": 730 }, { "epoch": 0.07805907172995781, "grad_norm": 0.8080822229385376, "learning_rate": 0.0015, "loss": 2.0244, "step": 740 }, { "epoch": 0.07911392405063292, "grad_norm": 0.8982332348823547, "learning_rate": 0.0015, "loss": 2.001, "step": 750 }, { "epoch": 0.08016877637130802, "grad_norm": 1.2816462516784668, "learning_rate": 0.0015, "loss": 2.0012, "step": 760 }, { "epoch": 0.08122362869198312, "grad_norm": 0.6519502997398376, "learning_rate": 0.0015, "loss": 1.9916, "step": 770 }, { "epoch": 0.08227848101265822, "grad_norm": 0.7490759491920471, "learning_rate": 0.0015, "loss": 1.9894, "step": 780 }, { "epoch": 0.08333333333333333, "grad_norm": 0.8788653612136841, "learning_rate": 0.0015, "loss": 1.9916, "step": 790 }, { "epoch": 0.08438818565400844, "grad_norm": 0.837259829044342, "learning_rate": 0.0015, "loss": 1.9636, "step": 800 }, { "epoch": 0.08544303797468354, "grad_norm": 0.9153476357460022, "learning_rate": 0.0015, "loss": 1.9544, "step": 810 }, { "epoch": 0.08649789029535865, "grad_norm": 0.7409590482711792, "learning_rate": 0.0015, "loss": 1.9623, "step": 820 }, { "epoch": 0.08755274261603375, "grad_norm": 0.8565800786018372, "learning_rate": 0.0015, "loss": 1.9658, "step": 830 }, { "epoch": 0.08860759493670886, "grad_norm": 0.9489105939865112, "learning_rate": 0.0015, "loss": 1.9448, "step": 840 }, { "epoch": 0.08966244725738397, "grad_norm": 0.6747156381607056, "learning_rate": 0.0015, "loss": 1.9454, "step": 850 }, { "epoch": 0.09071729957805907, "grad_norm": 0.73032146692276, "learning_rate": 0.0015, "loss": 1.9453, "step": 860 }, { "epoch": 0.09177215189873418, "grad_norm": 1.097557544708252, "learning_rate": 0.0015, "loss": 1.9388, "step": 870 }, { "epoch": 0.09282700421940929, "grad_norm": 0.7871012091636658, "learning_rate": 0.0015, "loss": 1.9311, "step": 880 }, { "epoch": 0.0938818565400844, "grad_norm": 0.9786339998245239, "learning_rate": 0.0015, "loss": 1.9229, "step": 890 }, { "epoch": 0.0949367088607595, "grad_norm": 1.0217005014419556, "learning_rate": 0.0015, "loss": 1.9212, "step": 900 }, { "epoch": 0.09599156118143459, "grad_norm": 0.6946851015090942, "learning_rate": 0.0015, "loss": 1.9254, "step": 910 }, { "epoch": 0.0970464135021097, "grad_norm": 0.6087393760681152, "learning_rate": 0.0015, "loss": 1.914, "step": 920 }, { "epoch": 0.0981012658227848, "grad_norm": 0.9960092902183533, "learning_rate": 0.0015, "loss": 1.897, "step": 930 }, { "epoch": 0.09915611814345991, "grad_norm": 0.7217803001403809, "learning_rate": 0.0015, "loss": 1.9084, "step": 940 }, { "epoch": 0.10021097046413502, "grad_norm": 0.8259333372116089, "learning_rate": 0.0015, "loss": 1.8858, "step": 950 }, { "epoch": 0.10126582278481013, "grad_norm": 0.7509357333183289, "learning_rate": 0.0015, "loss": 1.8917, "step": 960 }, { "epoch": 0.10232067510548523, "grad_norm": 0.8337006568908691, "learning_rate": 0.0015, "loss": 1.8975, "step": 970 }, { "epoch": 0.10337552742616034, "grad_norm": 0.7414672374725342, "learning_rate": 0.0015, "loss": 1.876, "step": 980 }, { "epoch": 0.10443037974683544, "grad_norm": 1.2368232011795044, "learning_rate": 0.0015, "loss": 1.8823, "step": 990 }, { "epoch": 0.10548523206751055, "grad_norm": 0.7442910075187683, "learning_rate": 0.0015, "loss": 1.879, "step": 1000 }, { "epoch": 0.10654008438818566, "grad_norm": 0.7759189605712891, "learning_rate": 0.0015, "loss": 1.8702, "step": 1010 }, { "epoch": 0.10759493670886076, "grad_norm": 0.9012283682823181, "learning_rate": 0.0015, "loss": 1.8629, "step": 1020 }, { "epoch": 0.10864978902953587, "grad_norm": 0.8135190606117249, "learning_rate": 0.0015, "loss": 1.8652, "step": 1030 }, { "epoch": 0.10970464135021098, "grad_norm": 0.6859939694404602, "learning_rate": 0.0015, "loss": 1.858, "step": 1040 }, { "epoch": 0.11075949367088607, "grad_norm": 1.2869263887405396, "learning_rate": 0.0015, "loss": 1.8613, "step": 1050 }, { "epoch": 0.11181434599156118, "grad_norm": 0.7284495830535889, "learning_rate": 0.0015, "loss": 1.8531, "step": 1060 }, { "epoch": 0.11286919831223628, "grad_norm": 0.8010578751564026, "learning_rate": 0.0015, "loss": 1.8417, "step": 1070 }, { "epoch": 0.11392405063291139, "grad_norm": 1.0110236406326294, "learning_rate": 0.0015, "loss": 1.841, "step": 1080 }, { "epoch": 0.1149789029535865, "grad_norm": 0.6968798637390137, "learning_rate": 0.0015, "loss": 1.8465, "step": 1090 }, { "epoch": 0.1160337552742616, "grad_norm": 0.6804802417755127, "learning_rate": 0.0015, "loss": 1.8316, "step": 1100 }, { "epoch": 0.11708860759493671, "grad_norm": 0.6878808736801147, "learning_rate": 0.0015, "loss": 1.8405, "step": 1110 }, { "epoch": 0.11814345991561181, "grad_norm": 0.7354303002357483, "learning_rate": 0.0015, "loss": 1.8306, "step": 1120 }, { "epoch": 0.11919831223628692, "grad_norm": 0.9895655512809753, "learning_rate": 0.0015, "loss": 1.8165, "step": 1130 }, { "epoch": 0.12025316455696203, "grad_norm": 0.8318318724632263, "learning_rate": 0.0015, "loss": 1.8319, "step": 1140 }, { "epoch": 0.12130801687763713, "grad_norm": 1.169092059135437, "learning_rate": 0.0015, "loss": 1.8156, "step": 1150 }, { "epoch": 0.12236286919831224, "grad_norm": 0.9082102179527283, "learning_rate": 0.0015, "loss": 1.8278, "step": 1160 }, { "epoch": 0.12341772151898735, "grad_norm": 0.8206037878990173, "learning_rate": 0.0015, "loss": 1.8203, "step": 1170 }, { "epoch": 0.12447257383966245, "grad_norm": 0.6738736033439636, "learning_rate": 0.0015, "loss": 1.7983, "step": 1180 }, { "epoch": 0.12552742616033755, "grad_norm": 0.6992834210395813, "learning_rate": 0.0015, "loss": 1.8003, "step": 1190 }, { "epoch": 0.12658227848101267, "grad_norm": 0.9246848821640015, "learning_rate": 0.0015, "loss": 1.8136, "step": 1200 }, { "epoch": 0.12763713080168776, "grad_norm": 0.7774592041969299, "learning_rate": 0.0015, "loss": 1.8062, "step": 1210 }, { "epoch": 0.12869198312236288, "grad_norm": 0.6720554232597351, "learning_rate": 0.0015, "loss": 1.7889, "step": 1220 }, { "epoch": 0.12974683544303797, "grad_norm": 0.6804599761962891, "learning_rate": 0.0015, "loss": 1.78, "step": 1230 }, { "epoch": 0.1308016877637131, "grad_norm": 0.6456968188285828, "learning_rate": 0.0015, "loss": 1.7974, "step": 1240 }, { "epoch": 0.13185654008438819, "grad_norm": 0.6627588272094727, "learning_rate": 0.0015, "loss": 1.7982, "step": 1250 }, { "epoch": 0.13291139240506328, "grad_norm": 0.8834444284439087, "learning_rate": 0.0015, "loss": 1.781, "step": 1260 }, { "epoch": 0.1339662447257384, "grad_norm": 0.7322362065315247, "learning_rate": 0.0015, "loss": 1.7838, "step": 1270 }, { "epoch": 0.1350210970464135, "grad_norm": 0.9749155640602112, "learning_rate": 0.0015, "loss": 1.8028, "step": 1280 }, { "epoch": 0.1360759493670886, "grad_norm": 0.7019423842430115, "learning_rate": 0.0015, "loss": 1.7743, "step": 1290 }, { "epoch": 0.1371308016877637, "grad_norm": 0.686527669429779, "learning_rate": 0.0015, "loss": 1.7653, "step": 1300 }, { "epoch": 0.13818565400843882, "grad_norm": 0.6187251806259155, "learning_rate": 0.0015, "loss": 1.7691, "step": 1310 }, { "epoch": 0.13924050632911392, "grad_norm": 0.9021650552749634, "learning_rate": 0.0015, "loss": 1.7711, "step": 1320 }, { "epoch": 0.14029535864978904, "grad_norm": 0.9765862822532654, "learning_rate": 0.0015, "loss": 1.7638, "step": 1330 }, { "epoch": 0.14135021097046413, "grad_norm": 0.7598349452018738, "learning_rate": 0.0015, "loss": 1.7652, "step": 1340 }, { "epoch": 0.14240506329113925, "grad_norm": 0.7436577677726746, "learning_rate": 0.0015, "loss": 1.77, "step": 1350 }, { "epoch": 0.14345991561181434, "grad_norm": 0.7047667503356934, "learning_rate": 0.0015, "loss": 1.76, "step": 1360 }, { "epoch": 0.14451476793248946, "grad_norm": 1.006347417831421, "learning_rate": 0.0015, "loss": 1.7616, "step": 1370 }, { "epoch": 0.14556962025316456, "grad_norm": 0.7401061058044434, "learning_rate": 0.0015, "loss": 1.7485, "step": 1380 }, { "epoch": 0.14662447257383968, "grad_norm": 1.2228999137878418, "learning_rate": 0.0015, "loss": 1.7608, "step": 1390 }, { "epoch": 0.14767932489451477, "grad_norm": 0.6520518660545349, "learning_rate": 0.0015, "loss": 1.7426, "step": 1400 }, { "epoch": 0.14873417721518986, "grad_norm": 0.8179234266281128, "learning_rate": 0.0015, "loss": 1.7431, "step": 1410 }, { "epoch": 0.14978902953586498, "grad_norm": 0.6742578148841858, "learning_rate": 0.0015, "loss": 1.7486, "step": 1420 }, { "epoch": 0.15084388185654007, "grad_norm": 0.6870294809341431, "learning_rate": 0.0015, "loss": 1.7452, "step": 1430 }, { "epoch": 0.1518987341772152, "grad_norm": 0.7341146469116211, "learning_rate": 0.0015, "loss": 1.7434, "step": 1440 }, { "epoch": 0.1529535864978903, "grad_norm": 0.6733317375183105, "learning_rate": 0.0015, "loss": 1.7385, "step": 1450 }, { "epoch": 0.1540084388185654, "grad_norm": 0.6803165078163147, "learning_rate": 0.0015, "loss": 1.7352, "step": 1460 }, { "epoch": 0.1550632911392405, "grad_norm": 1.4749171733856201, "learning_rate": 0.0015, "loss": 1.7365, "step": 1470 }, { "epoch": 0.15611814345991562, "grad_norm": 1.347564935684204, "learning_rate": 0.0015, "loss": 1.7489, "step": 1480 }, { "epoch": 0.1571729957805907, "grad_norm": 0.7148910164833069, "learning_rate": 0.0015, "loss": 1.7339, "step": 1490 }, { "epoch": 0.15822784810126583, "grad_norm": 0.8039283752441406, "learning_rate": 0.0015, "loss": 1.7286, "step": 1500 }, { "epoch": 0.15928270042194093, "grad_norm": 1.3167616128921509, "learning_rate": 0.0015, "loss": 1.7352, "step": 1510 }, { "epoch": 0.16033755274261605, "grad_norm": 1.0010050535202026, "learning_rate": 0.0015, "loss": 1.7346, "step": 1520 }, { "epoch": 0.16139240506329114, "grad_norm": 0.6768319606781006, "learning_rate": 0.0015, "loss": 1.7145, "step": 1530 }, { "epoch": 0.16244725738396623, "grad_norm": 0.6522865891456604, "learning_rate": 0.0015, "loss": 1.7039, "step": 1540 }, { "epoch": 0.16350210970464135, "grad_norm": 0.9729596376419067, "learning_rate": 0.0015, "loss": 1.7063, "step": 1550 }, { "epoch": 0.16455696202531644, "grad_norm": 0.7230920791625977, "learning_rate": 0.0015, "loss": 1.7116, "step": 1560 }, { "epoch": 0.16561181434599156, "grad_norm": 0.7281738519668579, "learning_rate": 0.0015, "loss": 1.7023, "step": 1570 }, { "epoch": 0.16666666666666666, "grad_norm": 0.8933695554733276, "learning_rate": 0.0015, "loss": 1.7135, "step": 1580 }, { "epoch": 0.16772151898734178, "grad_norm": 0.8795140981674194, "learning_rate": 0.0015, "loss": 1.7093, "step": 1590 }, { "epoch": 0.16877637130801687, "grad_norm": 0.702780544757843, "learning_rate": 0.0015, "loss": 1.7153, "step": 1600 }, { "epoch": 0.169831223628692, "grad_norm": 0.6340116858482361, "learning_rate": 0.0015, "loss": 1.7115, "step": 1610 }, { "epoch": 0.17088607594936708, "grad_norm": 0.6516766548156738, "learning_rate": 0.0015, "loss": 1.6822, "step": 1620 }, { "epoch": 0.1719409282700422, "grad_norm": 0.7751317620277405, "learning_rate": 0.0015, "loss": 1.6931, "step": 1630 }, { "epoch": 0.1729957805907173, "grad_norm": 0.8222350478172302, "learning_rate": 0.0015, "loss": 1.7007, "step": 1640 }, { "epoch": 0.17405063291139242, "grad_norm": 0.9241092205047607, "learning_rate": 0.0015, "loss": 1.6929, "step": 1650 }, { "epoch": 0.1751054852320675, "grad_norm": 0.758102536201477, "learning_rate": 0.0015, "loss": 1.6892, "step": 1660 }, { "epoch": 0.17616033755274263, "grad_norm": 0.742407500743866, "learning_rate": 0.0015, "loss": 1.7018, "step": 1670 }, { "epoch": 0.17721518987341772, "grad_norm": 1.2279940843582153, "learning_rate": 0.0015, "loss": 1.6859, "step": 1680 }, { "epoch": 0.17827004219409281, "grad_norm": 0.8028030395507812, "learning_rate": 0.0015, "loss": 1.6828, "step": 1690 }, { "epoch": 0.17932489451476794, "grad_norm": 0.9893059730529785, "learning_rate": 0.0015, "loss": 1.6995, "step": 1700 }, { "epoch": 0.18037974683544303, "grad_norm": 0.623496949672699, "learning_rate": 0.0015, "loss": 1.6756, "step": 1710 }, { "epoch": 0.18143459915611815, "grad_norm": 0.654557466506958, "learning_rate": 0.0015, "loss": 1.6716, "step": 1720 }, { "epoch": 0.18248945147679324, "grad_norm": 0.6914863586425781, "learning_rate": 0.0015, "loss": 1.6772, "step": 1730 }, { "epoch": 0.18354430379746836, "grad_norm": 0.68093341588974, "learning_rate": 0.0015, "loss": 1.6788, "step": 1740 }, { "epoch": 0.18459915611814345, "grad_norm": 0.6601630449295044, "learning_rate": 0.0015, "loss": 1.6985, "step": 1750 }, { "epoch": 0.18565400843881857, "grad_norm": 0.677162766456604, "learning_rate": 0.0015, "loss": 1.6771, "step": 1760 }, { "epoch": 0.18670886075949367, "grad_norm": 0.6156236529350281, "learning_rate": 0.0015, "loss": 1.6796, "step": 1770 }, { "epoch": 0.1877637130801688, "grad_norm": 0.7677043676376343, "learning_rate": 0.0015, "loss": 1.6823, "step": 1780 }, { "epoch": 0.18881856540084388, "grad_norm": 0.7379671335220337, "learning_rate": 0.0015, "loss": 1.6842, "step": 1790 }, { "epoch": 0.189873417721519, "grad_norm": 0.6247692108154297, "learning_rate": 0.0015, "loss": 1.6766, "step": 1800 }, { "epoch": 0.1909282700421941, "grad_norm": 0.9126911759376526, "learning_rate": 0.0015, "loss": 1.6644, "step": 1810 }, { "epoch": 0.19198312236286919, "grad_norm": 0.6720184087753296, "learning_rate": 0.0015, "loss": 1.6601, "step": 1820 }, { "epoch": 0.1930379746835443, "grad_norm": 0.7707266211509705, "learning_rate": 0.0015, "loss": 1.6621, "step": 1830 }, { "epoch": 0.1940928270042194, "grad_norm": 0.5960696935653687, "learning_rate": 0.0015, "loss": 1.6582, "step": 1840 }, { "epoch": 0.19514767932489452, "grad_norm": 0.9515610933303833, "learning_rate": 0.0015, "loss": 1.6641, "step": 1850 }, { "epoch": 0.1962025316455696, "grad_norm": 0.6111634969711304, "learning_rate": 0.0015, "loss": 1.6679, "step": 1860 }, { "epoch": 0.19725738396624473, "grad_norm": 0.7194884419441223, "learning_rate": 0.0015, "loss": 1.6666, "step": 1870 }, { "epoch": 0.19831223628691982, "grad_norm": 0.6791244149208069, "learning_rate": 0.0015, "loss": 1.6587, "step": 1880 }, { "epoch": 0.19936708860759494, "grad_norm": 0.8795470595359802, "learning_rate": 0.0015, "loss": 1.662, "step": 1890 }, { "epoch": 0.20042194092827004, "grad_norm": 0.6828414797782898, "learning_rate": 0.0015, "loss": 1.6587, "step": 1900 }, { "epoch": 0.20147679324894516, "grad_norm": 0.8992465138435364, "learning_rate": 0.0015, "loss": 1.6628, "step": 1910 }, { "epoch": 0.20253164556962025, "grad_norm": 0.6946663856506348, "learning_rate": 0.0015, "loss": 1.6554, "step": 1920 }, { "epoch": 0.20358649789029537, "grad_norm": 0.765633761882782, "learning_rate": 0.0015, "loss": 1.651, "step": 1930 }, { "epoch": 0.20464135021097046, "grad_norm": 0.6072712540626526, "learning_rate": 0.0015, "loss": 1.6436, "step": 1940 }, { "epoch": 0.20569620253164558, "grad_norm": 0.6243547201156616, "learning_rate": 0.0015, "loss": 1.6451, "step": 1950 }, { "epoch": 0.20675105485232068, "grad_norm": 0.602359414100647, "learning_rate": 0.0015, "loss": 1.6494, "step": 1960 }, { "epoch": 0.20780590717299577, "grad_norm": 0.6976801753044128, "learning_rate": 0.0015, "loss": 1.6617, "step": 1970 }, { "epoch": 0.2088607594936709, "grad_norm": 0.7268016934394836, "learning_rate": 0.0015, "loss": 1.642, "step": 1980 }, { "epoch": 0.20991561181434598, "grad_norm": 0.6199648380279541, "learning_rate": 0.0015, "loss": 1.6393, "step": 1990 }, { "epoch": 0.2109704641350211, "grad_norm": 0.8665949702262878, "learning_rate": 0.0015, "loss": 1.6367, "step": 2000 }, { "epoch": 0.2120253164556962, "grad_norm": 0.6769253611564636, "learning_rate": 0.0015, "loss": 1.6474, "step": 2010 }, { "epoch": 0.21308016877637131, "grad_norm": 0.9666388630867004, "learning_rate": 0.0015, "loss": 1.657, "step": 2020 }, { "epoch": 0.2141350210970464, "grad_norm": 0.6327275633811951, "learning_rate": 0.0015, "loss": 1.6425, "step": 2030 }, { "epoch": 0.21518987341772153, "grad_norm": 0.6882089376449585, "learning_rate": 0.0015, "loss": 1.639, "step": 2040 }, { "epoch": 0.21624472573839662, "grad_norm": 0.7690213918685913, "learning_rate": 0.0015, "loss": 1.64, "step": 2050 }, { "epoch": 0.21729957805907174, "grad_norm": 0.6751629710197449, "learning_rate": 0.0015, "loss": 1.6321, "step": 2060 }, { "epoch": 0.21835443037974683, "grad_norm": 1.079460859298706, "learning_rate": 0.0015, "loss": 1.633, "step": 2070 }, { "epoch": 0.21940928270042195, "grad_norm": 1.1796042919158936, "learning_rate": 0.0015, "loss": 1.6458, "step": 2080 }, { "epoch": 0.22046413502109705, "grad_norm": 0.6678057312965393, "learning_rate": 0.0015, "loss": 1.6244, "step": 2090 }, { "epoch": 0.22151898734177214, "grad_norm": 0.7857685089111328, "learning_rate": 0.0015, "loss": 1.6326, "step": 2100 }, { "epoch": 0.22257383966244726, "grad_norm": 0.7463678121566772, "learning_rate": 0.0015, "loss": 1.6266, "step": 2110 }, { "epoch": 0.22362869198312235, "grad_norm": 0.6068580746650696, "learning_rate": 0.0015, "loss": 1.6228, "step": 2120 }, { "epoch": 0.22468354430379747, "grad_norm": 0.6354256272315979, "learning_rate": 0.0015, "loss": 1.6263, "step": 2130 }, { "epoch": 0.22573839662447256, "grad_norm": 0.6060561537742615, "learning_rate": 0.0015, "loss": 1.6196, "step": 2140 }, { "epoch": 0.22679324894514769, "grad_norm": 0.9400723576545715, "learning_rate": 0.0015, "loss": 1.6286, "step": 2150 }, { "epoch": 0.22784810126582278, "grad_norm": 1.2904577255249023, "learning_rate": 0.0015, "loss": 1.6442, "step": 2160 }, { "epoch": 0.2289029535864979, "grad_norm": 1.4418649673461914, "learning_rate": 0.0015, "loss": 1.6288, "step": 2170 }, { "epoch": 0.229957805907173, "grad_norm": 1.1155215501785278, "learning_rate": 0.0015, "loss": 1.6123, "step": 2180 }, { "epoch": 0.2310126582278481, "grad_norm": 0.7112396359443665, "learning_rate": 0.0015, "loss": 1.6179, "step": 2190 }, { "epoch": 0.2320675105485232, "grad_norm": 0.6585453152656555, "learning_rate": 0.0015, "loss": 1.621, "step": 2200 }, { "epoch": 0.23312236286919832, "grad_norm": 0.6442099213600159, "learning_rate": 0.0015, "loss": 1.6067, "step": 2210 }, { "epoch": 0.23417721518987342, "grad_norm": 0.5695006251335144, "learning_rate": 0.0015, "loss": 1.6236, "step": 2220 }, { "epoch": 0.23523206751054854, "grad_norm": 0.6747961044311523, "learning_rate": 0.0015, "loss": 1.6154, "step": 2230 }, { "epoch": 0.23628691983122363, "grad_norm": 0.809987485408783, "learning_rate": 0.0015, "loss": 1.6164, "step": 2240 }, { "epoch": 0.23734177215189872, "grad_norm": 0.7946650385856628, "learning_rate": 0.0015, "loss": 1.6143, "step": 2250 }, { "epoch": 0.23839662447257384, "grad_norm": 0.8359740376472473, "learning_rate": 0.0015, "loss": 1.6009, "step": 2260 }, { "epoch": 0.23945147679324894, "grad_norm": 0.756756603717804, "learning_rate": 0.0015, "loss": 1.6346, "step": 2270 }, { "epoch": 0.24050632911392406, "grad_norm": 0.9783498048782349, "learning_rate": 0.0015, "loss": 1.6222, "step": 2280 }, { "epoch": 0.24156118143459915, "grad_norm": 0.6283764243125916, "learning_rate": 0.0015, "loss": 1.608, "step": 2290 }, { "epoch": 0.24261603375527427, "grad_norm": 0.6477097272872925, "learning_rate": 0.0015, "loss": 1.6049, "step": 2300 }, { "epoch": 0.24367088607594936, "grad_norm": 0.5955271124839783, "learning_rate": 0.0015, "loss": 1.6075, "step": 2310 }, { "epoch": 0.24472573839662448, "grad_norm": 0.7311010360717773, "learning_rate": 0.0015, "loss": 1.6151, "step": 2320 }, { "epoch": 0.24578059071729957, "grad_norm": 0.8355544209480286, "learning_rate": 0.0015, "loss": 1.5992, "step": 2330 }, { "epoch": 0.2468354430379747, "grad_norm": 0.8351632952690125, "learning_rate": 0.0015, "loss": 1.5954, "step": 2340 }, { "epoch": 0.2478902953586498, "grad_norm": 0.6420930027961731, "learning_rate": 0.0015, "loss": 1.5986, "step": 2350 }, { "epoch": 0.2489451476793249, "grad_norm": 0.6042001247406006, "learning_rate": 0.0015, "loss": 1.6036, "step": 2360 }, { "epoch": 0.25, "grad_norm": 0.7402383089065552, "learning_rate": 0.0015, "loss": 1.6031, "step": 2370 }, { "epoch": 0.2510548523206751, "grad_norm": 0.6115971207618713, "learning_rate": 0.0015, "loss": 1.6009, "step": 2380 }, { "epoch": 0.2521097046413502, "grad_norm": 0.6082727909088135, "learning_rate": 0.0015, "loss": 1.6004, "step": 2390 }, { "epoch": 0.25316455696202533, "grad_norm": 0.6307869553565979, "learning_rate": 0.0015, "loss": 1.6019, "step": 2400 }, { "epoch": 0.2542194092827004, "grad_norm": 0.6242306232452393, "learning_rate": 0.0015, "loss": 1.6032, "step": 2410 }, { "epoch": 0.2552742616033755, "grad_norm": 0.7853202223777771, "learning_rate": 0.0015, "loss": 1.5959, "step": 2420 }, { "epoch": 0.2563291139240506, "grad_norm": 0.6395176649093628, "learning_rate": 0.0015, "loss": 1.5949, "step": 2430 }, { "epoch": 0.25738396624472576, "grad_norm": 0.6518899202346802, "learning_rate": 0.0015, "loss": 1.6045, "step": 2440 }, { "epoch": 0.25843881856540085, "grad_norm": 0.575395941734314, "learning_rate": 0.0015, "loss": 1.5894, "step": 2450 }, { "epoch": 0.25949367088607594, "grad_norm": 0.6950863599777222, "learning_rate": 0.0015, "loss": 1.6033, "step": 2460 }, { "epoch": 0.26054852320675104, "grad_norm": 0.952235996723175, "learning_rate": 0.0015, "loss": 1.5943, "step": 2470 }, { "epoch": 0.2616033755274262, "grad_norm": 0.6923096179962158, "learning_rate": 0.0015, "loss": 1.5995, "step": 2480 }, { "epoch": 0.2626582278481013, "grad_norm": 0.6995301246643066, "learning_rate": 0.0015, "loss": 1.5874, "step": 2490 }, { "epoch": 0.26371308016877637, "grad_norm": 0.6522321701049805, "learning_rate": 0.0015, "loss": 1.5907, "step": 2500 }, { "epoch": 0.26476793248945146, "grad_norm": 0.5644729137420654, "learning_rate": 0.0015, "loss": 1.5911, "step": 2510 }, { "epoch": 0.26582278481012656, "grad_norm": 0.7484664916992188, "learning_rate": 0.0015, "loss": 1.5943, "step": 2520 }, { "epoch": 0.2668776371308017, "grad_norm": 0.6378340125083923, "learning_rate": 0.0015, "loss": 1.5892, "step": 2530 }, { "epoch": 0.2679324894514768, "grad_norm": 0.9534720778465271, "learning_rate": 0.0015, "loss": 1.5907, "step": 2540 }, { "epoch": 0.2689873417721519, "grad_norm": 0.6464568972587585, "learning_rate": 0.0015, "loss": 1.5843, "step": 2550 }, { "epoch": 0.270042194092827, "grad_norm": 0.8279476761817932, "learning_rate": 0.0015, "loss": 1.5937, "step": 2560 }, { "epoch": 0.27109704641350213, "grad_norm": 0.7225745320320129, "learning_rate": 0.0015, "loss": 1.5865, "step": 2570 }, { "epoch": 0.2721518987341772, "grad_norm": 0.6332727670669556, "learning_rate": 0.0015, "loss": 1.5872, "step": 2580 }, { "epoch": 0.2732067510548523, "grad_norm": 0.5766628384590149, "learning_rate": 0.0015, "loss": 1.5933, "step": 2590 }, { "epoch": 0.2742616033755274, "grad_norm": 0.6092755794525146, "learning_rate": 0.0015, "loss": 1.5898, "step": 2600 }, { "epoch": 0.27531645569620256, "grad_norm": 0.7969714999198914, "learning_rate": 0.0015, "loss": 1.588, "step": 2610 }, { "epoch": 0.27637130801687765, "grad_norm": 0.6936610341072083, "learning_rate": 0.0015, "loss": 1.5855, "step": 2620 }, { "epoch": 0.27742616033755274, "grad_norm": 0.6494637727737427, "learning_rate": 0.0015, "loss": 1.5805, "step": 2630 }, { "epoch": 0.27848101265822783, "grad_norm": 1.3193278312683105, "learning_rate": 0.0015, "loss": 1.5948, "step": 2640 }, { "epoch": 0.2795358649789029, "grad_norm": 0.6159821152687073, "learning_rate": 0.0015, "loss": 1.5799, "step": 2650 }, { "epoch": 0.2805907172995781, "grad_norm": 0.5923860669136047, "learning_rate": 0.0015, "loss": 1.5702, "step": 2660 }, { "epoch": 0.28164556962025317, "grad_norm": 0.626798152923584, "learning_rate": 0.0015, "loss": 1.5768, "step": 2670 }, { "epoch": 0.28270042194092826, "grad_norm": 0.5723259449005127, "learning_rate": 0.0015, "loss": 1.5735, "step": 2680 }, { "epoch": 0.28375527426160335, "grad_norm": 0.6969932913780212, "learning_rate": 0.0015, "loss": 1.5726, "step": 2690 }, { "epoch": 0.2848101265822785, "grad_norm": 0.6743830442428589, "learning_rate": 0.0015, "loss": 1.569, "step": 2700 }, { "epoch": 0.2858649789029536, "grad_norm": 0.6520766615867615, "learning_rate": 0.0015, "loss": 1.5784, "step": 2710 }, { "epoch": 0.2869198312236287, "grad_norm": 0.6736236214637756, "learning_rate": 0.0015, "loss": 1.5783, "step": 2720 }, { "epoch": 0.2879746835443038, "grad_norm": 0.5693079233169556, "learning_rate": 0.0015, "loss": 1.5557, "step": 2730 }, { "epoch": 0.2890295358649789, "grad_norm": 1.0078039169311523, "learning_rate": 0.0015, "loss": 1.5693, "step": 2740 }, { "epoch": 0.290084388185654, "grad_norm": 0.5904132127761841, "learning_rate": 0.0015, "loss": 1.5736, "step": 2750 }, { "epoch": 0.2911392405063291, "grad_norm": 0.6193687319755554, "learning_rate": 0.0015, "loss": 1.5841, "step": 2760 }, { "epoch": 0.2921940928270042, "grad_norm": 0.6845226287841797, "learning_rate": 0.0015, "loss": 1.5785, "step": 2770 }, { "epoch": 0.29324894514767935, "grad_norm": 0.5668240785598755, "learning_rate": 0.0015, "loss": 1.5748, "step": 2780 }, { "epoch": 0.29430379746835444, "grad_norm": 0.6082599759101868, "learning_rate": 0.0015, "loss": 1.5727, "step": 2790 }, { "epoch": 0.29535864978902954, "grad_norm": 0.9084484577178955, "learning_rate": 0.0015, "loss": 1.5677, "step": 2800 }, { "epoch": 0.29641350210970463, "grad_norm": 0.6633672714233398, "learning_rate": 0.0015, "loss": 1.5699, "step": 2810 }, { "epoch": 0.2974683544303797, "grad_norm": 0.6207756400108337, "learning_rate": 0.0015, "loss": 1.577, "step": 2820 }, { "epoch": 0.29852320675105487, "grad_norm": 0.9366081357002258, "learning_rate": 0.0015, "loss": 1.5507, "step": 2830 }, { "epoch": 0.29957805907172996, "grad_norm": 1.137453317642212, "learning_rate": 0.0015, "loss": 1.5553, "step": 2840 }, { "epoch": 0.30063291139240506, "grad_norm": 0.5851289629936218, "learning_rate": 0.0015, "loss": 1.5616, "step": 2850 }, { "epoch": 0.30168776371308015, "grad_norm": 0.5535052418708801, "learning_rate": 0.0015, "loss": 1.5642, "step": 2860 }, { "epoch": 0.3027426160337553, "grad_norm": 0.968014657497406, "learning_rate": 0.0015, "loss": 1.5655, "step": 2870 }, { "epoch": 0.3037974683544304, "grad_norm": 0.7300151586532593, "learning_rate": 0.0015, "loss": 1.5521, "step": 2880 }, { "epoch": 0.3048523206751055, "grad_norm": 0.629642903804779, "learning_rate": 0.0015, "loss": 1.5599, "step": 2890 }, { "epoch": 0.3059071729957806, "grad_norm": 0.8954982161521912, "learning_rate": 0.0015, "loss": 1.5642, "step": 2900 }, { "epoch": 0.3069620253164557, "grad_norm": 0.7289522886276245, "learning_rate": 0.0015, "loss": 1.5622, "step": 2910 }, { "epoch": 0.3080168776371308, "grad_norm": 0.6284463405609131, "learning_rate": 0.0015, "loss": 1.5567, "step": 2920 }, { "epoch": 0.3090717299578059, "grad_norm": 0.679797887802124, "learning_rate": 0.0015, "loss": 1.5544, "step": 2930 }, { "epoch": 0.310126582278481, "grad_norm": 0.5774010419845581, "learning_rate": 0.0015, "loss": 1.5625, "step": 2940 }, { "epoch": 0.3111814345991561, "grad_norm": 0.8260715007781982, "learning_rate": 0.0015, "loss": 1.5688, "step": 2950 }, { "epoch": 0.31223628691983124, "grad_norm": 0.7063205242156982, "learning_rate": 0.0015, "loss": 1.5584, "step": 2960 }, { "epoch": 0.31329113924050633, "grad_norm": 0.9290918111801147, "learning_rate": 0.0015, "loss": 1.539, "step": 2970 }, { "epoch": 0.3143459915611814, "grad_norm": 0.5621309876441956, "learning_rate": 0.0015, "loss": 1.5439, "step": 2980 }, { "epoch": 0.3154008438818565, "grad_norm": 0.7395849823951721, "learning_rate": 0.0015, "loss": 1.5643, "step": 2990 }, { "epoch": 0.31645569620253167, "grad_norm": 0.6708174347877502, "learning_rate": 0.0015, "loss": 1.5452, "step": 3000 }, { "epoch": 0.31751054852320676, "grad_norm": 0.6967434883117676, "learning_rate": 0.0015, "loss": 1.553, "step": 3010 }, { "epoch": 0.31856540084388185, "grad_norm": 0.7203395366668701, "learning_rate": 0.0015, "loss": 1.5571, "step": 3020 }, { "epoch": 0.31962025316455694, "grad_norm": 0.6107951402664185, "learning_rate": 0.0015, "loss": 1.5397, "step": 3030 }, { "epoch": 0.3206751054852321, "grad_norm": 0.6455133557319641, "learning_rate": 0.0015, "loss": 1.5494, "step": 3040 }, { "epoch": 0.3217299578059072, "grad_norm": 0.559616208076477, "learning_rate": 0.0015, "loss": 1.5457, "step": 3050 }, { "epoch": 0.3227848101265823, "grad_norm": 0.6274845600128174, "learning_rate": 0.0015, "loss": 1.5574, "step": 3060 }, { "epoch": 0.32383966244725737, "grad_norm": 0.5791829228401184, "learning_rate": 0.0015, "loss": 1.547, "step": 3070 }, { "epoch": 0.32489451476793246, "grad_norm": 0.7367576360702515, "learning_rate": 0.0015, "loss": 1.5377, "step": 3080 }, { "epoch": 0.3259493670886076, "grad_norm": 0.5722604393959045, "learning_rate": 0.0015, "loss": 1.5517, "step": 3090 }, { "epoch": 0.3270042194092827, "grad_norm": 0.6680841445922852, "learning_rate": 0.0015, "loss": 1.5399, "step": 3100 }, { "epoch": 0.3280590717299578, "grad_norm": 0.7069068551063538, "learning_rate": 0.0015, "loss": 1.5465, "step": 3110 }, { "epoch": 0.3291139240506329, "grad_norm": 0.5908547639846802, "learning_rate": 0.0015, "loss": 1.5464, "step": 3120 }, { "epoch": 0.33016877637130804, "grad_norm": 0.5545928478240967, "learning_rate": 0.0015, "loss": 1.558, "step": 3130 }, { "epoch": 0.33122362869198313, "grad_norm": 0.6596634984016418, "learning_rate": 0.0015, "loss": 1.5356, "step": 3140 }, { "epoch": 0.3322784810126582, "grad_norm": 0.6853034496307373, "learning_rate": 0.0015, "loss": 1.5465, "step": 3150 }, { "epoch": 0.3333333333333333, "grad_norm": 0.5891876220703125, "learning_rate": 0.0015, "loss": 1.5466, "step": 3160 }, { "epoch": 0.33438818565400846, "grad_norm": 0.6577898263931274, "learning_rate": 0.0015, "loss": 1.5521, "step": 3170 }, { "epoch": 0.33544303797468356, "grad_norm": 0.9291559457778931, "learning_rate": 0.0015, "loss": 1.5427, "step": 3180 }, { "epoch": 0.33649789029535865, "grad_norm": 0.7408413290977478, "learning_rate": 0.0015, "loss": 1.5436, "step": 3190 }, { "epoch": 0.33755274261603374, "grad_norm": 0.6851930022239685, "learning_rate": 0.0015, "loss": 1.537, "step": 3200 }, { "epoch": 0.33860759493670883, "grad_norm": 1.133357048034668, "learning_rate": 0.0015, "loss": 1.5526, "step": 3210 }, { "epoch": 0.339662447257384, "grad_norm": 0.6708566546440125, "learning_rate": 0.0015, "loss": 1.5415, "step": 3220 }, { "epoch": 0.3407172995780591, "grad_norm": 0.7159783244132996, "learning_rate": 0.0015, "loss": 1.532, "step": 3230 }, { "epoch": 0.34177215189873417, "grad_norm": 0.5234776139259338, "learning_rate": 0.0015, "loss": 1.516, "step": 3240 }, { "epoch": 0.34282700421940926, "grad_norm": 0.5658848285675049, "learning_rate": 0.0015, "loss": 1.5417, "step": 3250 }, { "epoch": 0.3438818565400844, "grad_norm": 0.5536794066429138, "learning_rate": 0.0015, "loss": 1.5261, "step": 3260 }, { "epoch": 0.3449367088607595, "grad_norm": 0.5448280572891235, "learning_rate": 0.0015, "loss": 1.546, "step": 3270 }, { "epoch": 0.3459915611814346, "grad_norm": 0.6722742319107056, "learning_rate": 0.0015, "loss": 1.5487, "step": 3280 }, { "epoch": 0.3470464135021097, "grad_norm": 0.8077312707901001, "learning_rate": 0.0015, "loss": 1.529, "step": 3290 }, { "epoch": 0.34810126582278483, "grad_norm": 0.5687286853790283, "learning_rate": 0.0015, "loss": 1.5384, "step": 3300 }, { "epoch": 0.3491561181434599, "grad_norm": 0.5500969886779785, "learning_rate": 0.0015, "loss": 1.5391, "step": 3310 }, { "epoch": 0.350210970464135, "grad_norm": 0.5969871282577515, "learning_rate": 0.0015, "loss": 1.5442, "step": 3320 }, { "epoch": 0.3512658227848101, "grad_norm": 0.6553987264633179, "learning_rate": 0.0015, "loss": 1.5301, "step": 3330 }, { "epoch": 0.35232067510548526, "grad_norm": 0.6266419291496277, "learning_rate": 0.0015, "loss": 1.5368, "step": 3340 }, { "epoch": 0.35337552742616035, "grad_norm": 0.6171289682388306, "learning_rate": 0.0015, "loss": 1.5289, "step": 3350 }, { "epoch": 0.35443037974683544, "grad_norm": 0.6962433457374573, "learning_rate": 0.0015, "loss": 1.5242, "step": 3360 }, { "epoch": 0.35548523206751054, "grad_norm": 0.8192803859710693, "learning_rate": 0.0015, "loss": 1.5352, "step": 3370 }, { "epoch": 0.35654008438818563, "grad_norm": 0.6736462712287903, "learning_rate": 0.0015, "loss": 1.532, "step": 3380 }, { "epoch": 0.3575949367088608, "grad_norm": 0.5962381958961487, "learning_rate": 0.0015, "loss": 1.5257, "step": 3390 }, { "epoch": 0.35864978902953587, "grad_norm": 0.6057121753692627, "learning_rate": 0.0015, "loss": 1.5357, "step": 3400 }, { "epoch": 0.35970464135021096, "grad_norm": 0.564690113067627, "learning_rate": 0.0015, "loss": 1.5301, "step": 3410 }, { "epoch": 0.36075949367088606, "grad_norm": 0.8714531660079956, "learning_rate": 0.0015, "loss": 1.512, "step": 3420 }, { "epoch": 0.3618143459915612, "grad_norm": 0.5465991497039795, "learning_rate": 0.0015, "loss": 1.5288, "step": 3430 }, { "epoch": 0.3628691983122363, "grad_norm": 0.6148397922515869, "learning_rate": 0.0015, "loss": 1.5248, "step": 3440 }, { "epoch": 0.3639240506329114, "grad_norm": 0.6495022177696228, "learning_rate": 0.0015, "loss": 1.5226, "step": 3450 }, { "epoch": 0.3649789029535865, "grad_norm": 1.464675784111023, "learning_rate": 0.0015, "loss": 1.5173, "step": 3460 }, { "epoch": 0.36603375527426163, "grad_norm": 0.7403339743614197, "learning_rate": 0.0015, "loss": 1.54, "step": 3470 }, { "epoch": 0.3670886075949367, "grad_norm": 0.6511922478675842, "learning_rate": 0.0015, "loss": 1.5349, "step": 3480 }, { "epoch": 0.3681434599156118, "grad_norm": 0.5631682872772217, "learning_rate": 0.0015, "loss": 1.5281, "step": 3490 }, { "epoch": 0.3691983122362869, "grad_norm": 0.6367287039756775, "learning_rate": 0.0015, "loss": 1.5106, "step": 3500 }, { "epoch": 0.370253164556962, "grad_norm": 0.6354228854179382, "learning_rate": 0.0015, "loss": 1.5302, "step": 3510 }, { "epoch": 0.37130801687763715, "grad_norm": 0.6151289343833923, "learning_rate": 0.0015, "loss": 1.5257, "step": 3520 }, { "epoch": 0.37236286919831224, "grad_norm": 0.7224065661430359, "learning_rate": 0.0015, "loss": 1.536, "step": 3530 }, { "epoch": 0.37341772151898733, "grad_norm": 1.1419929265975952, "learning_rate": 0.0015, "loss": 1.5298, "step": 3540 }, { "epoch": 0.3744725738396624, "grad_norm": 0.7123492956161499, "learning_rate": 0.0015, "loss": 1.5215, "step": 3550 }, { "epoch": 0.3755274261603376, "grad_norm": 0.5686929225921631, "learning_rate": 0.0015, "loss": 1.5196, "step": 3560 }, { "epoch": 0.37658227848101267, "grad_norm": 1.0632009506225586, "learning_rate": 0.0015, "loss": 1.5181, "step": 3570 }, { "epoch": 0.37763713080168776, "grad_norm": 0.9212638139724731, "learning_rate": 0.0015, "loss": 1.5188, "step": 3580 }, { "epoch": 0.37869198312236285, "grad_norm": 0.560971200466156, "learning_rate": 0.0015, "loss": 1.5254, "step": 3590 }, { "epoch": 0.379746835443038, "grad_norm": 0.5467254519462585, "learning_rate": 0.0015, "loss": 1.5096, "step": 3600 }, { "epoch": 0.3808016877637131, "grad_norm": 0.599979817867279, "learning_rate": 0.0015, "loss": 1.5132, "step": 3610 }, { "epoch": 0.3818565400843882, "grad_norm": 0.6306496858596802, "learning_rate": 0.0015, "loss": 1.5147, "step": 3620 }, { "epoch": 0.3829113924050633, "grad_norm": 0.879368007183075, "learning_rate": 0.0015, "loss": 1.5174, "step": 3630 }, { "epoch": 0.38396624472573837, "grad_norm": 0.8159598112106323, "learning_rate": 0.0015, "loss": 1.518, "step": 3640 }, { "epoch": 0.3850210970464135, "grad_norm": 0.6509981155395508, "learning_rate": 0.0015, "loss": 1.5275, "step": 3650 }, { "epoch": 0.3860759493670886, "grad_norm": 0.6290169358253479, "learning_rate": 0.0015, "loss": 1.5173, "step": 3660 }, { "epoch": 0.3871308016877637, "grad_norm": 0.6807018518447876, "learning_rate": 0.0015, "loss": 1.5122, "step": 3670 }, { "epoch": 0.3881856540084388, "grad_norm": 0.7448897361755371, "learning_rate": 0.0015, "loss": 1.5149, "step": 3680 }, { "epoch": 0.38924050632911394, "grad_norm": 0.5696559548377991, "learning_rate": 0.0015, "loss": 1.5254, "step": 3690 }, { "epoch": 0.39029535864978904, "grad_norm": 0.6409304738044739, "learning_rate": 0.0015, "loss": 1.527, "step": 3700 }, { "epoch": 0.39135021097046413, "grad_norm": 0.6279841065406799, "learning_rate": 0.0015, "loss": 1.5162, "step": 3710 }, { "epoch": 0.3924050632911392, "grad_norm": 0.6621096730232239, "learning_rate": 0.0015, "loss": 1.4963, "step": 3720 }, { "epoch": 0.39345991561181437, "grad_norm": 0.6872855424880981, "learning_rate": 0.0015, "loss": 1.5135, "step": 3730 }, { "epoch": 0.39451476793248946, "grad_norm": 0.5671707391738892, "learning_rate": 0.0015, "loss": 1.5123, "step": 3740 }, { "epoch": 0.39556962025316456, "grad_norm": 0.8312143087387085, "learning_rate": 0.0015, "loss": 1.5174, "step": 3750 }, { "epoch": 0.39662447257383965, "grad_norm": 0.995048463344574, "learning_rate": 0.0015, "loss": 1.5071, "step": 3760 }, { "epoch": 0.39767932489451474, "grad_norm": 0.6123533248901367, "learning_rate": 0.0015, "loss": 1.5221, "step": 3770 }, { "epoch": 0.3987341772151899, "grad_norm": 0.5263500809669495, "learning_rate": 0.0015, "loss": 1.507, "step": 3780 }, { "epoch": 0.399789029535865, "grad_norm": 0.7213681936264038, "learning_rate": 0.0015, "loss": 1.5124, "step": 3790 }, { "epoch": 0.4008438818565401, "grad_norm": 0.6691168546676636, "learning_rate": 0.0015, "loss": 1.5206, "step": 3800 }, { "epoch": 0.40189873417721517, "grad_norm": 0.5371190309524536, "learning_rate": 0.0015, "loss": 1.5025, "step": 3810 }, { "epoch": 0.4029535864978903, "grad_norm": 0.5635185837745667, "learning_rate": 0.0015, "loss": 1.4976, "step": 3820 }, { "epoch": 0.4040084388185654, "grad_norm": 0.535293698310852, "learning_rate": 0.0015, "loss": 1.4994, "step": 3830 }, { "epoch": 0.4050632911392405, "grad_norm": 0.48217204213142395, "learning_rate": 0.0015, "loss": 1.5166, "step": 3840 }, { "epoch": 0.4061181434599156, "grad_norm": 0.5847092270851135, "learning_rate": 0.0015, "loss": 1.499, "step": 3850 }, { "epoch": 0.40717299578059074, "grad_norm": 0.5289793014526367, "learning_rate": 0.0015, "loss": 1.5136, "step": 3860 }, { "epoch": 0.40822784810126583, "grad_norm": 1.0073657035827637, "learning_rate": 0.0015, "loss": 1.5029, "step": 3870 }, { "epoch": 0.4092827004219409, "grad_norm": 0.9138542413711548, "learning_rate": 0.0015, "loss": 1.4957, "step": 3880 }, { "epoch": 0.410337552742616, "grad_norm": 0.5173036456108093, "learning_rate": 0.0015, "loss": 1.4944, "step": 3890 }, { "epoch": 0.41139240506329117, "grad_norm": 1.1745268106460571, "learning_rate": 0.0015, "loss": 1.4964, "step": 3900 }, { "epoch": 0.41244725738396626, "grad_norm": 0.5849131941795349, "learning_rate": 0.0015, "loss": 1.5001, "step": 3910 }, { "epoch": 0.41350210970464135, "grad_norm": 0.5363203287124634, "learning_rate": 0.0015, "loss": 1.4939, "step": 3920 }, { "epoch": 0.41455696202531644, "grad_norm": 0.6402507424354553, "learning_rate": 0.0015, "loss": 1.5021, "step": 3930 }, { "epoch": 0.41561181434599154, "grad_norm": 0.688439130783081, "learning_rate": 0.0015, "loss": 1.4966, "step": 3940 }, { "epoch": 0.4166666666666667, "grad_norm": 0.8214023113250732, "learning_rate": 0.0015, "loss": 1.5042, "step": 3950 }, { "epoch": 0.4177215189873418, "grad_norm": 0.6306218504905701, "learning_rate": 0.0015, "loss": 1.5045, "step": 3960 }, { "epoch": 0.41877637130801687, "grad_norm": 0.6060662269592285, "learning_rate": 0.0015, "loss": 1.492, "step": 3970 }, { "epoch": 0.41983122362869196, "grad_norm": 0.6276333928108215, "learning_rate": 0.0015, "loss": 1.5038, "step": 3980 }, { "epoch": 0.4208860759493671, "grad_norm": 0.5927751064300537, "learning_rate": 0.0015, "loss": 1.4971, "step": 3990 }, { "epoch": 0.4219409282700422, "grad_norm": 0.5483892560005188, "learning_rate": 0.0015, "loss": 1.5104, "step": 4000 }, { "epoch": 0.4229957805907173, "grad_norm": 0.7140218019485474, "learning_rate": 0.0015, "loss": 1.4954, "step": 4010 }, { "epoch": 0.4240506329113924, "grad_norm": 0.5532450079917908, "learning_rate": 0.0015, "loss": 1.4969, "step": 4020 }, { "epoch": 0.42510548523206754, "grad_norm": 0.5548282265663147, "learning_rate": 0.0015, "loss": 1.4868, "step": 4030 }, { "epoch": 0.42616033755274263, "grad_norm": 0.5832592844963074, "learning_rate": 0.0015, "loss": 1.4982, "step": 4040 }, { "epoch": 0.4272151898734177, "grad_norm": 0.5230129957199097, "learning_rate": 0.0015, "loss": 1.4993, "step": 4050 }, { "epoch": 0.4282700421940928, "grad_norm": 0.8107106685638428, "learning_rate": 0.0015, "loss": 1.5049, "step": 4060 }, { "epoch": 0.4293248945147679, "grad_norm": 0.7404172420501709, "learning_rate": 0.0015, "loss": 1.4998, "step": 4070 }, { "epoch": 0.43037974683544306, "grad_norm": 0.6545730233192444, "learning_rate": 0.0015, "loss": 1.506, "step": 4080 }, { "epoch": 0.43143459915611815, "grad_norm": 0.8511345386505127, "learning_rate": 0.0015, "loss": 1.5044, "step": 4090 }, { "epoch": 0.43248945147679324, "grad_norm": 0.7161096334457397, "learning_rate": 0.0015, "loss": 1.4939, "step": 4100 }, { "epoch": 0.43354430379746833, "grad_norm": 0.7610675692558289, "learning_rate": 0.0015, "loss": 1.489, "step": 4110 }, { "epoch": 0.4345991561181435, "grad_norm": 0.7774899005889893, "learning_rate": 0.0015, "loss": 1.4954, "step": 4120 }, { "epoch": 0.4356540084388186, "grad_norm": 0.7286903858184814, "learning_rate": 0.0015, "loss": 1.4849, "step": 4130 }, { "epoch": 0.43670886075949367, "grad_norm": 0.5786338448524475, "learning_rate": 0.0015, "loss": 1.4891, "step": 4140 }, { "epoch": 0.43776371308016876, "grad_norm": 0.6982702016830444, "learning_rate": 0.0015, "loss": 1.4962, "step": 4150 }, { "epoch": 0.4388185654008439, "grad_norm": 0.6745831370353699, "learning_rate": 0.0015, "loss": 1.4873, "step": 4160 }, { "epoch": 0.439873417721519, "grad_norm": 0.5979346632957458, "learning_rate": 0.0015, "loss": 1.4905, "step": 4170 }, { "epoch": 0.4409282700421941, "grad_norm": 0.522210419178009, "learning_rate": 0.0015, "loss": 1.4871, "step": 4180 }, { "epoch": 0.4419831223628692, "grad_norm": 0.6977109313011169, "learning_rate": 0.0015, "loss": 1.4918, "step": 4190 }, { "epoch": 0.4430379746835443, "grad_norm": 0.7419961094856262, "learning_rate": 0.0015, "loss": 1.4962, "step": 4200 }, { "epoch": 0.4440928270042194, "grad_norm": 0.5456410050392151, "learning_rate": 0.0015, "loss": 1.4819, "step": 4210 }, { "epoch": 0.4451476793248945, "grad_norm": 0.5922154188156128, "learning_rate": 0.0015, "loss": 1.4933, "step": 4220 }, { "epoch": 0.4462025316455696, "grad_norm": 0.8679487705230713, "learning_rate": 0.0015, "loss": 1.4899, "step": 4230 }, { "epoch": 0.4472573839662447, "grad_norm": 0.574184238910675, "learning_rate": 0.0015, "loss": 1.4861, "step": 4240 }, { "epoch": 0.44831223628691985, "grad_norm": 0.6575837135314941, "learning_rate": 0.0015, "loss": 1.4904, "step": 4250 }, { "epoch": 0.44936708860759494, "grad_norm": 0.8064940571784973, "learning_rate": 0.0015, "loss": 1.4969, "step": 4260 }, { "epoch": 0.45042194092827004, "grad_norm": 0.5215798020362854, "learning_rate": 0.0015, "loss": 1.4875, "step": 4270 }, { "epoch": 0.45147679324894513, "grad_norm": 0.7631582617759705, "learning_rate": 0.0015, "loss": 1.4789, "step": 4280 }, { "epoch": 0.4525316455696203, "grad_norm": 0.720287561416626, "learning_rate": 0.0015, "loss": 1.4913, "step": 4290 }, { "epoch": 0.45358649789029537, "grad_norm": 0.9794846177101135, "learning_rate": 0.0015, "loss": 1.4893, "step": 4300 }, { "epoch": 0.45464135021097046, "grad_norm": 0.6552988886833191, "learning_rate": 0.0015, "loss": 1.4904, "step": 4310 }, { "epoch": 0.45569620253164556, "grad_norm": 0.8640376329421997, "learning_rate": 0.0015, "loss": 1.4821, "step": 4320 }, { "epoch": 0.45675105485232065, "grad_norm": 0.5411359667778015, "learning_rate": 0.0015, "loss": 1.4917, "step": 4330 }, { "epoch": 0.4578059071729958, "grad_norm": 0.7179622650146484, "learning_rate": 0.0015, "loss": 1.4809, "step": 4340 }, { "epoch": 0.4588607594936709, "grad_norm": 1.0754817724227905, "learning_rate": 0.0015, "loss": 1.4849, "step": 4350 }, { "epoch": 0.459915611814346, "grad_norm": 0.6185563206672668, "learning_rate": 0.0015, "loss": 1.4987, "step": 4360 }, { "epoch": 0.4609704641350211, "grad_norm": 0.5712226033210754, "learning_rate": 0.0015, "loss": 1.4738, "step": 4370 }, { "epoch": 0.4620253164556962, "grad_norm": 0.7067052125930786, "learning_rate": 0.0015, "loss": 1.4837, "step": 4380 }, { "epoch": 0.4630801687763713, "grad_norm": 0.5491397380828857, "learning_rate": 0.0015, "loss": 1.4791, "step": 4390 }, { "epoch": 0.4641350210970464, "grad_norm": 0.558575451374054, "learning_rate": 0.0015, "loss": 1.4818, "step": 4400 }, { "epoch": 0.4651898734177215, "grad_norm": 0.6383283138275146, "learning_rate": 0.0015, "loss": 1.5, "step": 4410 }, { "epoch": 0.46624472573839665, "grad_norm": 0.6881840825080872, "learning_rate": 0.0015, "loss": 1.4799, "step": 4420 }, { "epoch": 0.46729957805907174, "grad_norm": 0.7439998984336853, "learning_rate": 0.0015, "loss": 1.4844, "step": 4430 }, { "epoch": 0.46835443037974683, "grad_norm": 0.755620539188385, "learning_rate": 0.0015, "loss": 1.4788, "step": 4440 }, { "epoch": 0.4694092827004219, "grad_norm": 0.5531437397003174, "learning_rate": 0.0015, "loss": 1.4896, "step": 4450 }, { "epoch": 0.4704641350210971, "grad_norm": 0.5616311430931091, "learning_rate": 0.0015, "loss": 1.4752, "step": 4460 }, { "epoch": 0.47151898734177217, "grad_norm": 0.6229474544525146, "learning_rate": 0.0015, "loss": 1.4838, "step": 4470 }, { "epoch": 0.47257383966244726, "grad_norm": 0.7021309733390808, "learning_rate": 0.0015, "loss": 1.4911, "step": 4480 }, { "epoch": 0.47362869198312235, "grad_norm": 0.7292089462280273, "learning_rate": 0.0015, "loss": 1.4769, "step": 4490 }, { "epoch": 0.47468354430379744, "grad_norm": 0.6191797256469727, "learning_rate": 0.0015, "loss": 1.4763, "step": 4500 }, { "epoch": 0.4757383966244726, "grad_norm": 0.5511068105697632, "learning_rate": 0.0015, "loss": 1.4833, "step": 4510 }, { "epoch": 0.4767932489451477, "grad_norm": 0.6581600904464722, "learning_rate": 0.0015, "loss": 1.464, "step": 4520 }, { "epoch": 0.4778481012658228, "grad_norm": 0.6747670769691467, "learning_rate": 0.0015, "loss": 1.4838, "step": 4530 }, { "epoch": 0.47890295358649787, "grad_norm": 0.6507686376571655, "learning_rate": 0.0015, "loss": 1.4845, "step": 4540 }, { "epoch": 0.479957805907173, "grad_norm": 0.607556164264679, "learning_rate": 0.0015, "loss": 1.4824, "step": 4550 }, { "epoch": 0.4810126582278481, "grad_norm": 0.6154383420944214, "learning_rate": 0.0015, "loss": 1.4718, "step": 4560 }, { "epoch": 0.4820675105485232, "grad_norm": 0.6015588641166687, "learning_rate": 0.0015, "loss": 1.4541, "step": 4570 }, { "epoch": 0.4831223628691983, "grad_norm": 0.6513546705245972, "learning_rate": 0.0015, "loss": 1.4762, "step": 4580 }, { "epoch": 0.48417721518987344, "grad_norm": 0.6503914594650269, "learning_rate": 0.0015, "loss": 1.4657, "step": 4590 }, { "epoch": 0.48523206751054854, "grad_norm": 0.5230239629745483, "learning_rate": 0.0015, "loss": 1.4827, "step": 4600 }, { "epoch": 0.48628691983122363, "grad_norm": 1.0571868419647217, "learning_rate": 0.0015, "loss": 1.4849, "step": 4610 }, { "epoch": 0.4873417721518987, "grad_norm": 0.7897891998291016, "learning_rate": 0.0015, "loss": 1.462, "step": 4620 }, { "epoch": 0.4883966244725738, "grad_norm": 0.569659948348999, "learning_rate": 0.0015, "loss": 1.4813, "step": 4630 }, { "epoch": 0.48945147679324896, "grad_norm": 0.6289198994636536, "learning_rate": 0.0015, "loss": 1.4831, "step": 4640 }, { "epoch": 0.49050632911392406, "grad_norm": 0.6410478353500366, "learning_rate": 0.0015, "loss": 1.4835, "step": 4650 }, { "epoch": 0.49156118143459915, "grad_norm": 0.5901730060577393, "learning_rate": 0.0015, "loss": 1.4699, "step": 4660 }, { "epoch": 0.49261603375527424, "grad_norm": 0.8757472634315491, "learning_rate": 0.0015, "loss": 1.4705, "step": 4670 }, { "epoch": 0.4936708860759494, "grad_norm": 0.6517742276191711, "learning_rate": 0.0015, "loss": 1.4671, "step": 4680 }, { "epoch": 0.4947257383966245, "grad_norm": 0.973685622215271, "learning_rate": 0.0015, "loss": 1.4637, "step": 4690 }, { "epoch": 0.4957805907172996, "grad_norm": 0.6177852749824524, "learning_rate": 0.0015, "loss": 1.4731, "step": 4700 }, { "epoch": 0.49683544303797467, "grad_norm": 0.501401960849762, "learning_rate": 0.0015, "loss": 1.4743, "step": 4710 }, { "epoch": 0.4978902953586498, "grad_norm": 0.8149515986442566, "learning_rate": 0.0015, "loss": 1.4726, "step": 4720 }, { "epoch": 0.4989451476793249, "grad_norm": 0.6071144938468933, "learning_rate": 0.0015, "loss": 1.4676, "step": 4730 }, { "epoch": 0.5, "grad_norm": 0.7226423025131226, "learning_rate": 0.0015, "loss": 1.4671, "step": 4740 }, { "epoch": 0.5010548523206751, "grad_norm": 0.6534337401390076, "learning_rate": 0.0015, "loss": 1.4874, "step": 4750 }, { "epoch": 0.5021097046413502, "grad_norm": 0.5539942383766174, "learning_rate": 0.0015, "loss": 1.4672, "step": 4760 }, { "epoch": 0.5031645569620253, "grad_norm": 0.6125468611717224, "learning_rate": 0.0015, "loss": 1.4676, "step": 4770 }, { "epoch": 0.5042194092827004, "grad_norm": 0.6089324355125427, "learning_rate": 0.0015, "loss": 1.4772, "step": 4780 }, { "epoch": 0.5052742616033755, "grad_norm": 0.6028252840042114, "learning_rate": 0.0015, "loss": 1.4603, "step": 4790 }, { "epoch": 0.5063291139240507, "grad_norm": 0.6291878819465637, "learning_rate": 0.0015, "loss": 1.4662, "step": 4800 }, { "epoch": 0.5073839662447257, "grad_norm": 0.5844051241874695, "learning_rate": 0.0015, "loss": 1.4694, "step": 4810 }, { "epoch": 0.5084388185654009, "grad_norm": 0.6730778813362122, "learning_rate": 0.0015, "loss": 1.4857, "step": 4820 }, { "epoch": 0.509493670886076, "grad_norm": 0.7433723211288452, "learning_rate": 0.0015, "loss": 1.4687, "step": 4830 }, { "epoch": 0.510548523206751, "grad_norm": 0.6143871545791626, "learning_rate": 0.0015, "loss": 1.463, "step": 4840 }, { "epoch": 0.5116033755274262, "grad_norm": 0.514320433139801, "learning_rate": 0.0015, "loss": 1.4734, "step": 4850 }, { "epoch": 0.5126582278481012, "grad_norm": 0.5280232429504395, "learning_rate": 0.0015, "loss": 1.4622, "step": 4860 }, { "epoch": 0.5137130801687764, "grad_norm": 0.9742357134819031, "learning_rate": 0.0015, "loss": 1.4572, "step": 4870 }, { "epoch": 0.5147679324894515, "grad_norm": 0.6315768361091614, "learning_rate": 0.0015, "loss": 1.4778, "step": 4880 }, { "epoch": 0.5158227848101266, "grad_norm": 0.6527271270751953, "learning_rate": 0.0015, "loss": 1.4685, "step": 4890 }, { "epoch": 0.5168776371308017, "grad_norm": 0.6024441719055176, "learning_rate": 0.0015, "loss": 1.4738, "step": 4900 }, { "epoch": 0.5179324894514767, "grad_norm": 0.68658447265625, "learning_rate": 0.0015, "loss": 1.4693, "step": 4910 }, { "epoch": 0.5189873417721519, "grad_norm": 0.5601794719696045, "learning_rate": 0.0015, "loss": 1.4581, "step": 4920 }, { "epoch": 0.520042194092827, "grad_norm": 0.5640733242034912, "learning_rate": 0.0015, "loss": 1.4681, "step": 4930 }, { "epoch": 0.5210970464135021, "grad_norm": 0.5403666496276855, "learning_rate": 0.0015, "loss": 1.4651, "step": 4940 }, { "epoch": 0.5221518987341772, "grad_norm": 0.6531521677970886, "learning_rate": 0.0015, "loss": 1.474, "step": 4950 }, { "epoch": 0.5232067510548524, "grad_norm": 0.6115059852600098, "learning_rate": 0.0015, "loss": 1.4564, "step": 4960 }, { "epoch": 0.5242616033755274, "grad_norm": 0.8114550709724426, "learning_rate": 0.0015, "loss": 1.462, "step": 4970 }, { "epoch": 0.5253164556962026, "grad_norm": 0.6079219579696655, "learning_rate": 0.0015, "loss": 1.4546, "step": 4980 }, { "epoch": 0.5263713080168776, "grad_norm": 0.578087568283081, "learning_rate": 0.0015, "loss": 1.459, "step": 4990 }, { "epoch": 0.5274261603375527, "grad_norm": 0.672719419002533, "learning_rate": 0.0015, "loss": 1.4712, "step": 5000 }, { "epoch": 0.5284810126582279, "grad_norm": 0.5922591686248779, "learning_rate": 0.0015, "loss": 1.4642, "step": 5010 }, { "epoch": 0.5295358649789029, "grad_norm": 0.5584644079208374, "learning_rate": 0.0015, "loss": 1.4551, "step": 5020 }, { "epoch": 0.5305907172995781, "grad_norm": 0.5503262281417847, "learning_rate": 0.0015, "loss": 1.4439, "step": 5030 }, { "epoch": 0.5316455696202531, "grad_norm": 0.5341739654541016, "learning_rate": 0.0015, "loss": 1.4708, "step": 5040 }, { "epoch": 0.5327004219409283, "grad_norm": 0.5860607624053955, "learning_rate": 0.0015, "loss": 1.4585, "step": 5050 }, { "epoch": 0.5337552742616034, "grad_norm": 0.7299894690513611, "learning_rate": 0.0015, "loss": 1.4713, "step": 5060 }, { "epoch": 0.5348101265822784, "grad_norm": 0.8018727898597717, "learning_rate": 0.0015, "loss": 1.4584, "step": 5070 }, { "epoch": 0.5358649789029536, "grad_norm": 0.5662050843238831, "learning_rate": 0.0015, "loss": 1.4733, "step": 5080 }, { "epoch": 0.5369198312236287, "grad_norm": 0.5539529919624329, "learning_rate": 0.0015, "loss": 1.4636, "step": 5090 }, { "epoch": 0.5379746835443038, "grad_norm": 0.6735826134681702, "learning_rate": 0.0015, "loss": 1.4452, "step": 5100 }, { "epoch": 0.5390295358649789, "grad_norm": 0.5980088710784912, "learning_rate": 0.0015, "loss": 1.4619, "step": 5110 }, { "epoch": 0.540084388185654, "grad_norm": 0.6005414724349976, "learning_rate": 0.0015, "loss": 1.4603, "step": 5120 }, { "epoch": 0.5411392405063291, "grad_norm": 0.5914118885993958, "learning_rate": 0.0015, "loss": 1.4553, "step": 5130 }, { "epoch": 0.5421940928270043, "grad_norm": 1.06390380859375, "learning_rate": 0.0015, "loss": 1.4698, "step": 5140 }, { "epoch": 0.5432489451476793, "grad_norm": 0.6869582533836365, "learning_rate": 0.0015, "loss": 1.4669, "step": 5150 }, { "epoch": 0.5443037974683544, "grad_norm": 0.559586226940155, "learning_rate": 0.0015, "loss": 1.4606, "step": 5160 }, { "epoch": 0.5453586497890295, "grad_norm": 0.5051333904266357, "learning_rate": 0.0015, "loss": 1.4599, "step": 5170 }, { "epoch": 0.5464135021097046, "grad_norm": 0.7952111959457397, "learning_rate": 0.0015, "loss": 1.4712, "step": 5180 }, { "epoch": 0.5474683544303798, "grad_norm": 0.8145313262939453, "learning_rate": 0.0015, "loss": 1.4487, "step": 5190 }, { "epoch": 0.5485232067510548, "grad_norm": 0.5636731386184692, "learning_rate": 0.0015, "loss": 1.464, "step": 5200 }, { "epoch": 0.54957805907173, "grad_norm": 0.6466506719589233, "learning_rate": 0.0015, "loss": 1.4512, "step": 5210 }, { "epoch": 0.5506329113924051, "grad_norm": 0.6597478985786438, "learning_rate": 0.0015, "loss": 1.4599, "step": 5220 }, { "epoch": 0.5516877637130801, "grad_norm": 0.6992036700248718, "learning_rate": 0.0015, "loss": 1.4528, "step": 5230 }, { "epoch": 0.5527426160337553, "grad_norm": 0.5877173542976379, "learning_rate": 0.0015, "loss": 1.4613, "step": 5240 }, { "epoch": 0.5537974683544303, "grad_norm": 0.8450697064399719, "learning_rate": 0.0015, "loss": 1.4529, "step": 5250 }, { "epoch": 0.5548523206751055, "grad_norm": 0.5091837048530579, "learning_rate": 0.0015, "loss": 1.4591, "step": 5260 }, { "epoch": 0.5559071729957806, "grad_norm": 0.7349790334701538, "learning_rate": 0.0015, "loss": 1.4653, "step": 5270 }, { "epoch": 0.5569620253164557, "grad_norm": 1.0718812942504883, "learning_rate": 0.0015, "loss": 1.4561, "step": 5280 }, { "epoch": 0.5580168776371308, "grad_norm": 0.5732921957969666, "learning_rate": 0.0015, "loss": 1.4563, "step": 5290 }, { "epoch": 0.5590717299578059, "grad_norm": 0.5573410987854004, "learning_rate": 0.0015, "loss": 1.4617, "step": 5300 }, { "epoch": 0.560126582278481, "grad_norm": 0.6892499923706055, "learning_rate": 0.0015, "loss": 1.4465, "step": 5310 }, { "epoch": 0.5611814345991561, "grad_norm": 0.5653085112571716, "learning_rate": 0.0015, "loss": 1.4477, "step": 5320 }, { "epoch": 0.5622362869198312, "grad_norm": 0.8573117256164551, "learning_rate": 0.0015, "loss": 1.4575, "step": 5330 }, { "epoch": 0.5632911392405063, "grad_norm": 0.5884286165237427, "learning_rate": 0.0015, "loss": 1.4494, "step": 5340 }, { "epoch": 0.5643459915611815, "grad_norm": 0.5683361887931824, "learning_rate": 0.0015, "loss": 1.4454, "step": 5350 }, { "epoch": 0.5654008438818565, "grad_norm": 0.8135273456573486, "learning_rate": 0.0015, "loss": 1.4528, "step": 5360 }, { "epoch": 0.5664556962025317, "grad_norm": 0.5589624643325806, "learning_rate": 0.0015, "loss": 1.4534, "step": 5370 }, { "epoch": 0.5675105485232067, "grad_norm": 0.5421573519706726, "learning_rate": 0.0015, "loss": 1.4472, "step": 5380 }, { "epoch": 0.5685654008438819, "grad_norm": 0.6375049352645874, "learning_rate": 0.0015, "loss": 1.4455, "step": 5390 }, { "epoch": 0.569620253164557, "grad_norm": 0.5435382723808289, "learning_rate": 0.0015, "loss": 1.4458, "step": 5400 }, { "epoch": 0.570675105485232, "grad_norm": 0.5885555148124695, "learning_rate": 0.0015, "loss": 1.451, "step": 5410 }, { "epoch": 0.5717299578059072, "grad_norm": 0.7123895287513733, "learning_rate": 0.0015, "loss": 1.4447, "step": 5420 }, { "epoch": 0.5727848101265823, "grad_norm": 0.7274113297462463, "learning_rate": 0.0015, "loss": 1.4421, "step": 5430 }, { "epoch": 0.5738396624472574, "grad_norm": 0.8285456299781799, "learning_rate": 0.0015, "loss": 1.441, "step": 5440 }, { "epoch": 0.5748945147679325, "grad_norm": 0.7984604239463806, "learning_rate": 0.0015, "loss": 1.4515, "step": 5450 }, { "epoch": 0.5759493670886076, "grad_norm": 0.6042565107345581, "learning_rate": 0.0015, "loss": 1.4512, "step": 5460 }, { "epoch": 0.5770042194092827, "grad_norm": 0.6467163562774658, "learning_rate": 0.0015, "loss": 1.4507, "step": 5470 }, { "epoch": 0.5780590717299579, "grad_norm": 0.7072213292121887, "learning_rate": 0.0015, "loss": 1.4507, "step": 5480 }, { "epoch": 0.5791139240506329, "grad_norm": 0.5475664734840393, "learning_rate": 0.0015, "loss": 1.4571, "step": 5490 }, { "epoch": 0.580168776371308, "grad_norm": 0.6362132430076599, "learning_rate": 0.0015, "loss": 1.4446, "step": 5500 }, { "epoch": 0.5812236286919831, "grad_norm": 1.106300711631775, "learning_rate": 0.0015, "loss": 1.4537, "step": 5510 }, { "epoch": 0.5822784810126582, "grad_norm": 0.5137573480606079, "learning_rate": 0.0015, "loss": 1.4442, "step": 5520 }, { "epoch": 0.5833333333333334, "grad_norm": 0.6358821988105774, "learning_rate": 0.0015, "loss": 1.4389, "step": 5530 }, { "epoch": 0.5843881856540084, "grad_norm": 0.5367628335952759, "learning_rate": 0.0015, "loss": 1.4381, "step": 5540 }, { "epoch": 0.5854430379746836, "grad_norm": 0.6821813583374023, "learning_rate": 0.0015, "loss": 1.454, "step": 5550 }, { "epoch": 0.5864978902953587, "grad_norm": 0.5292353630065918, "learning_rate": 0.0015, "loss": 1.4606, "step": 5560 }, { "epoch": 0.5875527426160337, "grad_norm": 0.4966440796852112, "learning_rate": 0.0015, "loss": 1.4436, "step": 5570 }, { "epoch": 0.5886075949367089, "grad_norm": 0.5308849215507507, "learning_rate": 0.0015, "loss": 1.4429, "step": 5580 }, { "epoch": 0.5896624472573839, "grad_norm": 0.6342505216598511, "learning_rate": 0.0015, "loss": 1.4468, "step": 5590 }, { "epoch": 0.5907172995780591, "grad_norm": 0.5645837783813477, "learning_rate": 0.0015, "loss": 1.4409, "step": 5600 }, { "epoch": 0.5917721518987342, "grad_norm": 0.637165904045105, "learning_rate": 0.0015, "loss": 1.4491, "step": 5610 }, { "epoch": 0.5928270042194093, "grad_norm": 0.6109039783477783, "learning_rate": 0.0015, "loss": 1.4531, "step": 5620 }, { "epoch": 0.5938818565400844, "grad_norm": 0.6971760392189026, "learning_rate": 0.0015, "loss": 1.447, "step": 5630 }, { "epoch": 0.5949367088607594, "grad_norm": 0.6155142188072205, "learning_rate": 0.0015, "loss": 1.4584, "step": 5640 }, { "epoch": 0.5959915611814346, "grad_norm": 0.7192047238349915, "learning_rate": 0.0015, "loss": 1.454, "step": 5650 }, { "epoch": 0.5970464135021097, "grad_norm": 1.393180251121521, "learning_rate": 0.0015, "loss": 1.4544, "step": 5660 }, { "epoch": 0.5981012658227848, "grad_norm": 1.2101657390594482, "learning_rate": 0.0015, "loss": 1.4603, "step": 5670 }, { "epoch": 0.5991561181434599, "grad_norm": 0.9806029200553894, "learning_rate": 0.0015, "loss": 1.4542, "step": 5680 }, { "epoch": 0.6002109704641351, "grad_norm": 1.0299819707870483, "learning_rate": 0.0015, "loss": 1.4311, "step": 5690 }, { "epoch": 0.6012658227848101, "grad_norm": 0.563425600528717, "learning_rate": 0.0015, "loss": 1.4381, "step": 5700 }, { "epoch": 0.6023206751054853, "grad_norm": 0.5351040959358215, "learning_rate": 0.0015, "loss": 1.4414, "step": 5710 }, { "epoch": 0.6033755274261603, "grad_norm": 0.5208850502967834, "learning_rate": 0.0015, "loss": 1.4461, "step": 5720 }, { "epoch": 0.6044303797468354, "grad_norm": 0.5404925346374512, "learning_rate": 0.0015, "loss": 1.4455, "step": 5730 }, { "epoch": 0.6054852320675106, "grad_norm": 0.5373325943946838, "learning_rate": 0.0015, "loss": 1.4394, "step": 5740 }, { "epoch": 0.6065400843881856, "grad_norm": 0.6910935044288635, "learning_rate": 0.0015, "loss": 1.4179, "step": 5750 }, { "epoch": 0.6075949367088608, "grad_norm": 0.5569452047348022, "learning_rate": 0.0015, "loss": 1.4294, "step": 5760 }, { "epoch": 0.6086497890295358, "grad_norm": 0.7494074702262878, "learning_rate": 0.0015, "loss": 1.4305, "step": 5770 }, { "epoch": 0.609704641350211, "grad_norm": 0.5173893570899963, "learning_rate": 0.0015, "loss": 1.438, "step": 5780 }, { "epoch": 0.6107594936708861, "grad_norm": 0.6150003671646118, "learning_rate": 0.0015, "loss": 1.4346, "step": 5790 }, { "epoch": 0.6118143459915611, "grad_norm": 0.541309118270874, "learning_rate": 0.0015, "loss": 1.4406, "step": 5800 }, { "epoch": 0.6128691983122363, "grad_norm": 0.523231565952301, "learning_rate": 0.0015, "loss": 1.4462, "step": 5810 }, { "epoch": 0.6139240506329114, "grad_norm": 0.5977320671081543, "learning_rate": 0.0015, "loss": 1.4574, "step": 5820 }, { "epoch": 0.6149789029535865, "grad_norm": 0.5601499080657959, "learning_rate": 0.0015, "loss": 1.4371, "step": 5830 }, { "epoch": 0.6160337552742616, "grad_norm": 0.731503963470459, "learning_rate": 0.0015, "loss": 1.4526, "step": 5840 }, { "epoch": 0.6170886075949367, "grad_norm": 0.599065363407135, "learning_rate": 0.0015, "loss": 1.4543, "step": 5850 }, { "epoch": 0.6181434599156118, "grad_norm": 0.5344615578651428, "learning_rate": 0.0015, "loss": 1.4414, "step": 5860 }, { "epoch": 0.619198312236287, "grad_norm": 0.5567651987075806, "learning_rate": 0.0015, "loss": 1.4368, "step": 5870 }, { "epoch": 0.620253164556962, "grad_norm": 0.681366503238678, "learning_rate": 0.0015, "loss": 1.4456, "step": 5880 }, { "epoch": 0.6213080168776371, "grad_norm": 0.5168014168739319, "learning_rate": 0.0015, "loss": 1.437, "step": 5890 }, { "epoch": 0.6223628691983122, "grad_norm": 0.6140822768211365, "learning_rate": 0.0015, "loss": 1.4383, "step": 5900 }, { "epoch": 0.6234177215189873, "grad_norm": 0.6157610416412354, "learning_rate": 0.0015, "loss": 1.4337, "step": 5910 }, { "epoch": 0.6244725738396625, "grad_norm": 0.5748449563980103, "learning_rate": 0.0015, "loss": 1.4274, "step": 5920 }, { "epoch": 0.6255274261603375, "grad_norm": 1.1245657205581665, "learning_rate": 0.0015, "loss": 1.4369, "step": 5930 }, { "epoch": 0.6265822784810127, "grad_norm": 0.6548138856887817, "learning_rate": 0.0015, "loss": 1.4441, "step": 5940 }, { "epoch": 0.6276371308016878, "grad_norm": 0.5102262496948242, "learning_rate": 0.0015, "loss": 1.4316, "step": 5950 }, { "epoch": 0.6286919831223629, "grad_norm": 0.5147098898887634, "learning_rate": 0.0015, "loss": 1.438, "step": 5960 }, { "epoch": 0.629746835443038, "grad_norm": 0.5202301740646362, "learning_rate": 0.0015, "loss": 1.4394, "step": 5970 }, { "epoch": 0.630801687763713, "grad_norm": 0.536027729511261, "learning_rate": 0.0015, "loss": 1.4223, "step": 5980 }, { "epoch": 0.6318565400843882, "grad_norm": 1.0913490056991577, "learning_rate": 0.0015, "loss": 1.4456, "step": 5990 }, { "epoch": 0.6329113924050633, "grad_norm": 0.5582550168037415, "learning_rate": 0.0015, "loss": 1.4539, "step": 6000 }, { "epoch": 0.6339662447257384, "grad_norm": 0.5896278023719788, "learning_rate": 0.0015, "loss": 1.4339, "step": 6010 }, { "epoch": 0.6350210970464135, "grad_norm": 0.9601753354072571, "learning_rate": 0.0015, "loss": 1.4385, "step": 6020 }, { "epoch": 0.6360759493670886, "grad_norm": 0.634880006313324, "learning_rate": 0.0015, "loss": 1.4206, "step": 6030 }, { "epoch": 0.6371308016877637, "grad_norm": 0.5148939490318298, "learning_rate": 0.0015, "loss": 1.4398, "step": 6040 }, { "epoch": 0.6381856540084389, "grad_norm": 0.5344108939170837, "learning_rate": 0.0015, "loss": 1.4243, "step": 6050 }, { "epoch": 0.6392405063291139, "grad_norm": 0.6601157188415527, "learning_rate": 0.0015, "loss": 1.4424, "step": 6060 }, { "epoch": 0.640295358649789, "grad_norm": 0.7422851324081421, "learning_rate": 0.0015, "loss": 1.4334, "step": 6070 }, { "epoch": 0.6413502109704642, "grad_norm": 0.5776787400245667, "learning_rate": 0.0015, "loss": 1.4334, "step": 6080 }, { "epoch": 0.6424050632911392, "grad_norm": 0.5289221405982971, "learning_rate": 0.0015, "loss": 1.4328, "step": 6090 }, { "epoch": 0.6434599156118144, "grad_norm": 0.8894216418266296, "learning_rate": 0.0015, "loss": 1.4276, "step": 6100 }, { "epoch": 0.6445147679324894, "grad_norm": 0.5304521918296814, "learning_rate": 0.0015, "loss": 1.4398, "step": 6110 }, { "epoch": 0.6455696202531646, "grad_norm": 0.6131283044815063, "learning_rate": 0.0015, "loss": 1.4323, "step": 6120 }, { "epoch": 0.6466244725738397, "grad_norm": 0.7603117823600769, "learning_rate": 0.0015, "loss": 1.4328, "step": 6130 }, { "epoch": 0.6476793248945147, "grad_norm": Infinity, "learning_rate": 0.0015, "loss": 1.4344, "step": 6140 }, { "epoch": 0.6487341772151899, "grad_norm": 0.5572797060012817, "learning_rate": 0.0015, "loss": 1.4415, "step": 6150 }, { "epoch": 0.6497890295358649, "grad_norm": 0.6372722387313843, "learning_rate": 0.0015, "loss": 1.4372, "step": 6160 }, { "epoch": 0.6508438818565401, "grad_norm": 0.5281897187232971, "learning_rate": 0.001487560447745699, "loss": 1.4152, "step": 6170 }, { "epoch": 0.6518987341772152, "grad_norm": 0.6531746983528137, "learning_rate": 0.0014670566859713624, "loss": 1.4363, "step": 6180 }, { "epoch": 0.6529535864978903, "grad_norm": 0.5641706585884094, "learning_rate": 0.0014468355374162303, "loss": 1.4428, "step": 6190 }, { "epoch": 0.6540084388185654, "grad_norm": 0.6546826362609863, "learning_rate": 0.0014268931066862504, "loss": 1.4312, "step": 6200 }, { "epoch": 0.6550632911392406, "grad_norm": 0.5389937162399292, "learning_rate": 0.0014072255520794614, "loss": 1.4232, "step": 6210 }, { "epoch": 0.6561181434599156, "grad_norm": 0.5590718388557434, "learning_rate": 0.0013878290848459301, "loss": 1.431, "step": 6220 }, { "epoch": 0.6571729957805907, "grad_norm": 0.7184352874755859, "learning_rate": 0.0013686999684578874, "loss": 1.4193, "step": 6230 }, { "epoch": 0.6582278481012658, "grad_norm": 0.5333797335624695, "learning_rate": 0.001349834517889925, "loss": 1.4206, "step": 6240 }, { "epoch": 0.6592827004219409, "grad_norm": 0.48487091064453125, "learning_rate": 0.001331229098909114, "loss": 1.4279, "step": 6250 }, { "epoch": 0.6603375527426161, "grad_norm": 0.5883477926254272, "learning_rate": 0.0013128801273749075, "loss": 1.4172, "step": 6260 }, { "epoch": 0.6613924050632911, "grad_norm": 0.5650054216384888, "learning_rate": 0.0012947840685486932, "loss": 1.4268, "step": 6270 }, { "epoch": 0.6624472573839663, "grad_norm": 0.6160814166069031, "learning_rate": 0.0012769374364128628, "loss": 1.4233, "step": 6280 }, { "epoch": 0.6635021097046413, "grad_norm": 0.5125714540481567, "learning_rate": 0.0012593367929992667, "loss": 1.417, "step": 6290 }, { "epoch": 0.6645569620253164, "grad_norm": 0.6019501090049744, "learning_rate": 0.0012419787477269257, "loss": 1.4113, "step": 6300 }, { "epoch": 0.6656118143459916, "grad_norm": 0.5925447344779968, "learning_rate": 0.0012248599567488698, "loss": 1.4096, "step": 6310 }, { "epoch": 0.6666666666666666, "grad_norm": 0.47715866565704346, "learning_rate": 0.0012079771223079822, "loss": 1.4019, "step": 6320 }, { "epoch": 0.6677215189873418, "grad_norm": 0.5875175595283508, "learning_rate": 0.0011913269921017202, "loss": 1.4112, "step": 6330 }, { "epoch": 0.6687763713080169, "grad_norm": 0.7462964057922363, "learning_rate": 0.0011749063586555919, "loss": 1.4049, "step": 6340 }, { "epoch": 0.669831223628692, "grad_norm": 0.6075565218925476, "learning_rate": 0.001158712058705271, "loss": 1.41, "step": 6350 }, { "epoch": 0.6708860759493671, "grad_norm": 0.8765844106674194, "learning_rate": 0.0011427409725872262, "loss": 1.4071, "step": 6360 }, { "epoch": 0.6719409282700421, "grad_norm": 0.7356165051460266, "learning_rate": 0.00112699002363775, "loss": 1.387, "step": 6370 }, { "epoch": 0.6729957805907173, "grad_norm": 0.47462791204452515, "learning_rate": 0.0011114561776002726, "loss": 1.389, "step": 6380 }, { "epoch": 0.6740506329113924, "grad_norm": 0.9150704741477966, "learning_rate": 0.001096136442040843, "loss": 1.4043, "step": 6390 }, { "epoch": 0.6751054852320675, "grad_norm": 0.7336153388023376, "learning_rate": 0.001081027865771668, "loss": 1.4043, "step": 6400 }, { "epoch": 0.6761603375527426, "grad_norm": 0.5432242155075073, "learning_rate": 0.0010661275382825958, "loss": 1.3978, "step": 6410 }, { "epoch": 0.6772151898734177, "grad_norm": 0.5393098592758179, "learning_rate": 0.0010514325891804379, "loss": 1.3866, "step": 6420 }, { "epoch": 0.6782700421940928, "grad_norm": 0.4978395700454712, "learning_rate": 0.0010369401876360166, "loss": 1.3949, "step": 6430 }, { "epoch": 0.679324894514768, "grad_norm": 0.47768574953079224, "learning_rate": 0.001022647541838836, "loss": 1.38, "step": 6440 }, { "epoch": 0.680379746835443, "grad_norm": 0.6149685978889465, "learning_rate": 0.0010085518984592678, "loss": 1.3817, "step": 6450 }, { "epoch": 0.6814345991561181, "grad_norm": 0.5218510031700134, "learning_rate": 0.0009946505421181513, "loss": 1.3933, "step": 6460 }, { "epoch": 0.6824894514767933, "grad_norm": 0.5168776512145996, "learning_rate": 0.0009809407948637044, "loss": 1.3879, "step": 6470 }, { "epoch": 0.6835443037974683, "grad_norm": 0.6201766729354858, "learning_rate": 0.0009674200156556436, "loss": 1.3931, "step": 6480 }, { "epoch": 0.6845991561181435, "grad_norm": 0.5909963846206665, "learning_rate": 0.0009540855998564147, "loss": 1.389, "step": 6490 }, { "epoch": 0.6856540084388185, "grad_norm": 0.5744777917861938, "learning_rate": 0.000940934978729437, "loss": 1.4006, "step": 6500 }, { "epoch": 0.6867088607594937, "grad_norm": 0.4588823914527893, "learning_rate": 0.0009279656189442628, "loss": 1.4011, "step": 6510 }, { "epoch": 0.6877637130801688, "grad_norm": 0.4933320879936218, "learning_rate": 0.0009151750220885573, "loss": 1.3857, "step": 6520 }, { "epoch": 0.6888185654008439, "grad_norm": 0.5646079778671265, "learning_rate": 0.0009025607241868057, "loss": 1.3734, "step": 6530 }, { "epoch": 0.689873417721519, "grad_norm": 0.6859690546989441, "learning_rate": 0.0008901202952256545, "loss": 1.3858, "step": 6540 }, { "epoch": 0.6909282700421941, "grad_norm": 0.5678068399429321, "learning_rate": 0.0008778513386857928, "loss": 1.3776, "step": 6550 }, { "epoch": 0.6919831223628692, "grad_norm": 0.5483217835426331, "learning_rate": 0.0008657514910802905, "loss": 1.3772, "step": 6560 }, { "epoch": 0.6930379746835443, "grad_norm": 0.4671037495136261, "learning_rate": 0.0008538184214992943, "loss": 1.3731, "step": 6570 }, { "epoch": 0.6940928270042194, "grad_norm": 0.5548893809318542, "learning_rate": 0.0008420498311610049, "loss": 1.3706, "step": 6580 }, { "epoch": 0.6951476793248945, "grad_norm": 0.5803453922271729, "learning_rate": 0.0008304434529688382, "loss": 1.3719, "step": 6590 }, { "epoch": 0.6962025316455697, "grad_norm": 0.5223375558853149, "learning_rate": 0.0008189970510746938, "loss": 1.3799, "step": 6600 }, { "epoch": 0.6972573839662447, "grad_norm": 0.5233261585235596, "learning_rate": 0.0008077084204482425, "loss": 1.3666, "step": 6610 }, { "epoch": 0.6983122362869199, "grad_norm": 0.530835747718811, "learning_rate": 0.0007965753864521494, "loss": 1.3752, "step": 6620 }, { "epoch": 0.6993670886075949, "grad_norm": 0.4769565463066101, "learning_rate": 0.0007855958044231527, "loss": 1.3673, "step": 6630 }, { "epoch": 0.70042194092827, "grad_norm": 0.6102154850959778, "learning_rate": 0.000774767559258917, "loss": 1.3712, "step": 6640 }, { "epoch": 0.7014767932489452, "grad_norm": 0.5593146085739136, "learning_rate": 0.0007640885650105806, "loss": 1.3645, "step": 6650 }, { "epoch": 0.7025316455696202, "grad_norm": 0.594312846660614, "learning_rate": 0.0007535567644809191, "loss": 1.3636, "step": 6660 }, { "epoch": 0.7035864978902954, "grad_norm": 0.5139537453651428, "learning_rate": 0.0007431701288280478, "loss": 1.3638, "step": 6670 }, { "epoch": 0.7046413502109705, "grad_norm": 0.46587347984313965, "learning_rate": 0.0007329266571745864, "loss": 1.3649, "step": 6680 }, { "epoch": 0.7056962025316456, "grad_norm": 0.5560813546180725, "learning_rate": 0.0007228243762222109, "loss": 1.3593, "step": 6690 }, { "epoch": 0.7067510548523207, "grad_norm": 0.4940660297870636, "learning_rate": 0.0007128613398715179, "loss": 1.3623, "step": 6700 }, { "epoch": 0.7078059071729957, "grad_norm": 0.49038827419281006, "learning_rate": 0.0007030356288471288, "loss": 1.3619, "step": 6710 }, { "epoch": 0.7088607594936709, "grad_norm": 0.6032112240791321, "learning_rate": 0.0006933453503279619, "loss": 1.3517, "step": 6720 }, { "epoch": 0.709915611814346, "grad_norm": 0.5079308152198792, "learning_rate": 0.0006837886375825994, "loss": 1.3612, "step": 6730 }, { "epoch": 0.7109704641350211, "grad_norm": 0.5179148316383362, "learning_rate": 0.0006743636496096813, "loss": 1.3622, "step": 6740 }, { "epoch": 0.7120253164556962, "grad_norm": 0.4907872974872589, "learning_rate": 0.0006650685707832559, "loss": 1.3621, "step": 6750 }, { "epoch": 0.7130801687763713, "grad_norm": 0.6771067976951599, "learning_rate": 0.0006559016105030176, "loss": 1.3606, "step": 6760 }, { "epoch": 0.7141350210970464, "grad_norm": 0.6246157288551331, "learning_rate": 0.000646861002849367, "loss": 1.3595, "step": 6770 }, { "epoch": 0.7151898734177216, "grad_norm": 0.4946833550930023, "learning_rate": 0.0006379450062432248, "loss": 1.3566, "step": 6780 }, { "epoch": 0.7162447257383966, "grad_norm": 0.503964900970459, "learning_rate": 0.0006291519031105347, "loss": 1.3611, "step": 6790 }, { "epoch": 0.7172995780590717, "grad_norm": 0.48558634519577026, "learning_rate": 0.00062047999955139, "loss": 1.3469, "step": 6800 }, { "epoch": 0.7183544303797469, "grad_norm": 0.6461641788482666, "learning_rate": 0.000611927625013722, "loss": 1.3491, "step": 6810 }, { "epoch": 0.7194092827004219, "grad_norm": 0.5477721095085144, "learning_rate": 0.0006034931319714858, "loss": 1.353, "step": 6820 }, { "epoch": 0.7204641350210971, "grad_norm": 0.6451165080070496, "learning_rate": 0.0005951748956072806, "loss": 1.3514, "step": 6830 }, { "epoch": 0.7215189873417721, "grad_norm": 0.7312910556793213, "learning_rate": 0.0005869713134993463, "loss": 1.3396, "step": 6840 }, { "epoch": 0.7225738396624473, "grad_norm": 0.4609370529651642, "learning_rate": 0.0005788808053128734, "loss": 1.3474, "step": 6850 }, { "epoch": 0.7236286919831224, "grad_norm": 0.5799684524536133, "learning_rate": 0.0005709018124955674, "loss": 1.342, "step": 6860 }, { "epoch": 0.7246835443037974, "grad_norm": 0.5226260423660278, "learning_rate": 0.0005630327979774111, "loss": 1.3405, "step": 6870 }, { "epoch": 0.7257383966244726, "grad_norm": 0.6010406017303467, "learning_rate": 0.0005552722458745627, "loss": 1.348, "step": 6880 }, { "epoch": 0.7267932489451476, "grad_norm": 0.5826901197433472, "learning_rate": 0.0005476186611973374, "loss": 1.3383, "step": 6890 }, { "epoch": 0.7278481012658228, "grad_norm": 0.5549695491790771, "learning_rate": 0.000540070569562213, "loss": 1.3397, "step": 6900 }, { "epoch": 0.7289029535864979, "grad_norm": 0.5249995589256287, "learning_rate": 0.0005326265169078048, "loss": 1.3595, "step": 6910 }, { "epoch": 0.729957805907173, "grad_norm": 0.565153181552887, "learning_rate": 0.0005252850692147567, "loss": 1.3401, "step": 6920 }, { "epoch": 0.7310126582278481, "grad_norm": 0.48969659209251404, "learning_rate": 0.0005180448122294913, "loss": 1.3475, "step": 6930 }, { "epoch": 0.7320675105485233, "grad_norm": 0.5739888548851013, "learning_rate": 0.0005109043511917693, "loss": 1.3471, "step": 6940 }, { "epoch": 0.7331223628691983, "grad_norm": 0.5284473896026611, "learning_rate": 0.0005038623105660032, "loss": 1.3465, "step": 6950 }, { "epoch": 0.7341772151898734, "grad_norm": 0.4692670404911041, "learning_rate": 0.0004969173337762747, "loss": 1.3364, "step": 6960 }, { "epoch": 0.7352320675105485, "grad_norm": 0.4796403646469116, "learning_rate": 0.0004900680829450042, "loss": 1.33, "step": 6970 }, { "epoch": 0.7362869198312236, "grad_norm": 0.48727184534072876, "learning_rate": 0.0004833132386352233, "loss": 1.336, "step": 6980 }, { "epoch": 0.7373417721518988, "grad_norm": 0.4695288836956024, "learning_rate": 0.00047665149959639813, "loss": 1.334, "step": 6990 }, { "epoch": 0.7383966244725738, "grad_norm": 0.5014991164207458, "learning_rate": 0.0004700815825137577, "loss": 1.3422, "step": 7000 }, { "epoch": 0.739451476793249, "grad_norm": 0.44504332542419434, "learning_rate": 0.00046360222176107584, "loss": 1.3159, "step": 7010 }, { "epoch": 0.740506329113924, "grad_norm": 0.6361677646636963, "learning_rate": 0.0004572121691568625, "loss": 1.331, "step": 7020 }, { "epoch": 0.7415611814345991, "grad_norm": 0.5153830647468567, "learning_rate": 0.00045091019372391354, "loss": 1.3341, "step": 7030 }, { "epoch": 0.7426160337552743, "grad_norm": 0.5851036906242371, "learning_rate": 0.0004446950814521764, "loss": 1.3415, "step": 7040 }, { "epoch": 0.7436708860759493, "grad_norm": 0.5900114178657532, "learning_rate": 0.0004385656350648835, "loss": 1.328, "step": 7050 }, { "epoch": 0.7447257383966245, "grad_norm": 0.5923007726669312, "learning_rate": 0.00043252067378790946, "loss": 1.3333, "step": 7060 }, { "epoch": 0.7457805907172996, "grad_norm": 0.4834819734096527, "learning_rate": 0.00042655903312230673, "loss": 1.3377, "step": 7070 }, { "epoch": 0.7468354430379747, "grad_norm": 0.499101847410202, "learning_rate": 0.0004206795646199778, "loss": 1.3337, "step": 7080 }, { "epoch": 0.7478902953586498, "grad_norm": 0.5846497416496277, "learning_rate": 0.0004148811356624379, "loss": 1.3341, "step": 7090 }, { "epoch": 0.7489451476793249, "grad_norm": 0.471291720867157, "learning_rate": 0.0004091626292426282, "loss": 1.3285, "step": 7100 }, { "epoch": 0.75, "grad_norm": 0.4688068628311157, "learning_rate": 0.0004035229437497357, "loss": 1.3382, "step": 7110 }, { "epoch": 0.7510548523206751, "grad_norm": 0.5641908645629883, "learning_rate": 0.00039796099275697986, "loss": 1.3303, "step": 7120 }, { "epoch": 0.7521097046413502, "grad_norm": 0.4740554392337799, "learning_rate": 0.0003924757048123232, "loss": 1.3412, "step": 7130 }, { "epoch": 0.7531645569620253, "grad_norm": 0.4787723422050476, "learning_rate": 0.0003870660232320675, "loss": 1.3223, "step": 7140 }, { "epoch": 0.7542194092827004, "grad_norm": 0.5571549534797668, "learning_rate": 0.000381730905897295, "loss": 1.321, "step": 7150 }, { "epoch": 0.7552742616033755, "grad_norm": 0.4965110421180725, "learning_rate": 0.0003764693250531141, "loss": 1.3276, "step": 7160 }, { "epoch": 0.7563291139240507, "grad_norm": 0.7557862401008606, "learning_rate": 0.0003712802671106742, "loss": 1.3378, "step": 7170 }, { "epoch": 0.7573839662447257, "grad_norm": 0.5894241333007812, "learning_rate": 0.0003661627324519073, "loss": 1.3146, "step": 7180 }, { "epoch": 0.7584388185654009, "grad_norm": 0.49034640192985535, "learning_rate": 0.0003611157352369628, "loss": 1.3182, "step": 7190 }, { "epoch": 0.759493670886076, "grad_norm": 0.48033463954925537, "learning_rate": 0.00035613830321429534, "loss": 1.322, "step": 7200 }, { "epoch": 0.760548523206751, "grad_norm": 0.5243462324142456, "learning_rate": 0.00035122947753337037, "loss": 1.3255, "step": 7210 }, { "epoch": 0.7616033755274262, "grad_norm": 0.5140429139137268, "learning_rate": 0.0003463883125599521, "loss": 1.3377, "step": 7220 }, { "epoch": 0.7626582278481012, "grad_norm": 0.5377463102340698, "learning_rate": 0.00034161387569393647, "loss": 1.3328, "step": 7230 }, { "epoch": 0.7637130801687764, "grad_norm": 0.5283532738685608, "learning_rate": 0.00033690524718969593, "loss": 1.3296, "step": 7240 }, { "epoch": 0.7647679324894515, "grad_norm": 0.4901208281517029, "learning_rate": 0.0003322615199788993, "loss": 1.3233, "step": 7250 }, { "epoch": 0.7658227848101266, "grad_norm": 0.4646488130092621, "learning_rate": 0.00032768179949577516, "loss": 1.321, "step": 7260 }, { "epoch": 0.7668776371308017, "grad_norm": 0.49830129742622375, "learning_rate": 0.0003231652035047826, "loss": 1.3192, "step": 7270 }, { "epoch": 0.7679324894514767, "grad_norm": 0.6454711556434631, "learning_rate": 0.000318710861930658, "loss": 1.3227, "step": 7280 }, { "epoch": 0.7689873417721519, "grad_norm": 0.4911431074142456, "learning_rate": 0.0003143179166908038, "loss": 1.3326, "step": 7290 }, { "epoch": 0.770042194092827, "grad_norm": 0.5197914242744446, "learning_rate": 0.00030998552152998834, "loss": 1.3362, "step": 7300 }, { "epoch": 0.7710970464135021, "grad_norm": 0.4570811688899994, "learning_rate": 0.00030571284185732276, "loss": 1.3172, "step": 7310 }, { "epoch": 0.7721518987341772, "grad_norm": 0.5702477693557739, "learning_rate": 0.0003014990545854864, "loss": 1.3117, "step": 7320 }, { "epoch": 0.7732067510548524, "grad_norm": 0.5021108984947205, "learning_rate": 0.0002973433479721675, "loss": 1.3169, "step": 7330 }, { "epoch": 0.7742616033755274, "grad_norm": 0.5911805629730225, "learning_rate": 0.00029324492146368906, "loss": 1.3139, "step": 7340 }, { "epoch": 0.7753164556962026, "grad_norm": 0.4756425619125366, "learning_rate": 0.00028920298554079113, "loss": 1.3101, "step": 7350 }, { "epoch": 0.7763713080168776, "grad_norm": 0.4732058048248291, "learning_rate": 0.00028521676156653756, "loss": 1.3195, "step": 7360 }, { "epoch": 0.7774261603375527, "grad_norm": 0.5115228891372681, "learning_rate": 0.00028128548163632006, "loss": 1.3191, "step": 7370 }, { "epoch": 0.7784810126582279, "grad_norm": 0.5534390807151794, "learning_rate": 0.0002774083884299292, "loss": 1.3064, "step": 7380 }, { "epoch": 0.7795358649789029, "grad_norm": 0.46912050247192383, "learning_rate": 0.0002735847350656645, "loss": 1.3158, "step": 7390 }, { "epoch": 0.7805907172995781, "grad_norm": 0.5076785683631897, "learning_rate": 0.0002698137849564556, "loss": 1.3241, "step": 7400 }, { "epoch": 0.7816455696202531, "grad_norm": 0.5120282173156738, "learning_rate": 0.0002660948116679665, "loss": 1.3036, "step": 7410 }, { "epoch": 0.7827004219409283, "grad_norm": 0.5615637898445129, "learning_rate": 0.00026242709877865493, "loss": 1.309, "step": 7420 }, { "epoch": 0.7837552742616034, "grad_norm": 0.4717916250228882, "learning_rate": 0.00025880993974176204, "loss": 1.2957, "step": 7430 }, { "epoch": 0.7848101265822784, "grad_norm": 0.47996142506599426, "learning_rate": 0.0002552426377492028, "loss": 1.2992, "step": 7440 }, { "epoch": 0.7858649789029536, "grad_norm": 0.45766481757164, "learning_rate": 0.0002517245055973337, "loss": 1.303, "step": 7450 }, { "epoch": 0.7869198312236287, "grad_norm": 0.482280433177948, "learning_rate": 0.00024825486555456975, "loss": 1.3137, "step": 7460 }, { "epoch": 0.7879746835443038, "grad_norm": 0.5143918395042419, "learning_rate": 0.00024483304923082663, "loss": 1.3052, "step": 7470 }, { "epoch": 0.7890295358649789, "grad_norm": 0.513719916343689, "learning_rate": 0.0002414583974487624, "loss": 1.3109, "step": 7480 }, { "epoch": 0.790084388185654, "grad_norm": 0.45668476819992065, "learning_rate": 0.00023813026011679372, "loss": 1.3182, "step": 7490 }, { "epoch": 0.7911392405063291, "grad_norm": 0.49208977818489075, "learning_rate": 0.0002348479961038625, "loss": 1.313, "step": 7500 }, { "epoch": 0.7921940928270043, "grad_norm": 0.46777087450027466, "learning_rate": 0.00023161097311592867, "loss": 1.3178, "step": 7510 }, { "epoch": 0.7932489451476793, "grad_norm": 0.4836950898170471, "learning_rate": 0.00022841856757416538, "loss": 1.303, "step": 7520 }, { "epoch": 0.7943037974683544, "grad_norm": 0.46816742420196533, "learning_rate": 0.0002252701644948328, "loss": 1.3028, "step": 7530 }, { "epoch": 0.7953586497890295, "grad_norm": 0.5724418759346008, "learning_rate": 0.00022216515737080817, "loss": 1.3023, "step": 7540 }, { "epoch": 0.7964135021097046, "grad_norm": 0.4637795388698578, "learning_rate": 0.00021910294805474833, "loss": 1.2987, "step": 7550 }, { "epoch": 0.7974683544303798, "grad_norm": 0.5904604196548462, "learning_rate": 0.0002160829466438629, "loss": 1.3005, "step": 7560 }, { "epoch": 0.7985232067510548, "grad_norm": 0.4693807065486908, "learning_rate": 0.00021310457136627562, "loss": 1.3132, "step": 7570 }, { "epoch": 0.79957805907173, "grad_norm": 0.47196266055107117, "learning_rate": 0.00021016724846895213, "loss": 1.3096, "step": 7580 }, { "epoch": 0.8006329113924051, "grad_norm": 0.4710361957550049, "learning_rate": 0.00020727041210717235, "loss": 1.3005, "step": 7590 }, { "epoch": 0.8016877637130801, "grad_norm": 0.4702761471271515, "learning_rate": 0.00020441350423552624, "loss": 1.3018, "step": 7600 }, { "epoch": 0.8027426160337553, "grad_norm": 0.5067238211631775, "learning_rate": 0.00020159597450041257, "loss": 1.3151, "step": 7610 }, { "epoch": 0.8037974683544303, "grad_norm": 0.4618867039680481, "learning_rate": 0.00019881728013401842, "loss": 1.2896, "step": 7620 }, { "epoch": 0.8048523206751055, "grad_norm": 0.534237265586853, "learning_rate": 0.00019607688584976116, "loss": 1.3143, "step": 7630 }, { "epoch": 0.8059071729957806, "grad_norm": 0.5107449293136597, "learning_rate": 0.00019337426373917076, "loss": 1.3021, "step": 7640 }, { "epoch": 0.8069620253164557, "grad_norm": 0.47250908613204956, "learning_rate": 0.00019070889317019375, "loss": 1.3203, "step": 7650 }, { "epoch": 0.8080168776371308, "grad_norm": 0.463882178068161, "learning_rate": 0.00018808026068689883, "loss": 1.2975, "step": 7660 }, { "epoch": 0.8090717299578059, "grad_norm": 0.4829825460910797, "learning_rate": 0.00018548785991056508, "loss": 1.3031, "step": 7670 }, { "epoch": 0.810126582278481, "grad_norm": 0.5524773001670837, "learning_rate": 0.00018293119144213328, "loss": 1.3172, "step": 7680 }, { "epoch": 0.8111814345991561, "grad_norm": 0.6024357676506042, "learning_rate": 0.00018040976276600176, "loss": 1.2894, "step": 7690 }, { "epoch": 0.8122362869198312, "grad_norm": 0.5167730450630188, "learning_rate": 0.00017792308815514854, "loss": 1.2996, "step": 7700 }, { "epoch": 0.8132911392405063, "grad_norm": 0.537597119808197, "learning_rate": 0.00017547068857756104, "loss": 1.2992, "step": 7710 }, { "epoch": 0.8143459915611815, "grad_norm": 0.47882741689682007, "learning_rate": 0.00017305209160395547, "loss": 1.3024, "step": 7720 }, { "epoch": 0.8154008438818565, "grad_norm": 0.5075975060462952, "learning_rate": 0.00017066683131676825, "loss": 1.3086, "step": 7730 }, { "epoch": 0.8164556962025317, "grad_norm": 0.45599648356437683, "learning_rate": 0.00016831444822040207, "loss": 1.3073, "step": 7740 }, { "epoch": 0.8175105485232067, "grad_norm": 0.4692286252975464, "learning_rate": 0.00016599448915270845, "loss": 1.2932, "step": 7750 }, { "epoch": 0.8185654008438819, "grad_norm": 0.5363489985466003, "learning_rate": 0.000163706507197691, "loss": 1.294, "step": 7760 }, { "epoch": 0.819620253164557, "grad_norm": 0.46856990456581116, "learning_rate": 0.0001614500615994117, "loss": 1.302, "step": 7770 }, { "epoch": 0.820675105485232, "grad_norm": 0.529060959815979, "learning_rate": 0.00015922471767708377, "loss": 1.2952, "step": 7780 }, { "epoch": 0.8217299578059072, "grad_norm": 0.6456242799758911, "learning_rate": 0.00015703004674133498, "loss": 1.3112, "step": 7790 }, { "epoch": 0.8227848101265823, "grad_norm": 0.4637324810028076, "learning_rate": 0.00015486562601162512, "loss": 1.2997, "step": 7800 }, { "epoch": 0.8238396624472574, "grad_norm": 0.523307204246521, "learning_rate": 0.0001527310385348017, "loss": 1.2974, "step": 7810 }, { "epoch": 0.8248945147679325, "grad_norm": 0.45750248432159424, "learning_rate": 0.00015062587310477813, "loss": 1.304, "step": 7820 }, { "epoch": 0.8259493670886076, "grad_norm": 0.463380366563797, "learning_rate": 0.00014854972418331948, "loss": 1.2948, "step": 7830 }, { "epoch": 0.8270042194092827, "grad_norm": 0.48942434787750244, "learning_rate": 0.00014650219182191934, "loss": 1.2825, "step": 7840 }, { "epoch": 0.8280590717299579, "grad_norm": 0.525461733341217, "learning_rate": 0.00014448288158475423, "loss": 1.2923, "step": 7850 }, { "epoch": 0.8291139240506329, "grad_norm": 0.45308318734169006, "learning_rate": 0.0001424914044726995, "loss": 1.2776, "step": 7860 }, { "epoch": 0.830168776371308, "grad_norm": 0.6268229484558105, "learning_rate": 0.0001405273768483926, "loss": 1.291, "step": 7870 }, { "epoch": 0.8312236286919831, "grad_norm": 0.496448278427124, "learning_rate": 0.0001385904203623296, "loss": 1.3135, "step": 7880 }, { "epoch": 0.8322784810126582, "grad_norm": 0.47262054681777954, "learning_rate": 0.00013668016187997964, "loss": 1.3109, "step": 7890 }, { "epoch": 0.8333333333333334, "grad_norm": 0.4877476692199707, "learning_rate": 0.0001347962334099052, "loss": 1.2887, "step": 7900 }, { "epoch": 0.8343881856540084, "grad_norm": 0.5657548904418945, "learning_rate": 0.00013293827203287141, "loss": 1.2999, "step": 7910 }, { "epoch": 0.8354430379746836, "grad_norm": 0.5200432538986206, "learning_rate": 0.00013110591983193424, "loss": 1.296, "step": 7920 }, { "epoch": 0.8364978902953587, "grad_norm": 0.4625248908996582, "learning_rate": 0.00012929882382349103, "loss": 1.3053, "step": 7930 }, { "epoch": 0.8375527426160337, "grad_norm": 0.5191882252693176, "learning_rate": 0.0001275166358892821, "loss": 1.3006, "step": 7940 }, { "epoch": 0.8386075949367089, "grad_norm": 0.49986517429351807, "learning_rate": 0.00012575901270932944, "loss": 1.3017, "step": 7950 }, { "epoch": 0.8396624472573839, "grad_norm": 0.4724440276622772, "learning_rate": 0.00012402561569579935, "loss": 1.287, "step": 7960 }, { "epoch": 0.8407172995780591, "grad_norm": 0.4518606662750244, "learning_rate": 0.00012231611092777743, "loss": 1.2891, "step": 7970 }, { "epoch": 0.8417721518987342, "grad_norm": 0.46727433800697327, "learning_rate": 0.00012063016908694192, "loss": 1.2874, "step": 7980 }, { "epoch": 0.8428270042194093, "grad_norm": 0.5409188866615295, "learning_rate": 0.00011896746539412405, "loss": 1.3011, "step": 7990 }, { "epoch": 0.8438818565400844, "grad_norm": 0.5100900530815125, "learning_rate": 0.00011732767954674264, "loss": 1.2931, "step": 8000 }, { "epoch": 0.8449367088607594, "grad_norm": 0.49695006012916565, "learning_rate": 0.00011571049565710122, "loss": 1.3001, "step": 8010 }, { "epoch": 0.8459915611814346, "grad_norm": 0.5343964695930481, "learning_rate": 0.00011411560219153552, "loss": 1.2943, "step": 8020 }, { "epoch": 0.8470464135021097, "grad_norm": 0.4976387023925781, "learning_rate": 0.0001125426919103997, "loss": 1.2914, "step": 8030 }, { "epoch": 0.8481012658227848, "grad_norm": 0.47220513224601746, "learning_rate": 0.00011099146180887992, "loss": 1.3017, "step": 8040 }, { "epoch": 0.8491561181434599, "grad_norm": 0.465561181306839, "learning_rate": 0.0001094616130586235, "loss": 1.284, "step": 8050 }, { "epoch": 0.8502109704641351, "grad_norm": 0.49773499369621277, "learning_rate": 0.00010795285095017282, "loss": 1.2936, "step": 8060 }, { "epoch": 0.8512658227848101, "grad_norm": 0.4696478545665741, "learning_rate": 0.00010646488483619263, "loss": 1.2975, "step": 8070 }, { "epoch": 0.8523206751054853, "grad_norm": 0.4973534643650055, "learning_rate": 0.00010499742807547978, "loss": 1.2995, "step": 8080 }, { "epoch": 0.8533755274261603, "grad_norm": 0.5054043531417847, "learning_rate": 0.0001035501979777448, "loss": 1.2895, "step": 8090 }, { "epoch": 0.8544303797468354, "grad_norm": 0.5566696524620056, "learning_rate": 0.00010212291574915464, "loss": 1.2813, "step": 8100 }, { "epoch": 0.8554852320675106, "grad_norm": 0.5192328691482544, "learning_rate": 0.00010071530643862575, "loss": 1.299, "step": 8110 }, { "epoch": 0.8565400843881856, "grad_norm": 0.4706285893917084, "learning_rate": 9.932709888485788e-05, "loss": 1.2908, "step": 8120 }, { "epoch": 0.8575949367088608, "grad_norm": 0.5187116861343384, "learning_rate": 9.79580256640974e-05, "loss": 1.2899, "step": 8130 }, { "epoch": 0.8586497890295358, "grad_norm": 0.46268996596336365, "learning_rate": 9.660782303862107e-05, "loss": 1.2889, "step": 8140 }, { "epoch": 0.859704641350211, "grad_norm": 0.4860462546348572, "learning_rate": 9.527623090592962e-05, "loss": 1.282, "step": 8150 }, { "epoch": 0.8607594936708861, "grad_norm": 0.49108442664146423, "learning_rate": 9.396299274864176e-05, "loss": 1.296, "step": 8160 }, { "epoch": 0.8618143459915611, "grad_norm": 0.5382658839225769, "learning_rate": 9.266785558507876e-05, "loss": 1.2887, "step": 8170 }, { "epoch": 0.8628691983122363, "grad_norm": 0.5346512198448181, "learning_rate": 9.139056992053017e-05, "loss": 1.2945, "step": 8180 }, { "epoch": 0.8639240506329114, "grad_norm": 0.5191641449928284, "learning_rate": 9.01308896991912e-05, "loss": 1.286, "step": 8190 }, { "epoch": 0.8649789029535865, "grad_norm": 0.46642035245895386, "learning_rate": 8.88885722567627e-05, "loss": 1.2987, "step": 8200 }, { "epoch": 0.8660337552742616, "grad_norm": 0.4673319160938263, "learning_rate": 8.766337827370438e-05, "loss": 1.2916, "step": 8210 }, { "epoch": 0.8670886075949367, "grad_norm": 0.4642171263694763, "learning_rate": 8.645507172913238e-05, "loss": 1.2993, "step": 8220 }, { "epoch": 0.8681434599156118, "grad_norm": 0.47717443108558655, "learning_rate": 8.52634198553523e-05, "loss": 1.2958, "step": 8230 }, { "epoch": 0.869198312236287, "grad_norm": 0.4526664912700653, "learning_rate": 8.408819309301891e-05, "loss": 1.2922, "step": 8240 }, { "epoch": 0.870253164556962, "grad_norm": 0.4961718022823334, "learning_rate": 8.292916504691398e-05, "loss": 1.3044, "step": 8250 }, { "epoch": 0.8713080168776371, "grad_norm": 0.4467519521713257, "learning_rate": 8.178611244233354e-05, "loss": 1.2958, "step": 8260 }, { "epoch": 0.8723628691983122, "grad_norm": 0.4841761887073517, "learning_rate": 8.065881508207636e-05, "loss": 1.293, "step": 8270 }, { "epoch": 0.8734177215189873, "grad_norm": 0.4580075442790985, "learning_rate": 7.954705580402525e-05, "loss": 1.2756, "step": 8280 }, { "epoch": 0.8744725738396625, "grad_norm": 0.5612879395484924, "learning_rate": 7.845062043931299e-05, "loss": 1.2776, "step": 8290 }, { "epoch": 0.8755274261603375, "grad_norm": 0.4572225511074066, "learning_rate": 7.736929777106499e-05, "loss": 1.3018, "step": 8300 }, { "epoch": 0.8765822784810127, "grad_norm": 0.46267932653427124, "learning_rate": 7.630287949371051e-05, "loss": 1.2814, "step": 8310 }, { "epoch": 0.8776371308016878, "grad_norm": 0.44602736830711365, "learning_rate": 7.525116017285479e-05, "loss": 1.3011, "step": 8320 }, { "epoch": 0.8786919831223629, "grad_norm": 0.4606595039367676, "learning_rate": 7.421393720570416e-05, "loss": 1.2942, "step": 8330 }, { "epoch": 0.879746835443038, "grad_norm": 0.45960938930511475, "learning_rate": 7.319101078203692e-05, "loss": 1.2803, "step": 8340 }, { "epoch": 0.880801687763713, "grad_norm": 0.49523669481277466, "learning_rate": 7.218218384571176e-05, "loss": 1.283, "step": 8350 }, { "epoch": 0.8818565400843882, "grad_norm": 0.45749959349632263, "learning_rate": 7.118726205670702e-05, "loss": 1.2813, "step": 8360 }, { "epoch": 0.8829113924050633, "grad_norm": 0.4498620629310608, "learning_rate": 7.020605375368314e-05, "loss": 1.2887, "step": 8370 }, { "epoch": 0.8839662447257384, "grad_norm": 0.4596899449825287, "learning_rate": 6.923836991706108e-05, "loss": 1.2957, "step": 8380 }, { "epoch": 0.8850210970464135, "grad_norm": 0.4731309115886688, "learning_rate": 6.828402413260965e-05, "loss": 1.2858, "step": 8390 }, { "epoch": 0.8860759493670886, "grad_norm": 0.50437992811203, "learning_rate": 6.73428325555347e-05, "loss": 1.2861, "step": 8400 }, { "epoch": 0.8871308016877637, "grad_norm": 0.5009710788726807, "learning_rate": 6.641461387506347e-05, "loss": 1.3037, "step": 8410 }, { "epoch": 0.8881856540084389, "grad_norm": 0.4667570888996124, "learning_rate": 6.549918927951679e-05, "loss": 1.2925, "step": 8420 }, { "epoch": 0.8892405063291139, "grad_norm": 0.46593964099884033, "learning_rate": 6.459638242186298e-05, "loss": 1.2886, "step": 8430 }, { "epoch": 0.890295358649789, "grad_norm": 0.5209925770759583, "learning_rate": 6.370601938574637e-05, "loss": 1.2809, "step": 8440 }, { "epoch": 0.8913502109704642, "grad_norm": 0.5734021067619324, "learning_rate": 6.282792865198421e-05, "loss": 1.2907, "step": 8450 }, { "epoch": 0.8924050632911392, "grad_norm": 0.5093693733215332, "learning_rate": 6.196194106552512e-05, "loss": 1.2863, "step": 8460 }, { "epoch": 0.8934599156118144, "grad_norm": 0.46313154697418213, "learning_rate": 6.110788980286329e-05, "loss": 1.2886, "step": 8470 }, { "epoch": 0.8945147679324894, "grad_norm": 0.5093756318092346, "learning_rate": 6.026561033990159e-05, "loss": 1.2821, "step": 8480 }, { "epoch": 0.8955696202531646, "grad_norm": 0.49674439430236816, "learning_rate": 5.943494042025771e-05, "loss": 1.2866, "step": 8490 }, { "epoch": 0.8966244725738397, "grad_norm": 0.49339917302131653, "learning_rate": 5.8615720024007174e-05, "loss": 1.2775, "step": 8500 }, { "epoch": 0.8976793248945147, "grad_norm": 0.46326667070388794, "learning_rate": 5.780779133685717e-05, "loss": 1.2904, "step": 8510 }, { "epoch": 0.8987341772151899, "grad_norm": 0.4716956913471222, "learning_rate": 5.701099871974525e-05, "loss": 1.2855, "step": 8520 }, { "epoch": 0.8997890295358649, "grad_norm": 0.46486932039260864, "learning_rate": 5.6225188678857095e-05, "loss": 1.2982, "step": 8530 }, { "epoch": 0.9008438818565401, "grad_norm": 0.45782825350761414, "learning_rate": 5.545020983605749e-05, "loss": 1.2956, "step": 8540 }, { "epoch": 0.9018987341772152, "grad_norm": 0.46856826543807983, "learning_rate": 5.4685912899728965e-05, "loss": 1.2851, "step": 8550 }, { "epoch": 0.9029535864978903, "grad_norm": 0.4586803913116455, "learning_rate": 5.39321506360123e-05, "loss": 1.2774, "step": 8560 }, { "epoch": 0.9040084388185654, "grad_norm": 0.5621423125267029, "learning_rate": 5.318877784044342e-05, "loss": 1.2984, "step": 8570 }, { "epoch": 0.9050632911392406, "grad_norm": 0.5149664878845215, "learning_rate": 5.245565130998124e-05, "loss": 1.2885, "step": 8580 }, { "epoch": 0.9061181434599156, "grad_norm": 0.5729714632034302, "learning_rate": 5.173262981542119e-05, "loss": 1.2936, "step": 8590 }, { "epoch": 0.9071729957805907, "grad_norm": 0.4599650204181671, "learning_rate": 5.101957407418877e-05, "loss": 1.2832, "step": 8600 }, { "epoch": 0.9082278481012658, "grad_norm": 0.5617707371711731, "learning_rate": 5.0316346723508287e-05, "loss": 1.2872, "step": 8610 }, { "epoch": 0.9092827004219409, "grad_norm": 0.5002422332763672, "learning_rate": 4.962281229394129e-05, "loss": 1.2828, "step": 8620 }, { "epoch": 0.9103375527426161, "grad_norm": 0.478383868932724, "learning_rate": 4.893883718328984e-05, "loss": 1.2912, "step": 8630 }, { "epoch": 0.9113924050632911, "grad_norm": 0.5419098734855652, "learning_rate": 4.8264289630859386e-05, "loss": 1.2732, "step": 8640 }, { "epoch": 0.9124472573839663, "grad_norm": 0.4829871952533722, "learning_rate": 4.759903969207646e-05, "loss": 1.2783, "step": 8650 }, { "epoch": 0.9135021097046413, "grad_norm": 0.4976823627948761, "learning_rate": 4.694295921345623e-05, "loss": 1.2804, "step": 8660 }, { "epoch": 0.9145569620253164, "grad_norm": 0.48330676555633545, "learning_rate": 4.629592180791501e-05, "loss": 1.2887, "step": 8670 }, { "epoch": 0.9156118143459916, "grad_norm": 0.43191611766815186, "learning_rate": 4.565780283042316e-05, "loss": 1.2961, "step": 8680 }, { "epoch": 0.9166666666666666, "grad_norm": 0.4623429775238037, "learning_rate": 4.502847935399348e-05, "loss": 1.2887, "step": 8690 }, { "epoch": 0.9177215189873418, "grad_norm": 0.49294161796569824, "learning_rate": 4.440783014600059e-05, "loss": 1.2865, "step": 8700 }, { "epoch": 0.9187763713080169, "grad_norm": 0.4549841582775116, "learning_rate": 4.3795735644826776e-05, "loss": 1.2851, "step": 8710 }, { "epoch": 0.919831223628692, "grad_norm": 0.4817006587982178, "learning_rate": 4.319207793682965e-05, "loss": 1.2789, "step": 8720 }, { "epoch": 0.9208860759493671, "grad_norm": 0.4689021408557892, "learning_rate": 4.259674073362732e-05, "loss": 1.2934, "step": 8730 }, { "epoch": 0.9219409282700421, "grad_norm": 0.49034053087234497, "learning_rate": 4.200960934969664e-05, "loss": 1.2804, "step": 8740 }, { "epoch": 0.9229957805907173, "grad_norm": 0.4780392646789551, "learning_rate": 4.143057068028024e-05, "loss": 1.2799, "step": 8750 }, { "epoch": 0.9240506329113924, "grad_norm": 0.5309856534004211, "learning_rate": 4.0859513179598096e-05, "loss": 1.2662, "step": 8760 }, { "epoch": 0.9251054852320675, "grad_norm": 0.45172807574272156, "learning_rate": 4.02963268393593e-05, "loss": 1.2796, "step": 8770 }, { "epoch": 0.9261603375527426, "grad_norm": 0.4595884084701538, "learning_rate": 3.974090316757029e-05, "loss": 1.2822, "step": 8780 }, { "epoch": 0.9272151898734177, "grad_norm": 0.4415097236633301, "learning_rate": 3.919313516763478e-05, "loss": 1.2811, "step": 8790 }, { "epoch": 0.9282700421940928, "grad_norm": 0.4719064235687256, "learning_rate": 3.8652917317742106e-05, "loss": 1.2778, "step": 8800 }, { "epoch": 0.929324894514768, "grad_norm": 0.44753560423851013, "learning_rate": 3.812014555053955e-05, "loss": 1.2965, "step": 8810 }, { "epoch": 0.930379746835443, "grad_norm": 0.45936092734336853, "learning_rate": 3.759471723308477e-05, "loss": 1.2829, "step": 8820 }, { "epoch": 0.9314345991561181, "grad_norm": 0.4726845622062683, "learning_rate": 3.707653114707471e-05, "loss": 1.2988, "step": 8830 }, { "epoch": 0.9324894514767933, "grad_norm": 0.44957664608955383, "learning_rate": 3.6565487469346904e-05, "loss": 1.284, "step": 8840 }, { "epoch": 0.9335443037974683, "grad_norm": 0.5235864520072937, "learning_rate": 3.606148775264958e-05, "loss": 1.2835, "step": 8850 }, { "epoch": 0.9345991561181435, "grad_norm": 0.5054678320884705, "learning_rate": 3.5564434906676834e-05, "loss": 1.2825, "step": 8860 }, { "epoch": 0.9356540084388185, "grad_norm": 0.4912639260292053, "learning_rate": 3.507423317936521e-05, "loss": 1.2856, "step": 8870 }, { "epoch": 0.9367088607594937, "grad_norm": 0.4780506193637848, "learning_rate": 3.4590788138448004e-05, "loss": 1.292, "step": 8880 }, { "epoch": 0.9377637130801688, "grad_norm": 0.4532028138637543, "learning_rate": 3.411400665326393e-05, "loss": 1.2881, "step": 8890 }, { "epoch": 0.9388185654008439, "grad_norm": 0.46156346797943115, "learning_rate": 3.364379687681642e-05, "loss": 1.2791, "step": 8900 }, { "epoch": 0.939873417721519, "grad_norm": 0.5188421010971069, "learning_rate": 3.31800682280803e-05, "loss": 1.2781, "step": 8910 }, { "epoch": 0.9409282700421941, "grad_norm": 0.457661896944046, "learning_rate": 3.272273137455225e-05, "loss": 1.2867, "step": 8920 }, { "epoch": 0.9419831223628692, "grad_norm": 0.46978896856307983, "learning_rate": 3.227169821504187e-05, "loss": 1.2823, "step": 8930 }, { "epoch": 0.9430379746835443, "grad_norm": 0.4533792734146118, "learning_rate": 3.182688186269985e-05, "loss": 1.2842, "step": 8940 }, { "epoch": 0.9440928270042194, "grad_norm": 0.4478567838668823, "learning_rate": 3.138819662828018e-05, "loss": 1.2831, "step": 8950 }, { "epoch": 0.9451476793248945, "grad_norm": 0.48361706733703613, "learning_rate": 3.095555800363297e-05, "loss": 1.2866, "step": 8960 }, { "epoch": 0.9462025316455697, "grad_norm": 0.5194976925849915, "learning_rate": 3.052888264542484e-05, "loss": 1.285, "step": 8970 }, { "epoch": 0.9472573839662447, "grad_norm": 0.593368411064148, "learning_rate": 3.0108088359083675e-05, "loss": 1.2823, "step": 8980 }, { "epoch": 0.9483122362869199, "grad_norm": 0.470674067735672, "learning_rate": 2.9693094082964775e-05, "loss": 1.2863, "step": 8990 }, { "epoch": 0.9493670886075949, "grad_norm": 0.45366424322128296, "learning_rate": 2.928381987273507e-05, "loss": 1.2808, "step": 9000 }, { "epoch": 0.95042194092827, "grad_norm": 0.5190941095352173, "learning_rate": 2.8880186885972716e-05, "loss": 1.2803, "step": 9010 }, { "epoch": 0.9514767932489452, "grad_norm": 0.4770827293395996, "learning_rate": 2.8482117366978935e-05, "loss": 1.2788, "step": 9020 }, { "epoch": 0.9525316455696202, "grad_norm": 0.5441578030586243, "learning_rate": 2.808953463179918e-05, "loss": 1.3002, "step": 9030 }, { "epoch": 0.9535864978902954, "grad_norm": 0.4609360694885254, "learning_rate": 2.770236305345076e-05, "loss": 1.2742, "step": 9040 }, { "epoch": 0.9546413502109705, "grad_norm": 0.47006136178970337, "learning_rate": 2.732052804735409e-05, "loss": 1.2855, "step": 9050 }, { "epoch": 0.9556962025316456, "grad_norm": 0.4748915135860443, "learning_rate": 2.6943956056964773e-05, "loss": 1.2858, "step": 9060 }, { "epoch": 0.9567510548523207, "grad_norm": 0.46649369597435, "learning_rate": 2.6572574539603643e-05, "loss": 1.2875, "step": 9070 }, { "epoch": 0.9578059071729957, "grad_norm": 0.44580161571502686, "learning_rate": 2.6206311952482224e-05, "loss": 1.2886, "step": 9080 }, { "epoch": 0.9588607594936709, "grad_norm": 0.4662837088108063, "learning_rate": 2.584509773892073e-05, "loss": 1.2885, "step": 9090 }, { "epoch": 0.959915611814346, "grad_norm": 0.4643775522708893, "learning_rate": 2.5488862314756066e-05, "loss": 1.2682, "step": 9100 }, { "epoch": 0.9609704641350211, "grad_norm": 0.44580066204071045, "learning_rate": 2.513753705493713e-05, "loss": 1.2811, "step": 9110 }, { "epoch": 0.9620253164556962, "grad_norm": 0.47444525361061096, "learning_rate": 2.4791054280304972e-05, "loss": 1.2887, "step": 9120 }, { "epoch": 0.9630801687763713, "grad_norm": 0.48020312190055847, "learning_rate": 2.4449347244555043e-05, "loss": 1.2739, "step": 9130 }, { "epoch": 0.9641350210970464, "grad_norm": 0.531441330909729, "learning_rate": 2.4112350121379255e-05, "loss": 1.2936, "step": 9140 }, { "epoch": 0.9651898734177216, "grad_norm": 0.5161461234092712, "learning_rate": 2.3779997991785207e-05, "loss": 1.2806, "step": 9150 }, { "epoch": 0.9662447257383966, "grad_norm": 0.4328412115573883, "learning_rate": 2.3452226831590232e-05, "loss": 1.2885, "step": 9160 }, { "epoch": 0.9672995780590717, "grad_norm": 0.45244327187538147, "learning_rate": 2.3128973499087785e-05, "loss": 1.2857, "step": 9170 }, { "epoch": 0.9683544303797469, "grad_norm": 0.46364492177963257, "learning_rate": 2.2810175722883866e-05, "loss": 1.2832, "step": 9180 }, { "epoch": 0.9694092827004219, "grad_norm": 0.45162323117256165, "learning_rate": 2.2495772089901067e-05, "loss": 1.2791, "step": 9190 }, { "epoch": 0.9704641350210971, "grad_norm": 0.4856041967868805, "learning_rate": 2.218570203354799e-05, "loss": 1.2836, "step": 9200 }, { "epoch": 0.9715189873417721, "grad_norm": 0.4696802496910095, "learning_rate": 2.187990582205175e-05, "loss": 1.2824, "step": 9210 }, { "epoch": 0.9725738396624473, "grad_norm": 0.4935973882675171, "learning_rate": 2.157832454695122e-05, "loss": 1.2809, "step": 9220 }, { "epoch": 0.9736286919831224, "grad_norm": 0.4482496380805969, "learning_rate": 2.1280900111748943e-05, "loss": 1.2713, "step": 9230 }, { "epoch": 0.9746835443037974, "grad_norm": 0.4511680006980896, "learning_rate": 2.0987575220719476e-05, "loss": 1.265, "step": 9240 }, { "epoch": 0.9757383966244726, "grad_norm": 0.45410269498825073, "learning_rate": 2.069829336787193e-05, "loss": 1.2768, "step": 9250 }, { "epoch": 0.9767932489451476, "grad_norm": 0.455393522977829, "learning_rate": 2.0412998826064695e-05, "loss": 1.2747, "step": 9260 }, { "epoch": 0.9778481012658228, "grad_norm": 0.48198139667510986, "learning_rate": 2.0131636636270178e-05, "loss": 1.2863, "step": 9270 }, { "epoch": 0.9789029535864979, "grad_norm": 0.4690043330192566, "learning_rate": 1.9854152596987523e-05, "loss": 1.2736, "step": 9280 }, { "epoch": 0.979957805907173, "grad_norm": 0.46250012516975403, "learning_rate": 1.9580493253801253e-05, "loss": 1.2801, "step": 9290 }, { "epoch": 0.9810126582278481, "grad_norm": 0.45912715792655945, "learning_rate": 1.9310605889083842e-05, "loss": 1.2798, "step": 9300 }, { "epoch": 0.9820675105485233, "grad_norm": 0.5150806307792664, "learning_rate": 1.904443851184018e-05, "loss": 1.28, "step": 9310 }, { "epoch": 0.9831223628691983, "grad_norm": 0.4904801845550537, "learning_rate": 1.87819398476921e-05, "loss": 1.2755, "step": 9320 }, { "epoch": 0.9841772151898734, "grad_norm": 0.4826321601867676, "learning_rate": 1.8523059329000848e-05, "loss": 1.2878, "step": 9330 }, { "epoch": 0.9852320675105485, "grad_norm": 0.47633230686187744, "learning_rate": 1.826774708512579e-05, "loss": 1.2845, "step": 9340 }, { "epoch": 0.9862869198312236, "grad_norm": 0.4657934010028839, "learning_rate": 1.8015953932817347e-05, "loss": 1.2959, "step": 9350 }, { "epoch": 0.9873417721518988, "grad_norm": 0.4584238827228546, "learning_rate": 1.7767631366742332e-05, "loss": 1.2879, "step": 9360 }, { "epoch": 0.9883966244725738, "grad_norm": 0.48173826932907104, "learning_rate": 1.7522731550139926e-05, "loss": 1.2852, "step": 9370 }, { "epoch": 0.989451476793249, "grad_norm": 0.46752113103866577, "learning_rate": 1.728120730560641e-05, "loss": 1.2714, "step": 9380 }, { "epoch": 0.990506329113924, "grad_norm": 0.48856937885284424, "learning_rate": 1.704301210600693e-05, "loss": 1.2767, "step": 9390 }, { "epoch": 0.9915611814345991, "grad_norm": 0.49191349744796753, "learning_rate": 1.6808100065512536e-05, "loss": 1.2902, "step": 9400 }, { "epoch": 0.9926160337552743, "grad_norm": 0.49473485350608826, "learning_rate": 1.657642593076074e-05, "loss": 1.2649, "step": 9410 }, { "epoch": 0.9936708860759493, "grad_norm": 0.4490512013435364, "learning_rate": 1.634794507213793e-05, "loss": 1.2829, "step": 9420 }, { "epoch": 0.9947257383966245, "grad_norm": 0.45531249046325684, "learning_rate": 1.6122613475181976e-05, "loss": 1.2841, "step": 9430 }, { "epoch": 0.9957805907172996, "grad_norm": 0.4522455334663391, "learning_rate": 1.590038773210323e-05, "loss": 1.2796, "step": 9440 }, { "epoch": 0.9968354430379747, "grad_norm": 0.4630568027496338, "learning_rate": 1.568122503342252e-05, "loss": 1.2891, "step": 9450 }, { "epoch": 0.9978902953586498, "grad_norm": 0.4376571774482727, "learning_rate": 1.5465083159724344e-05, "loss": 1.2732, "step": 9460 }, { "epoch": 0.9989451476793249, "grad_norm": 0.4659065902233124, "learning_rate": 1.5251920473523708e-05, "loss": 1.2874, "step": 9470 }, { "epoch": 1.0, "grad_norm": 1.316353678703308, "learning_rate": 1.5041695911245136e-05, "loss": 1.276, "step": 9480 } ], "logging_steps": 10, "max_steps": 9480, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.832308198648013e+16, "train_batch_size": 1024, "trial_name": null, "trial_params": null }