{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.0, "eval_steps": 300000, "global_step": 1088730, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013777520597393293, "grad_norm": 11.65067195892334, "learning_rate": 4.849375459221161e-07, "loss": 1.4102, "step": 100 }, { "epoch": 0.0027555041194786585, "grad_norm": 13.264008522033691, "learning_rate": 9.900808229243204e-07, "loss": 1.4591, "step": 200 }, { "epoch": 0.004133256179217988, "grad_norm": 8.67760181427002, "learning_rate": 1.4952240999265246e-06, "loss": 1.2864, "step": 300 }, { "epoch": 0.005511008238957317, "grad_norm": 16.742992401123047, "learning_rate": 1.9953159441587073e-06, "loss": 1.3362, "step": 400 }, { "epoch": 0.006888760298696646, "grad_norm": 24.729005813598633, "learning_rate": 2.500459221160911e-06, "loss": 1.2823, "step": 500 }, { "epoch": 0.008266512358435976, "grad_norm": 9.72551155090332, "learning_rate": 3.0056024981631153e-06, "loss": 1.2514, "step": 600 }, { "epoch": 0.009644264418175305, "grad_norm": 5.669283866882324, "learning_rate": 3.5107457751653195e-06, "loss": 1.2883, "step": 700 }, { "epoch": 0.011022016477914634, "grad_norm": 34.766544342041016, "learning_rate": 4.015889052167524e-06, "loss": 1.1993, "step": 800 }, { "epoch": 0.012399768537653963, "grad_norm": 7.9280829429626465, "learning_rate": 4.521032329169728e-06, "loss": 1.0601, "step": 900 }, { "epoch": 0.013777520597393293, "grad_norm": 5.170280456542969, "learning_rate": 5.0261756061719325e-06, "loss": 1.1682, "step": 1000 }, { "epoch": 0.015155272657132622, "grad_norm": 12.251971244812012, "learning_rate": 5.531318883174137e-06, "loss": 1.0818, "step": 1100 }, { "epoch": 0.01653302471687195, "grad_norm": 6.913139820098877, "learning_rate": 6.036462160176341e-06, "loss": 1.1873, "step": 1200 }, { "epoch": 0.017910776776611282, "grad_norm": 4.224829196929932, "learning_rate": 6.5416054371785455e-06, "loss": 1.1132, "step": 1300 }, { "epoch": 0.01928852883635061, "grad_norm": 3.3719537258148193, "learning_rate": 7.046748714180749e-06, "loss": 1.0681, "step": 1400 }, { "epoch": 0.02066628089608994, "grad_norm": 9.445250511169434, "learning_rate": 7.551891991182953e-06, "loss": 1.0711, "step": 1500 }, { "epoch": 0.022044032955829268, "grad_norm": 21.53278160095215, "learning_rate": 8.057035268185158e-06, "loss": 1.1757, "step": 1600 }, { "epoch": 0.0234217850155686, "grad_norm": 19.54088592529297, "learning_rate": 8.562178545187362e-06, "loss": 1.0576, "step": 1700 }, { "epoch": 0.024799537075307927, "grad_norm": 7.3780927658081055, "learning_rate": 9.067321822189567e-06, "loss": 1.036, "step": 1800 }, { "epoch": 0.026177289135047258, "grad_norm": 9.797306060791016, "learning_rate": 9.57246509919177e-06, "loss": 1.1004, "step": 1900 }, { "epoch": 0.027555041194786585, "grad_norm": 7.63425350189209, "learning_rate": 1.0077608376193976e-05, "loss": 1.0462, "step": 2000 }, { "epoch": 0.028932793254525916, "grad_norm": 6.704458236694336, "learning_rate": 1.0582751653196178e-05, "loss": 1.037, "step": 2100 }, { "epoch": 0.030310545314265244, "grad_norm": 6.635415077209473, "learning_rate": 1.1087894930198384e-05, "loss": 0.9806, "step": 2200 }, { "epoch": 0.031688297374004575, "grad_norm": 4.560552597045898, "learning_rate": 1.1593038207200588e-05, "loss": 0.8893, "step": 2300 }, { "epoch": 0.0330660494337439, "grad_norm": 5.239875316619873, "learning_rate": 1.2098181484202793e-05, "loss": 1.0697, "step": 2400 }, { "epoch": 0.03444380149348323, "grad_norm": 4.281900882720947, "learning_rate": 1.2603324761204997e-05, "loss": 0.9615, "step": 2500 }, { "epoch": 0.035821553553222564, "grad_norm": 11.624236106872559, "learning_rate": 1.31084680382072e-05, "loss": 1.0059, "step": 2600 }, { "epoch": 0.03719930561296189, "grad_norm": 28.93486785888672, "learning_rate": 1.3608559882439383e-05, "loss": 0.8404, "step": 2700 }, { "epoch": 0.03857705767270122, "grad_norm": 11.4780912399292, "learning_rate": 1.4113703159441589e-05, "loss": 0.9094, "step": 2800 }, { "epoch": 0.03995480973244055, "grad_norm": 9.215027809143066, "learning_rate": 1.4618846436443793e-05, "loss": 0.8866, "step": 2900 }, { "epoch": 0.04133256179217988, "grad_norm": 29.185121536254883, "learning_rate": 1.5123989713445997e-05, "loss": 0.9744, "step": 3000 }, { "epoch": 0.04271031385191921, "grad_norm": 5.132340908050537, "learning_rate": 1.56291329904482e-05, "loss": 1.0274, "step": 3100 }, { "epoch": 0.044088065911658536, "grad_norm": 7.962031364440918, "learning_rate": 1.6134276267450406e-05, "loss": 0.8822, "step": 3200 }, { "epoch": 0.04546581797139787, "grad_norm": 6.370624542236328, "learning_rate": 1.663941954445261e-05, "loss": 1.0308, "step": 3300 }, { "epoch": 0.0468435700311372, "grad_norm": 4.936957836151123, "learning_rate": 1.7144562821454813e-05, "loss": 0.927, "step": 3400 }, { "epoch": 0.048221322090876526, "grad_norm": 24.894546508789062, "learning_rate": 1.7649706098457017e-05, "loss": 0.9564, "step": 3500 }, { "epoch": 0.04959907415061585, "grad_norm": 9.514641761779785, "learning_rate": 1.815484937545922e-05, "loss": 0.9474, "step": 3600 }, { "epoch": 0.05097682621035519, "grad_norm": 5.476971626281738, "learning_rate": 1.8659992652461428e-05, "loss": 0.9496, "step": 3700 }, { "epoch": 0.052354578270094515, "grad_norm": 15.0453462600708, "learning_rate": 1.9165135929463632e-05, "loss": 0.9339, "step": 3800 }, { "epoch": 0.05373233032983384, "grad_norm": 14.468217849731445, "learning_rate": 1.9670279206465836e-05, "loss": 0.9348, "step": 3900 }, { "epoch": 0.05511008238957317, "grad_norm": 7.205018520355225, "learning_rate": 2.0175422483468036e-05, "loss": 0.912, "step": 4000 }, { "epoch": 0.056487834449312505, "grad_norm": 4.877631187438965, "learning_rate": 2.0680565760470243e-05, "loss": 0.9864, "step": 4100 }, { "epoch": 0.05786558650905183, "grad_norm": 8.84345531463623, "learning_rate": 2.1185709037472447e-05, "loss": 1.0205, "step": 4200 }, { "epoch": 0.05924333856879116, "grad_norm": 23.226093292236328, "learning_rate": 2.169085231447465e-05, "loss": 0.937, "step": 4300 }, { "epoch": 0.06062109062853049, "grad_norm": 12.72083568572998, "learning_rate": 2.2195995591476855e-05, "loss": 0.9369, "step": 4400 }, { "epoch": 0.06199884268826982, "grad_norm": 17.244569778442383, "learning_rate": 2.270113886847906e-05, "loss": 0.952, "step": 4500 }, { "epoch": 0.06337659474800915, "grad_norm": 6.532393932342529, "learning_rate": 2.3206282145481265e-05, "loss": 0.9171, "step": 4600 }, { "epoch": 0.06475434680774848, "grad_norm": 10.168941497802734, "learning_rate": 2.371142542248347e-05, "loss": 1.0035, "step": 4700 }, { "epoch": 0.0661320988674878, "grad_norm": 11.163745880126953, "learning_rate": 2.4211517266715652e-05, "loss": 0.8717, "step": 4800 }, { "epoch": 0.06750985092722714, "grad_norm": 8.413740158081055, "learning_rate": 2.4716660543717856e-05, "loss": 1.0384, "step": 4900 }, { "epoch": 0.06888760298696646, "grad_norm": 45.454017639160156, "learning_rate": 2.521675238795004e-05, "loss": 0.9217, "step": 5000 }, { "epoch": 0.0702653550467058, "grad_norm": 11.021474838256836, "learning_rate": 2.572189566495224e-05, "loss": 0.9638, "step": 5100 }, { "epoch": 0.07164310710644513, "grad_norm": 4.591658592224121, "learning_rate": 2.6227038941954446e-05, "loss": 1.068, "step": 5200 }, { "epoch": 0.07302085916618445, "grad_norm": 6.153427600860596, "learning_rate": 2.673218221895665e-05, "loss": 0.9792, "step": 5300 }, { "epoch": 0.07439861122592378, "grad_norm": 5.186223030090332, "learning_rate": 2.7237325495958854e-05, "loss": 0.9601, "step": 5400 }, { "epoch": 0.07577636328566312, "grad_norm": 11.95683765411377, "learning_rate": 2.7742468772961058e-05, "loss": 0.9659, "step": 5500 }, { "epoch": 0.07715411534540244, "grad_norm": 45.106727600097656, "learning_rate": 2.824761204996326e-05, "loss": 0.9401, "step": 5600 }, { "epoch": 0.07853186740514177, "grad_norm": 42.417320251464844, "learning_rate": 2.8752755326965465e-05, "loss": 1.1097, "step": 5700 }, { "epoch": 0.0799096194648811, "grad_norm": 9.149765014648438, "learning_rate": 2.9257898603967676e-05, "loss": 1.0048, "step": 5800 }, { "epoch": 0.08128737152462043, "grad_norm": 10.777480125427246, "learning_rate": 2.976304188096988e-05, "loss": 0.9749, "step": 5900 }, { "epoch": 0.08266512358435976, "grad_norm": 60.3282470703125, "learning_rate": 3.0268185157972083e-05, "loss": 0.9522, "step": 6000 }, { "epoch": 0.08404287564409908, "grad_norm": 6.608532428741455, "learning_rate": 3.0773328434974284e-05, "loss": 1.0985, "step": 6100 }, { "epoch": 0.08542062770383842, "grad_norm": 11.488170623779297, "learning_rate": 3.127847171197649e-05, "loss": 1.1379, "step": 6200 }, { "epoch": 0.08679837976357775, "grad_norm": 12.194034576416016, "learning_rate": 3.178361498897869e-05, "loss": 0.9478, "step": 6300 }, { "epoch": 0.08817613182331707, "grad_norm": 19.647018432617188, "learning_rate": 3.2288758265980895e-05, "loss": 1.0554, "step": 6400 }, { "epoch": 0.0895538838830564, "grad_norm": 8.758164405822754, "learning_rate": 3.278885011021308e-05, "loss": 1.0042, "step": 6500 }, { "epoch": 0.09093163594279574, "grad_norm": 5.215201377868652, "learning_rate": 3.329399338721529e-05, "loss": 0.9614, "step": 6600 }, { "epoch": 0.09230938800253506, "grad_norm": 6.266079902648926, "learning_rate": 3.379913666421749e-05, "loss": 1.0105, "step": 6700 }, { "epoch": 0.0936871400622744, "grad_norm": 16.860300064086914, "learning_rate": 3.4304279941219696e-05, "loss": 1.0777, "step": 6800 }, { "epoch": 0.09506489212201372, "grad_norm": 19.434036254882812, "learning_rate": 3.48094232182219e-05, "loss": 1.0434, "step": 6900 }, { "epoch": 0.09644264418175305, "grad_norm": 4.839750289916992, "learning_rate": 3.5314566495224104e-05, "loss": 0.9474, "step": 7000 }, { "epoch": 0.09782039624149239, "grad_norm": 10.522396087646484, "learning_rate": 3.581970977222631e-05, "loss": 1.1231, "step": 7100 }, { "epoch": 0.0991981483012317, "grad_norm": 160.55545043945312, "learning_rate": 3.6324853049228504e-05, "loss": 1.0285, "step": 7200 }, { "epoch": 0.10057590036097104, "grad_norm": 9.294129371643066, "learning_rate": 3.682999632623071e-05, "loss": 1.005, "step": 7300 }, { "epoch": 0.10195365242071038, "grad_norm": 20.62299156188965, "learning_rate": 3.733513960323292e-05, "loss": 0.9804, "step": 7400 }, { "epoch": 0.1033314044804497, "grad_norm": 13.265617370605469, "learning_rate": 3.784028288023512e-05, "loss": 1.1047, "step": 7500 }, { "epoch": 0.10470915654018903, "grad_norm": 22.576231002807617, "learning_rate": 3.8345426157237326e-05, "loss": 0.9803, "step": 7600 }, { "epoch": 0.10608690859992835, "grad_norm": 27.096935272216797, "learning_rate": 3.885056943423953e-05, "loss": 0.9725, "step": 7700 }, { "epoch": 0.10746466065966769, "grad_norm": 41.65309143066406, "learning_rate": 3.9355712711241734e-05, "loss": 0.8945, "step": 7800 }, { "epoch": 0.10884241271940702, "grad_norm": 3.681422472000122, "learning_rate": 3.986085598824394e-05, "loss": 0.9033, "step": 7900 }, { "epoch": 0.11022016477914634, "grad_norm": 4.928839683532715, "learning_rate": 4.036599926524614e-05, "loss": 1.0033, "step": 8000 }, { "epoch": 0.11159791683888567, "grad_norm": 11.218937873840332, "learning_rate": 4.0871142542248345e-05, "loss": 1.039, "step": 8100 }, { "epoch": 0.11297566889862501, "grad_norm": 8.559927940368652, "learning_rate": 4.1376285819250556e-05, "loss": 1.1383, "step": 8200 }, { "epoch": 0.11435342095836433, "grad_norm": 7.633176326751709, "learning_rate": 4.188142909625276e-05, "loss": 1.1427, "step": 8300 }, { "epoch": 0.11573117301810366, "grad_norm": 8.33764934539795, "learning_rate": 4.238657237325496e-05, "loss": 1.0857, "step": 8400 }, { "epoch": 0.11710892507784298, "grad_norm": 10.000191688537598, "learning_rate": 4.289171565025717e-05, "loss": 1.084, "step": 8500 }, { "epoch": 0.11848667713758232, "grad_norm": 18.05430030822754, "learning_rate": 4.339180749448935e-05, "loss": 1.1712, "step": 8600 }, { "epoch": 0.11986442919732165, "grad_norm": 12.45681381225586, "learning_rate": 4.3896950771491554e-05, "loss": 1.0295, "step": 8700 }, { "epoch": 0.12124218125706097, "grad_norm": 9.36514663696289, "learning_rate": 4.440209404849376e-05, "loss": 1.2121, "step": 8800 }, { "epoch": 0.12261993331680031, "grad_norm": 9.039582252502441, "learning_rate": 4.4907237325495955e-05, "loss": 1.1665, "step": 8900 }, { "epoch": 0.12399768537653964, "grad_norm": 16.782058715820312, "learning_rate": 4.5412380602498165e-05, "loss": 1.1426, "step": 9000 }, { "epoch": 0.12537543743627896, "grad_norm": 17.21622085571289, "learning_rate": 4.591752387950037e-05, "loss": 1.1173, "step": 9100 }, { "epoch": 0.1267531894960183, "grad_norm": 6.7519307136535645, "learning_rate": 4.642266715650257e-05, "loss": 1.1164, "step": 9200 }, { "epoch": 0.12813094155575763, "grad_norm": 7.507974624633789, "learning_rate": 4.6927810433504776e-05, "loss": 1.1192, "step": 9300 }, { "epoch": 0.12950869361549697, "grad_norm": 4.70835018157959, "learning_rate": 4.743295371050698e-05, "loss": 1.0741, "step": 9400 }, { "epoch": 0.13088644567523627, "grad_norm": 8.935495376586914, "learning_rate": 4.7938096987509184e-05, "loss": 1.069, "step": 9500 }, { "epoch": 0.1322641977349756, "grad_norm": 7.942813873291016, "learning_rate": 4.844324026451139e-05, "loss": 1.1075, "step": 9600 }, { "epoch": 0.13364194979471494, "grad_norm": 8.648787498474121, "learning_rate": 4.894838354151359e-05, "loss": 1.0331, "step": 9700 }, { "epoch": 0.13501970185445428, "grad_norm": 37.376991271972656, "learning_rate": 4.94535268185158e-05, "loss": 1.1188, "step": 9800 }, { "epoch": 0.1363974539141936, "grad_norm": 8.2828369140625, "learning_rate": 4.9958670095518006e-05, "loss": 1.1222, "step": 9900 }, { "epoch": 0.13777520597393292, "grad_norm": 7.8098225593566895, "learning_rate": 5.046381337252021e-05, "loss": 1.0751, "step": 10000 }, { "epoch": 0.13915295803367225, "grad_norm": 174.5991668701172, "learning_rate": 5.0968956649522413e-05, "loss": 1.2117, "step": 10100 }, { "epoch": 0.1405307100934116, "grad_norm": 9.941773414611816, "learning_rate": 5.147409992652462e-05, "loss": 1.1293, "step": 10200 }, { "epoch": 0.14190846215315092, "grad_norm": 53.449195861816406, "learning_rate": 5.197924320352682e-05, "loss": 1.1948, "step": 10300 }, { "epoch": 0.14328621421289026, "grad_norm": 9.17073917388916, "learning_rate": 5.2484386480529025e-05, "loss": 1.105, "step": 10400 }, { "epoch": 0.1446639662726296, "grad_norm": 13.831076622009277, "learning_rate": 5.298952975753123e-05, "loss": 1.2271, "step": 10500 }, { "epoch": 0.1460417183323689, "grad_norm": 10.086932182312012, "learning_rate": 5.349467303453344e-05, "loss": 1.1864, "step": 10600 }, { "epoch": 0.14741947039210823, "grad_norm": 4.469985008239746, "learning_rate": 5.399981631153564e-05, "loss": 1.2172, "step": 10700 }, { "epoch": 0.14879722245184757, "grad_norm": 9.413055419921875, "learning_rate": 5.450495958853784e-05, "loss": 1.1877, "step": 10800 }, { "epoch": 0.1501749745115869, "grad_norm": 26.23496437072754, "learning_rate": 5.499999999988319e-05, "loss": 1.2923, "step": 10900 }, { "epoch": 0.15155272657132624, "grad_norm": 7.072531223297119, "learning_rate": 5.499999880838769e-05, "loss": 1.1533, "step": 11000 }, { "epoch": 0.15293047863106554, "grad_norm": 26.590896606445312, "learning_rate": 5.499999528062659e-05, "loss": 1.1943, "step": 11100 }, { "epoch": 0.15430823069080488, "grad_norm": 15.353813171386719, "learning_rate": 5.499998941660022e-05, "loss": 1.1975, "step": 11200 }, { "epoch": 0.1556859827505442, "grad_norm": 10.522014617919922, "learning_rate": 5.499998121630905e-05, "loss": 1.2506, "step": 11300 }, { "epoch": 0.15706373481028355, "grad_norm": 127.85610961914062, "learning_rate": 5.499997067975379e-05, "loss": 1.2472, "step": 11400 }, { "epoch": 0.15844148687002288, "grad_norm": 16.62019157409668, "learning_rate": 5.4999957806935333e-05, "loss": 1.2199, "step": 11500 }, { "epoch": 0.1598192389297622, "grad_norm": 30.395166397094727, "learning_rate": 5.499994259785477e-05, "loss": 1.3408, "step": 11600 }, { "epoch": 0.16119699098950152, "grad_norm": 9.531847953796387, "learning_rate": 5.49999250525134e-05, "loss": 1.2979, "step": 11700 }, { "epoch": 0.16257474304924086, "grad_norm": 13.844141960144043, "learning_rate": 5.49999051709127e-05, "loss": 1.1795, "step": 11800 }, { "epoch": 0.1639524951089802, "grad_norm": 10.223821640014648, "learning_rate": 5.4999882953054366e-05, "loss": 1.2488, "step": 11900 }, { "epoch": 0.16533024716871952, "grad_norm": 4.547140121459961, "learning_rate": 5.4999858398940294e-05, "loss": 1.2205, "step": 12000 }, { "epoch": 0.16670799922845886, "grad_norm": 21.750076293945312, "learning_rate": 5.4999831508572554e-05, "loss": 1.1902, "step": 12100 }, { "epoch": 0.16808575128819817, "grad_norm": 12.58484935760498, "learning_rate": 5.499980228195345e-05, "loss": 1.192, "step": 12200 }, { "epoch": 0.1694635033479375, "grad_norm": 19.202035903930664, "learning_rate": 5.499977071908545e-05, "loss": 1.1948, "step": 12300 }, { "epoch": 0.17084125540767683, "grad_norm": 24.988405227661133, "learning_rate": 5.4999736819971234e-05, "loss": 1.2013, "step": 12400 }, { "epoch": 0.17221900746741617, "grad_norm": 12.41451358795166, "learning_rate": 5.499970058461369e-05, "loss": 1.2893, "step": 12500 }, { "epoch": 0.1735967595271555, "grad_norm": 10.051517486572266, "learning_rate": 5.49996620130159e-05, "loss": 1.2852, "step": 12600 }, { "epoch": 0.1749745115868948, "grad_norm": 7.056366443634033, "learning_rate": 5.499962110518112e-05, "loss": 1.2215, "step": 12700 }, { "epoch": 0.17635226364663414, "grad_norm": 15.381092071533203, "learning_rate": 5.4999577861112856e-05, "loss": 1.3229, "step": 12800 }, { "epoch": 0.17773001570637348, "grad_norm": 9.423325538635254, "learning_rate": 5.4999532280814754e-05, "loss": 1.1513, "step": 12900 }, { "epoch": 0.1791077677661128, "grad_norm": 9.337259292602539, "learning_rate": 5.4999484364290714e-05, "loss": 1.1022, "step": 13000 }, { "epoch": 0.18048551982585215, "grad_norm": 8.747264862060547, "learning_rate": 5.499943411154478e-05, "loss": 1.2086, "step": 13100 }, { "epoch": 0.18186327188559148, "grad_norm": 74.09689331054688, "learning_rate": 5.4999381522581234e-05, "loss": 1.1169, "step": 13200 }, { "epoch": 0.1832410239453308, "grad_norm": 14.921662330627441, "learning_rate": 5.4999326597404536e-05, "loss": 1.1987, "step": 13300 }, { "epoch": 0.18461877600507012, "grad_norm": 6.766313076019287, "learning_rate": 5.499926933601937e-05, "loss": 1.2041, "step": 13400 }, { "epoch": 0.18599652806480946, "grad_norm": 15.904729843139648, "learning_rate": 5.4999209738430575e-05, "loss": 1.1398, "step": 13500 }, { "epoch": 0.1873742801245488, "grad_norm": 4.456528663635254, "learning_rate": 5.499914780464323e-05, "loss": 1.2177, "step": 13600 }, { "epoch": 0.18875203218428813, "grad_norm": 137.29202270507812, "learning_rate": 5.49990835346626e-05, "loss": 1.1807, "step": 13700 }, { "epoch": 0.19012978424402743, "grad_norm": 4.208902359008789, "learning_rate": 5.499901692849414e-05, "loss": 1.2225, "step": 13800 }, { "epoch": 0.19150753630376677, "grad_norm": 28.881311416625977, "learning_rate": 5.49989479861435e-05, "loss": 1.2106, "step": 13900 }, { "epoch": 0.1928852883635061, "grad_norm": 26.017873764038086, "learning_rate": 5.4998876707616556e-05, "loss": 1.2853, "step": 14000 }, { "epoch": 0.19426304042324544, "grad_norm": 40.50442886352539, "learning_rate": 5.4998803092919346e-05, "loss": 1.2832, "step": 14100 }, { "epoch": 0.19564079248298477, "grad_norm": 23.924936294555664, "learning_rate": 5.499872714205813e-05, "loss": 1.1663, "step": 14200 }, { "epoch": 0.19701854454272408, "grad_norm": 15.067869186401367, "learning_rate": 5.499864964947351e-05, "loss": 1.4564, "step": 14300 }, { "epoch": 0.1983962966024634, "grad_norm": 17.385555267333984, "learning_rate": 5.499856904966531e-05, "loss": 1.3213, "step": 14400 }, { "epoch": 0.19977404866220275, "grad_norm": 7.320497035980225, "learning_rate": 5.499848611371299e-05, "loss": 1.2966, "step": 14500 }, { "epoch": 0.20115180072194208, "grad_norm": 52.53131866455078, "learning_rate": 5.49984008416236e-05, "loss": 1.2964, "step": 14600 }, { "epoch": 0.20252955278168142, "grad_norm": 16.048524856567383, "learning_rate": 5.499831323340437e-05, "loss": 1.3015, "step": 14700 }, { "epoch": 0.20390730484142075, "grad_norm": 16.255613327026367, "learning_rate": 5.4998223289062754e-05, "loss": 1.3644, "step": 14800 }, { "epoch": 0.20528505690116006, "grad_norm": 8.616049766540527, "learning_rate": 5.4998131008606386e-05, "loss": 1.327, "step": 14900 }, { "epoch": 0.2066628089608994, "grad_norm": 51.175228118896484, "learning_rate": 5.499803639204311e-05, "loss": 1.2566, "step": 15000 }, { "epoch": 0.20804056102063873, "grad_norm": 7.27580451965332, "learning_rate": 5.499793943938097e-05, "loss": 1.3868, "step": 15100 }, { "epoch": 0.20941831308037806, "grad_norm": 7.9837517738342285, "learning_rate": 5.4997840150628194e-05, "loss": 1.2163, "step": 15200 }, { "epoch": 0.2107960651401174, "grad_norm": 13.115901947021484, "learning_rate": 5.4997738525793216e-05, "loss": 1.1793, "step": 15300 }, { "epoch": 0.2121738171998567, "grad_norm": 11.005683898925781, "learning_rate": 5.4997634564884676e-05, "loss": 1.2811, "step": 15400 }, { "epoch": 0.21355156925959604, "grad_norm": 12.336491584777832, "learning_rate": 5.4997528267911405e-05, "loss": 1.2793, "step": 15500 }, { "epoch": 0.21492932131933537, "grad_norm": 9.77892780303955, "learning_rate": 5.499741963488242e-05, "loss": 1.3607, "step": 15600 }, { "epoch": 0.2163070733790747, "grad_norm": 9.016510963439941, "learning_rate": 5.499730866580697e-05, "loss": 1.2084, "step": 15700 }, { "epoch": 0.21768482543881404, "grad_norm": 19.17759895324707, "learning_rate": 5.4997195360694475e-05, "loss": 1.2585, "step": 15800 }, { "epoch": 0.21906257749855337, "grad_norm": 12.264784812927246, "learning_rate": 5.499707971955455e-05, "loss": 1.2381, "step": 15900 }, { "epoch": 0.22044032955829268, "grad_norm": 14.009553909301758, "learning_rate": 5.499696174239703e-05, "loss": 1.2221, "step": 16000 }, { "epoch": 0.22181808161803201, "grad_norm": 5.409763813018799, "learning_rate": 5.499684142923195e-05, "loss": 1.2755, "step": 16100 }, { "epoch": 0.22319583367777135, "grad_norm": 14.818997383117676, "learning_rate": 5.499671878006951e-05, "loss": 1.3131, "step": 16200 }, { "epoch": 0.22457358573751068, "grad_norm": 18.775480270385742, "learning_rate": 5.4996593794920135e-05, "loss": 1.111, "step": 16300 }, { "epoch": 0.22595133779725002, "grad_norm": 2.511378526687622, "learning_rate": 5.499646647379445e-05, "loss": 1.1362, "step": 16400 }, { "epoch": 0.22732908985698932, "grad_norm": 8.04574203491211, "learning_rate": 5.4996336816703265e-05, "loss": 1.1849, "step": 16500 }, { "epoch": 0.22870684191672866, "grad_norm": 6.202666282653809, "learning_rate": 5.4996204823657594e-05, "loss": 1.2502, "step": 16600 }, { "epoch": 0.230084593976468, "grad_norm": 20.663745880126953, "learning_rate": 5.4996070494668656e-05, "loss": 1.1894, "step": 16700 }, { "epoch": 0.23146234603620733, "grad_norm": 10.979053497314453, "learning_rate": 5.499593382974786e-05, "loss": 1.1144, "step": 16800 }, { "epoch": 0.23284009809594666, "grad_norm": 28.4576358795166, "learning_rate": 5.499579482890682e-05, "loss": 1.1706, "step": 16900 }, { "epoch": 0.23421785015568597, "grad_norm": 24.407394409179688, "learning_rate": 5.499565349215733e-05, "loss": 1.3014, "step": 17000 }, { "epoch": 0.2355956022154253, "grad_norm": 6.59324836730957, "learning_rate": 5.499550981951142e-05, "loss": 1.2647, "step": 17100 }, { "epoch": 0.23697335427516464, "grad_norm": 19.240928649902344, "learning_rate": 5.4995363810981284e-05, "loss": 1.0861, "step": 17200 }, { "epoch": 0.23835110633490397, "grad_norm": 19.197452545166016, "learning_rate": 5.499521546657932e-05, "loss": 1.1999, "step": 17300 }, { "epoch": 0.2397288583946433, "grad_norm": 58.96784210205078, "learning_rate": 5.4995064786318154e-05, "loss": 1.2032, "step": 17400 }, { "epoch": 0.24110661045438264, "grad_norm": 9.024454116821289, "learning_rate": 5.4994911770210557e-05, "loss": 1.3062, "step": 17500 }, { "epoch": 0.24248436251412195, "grad_norm": 19.052547454833984, "learning_rate": 5.4994757983351285e-05, "loss": 1.2405, "step": 17600 }, { "epoch": 0.24386211457386128, "grad_norm": 13.851653099060059, "learning_rate": 5.4994600318948186e-05, "loss": 1.3275, "step": 17700 }, { "epoch": 0.24523986663360062, "grad_norm": 27.99448013305664, "learning_rate": 5.499444031873814e-05, "loss": 1.1994, "step": 17800 }, { "epoch": 0.24661761869333995, "grad_norm": 12.64948844909668, "learning_rate": 5.4994277982734713e-05, "loss": 1.1713, "step": 17900 }, { "epoch": 0.2479953707530793, "grad_norm": 8.357621192932129, "learning_rate": 5.499411331095172e-05, "loss": 1.2516, "step": 18000 }, { "epoch": 0.2493731228128186, "grad_norm": 14.239299774169922, "learning_rate": 5.499394630340314e-05, "loss": 1.2017, "step": 18100 }, { "epoch": 0.2507508748725579, "grad_norm": 11.225935935974121, "learning_rate": 5.499377696010317e-05, "loss": 1.1938, "step": 18200 }, { "epoch": 0.25212862693229726, "grad_norm": 17.634973526000977, "learning_rate": 5.499360528106618e-05, "loss": 1.254, "step": 18300 }, { "epoch": 0.2535063789920366, "grad_norm": 10.021209716796875, "learning_rate": 5.499343126630677e-05, "loss": 1.1603, "step": 18400 }, { "epoch": 0.25488413105177593, "grad_norm": 26.679080963134766, "learning_rate": 5.499325491583972e-05, "loss": 1.1749, "step": 18500 }, { "epoch": 0.25626188311151527, "grad_norm": 14.252337455749512, "learning_rate": 5.4993076229680005e-05, "loss": 1.1853, "step": 18600 }, { "epoch": 0.2576396351712546, "grad_norm": 14.54755973815918, "learning_rate": 5.499289520784281e-05, "loss": 1.2063, "step": 18700 }, { "epoch": 0.25901738723099393, "grad_norm": 10.576868057250977, "learning_rate": 5.499271185034352e-05, "loss": 1.197, "step": 18800 }, { "epoch": 0.2603951392907332, "grad_norm": 8.283714294433594, "learning_rate": 5.499252615719771e-05, "loss": 1.2528, "step": 18900 }, { "epoch": 0.26177289135047255, "grad_norm": 9.045394897460938, "learning_rate": 5.499233812842115e-05, "loss": 1.1031, "step": 19000 }, { "epoch": 0.2631506434102119, "grad_norm": 9.431145668029785, "learning_rate": 5.4992147764029804e-05, "loss": 1.113, "step": 19100 }, { "epoch": 0.2645283954699512, "grad_norm": 12.061200141906738, "learning_rate": 5.499195506403987e-05, "loss": 1.2396, "step": 19200 }, { "epoch": 0.26590614752969055, "grad_norm": 11.876815795898438, "learning_rate": 5.49917600284677e-05, "loss": 1.2091, "step": 19300 }, { "epoch": 0.2672838995894299, "grad_norm": 15.724605560302734, "learning_rate": 5.499156464260224e-05, "loss": 1.2188, "step": 19400 }, { "epoch": 0.2686616516491692, "grad_norm": 25.08930015563965, "learning_rate": 5.499136495927092e-05, "loss": 1.1141, "step": 19500 }, { "epoch": 0.27003940370890855, "grad_norm": 8.645094871520996, "learning_rate": 5.499116294040751e-05, "loss": 1.146, "step": 19600 }, { "epoch": 0.2714171557686479, "grad_norm": 32.26475143432617, "learning_rate": 5.499095858602915e-05, "loss": 1.2498, "step": 19700 }, { "epoch": 0.2727949078283872, "grad_norm": 7.541633129119873, "learning_rate": 5.499075189615322e-05, "loss": 1.1223, "step": 19800 }, { "epoch": 0.27417265988812656, "grad_norm": 12.944062232971191, "learning_rate": 5.4990542870797286e-05, "loss": 1.1919, "step": 19900 }, { "epoch": 0.27555041194786584, "grad_norm": 18.79230499267578, "learning_rate": 5.499033150997908e-05, "loss": 1.2171, "step": 20000 }, { "epoch": 0.27692816400760517, "grad_norm": 5.1296772956848145, "learning_rate": 5.499011781371659e-05, "loss": 1.2126, "step": 20100 }, { "epoch": 0.2783059160673445, "grad_norm": 13.036163330078125, "learning_rate": 5.4989901782027935e-05, "loss": 1.2454, "step": 20200 }, { "epoch": 0.27968366812708384, "grad_norm": 14.533465385437012, "learning_rate": 5.498968341493149e-05, "loss": 1.2029, "step": 20300 }, { "epoch": 0.2810614201868232, "grad_norm": 11.073671340942383, "learning_rate": 5.4989462712445804e-05, "loss": 1.1959, "step": 20400 }, { "epoch": 0.2824391722465625, "grad_norm": 11.641937255859375, "learning_rate": 5.4989239674589635e-05, "loss": 1.2327, "step": 20500 }, { "epoch": 0.28381692430630184, "grad_norm": 9.438309669494629, "learning_rate": 5.4989014301381915e-05, "loss": 1.3567, "step": 20600 }, { "epoch": 0.2851946763660412, "grad_norm": 14.73698902130127, "learning_rate": 5.4988786592841795e-05, "loss": 1.1945, "step": 20700 }, { "epoch": 0.2865724284257805, "grad_norm": 27.582157135009766, "learning_rate": 5.498855654898862e-05, "loss": 1.3459, "step": 20800 }, { "epoch": 0.28795018048551985, "grad_norm": 12.265806198120117, "learning_rate": 5.498832416984193e-05, "loss": 1.2672, "step": 20900 }, { "epoch": 0.2893279325452592, "grad_norm": 14.940210342407227, "learning_rate": 5.498808945542149e-05, "loss": 1.2263, "step": 21000 }, { "epoch": 0.29070568460499846, "grad_norm": 8.485940933227539, "learning_rate": 5.49878547878034e-05, "loss": 1.2284, "step": 21100 }, { "epoch": 0.2920834366647378, "grad_norm": 48.134517669677734, "learning_rate": 5.498761542624767e-05, "loss": 1.1747, "step": 21200 }, { "epoch": 0.29346118872447713, "grad_norm": 10.468015670776367, "learning_rate": 5.498737372947838e-05, "loss": 1.2562, "step": 21300 }, { "epoch": 0.29483894078421646, "grad_norm": 6.899281978607178, "learning_rate": 5.4987129697516074e-05, "loss": 1.2914, "step": 21400 }, { "epoch": 0.2962166928439558, "grad_norm": 28.62447166442871, "learning_rate": 5.498688333038148e-05, "loss": 1.1887, "step": 21500 }, { "epoch": 0.29759444490369513, "grad_norm": 15.736666679382324, "learning_rate": 5.4986634628095516e-05, "loss": 1.2271, "step": 21600 }, { "epoch": 0.29897219696343447, "grad_norm": 15.980098724365234, "learning_rate": 5.498638359067933e-05, "loss": 1.2464, "step": 21700 }, { "epoch": 0.3003499490231738, "grad_norm": 20.236129760742188, "learning_rate": 5.498613021815423e-05, "loss": 1.2531, "step": 21800 }, { "epoch": 0.30172770108291314, "grad_norm": 16.427574157714844, "learning_rate": 5.498587451054176e-05, "loss": 1.1831, "step": 21900 }, { "epoch": 0.30310545314265247, "grad_norm": 10.426117897033691, "learning_rate": 5.4985616467863624e-05, "loss": 1.1913, "step": 22000 }, { "epoch": 0.3044832052023918, "grad_norm": 11.521761894226074, "learning_rate": 5.498535609014175e-05, "loss": 1.0736, "step": 22100 }, { "epoch": 0.3058609572621311, "grad_norm": 6.655721664428711, "learning_rate": 5.498509337739827e-05, "loss": 1.2219, "step": 22200 }, { "epoch": 0.3072387093218704, "grad_norm": 19.005769729614258, "learning_rate": 5.49848283296555e-05, "loss": 1.2403, "step": 22300 }, { "epoch": 0.30861646138160975, "grad_norm": 10.534347534179688, "learning_rate": 5.4984560946935936e-05, "loss": 1.2538, "step": 22400 }, { "epoch": 0.3099942134413491, "grad_norm": 20.09412956237793, "learning_rate": 5.498429122926232e-05, "loss": 1.2081, "step": 22500 }, { "epoch": 0.3113719655010884, "grad_norm": 16.096044540405273, "learning_rate": 5.498401917665756e-05, "loss": 1.1684, "step": 22600 }, { "epoch": 0.31274971756082776, "grad_norm": 20.909530639648438, "learning_rate": 5.498374478914475e-05, "loss": 1.1313, "step": 22700 }, { "epoch": 0.3141274696205671, "grad_norm": 6.890002250671387, "learning_rate": 5.4983468066747225e-05, "loss": 1.0835, "step": 22800 }, { "epoch": 0.3155052216803064, "grad_norm": 16.735523223876953, "learning_rate": 5.498318900948848e-05, "loss": 1.2713, "step": 22900 }, { "epoch": 0.31688297374004576, "grad_norm": 16.891698837280273, "learning_rate": 5.498290761739222e-05, "loss": 1.1909, "step": 23000 }, { "epoch": 0.3182607257997851, "grad_norm": 13.88641357421875, "learning_rate": 5.498262389048237e-05, "loss": 1.2079, "step": 23100 }, { "epoch": 0.3196384778595244, "grad_norm": 7.386116981506348, "learning_rate": 5.498233782878301e-05, "loss": 1.2487, "step": 23200 }, { "epoch": 0.3210162299192637, "grad_norm": 13.648170471191406, "learning_rate": 5.498204943231846e-05, "loss": 1.1771, "step": 23300 }, { "epoch": 0.32239398197900304, "grad_norm": 24.235118865966797, "learning_rate": 5.49817587011132e-05, "loss": 1.0437, "step": 23400 }, { "epoch": 0.3237717340387424, "grad_norm": 16.021926879882812, "learning_rate": 5.498146563519196e-05, "loss": 1.2237, "step": 23500 }, { "epoch": 0.3251494860984817, "grad_norm": 28.705358505249023, "learning_rate": 5.498117023457961e-05, "loss": 1.272, "step": 23600 }, { "epoch": 0.32652723815822104, "grad_norm": 12.784554481506348, "learning_rate": 5.4980872499301254e-05, "loss": 1.0994, "step": 23700 }, { "epoch": 0.3279049902179604, "grad_norm": 9.825348854064941, "learning_rate": 5.4980572429382194e-05, "loss": 1.255, "step": 23800 }, { "epoch": 0.3292827422776997, "grad_norm": 12.11705493927002, "learning_rate": 5.498027002484791e-05, "loss": 1.2695, "step": 23900 }, { "epoch": 0.33066049433743905, "grad_norm": 8.544857025146484, "learning_rate": 5.4979965285724105e-05, "loss": 1.1801, "step": 24000 }, { "epoch": 0.3320382463971784, "grad_norm": 6.105419158935547, "learning_rate": 5.4979658212036656e-05, "loss": 1.1384, "step": 24100 }, { "epoch": 0.3334159984569177, "grad_norm": 7.332189083099365, "learning_rate": 5.497934880381166e-05, "loss": 1.2623, "step": 24200 }, { "epoch": 0.334793750516657, "grad_norm": 7.7589545249938965, "learning_rate": 5.497903706107541e-05, "loss": 1.1905, "step": 24300 }, { "epoch": 0.33617150257639633, "grad_norm": 12.13595962524414, "learning_rate": 5.4978722983854365e-05, "loss": 1.104, "step": 24400 }, { "epoch": 0.33754925463613566, "grad_norm": 67.7665786743164, "learning_rate": 5.4978406572175227e-05, "loss": 1.0483, "step": 24500 }, { "epoch": 0.338927006695875, "grad_norm": 15.1935396194458, "learning_rate": 5.4978087826064876e-05, "loss": 1.2435, "step": 24600 }, { "epoch": 0.34030475875561433, "grad_norm": 9.363604545593262, "learning_rate": 5.497776674555038e-05, "loss": 1.1054, "step": 24700 }, { "epoch": 0.34168251081535367, "grad_norm": 9.143170356750488, "learning_rate": 5.497744333065903e-05, "loss": 1.2164, "step": 24800 }, { "epoch": 0.343060262875093, "grad_norm": 9.108330726623535, "learning_rate": 5.4977117581418295e-05, "loss": 1.1459, "step": 24900 }, { "epoch": 0.34443801493483234, "grad_norm": 6.112168788909912, "learning_rate": 5.497678949785585e-05, "loss": 1.124, "step": 25000 }, { "epoch": 0.34581576699457167, "grad_norm": 12.69692325592041, "learning_rate": 5.497645907999956e-05, "loss": 1.1198, "step": 25100 }, { "epoch": 0.347193519054311, "grad_norm": 10.93918514251709, "learning_rate": 5.4976126327877504e-05, "loss": 1.1381, "step": 25200 }, { "epoch": 0.34857127111405034, "grad_norm": 54.03297805786133, "learning_rate": 5.497579124151796e-05, "loss": 1.15, "step": 25300 }, { "epoch": 0.3499490231737896, "grad_norm": 7.5128631591796875, "learning_rate": 5.4975453820949375e-05, "loss": 1.1485, "step": 25400 }, { "epoch": 0.35132677523352895, "grad_norm": 11.75341510772705, "learning_rate": 5.497511406620042e-05, "loss": 1.1393, "step": 25500 }, { "epoch": 0.3527045272932683, "grad_norm": 6.413788318634033, "learning_rate": 5.4974771977299975e-05, "loss": 1.0141, "step": 25600 }, { "epoch": 0.3540822793530076, "grad_norm": 7.513718605041504, "learning_rate": 5.497442755427709e-05, "loss": 1.1473, "step": 25700 }, { "epoch": 0.35546003141274696, "grad_norm": 4.155311584472656, "learning_rate": 5.497408079716102e-05, "loss": 1.0039, "step": 25800 }, { "epoch": 0.3568377834724863, "grad_norm": 10.284714698791504, "learning_rate": 5.4973731705981236e-05, "loss": 1.1851, "step": 25900 }, { "epoch": 0.3582155355322256, "grad_norm": 32.3874626159668, "learning_rate": 5.497338028076738e-05, "loss": 1.0983, "step": 26000 }, { "epoch": 0.35959328759196496, "grad_norm": 13.360755920410156, "learning_rate": 5.4973026521549324e-05, "loss": 1.1441, "step": 26100 }, { "epoch": 0.3609710396517043, "grad_norm": 26.66537094116211, "learning_rate": 5.497267042835711e-05, "loss": 1.1707, "step": 26200 }, { "epoch": 0.36234879171144363, "grad_norm": 14.240584373474121, "learning_rate": 5.497231200122099e-05, "loss": 1.0371, "step": 26300 }, { "epoch": 0.36372654377118296, "grad_norm": 28.889888763427734, "learning_rate": 5.497195124017142e-05, "loss": 1.1521, "step": 26400 }, { "epoch": 0.36510429583092224, "grad_norm": 10.489887237548828, "learning_rate": 5.497158814523906e-05, "loss": 1.1477, "step": 26500 }, { "epoch": 0.3664820478906616, "grad_norm": 31.80312156677246, "learning_rate": 5.497122271645473e-05, "loss": 1.2026, "step": 26600 }, { "epoch": 0.3678597999504009, "grad_norm": 11.199711799621582, "learning_rate": 5.497085495384949e-05, "loss": 1.1356, "step": 26700 }, { "epoch": 0.36923755201014025, "grad_norm": 8.920809745788574, "learning_rate": 5.4970484857454584e-05, "loss": 1.0267, "step": 26800 }, { "epoch": 0.3706153040698796, "grad_norm": 28.67486572265625, "learning_rate": 5.4970112427301454e-05, "loss": 1.1357, "step": 26900 }, { "epoch": 0.3719930561296189, "grad_norm": 50.55158233642578, "learning_rate": 5.496973766342173e-05, "loss": 1.1476, "step": 27000 }, { "epoch": 0.37337080818935825, "grad_norm": 11.69437026977539, "learning_rate": 5.496936056584726e-05, "loss": 1.185, "step": 27100 }, { "epoch": 0.3747485602490976, "grad_norm": 20.794321060180664, "learning_rate": 5.496898113461007e-05, "loss": 1.1766, "step": 27200 }, { "epoch": 0.3761263123088369, "grad_norm": 5.544865131378174, "learning_rate": 5.496859936974242e-05, "loss": 1.0123, "step": 27300 }, { "epoch": 0.37750406436857625, "grad_norm": 12.653202056884766, "learning_rate": 5.4968215271276716e-05, "loss": 1.1932, "step": 27400 }, { "epoch": 0.3788818164283156, "grad_norm": 3.6544315814971924, "learning_rate": 5.49678288392456e-05, "loss": 1.213, "step": 27500 }, { "epoch": 0.38025956848805487, "grad_norm": 36.82583236694336, "learning_rate": 5.496744007368189e-05, "loss": 1.1204, "step": 27600 }, { "epoch": 0.3816373205477942, "grad_norm": 9.208050727844238, "learning_rate": 5.4967052897159984e-05, "loss": 1.156, "step": 27700 }, { "epoch": 0.38301507260753354, "grad_norm": 4.28313684463501, "learning_rate": 5.496665948796489e-05, "loss": 1.1157, "step": 27800 }, { "epoch": 0.38439282466727287, "grad_norm": 17.668710708618164, "learning_rate": 5.4966263745336553e-05, "loss": 1.1063, "step": 27900 }, { "epoch": 0.3857705767270122, "grad_norm": 7.139005661010742, "learning_rate": 5.49658656693086e-05, "loss": 1.1658, "step": 28000 }, { "epoch": 0.38714832878675154, "grad_norm": 5.367567539215088, "learning_rate": 5.496546525991484e-05, "loss": 1.2344, "step": 28100 }, { "epoch": 0.3885260808464909, "grad_norm": 10.696671485900879, "learning_rate": 5.49650625171893e-05, "loss": 1.2177, "step": 28200 }, { "epoch": 0.3899038329062302, "grad_norm": 12.0829439163208, "learning_rate": 5.496466150347612e-05, "loss": 1.2213, "step": 28300 }, { "epoch": 0.39128158496596954, "grad_norm": 3.8467326164245605, "learning_rate": 5.496425411752232e-05, "loss": 1.137, "step": 28400 }, { "epoch": 0.3926593370257089, "grad_norm": 18.072893142700195, "learning_rate": 5.49638443983396e-05, "loss": 1.256, "step": 28500 }, { "epoch": 0.39403708908544816, "grad_norm": 26.420886993408203, "learning_rate": 5.4963432345962805e-05, "loss": 1.2116, "step": 28600 }, { "epoch": 0.3954148411451875, "grad_norm": 7.811822891235352, "learning_rate": 5.496301796042694e-05, "loss": 1.1855, "step": 28700 }, { "epoch": 0.3967925932049268, "grad_norm": 25.796764373779297, "learning_rate": 5.4962601241767195e-05, "loss": 1.2127, "step": 28800 }, { "epoch": 0.39817034526466616, "grad_norm": 24.476287841796875, "learning_rate": 5.496218219001897e-05, "loss": 1.1299, "step": 28900 }, { "epoch": 0.3995480973244055, "grad_norm": 10.935700416564941, "learning_rate": 5.4961760805217875e-05, "loss": 1.2926, "step": 29000 }, { "epoch": 0.40092584938414483, "grad_norm": 12.129424095153809, "learning_rate": 5.496133708739971e-05, "loss": 1.1798, "step": 29100 }, { "epoch": 0.40230360144388416, "grad_norm": 5.727356433868408, "learning_rate": 5.496091103660047e-05, "loss": 1.2193, "step": 29200 }, { "epoch": 0.4036813535036235, "grad_norm": 4.554442405700684, "learning_rate": 5.496048265285634e-05, "loss": 1.1673, "step": 29300 }, { "epoch": 0.40505910556336283, "grad_norm": 10.333436965942383, "learning_rate": 5.4960051936203726e-05, "loss": 1.117, "step": 29400 }, { "epoch": 0.40643685762310217, "grad_norm": 14.174511909484863, "learning_rate": 5.495961888667921e-05, "loss": 1.2083, "step": 29500 }, { "epoch": 0.4078146096828415, "grad_norm": 11.463114738464355, "learning_rate": 5.4959183504319596e-05, "loss": 1.1945, "step": 29600 }, { "epoch": 0.4091923617425808, "grad_norm": 3.3590152263641357, "learning_rate": 5.495874578916187e-05, "loss": 1.247, "step": 29700 }, { "epoch": 0.4105701138023201, "grad_norm": 39.54381561279297, "learning_rate": 5.495830574124319e-05, "loss": 1.2505, "step": 29800 }, { "epoch": 0.41194786586205945, "grad_norm": 6.801312446594238, "learning_rate": 5.495786336060098e-05, "loss": 1.1564, "step": 29900 }, { "epoch": 0.4133256179217988, "grad_norm": 3.9892210960388184, "learning_rate": 5.495741864727279e-05, "loss": 1.0421, "step": 30000 }, { "epoch": 0.4147033699815381, "grad_norm": 20.140783309936523, "learning_rate": 5.495697160129642e-05, "loss": 1.1145, "step": 30100 }, { "epoch": 0.41608112204127745, "grad_norm": 24.042156219482422, "learning_rate": 5.4956522222709846e-05, "loss": 1.2111, "step": 30200 }, { "epoch": 0.4174588741010168, "grad_norm": 7.5537004470825195, "learning_rate": 5.495607051155124e-05, "loss": 1.1163, "step": 30300 }, { "epoch": 0.4188366261607561, "grad_norm": 8.79468059539795, "learning_rate": 5.4955616467858984e-05, "loss": 1.1754, "step": 30400 }, { "epoch": 0.42021437822049545, "grad_norm": 21.197681427001953, "learning_rate": 5.495516009167164e-05, "loss": 1.0062, "step": 30500 }, { "epoch": 0.4215921302802348, "grad_norm": 12.178730010986328, "learning_rate": 5.495470138302799e-05, "loss": 1.1227, "step": 30600 }, { "epoch": 0.4229698823399741, "grad_norm": 15.641883850097656, "learning_rate": 5.4954240341967e-05, "loss": 1.1938, "step": 30700 }, { "epoch": 0.4243476343997134, "grad_norm": 24.71154022216797, "learning_rate": 5.4953776968527846e-05, "loss": 1.0528, "step": 30800 }, { "epoch": 0.42572538645945274, "grad_norm": 11.14073371887207, "learning_rate": 5.495331126274987e-05, "loss": 1.1865, "step": 30900 }, { "epoch": 0.42710313851919207, "grad_norm": 7.5139994621276855, "learning_rate": 5.495284322467267e-05, "loss": 1.0983, "step": 31000 }, { "epoch": 0.4284808905789314, "grad_norm": 13.63111400604248, "learning_rate": 5.495237285433598e-05, "loss": 1.2065, "step": 31100 }, { "epoch": 0.42985864263867074, "grad_norm": 31.978715896606445, "learning_rate": 5.495190015177977e-05, "loss": 1.1782, "step": 31200 }, { "epoch": 0.4312363946984101, "grad_norm": 9.212420463562012, "learning_rate": 5.4951425117044204e-05, "loss": 1.1952, "step": 31300 }, { "epoch": 0.4326141467581494, "grad_norm": 5.0600690841674805, "learning_rate": 5.4950947750169625e-05, "loss": 1.1572, "step": 31400 }, { "epoch": 0.43399189881788874, "grad_norm": 38.48628616333008, "learning_rate": 5.49504680511966e-05, "loss": 1.1557, "step": 31500 }, { "epoch": 0.4353696508776281, "grad_norm": 25.28558349609375, "learning_rate": 5.494999085201974e-05, "loss": 1.1951, "step": 31600 }, { "epoch": 0.4367474029373674, "grad_norm": 9.937192916870117, "learning_rate": 5.4949506512292245e-05, "loss": 1.1394, "step": 31700 }, { "epoch": 0.43812515499710675, "grad_norm": 17.897592544555664, "learning_rate": 5.494901984058873e-05, "loss": 1.1675, "step": 31800 }, { "epoch": 0.439502907056846, "grad_norm": 28.568443298339844, "learning_rate": 5.494853083695056e-05, "loss": 1.2684, "step": 31900 }, { "epoch": 0.44088065911658536, "grad_norm": 40.10667419433594, "learning_rate": 5.494803950141926e-05, "loss": 1.1271, "step": 32000 }, { "epoch": 0.4422584111763247, "grad_norm": 16.480844497680664, "learning_rate": 5.4947545834036594e-05, "loss": 1.1347, "step": 32100 }, { "epoch": 0.44363616323606403, "grad_norm": 8.691411018371582, "learning_rate": 5.494704983484448e-05, "loss": 1.1556, "step": 32200 }, { "epoch": 0.44501391529580336, "grad_norm": 9.516257286071777, "learning_rate": 5.494655150388506e-05, "loss": 1.2484, "step": 32300 }, { "epoch": 0.4463916673555427, "grad_norm": 21.820451736450195, "learning_rate": 5.494605084120069e-05, "loss": 1.1766, "step": 32400 }, { "epoch": 0.44776941941528203, "grad_norm": 53.97123718261719, "learning_rate": 5.494555288831923e-05, "loss": 1.2531, "step": 32500 }, { "epoch": 0.44914717147502137, "grad_norm": 10.5194673538208, "learning_rate": 5.49450475856289e-05, "loss": 1.2607, "step": 32600 }, { "epoch": 0.4505249235347607, "grad_norm": 28.1883602142334, "learning_rate": 5.494453995134138e-05, "loss": 1.1606, "step": 32700 }, { "epoch": 0.45190267559450004, "grad_norm": 16.315658569335938, "learning_rate": 5.494402998549977e-05, "loss": 1.1847, "step": 32800 }, { "epoch": 0.45328042765423937, "grad_norm": 10.581954002380371, "learning_rate": 5.494351768814742e-05, "loss": 1.1807, "step": 32900 }, { "epoch": 0.45465817971397865, "grad_norm": 10.171825408935547, "learning_rate": 5.494300305932784e-05, "loss": 1.1268, "step": 33000 }, { "epoch": 0.456035931773718, "grad_norm": 25.388198852539062, "learning_rate": 5.4942486099084755e-05, "loss": 1.1395, "step": 33100 }, { "epoch": 0.4574136838334573, "grad_norm": 6.69132661819458, "learning_rate": 5.4941966807462086e-05, "loss": 1.1545, "step": 33200 }, { "epoch": 0.45879143589319665, "grad_norm": 10.449405670166016, "learning_rate": 5.4941445184503934e-05, "loss": 1.2649, "step": 33300 }, { "epoch": 0.460169187952936, "grad_norm": 9.717305183410645, "learning_rate": 5.494092123025462e-05, "loss": 1.1029, "step": 33400 }, { "epoch": 0.4615469400126753, "grad_norm": 9.180707931518555, "learning_rate": 5.494039494475868e-05, "loss": 1.2296, "step": 33500 }, { "epoch": 0.46292469207241466, "grad_norm": 9.62405014038086, "learning_rate": 5.493987162576708e-05, "loss": 1.1546, "step": 33600 }, { "epoch": 0.464302444132154, "grad_norm": 17.93250846862793, "learning_rate": 5.4939340701223503e-05, "loss": 1.3056, "step": 33700 }, { "epoch": 0.4656801961918933, "grad_norm": 8.866469383239746, "learning_rate": 5.493880744556757e-05, "loss": 1.2404, "step": 33800 }, { "epoch": 0.46705794825163266, "grad_norm": 10.755928039550781, "learning_rate": 5.493827185884457e-05, "loss": 1.1766, "step": 33900 }, { "epoch": 0.46843570031137194, "grad_norm": 48.711204528808594, "learning_rate": 5.49377339411e-05, "loss": 1.0883, "step": 34000 }, { "epoch": 0.4698134523711113, "grad_norm": 10.83607292175293, "learning_rate": 5.493719369237956e-05, "loss": 1.1657, "step": 34100 }, { "epoch": 0.4711912044308506, "grad_norm": 202.4723358154297, "learning_rate": 5.4936651112729146e-05, "loss": 1.1673, "step": 34200 }, { "epoch": 0.47256895649058994, "grad_norm": 23.345539093017578, "learning_rate": 5.493610620219485e-05, "loss": 1.2544, "step": 34300 }, { "epoch": 0.4739467085503293, "grad_norm": 17.14194107055664, "learning_rate": 5.4935558960822975e-05, "loss": 1.2183, "step": 34400 }, { "epoch": 0.4753244606100686, "grad_norm": 154.0811004638672, "learning_rate": 5.493500938866002e-05, "loss": 1.1882, "step": 34500 }, { "epoch": 0.47670221266980795, "grad_norm": 29.115861892700195, "learning_rate": 5.493445748575265e-05, "loss": 1.1198, "step": 34600 }, { "epoch": 0.4780799647295473, "grad_norm": 20.269804000854492, "learning_rate": 5.493390325214776e-05, "loss": 1.2218, "step": 34700 }, { "epoch": 0.4794577167892866, "grad_norm": 16.553653717041016, "learning_rate": 5.4933346687892435e-05, "loss": 1.1487, "step": 34800 }, { "epoch": 0.48083546884902595, "grad_norm": 25.207561492919922, "learning_rate": 5.493278779303397e-05, "loss": 1.1777, "step": 34900 }, { "epoch": 0.4822132209087653, "grad_norm": 38.11124038696289, "learning_rate": 5.493222656761983e-05, "loss": 1.1786, "step": 35000 }, { "epoch": 0.48359097296850456, "grad_norm": 146.48446655273438, "learning_rate": 5.49316630116977e-05, "loss": 1.2362, "step": 35100 }, { "epoch": 0.4849687250282439, "grad_norm": 21.451509475708008, "learning_rate": 5.493109712531546e-05, "loss": 1.1021, "step": 35200 }, { "epoch": 0.48634647708798323, "grad_norm": 181.49642944335938, "learning_rate": 5.4930528908521185e-05, "loss": 1.2539, "step": 35300 }, { "epoch": 0.48772422914772257, "grad_norm": 26.78451156616211, "learning_rate": 5.4929958361363143e-05, "loss": 1.3365, "step": 35400 }, { "epoch": 0.4891019812074619, "grad_norm": 11.870569229125977, "learning_rate": 5.492938548388981e-05, "loss": 1.3135, "step": 35500 }, { "epoch": 0.49047973326720123, "grad_norm": 11.67177963256836, "learning_rate": 5.492881027614985e-05, "loss": 1.1676, "step": 35600 }, { "epoch": 0.49185748532694057, "grad_norm": 43.119224548339844, "learning_rate": 5.4928232738192135e-05, "loss": 1.3202, "step": 35700 }, { "epoch": 0.4932352373866799, "grad_norm": 26.77092170715332, "learning_rate": 5.4927652870065726e-05, "loss": 1.3075, "step": 35800 }, { "epoch": 0.49461298944641924, "grad_norm": 12.316910743713379, "learning_rate": 5.492707067181988e-05, "loss": 1.352, "step": 35900 }, { "epoch": 0.4959907415061586, "grad_norm": 61.07495880126953, "learning_rate": 5.492648614350407e-05, "loss": 1.1952, "step": 36000 }, { "epoch": 0.4973684935658979, "grad_norm": 49.89175033569336, "learning_rate": 5.492589928516795e-05, "loss": 1.1014, "step": 36100 }, { "epoch": 0.4987462456256372, "grad_norm": 8.198811531066895, "learning_rate": 5.492531009686138e-05, "loss": 1.349, "step": 36200 }, { "epoch": 0.5001239976853765, "grad_norm": 24.358966827392578, "learning_rate": 5.4924718578634405e-05, "loss": 1.2439, "step": 36300 }, { "epoch": 0.5015017497451159, "grad_norm": 15.124456405639648, "learning_rate": 5.4924124730537284e-05, "loss": 1.1741, "step": 36400 }, { "epoch": 0.5028795018048552, "grad_norm": 40.977848052978516, "learning_rate": 5.492352855262046e-05, "loss": 1.2399, "step": 36500 }, { "epoch": 0.5042572538645945, "grad_norm": 16.233501434326172, "learning_rate": 5.49229300449346e-05, "loss": 1.1527, "step": 36600 }, { "epoch": 0.5056350059243339, "grad_norm": 24.221960067749023, "learning_rate": 5.4922329207530526e-05, "loss": 1.311, "step": 36700 }, { "epoch": 0.5070127579840732, "grad_norm": 10.371733665466309, "learning_rate": 5.4921726040459305e-05, "loss": 1.2262, "step": 36800 }, { "epoch": 0.5083905100438125, "grad_norm": 13.55928897857666, "learning_rate": 5.492112054377216e-05, "loss": 1.2579, "step": 36900 }, { "epoch": 0.5097682621035519, "grad_norm": 19.904464721679688, "learning_rate": 5.492051271752054e-05, "loss": 1.1302, "step": 37000 }, { "epoch": 0.5111460141632912, "grad_norm": 8.938076972961426, "learning_rate": 5.491990256175609e-05, "loss": 1.2907, "step": 37100 }, { "epoch": 0.5125237662230305, "grad_norm": 7.375090599060059, "learning_rate": 5.491929007653063e-05, "loss": 1.2576, "step": 37200 }, { "epoch": 0.5139015182827699, "grad_norm": 19.95588493347168, "learning_rate": 5.491868142157295e-05, "loss": 1.2289, "step": 37300 }, { "epoch": 0.5152792703425092, "grad_norm": 38.733131408691406, "learning_rate": 5.4918064300875095e-05, "loss": 1.2287, "step": 37400 }, { "epoch": 0.5166570224022485, "grad_norm": 23.4934024810791, "learning_rate": 5.491744485087241e-05, "loss": 1.1797, "step": 37500 }, { "epoch": 0.5180347744619879, "grad_norm": 15.991085052490234, "learning_rate": 5.4916823071617515e-05, "loss": 1.2026, "step": 37600 }, { "epoch": 0.5194125265217272, "grad_norm": 6.564786434173584, "learning_rate": 5.491619896316324e-05, "loss": 1.2287, "step": 37700 }, { "epoch": 0.5207902785814664, "grad_norm": 21.138957977294922, "learning_rate": 5.49155725255626e-05, "loss": 1.2666, "step": 37800 }, { "epoch": 0.5221680306412058, "grad_norm": 5.338771820068359, "learning_rate": 5.491494375886882e-05, "loss": 1.0798, "step": 37900 }, { "epoch": 0.5235457827009451, "grad_norm": 17.468963623046875, "learning_rate": 5.4914312663135315e-05, "loss": 1.1097, "step": 38000 }, { "epoch": 0.5249235347606844, "grad_norm": 5.89479398727417, "learning_rate": 5.491367923841569e-05, "loss": 1.0782, "step": 38100 }, { "epoch": 0.5263012868204238, "grad_norm": 8.392021179199219, "learning_rate": 5.491304348476377e-05, "loss": 1.1417, "step": 38200 }, { "epoch": 0.5276790388801631, "grad_norm": 9.740779876708984, "learning_rate": 5.491240540223357e-05, "loss": 1.2198, "step": 38300 }, { "epoch": 0.5290567909399024, "grad_norm": 8.02989387512207, "learning_rate": 5.491176499087928e-05, "loss": 1.2369, "step": 38400 }, { "epoch": 0.5304345429996418, "grad_norm": 14.354741096496582, "learning_rate": 5.491112225075532e-05, "loss": 1.1563, "step": 38500 }, { "epoch": 0.5318122950593811, "grad_norm": 13.512933731079102, "learning_rate": 5.491047718191629e-05, "loss": 1.0641, "step": 38600 }, { "epoch": 0.5331900471191204, "grad_norm": 8.201644897460938, "learning_rate": 5.4909829784417e-05, "loss": 1.1771, "step": 38700 }, { "epoch": 0.5345677991788598, "grad_norm": 12.34885025024414, "learning_rate": 5.490918005831243e-05, "loss": 1.2186, "step": 38800 }, { "epoch": 0.5359455512385991, "grad_norm": 12.697105407714844, "learning_rate": 5.4908528003657794e-05, "loss": 1.0416, "step": 38900 }, { "epoch": 0.5373233032983384, "grad_norm": 20.74173355102539, "learning_rate": 5.490787362050848e-05, "loss": 1.2125, "step": 39000 }, { "epoch": 0.5387010553580778, "grad_norm": 5.475405216217041, "learning_rate": 5.490721690892009e-05, "loss": 1.1598, "step": 39100 }, { "epoch": 0.5400788074178171, "grad_norm": 15.600400924682617, "learning_rate": 5.490655786894841e-05, "loss": 1.1072, "step": 39200 }, { "epoch": 0.5414565594775564, "grad_norm": 13.139054298400879, "learning_rate": 5.4905896500649416e-05, "loss": 1.1552, "step": 39300 }, { "epoch": 0.5428343115372958, "grad_norm": 62.545928955078125, "learning_rate": 5.490523280407932e-05, "loss": 1.0561, "step": 39400 }, { "epoch": 0.5442120635970351, "grad_norm": 6.346467971801758, "learning_rate": 5.4904566779294486e-05, "loss": 1.1642, "step": 39500 }, { "epoch": 0.5455898156567744, "grad_norm": 15.561434745788574, "learning_rate": 5.490389842635151e-05, "loss": 1.106, "step": 39600 }, { "epoch": 0.5469675677165138, "grad_norm": 12.556934356689453, "learning_rate": 5.490322774530716e-05, "loss": 1.0369, "step": 39700 }, { "epoch": 0.5483453197762531, "grad_norm": 13.121975898742676, "learning_rate": 5.490255473621842e-05, "loss": 1.1628, "step": 39800 }, { "epoch": 0.5497230718359924, "grad_norm": 7.027225017547607, "learning_rate": 5.490187939914246e-05, "loss": 1.1277, "step": 39900 }, { "epoch": 0.5511008238957317, "grad_norm": 12.51138973236084, "learning_rate": 5.490120173413667e-05, "loss": 1.1055, "step": 40000 }, { "epoch": 0.552478575955471, "grad_norm": 15.838208198547363, "learning_rate": 5.4900521741258595e-05, "loss": 1.1405, "step": 40100 }, { "epoch": 0.5538563280152103, "grad_norm": 63.764068603515625, "learning_rate": 5.4899839420566027e-05, "loss": 1.2225, "step": 40200 }, { "epoch": 0.5552340800749497, "grad_norm": 24.84697151184082, "learning_rate": 5.489915477211693e-05, "loss": 1.0902, "step": 40300 }, { "epoch": 0.556611832134689, "grad_norm": 18.65909194946289, "learning_rate": 5.489846779596945e-05, "loss": 1.2615, "step": 40400 }, { "epoch": 0.5579895841944283, "grad_norm": 13.023543357849121, "learning_rate": 5.489777849218196e-05, "loss": 1.2782, "step": 40500 }, { "epoch": 0.5593673362541677, "grad_norm": 10.606678009033203, "learning_rate": 5.489708686081303e-05, "loss": 1.174, "step": 40600 }, { "epoch": 0.560745088313907, "grad_norm": 21.74370574951172, "learning_rate": 5.4896392901921404e-05, "loss": 1.1712, "step": 40700 }, { "epoch": 0.5621228403736463, "grad_norm": 36.1141242980957, "learning_rate": 5.489569661556604e-05, "loss": 1.1973, "step": 40800 }, { "epoch": 0.5635005924333857, "grad_norm": 20.26376724243164, "learning_rate": 5.4894998001806094e-05, "loss": 1.4648, "step": 40900 }, { "epoch": 0.564878344493125, "grad_norm": 15.052135467529297, "learning_rate": 5.4894297060700914e-05, "loss": 1.2466, "step": 41000 }, { "epoch": 0.5662560965528644, "grad_norm": 104.23808288574219, "learning_rate": 5.489359379231006e-05, "loss": 1.1266, "step": 41100 }, { "epoch": 0.5676338486126037, "grad_norm": 25.820280075073242, "learning_rate": 5.489288819669326e-05, "loss": 1.2133, "step": 41200 }, { "epoch": 0.569011600672343, "grad_norm": 12.302903175354004, "learning_rate": 5.4892180273910467e-05, "loss": 1.2712, "step": 41300 }, { "epoch": 0.5703893527320824, "grad_norm": 11.3978910446167, "learning_rate": 5.489147002402182e-05, "loss": 1.3285, "step": 41400 }, { "epoch": 0.5717671047918217, "grad_norm": 20.72574234008789, "learning_rate": 5.489075744708767e-05, "loss": 1.1829, "step": 41500 }, { "epoch": 0.573144856851561, "grad_norm": 8.017884254455566, "learning_rate": 5.489004254316854e-05, "loss": 1.2739, "step": 41600 }, { "epoch": 0.5745226089113004, "grad_norm": 11.902440071105957, "learning_rate": 5.488932531232517e-05, "loss": 1.3857, "step": 41700 }, { "epoch": 0.5759003609710397, "grad_norm": 10.239564895629883, "learning_rate": 5.488860575461849e-05, "loss": 1.1585, "step": 41800 }, { "epoch": 0.577278113030779, "grad_norm": 20.104263305664062, "learning_rate": 5.488788387010963e-05, "loss": 1.2889, "step": 41900 }, { "epoch": 0.5786558650905184, "grad_norm": 24.809194564819336, "learning_rate": 5.4887166912489586e-05, "loss": 1.2151, "step": 42000 }, { "epoch": 0.5800336171502576, "grad_norm": 10.520858764648438, "learning_rate": 5.488644039782705e-05, "loss": 1.2757, "step": 42100 }, { "epoch": 0.5814113692099969, "grad_norm": 10.631114959716797, "learning_rate": 5.488571155654628e-05, "loss": 1.1857, "step": 42200 }, { "epoch": 0.5827891212697363, "grad_norm": 18.827821731567383, "learning_rate": 5.488498038870921e-05, "loss": 1.2018, "step": 42300 }, { "epoch": 0.5841668733294756, "grad_norm": 5.8350019454956055, "learning_rate": 5.488424689437796e-05, "loss": 1.1799, "step": 42400 }, { "epoch": 0.5855446253892149, "grad_norm": 23.99330711364746, "learning_rate": 5.488351107361484e-05, "loss": 1.2866, "step": 42500 }, { "epoch": 0.5869223774489543, "grad_norm": 122.87985229492188, "learning_rate": 5.488277292648236e-05, "loss": 1.1655, "step": 42600 }, { "epoch": 0.5883001295086936, "grad_norm": 24.69963264465332, "learning_rate": 5.488203245304323e-05, "loss": 1.2314, "step": 42700 }, { "epoch": 0.5896778815684329, "grad_norm": 103.8561782836914, "learning_rate": 5.4881289653360364e-05, "loss": 1.244, "step": 42800 }, { "epoch": 0.5910556336281723, "grad_norm": 9.050652503967285, "learning_rate": 5.4880544527496854e-05, "loss": 1.3281, "step": 42900 }, { "epoch": 0.5924333856879116, "grad_norm": 28.411632537841797, "learning_rate": 5.487979707551601e-05, "loss": 1.238, "step": 43000 }, { "epoch": 0.5938111377476509, "grad_norm": 90.37954711914062, "learning_rate": 5.487904729748133e-05, "loss": 1.2811, "step": 43100 }, { "epoch": 0.5951888898073903, "grad_norm": 6.304271221160889, "learning_rate": 5.487829519345651e-05, "loss": 1.2682, "step": 43200 }, { "epoch": 0.5965666418671296, "grad_norm": 7.4491143226623535, "learning_rate": 5.487754076350545e-05, "loss": 1.1171, "step": 43300 }, { "epoch": 0.5979443939268689, "grad_norm": 11.623374938964844, "learning_rate": 5.487678400769224e-05, "loss": 1.323, "step": 43400 }, { "epoch": 0.5993221459866083, "grad_norm": 17.040042877197266, "learning_rate": 5.4876024926081166e-05, "loss": 1.2878, "step": 43500 }, { "epoch": 0.6006998980463476, "grad_norm": 37.76469421386719, "learning_rate": 5.4875263518736724e-05, "loss": 1.2728, "step": 43600 }, { "epoch": 0.6020776501060869, "grad_norm": 5.725869655609131, "learning_rate": 5.4874499785723586e-05, "loss": 1.3025, "step": 43700 }, { "epoch": 0.6034554021658263, "grad_norm": 17.790145874023438, "learning_rate": 5.4873733727106655e-05, "loss": 1.2354, "step": 43800 }, { "epoch": 0.6048331542255656, "grad_norm": 37.19287872314453, "learning_rate": 5.4872965342950995e-05, "loss": 1.1685, "step": 43900 }, { "epoch": 0.6062109062853049, "grad_norm": 19.467134475708008, "learning_rate": 5.4872194633321896e-05, "loss": 1.2465, "step": 44000 }, { "epoch": 0.6075886583450443, "grad_norm": 26.84002113342285, "learning_rate": 5.4871421598284824e-05, "loss": 1.2126, "step": 44100 }, { "epoch": 0.6089664104047836, "grad_norm": 40.50870895385742, "learning_rate": 5.4870646237905455e-05, "loss": 1.3323, "step": 44200 }, { "epoch": 0.6103441624645228, "grad_norm": 16.28245735168457, "learning_rate": 5.4869868552249666e-05, "loss": 1.2629, "step": 44300 }, { "epoch": 0.6117219145242622, "grad_norm": 14.023327827453613, "learning_rate": 5.4869088541383514e-05, "loss": 1.2761, "step": 44400 }, { "epoch": 0.6130996665840015, "grad_norm": 7.924612998962402, "learning_rate": 5.486830620537327e-05, "loss": 1.189, "step": 44500 }, { "epoch": 0.6144774186437408, "grad_norm": 29.270492553710938, "learning_rate": 5.48675215442854e-05, "loss": 1.1727, "step": 44600 }, { "epoch": 0.6158551707034802, "grad_norm": 12.770021438598633, "learning_rate": 5.486673455818657e-05, "loss": 1.2345, "step": 44700 }, { "epoch": 0.6172329227632195, "grad_norm": 18.894485473632812, "learning_rate": 5.486594524714362e-05, "loss": 1.0552, "step": 44800 }, { "epoch": 0.6186106748229588, "grad_norm": 34.18074035644531, "learning_rate": 5.4865153611223626e-05, "loss": 1.2627, "step": 44900 }, { "epoch": 0.6199884268826982, "grad_norm": 11.412156105041504, "learning_rate": 5.4864359650493836e-05, "loss": 1.2822, "step": 45000 }, { "epoch": 0.6213661789424375, "grad_norm": 25.027217864990234, "learning_rate": 5.4863563365021686e-05, "loss": 1.1529, "step": 45100 }, { "epoch": 0.6227439310021768, "grad_norm": 9.224932670593262, "learning_rate": 5.4862764754874855e-05, "loss": 1.1406, "step": 45200 }, { "epoch": 0.6241216830619162, "grad_norm": 2.8139286041259766, "learning_rate": 5.486196382012116e-05, "loss": 1.3154, "step": 45300 }, { "epoch": 0.6254994351216555, "grad_norm": 6.805177211761475, "learning_rate": 5.486116056082866e-05, "loss": 1.0983, "step": 45400 }, { "epoch": 0.6268771871813948, "grad_norm": 8.907033920288086, "learning_rate": 5.4860354977065584e-05, "loss": 1.1609, "step": 45500 }, { "epoch": 0.6282549392411342, "grad_norm": 19.110136032104492, "learning_rate": 5.4859547068900385e-05, "loss": 1.2155, "step": 45600 }, { "epoch": 0.6296326913008735, "grad_norm": 5.0881524085998535, "learning_rate": 5.485873683640169e-05, "loss": 1.2043, "step": 45700 }, { "epoch": 0.6310104433606128, "grad_norm": 44.61912155151367, "learning_rate": 5.4857924279638333e-05, "loss": 1.1461, "step": 45800 }, { "epoch": 0.6323881954203522, "grad_norm": 12.612244606018066, "learning_rate": 5.485710939867935e-05, "loss": 1.1641, "step": 45900 }, { "epoch": 0.6337659474800915, "grad_norm": 40.20648193359375, "learning_rate": 5.485629219359396e-05, "loss": 1.131, "step": 46000 }, { "epoch": 0.6351436995398309, "grad_norm": 8.210180282592773, "learning_rate": 5.48554726644516e-05, "loss": 1.1889, "step": 46100 }, { "epoch": 0.6365214515995702, "grad_norm": 5.954362392425537, "learning_rate": 5.485465081132189e-05, "loss": 1.1033, "step": 46200 }, { "epoch": 0.6378992036593095, "grad_norm": 19.641450881958008, "learning_rate": 5.485382663427464e-05, "loss": 1.0969, "step": 46300 }, { "epoch": 0.6392769557190487, "grad_norm": 26.514488220214844, "learning_rate": 5.485300013337988e-05, "loss": 1.1515, "step": 46400 }, { "epoch": 0.6406547077787881, "grad_norm": 11.927842140197754, "learning_rate": 5.485217130870782e-05, "loss": 1.1535, "step": 46500 }, { "epoch": 0.6420324598385274, "grad_norm": 19.73988151550293, "learning_rate": 5.485134016032888e-05, "loss": 1.1087, "step": 46600 }, { "epoch": 0.6434102118982667, "grad_norm": 5.729079246520996, "learning_rate": 5.485050668831366e-05, "loss": 1.0969, "step": 46700 }, { "epoch": 0.6447879639580061, "grad_norm": 27.262556076049805, "learning_rate": 5.484967089273297e-05, "loss": 1.189, "step": 46800 }, { "epoch": 0.6461657160177454, "grad_norm": 9.671031951904297, "learning_rate": 5.484883277365783e-05, "loss": 1.2077, "step": 46900 }, { "epoch": 0.6475434680774848, "grad_norm": 11.681415557861328, "learning_rate": 5.4847992331159415e-05, "loss": 1.1066, "step": 47000 }, { "epoch": 0.6489212201372241, "grad_norm": 11.195589065551758, "learning_rate": 5.4847149565309145e-05, "loss": 1.116, "step": 47100 }, { "epoch": 0.6502989721969634, "grad_norm": 24.623706817626953, "learning_rate": 5.484630447617862e-05, "loss": 1.0836, "step": 47200 }, { "epoch": 0.6516767242567028, "grad_norm": 5.540821075439453, "learning_rate": 5.484545706383961e-05, "loss": 1.1583, "step": 47300 }, { "epoch": 0.6530544763164421, "grad_norm": 11.08587646484375, "learning_rate": 5.484460732836414e-05, "loss": 1.176, "step": 47400 }, { "epoch": 0.6544322283761814, "grad_norm": 18.468021392822266, "learning_rate": 5.484375526982438e-05, "loss": 1.1749, "step": 47500 }, { "epoch": 0.6558099804359208, "grad_norm": 8.209395408630371, "learning_rate": 5.484290088829272e-05, "loss": 1.2228, "step": 47600 }, { "epoch": 0.6571877324956601, "grad_norm": 28.284866333007812, "learning_rate": 5.484204418384174e-05, "loss": 1.1728, "step": 47700 }, { "epoch": 0.6585654845553994, "grad_norm": 15.217370986938477, "learning_rate": 5.484118515654422e-05, "loss": 1.2026, "step": 47800 }, { "epoch": 0.6599432366151388, "grad_norm": 28.61541748046875, "learning_rate": 5.484032380647316e-05, "loss": 1.2036, "step": 47900 }, { "epoch": 0.6613209886748781, "grad_norm": 17.485252380371094, "learning_rate": 5.483946013370172e-05, "loss": 1.2027, "step": 48000 }, { "epoch": 0.6626987407346174, "grad_norm": 67.5372543334961, "learning_rate": 5.483859413830326e-05, "loss": 1.2129, "step": 48100 }, { "epoch": 0.6640764927943568, "grad_norm": 8.771037101745605, "learning_rate": 5.483772582035137e-05, "loss": 1.2718, "step": 48200 }, { "epoch": 0.6654542448540961, "grad_norm": 9.3125638961792, "learning_rate": 5.483685517991982e-05, "loss": 1.1783, "step": 48300 }, { "epoch": 0.6668319969138354, "grad_norm": 13.458366394042969, "learning_rate": 5.48359909582066e-05, "loss": 1.2295, "step": 48400 }, { "epoch": 0.6682097489735748, "grad_norm": 59.08677291870117, "learning_rate": 5.4835115696260744e-05, "loss": 1.2055, "step": 48500 }, { "epoch": 0.669587501033314, "grad_norm": 24.833065032958984, "learning_rate": 5.483423811205697e-05, "loss": 1.116, "step": 48600 }, { "epoch": 0.6709652530930533, "grad_norm": 22.202251434326172, "learning_rate": 5.4833358205669826e-05, "loss": 1.2302, "step": 48700 }, { "epoch": 0.6723430051527927, "grad_norm": 15.729240417480469, "learning_rate": 5.483247597717407e-05, "loss": 1.0977, "step": 48800 }, { "epoch": 0.673720757212532, "grad_norm": 142.8865966796875, "learning_rate": 5.483159142664464e-05, "loss": 1.2123, "step": 48900 }, { "epoch": 0.6750985092722713, "grad_norm": 6.221757411956787, "learning_rate": 5.4830704554156704e-05, "loss": 1.1948, "step": 49000 }, { "epoch": 0.6764762613320107, "grad_norm": 87.30204010009766, "learning_rate": 5.482981535978559e-05, "loss": 1.3066, "step": 49100 }, { "epoch": 0.67785401339175, "grad_norm": 19.206348419189453, "learning_rate": 5.4828932770261324e-05, "loss": 1.2929, "step": 49200 }, { "epoch": 0.6792317654514893, "grad_norm": 14.013370513916016, "learning_rate": 5.4828038955567624e-05, "loss": 1.2378, "step": 49300 }, { "epoch": 0.6806095175112287, "grad_norm": 22.980525970458984, "learning_rate": 5.4827142819217196e-05, "loss": 1.1694, "step": 49400 }, { "epoch": 0.681987269570968, "grad_norm": 10.68824577331543, "learning_rate": 5.482624436128619e-05, "loss": 1.1398, "step": 49500 }, { "epoch": 0.6833650216307073, "grad_norm": 20.506732940673828, "learning_rate": 5.482534358185092e-05, "loss": 1.2095, "step": 49600 }, { "epoch": 0.6847427736904467, "grad_norm": 11.502998352050781, "learning_rate": 5.482444048098792e-05, "loss": 1.0841, "step": 49700 }, { "epoch": 0.686120525750186, "grad_norm": 10.389013290405273, "learning_rate": 5.482353505877391e-05, "loss": 1.1964, "step": 49800 }, { "epoch": 0.6874982778099253, "grad_norm": 8.436223030090332, "learning_rate": 5.4822627315285815e-05, "loss": 1.1851, "step": 49900 }, { "epoch": 0.6888760298696647, "grad_norm": 9.904741287231445, "learning_rate": 5.4821717250600746e-05, "loss": 1.1134, "step": 50000 }, { "epoch": 0.690253781929404, "grad_norm": 45.23259353637695, "learning_rate": 5.482080486479602e-05, "loss": 1.1171, "step": 50100 }, { "epoch": 0.6916315339891433, "grad_norm": 22.682371139526367, "learning_rate": 5.481989015794914e-05, "loss": 1.1765, "step": 50200 }, { "epoch": 0.6930092860488827, "grad_norm": 9.918152809143066, "learning_rate": 5.481897313013783e-05, "loss": 1.1686, "step": 50300 }, { "epoch": 0.694387038108622, "grad_norm": 7.5822553634643555, "learning_rate": 5.481805378143999e-05, "loss": 1.1215, "step": 50400 }, { "epoch": 0.6957647901683613, "grad_norm": 52.38286209106445, "learning_rate": 5.4817132111933725e-05, "loss": 1.1647, "step": 50500 }, { "epoch": 0.6971425422281007, "grad_norm": 11.827544212341309, "learning_rate": 5.4816208121697324e-05, "loss": 1.1653, "step": 50600 }, { "epoch": 0.69852029428784, "grad_norm": 15.004902839660645, "learning_rate": 5.4815281810809304e-05, "loss": 1.2503, "step": 50700 }, { "epoch": 0.6998980463475792, "grad_norm": 41.13508605957031, "learning_rate": 5.4814353179348344e-05, "loss": 1.2125, "step": 50800 }, { "epoch": 0.7012757984073186, "grad_norm": 105.07270812988281, "learning_rate": 5.481342222739335e-05, "loss": 1.4533, "step": 50900 }, { "epoch": 0.7026535504670579, "grad_norm": 44.4683723449707, "learning_rate": 5.481249829923289e-05, "loss": 1.4762, "step": 51000 }, { "epoch": 0.7040313025267972, "grad_norm": 247.8203125, "learning_rate": 5.481156272973023e-05, "loss": 1.2595, "step": 51100 }, { "epoch": 0.7054090545865366, "grad_norm": 21.079259872436523, "learning_rate": 5.48106248399706e-05, "loss": 1.4128, "step": 51200 }, { "epoch": 0.7067868066462759, "grad_norm": 8.392303466796875, "learning_rate": 5.4809684630033665e-05, "loss": 1.2437, "step": 51300 }, { "epoch": 0.7081645587060152, "grad_norm": 7.86804723739624, "learning_rate": 5.480874209999932e-05, "loss": 1.3367, "step": 51400 }, { "epoch": 0.7095423107657546, "grad_norm": 13.094411849975586, "learning_rate": 5.4807797249947604e-05, "loss": 1.218, "step": 51500 }, { "epoch": 0.7109200628254939, "grad_norm": 8.20193099975586, "learning_rate": 5.480685007995881e-05, "loss": 1.3033, "step": 51600 }, { "epoch": 0.7122978148852332, "grad_norm": 46.50385665893555, "learning_rate": 5.4805900590113404e-05, "loss": 1.3322, "step": 51700 }, { "epoch": 0.7136755669449726, "grad_norm": 126.6577377319336, "learning_rate": 5.4804948780492044e-05, "loss": 1.3084, "step": 51800 }, { "epoch": 0.7150533190047119, "grad_norm": 25.30513572692871, "learning_rate": 5.4803994651175595e-05, "loss": 1.3466, "step": 51900 }, { "epoch": 0.7164310710644513, "grad_norm": 11.659921646118164, "learning_rate": 5.4803038202245116e-05, "loss": 1.1219, "step": 52000 }, { "epoch": 0.7178088231241906, "grad_norm": 16.904481887817383, "learning_rate": 5.480207943378186e-05, "loss": 1.0886, "step": 52100 }, { "epoch": 0.7191865751839299, "grad_norm": 8.303357124328613, "learning_rate": 5.480111834586728e-05, "loss": 1.2044, "step": 52200 }, { "epoch": 0.7205643272436693, "grad_norm": 26.6663875579834, "learning_rate": 5.480015493858302e-05, "loss": 1.1989, "step": 52300 }, { "epoch": 0.7219420793034086, "grad_norm": 16.30890464782715, "learning_rate": 5.479918921201093e-05, "loss": 1.1712, "step": 52400 }, { "epoch": 0.7233198313631479, "grad_norm": 6.086356163024902, "learning_rate": 5.479822116623306e-05, "loss": 1.1054, "step": 52500 }, { "epoch": 0.7246975834228873, "grad_norm": 8.788448333740234, "learning_rate": 5.4797250801331645e-05, "loss": 1.1336, "step": 52600 }, { "epoch": 0.7260753354826266, "grad_norm": 16.002470016479492, "learning_rate": 5.4796278117389126e-05, "loss": 1.2275, "step": 52700 }, { "epoch": 0.7274530875423659, "grad_norm": 6.798962116241455, "learning_rate": 5.4795303114488126e-05, "loss": 1.2138, "step": 52800 }, { "epoch": 0.7288308396021052, "grad_norm": 11.686549186706543, "learning_rate": 5.479432579271149e-05, "loss": 1.2924, "step": 52900 }, { "epoch": 0.7302085916618445, "grad_norm": 86.74701690673828, "learning_rate": 5.479334615214224e-05, "loss": 1.2223, "step": 53000 }, { "epoch": 0.7315863437215838, "grad_norm": 19.557451248168945, "learning_rate": 5.4792364192863604e-05, "loss": 1.292, "step": 53100 }, { "epoch": 0.7329640957813232, "grad_norm": 14.568532943725586, "learning_rate": 5.4791379914959e-05, "loss": 1.0606, "step": 53200 }, { "epoch": 0.7343418478410625, "grad_norm": 477.8893737792969, "learning_rate": 5.479039331851205e-05, "loss": 1.2518, "step": 53300 }, { "epoch": 0.7357195999008018, "grad_norm": 22.306926727294922, "learning_rate": 5.4789404403606576e-05, "loss": 1.2761, "step": 53400 }, { "epoch": 0.7370973519605412, "grad_norm": 24.619558334350586, "learning_rate": 5.4788413170326583e-05, "loss": 1.1627, "step": 53500 }, { "epoch": 0.7384751040202805, "grad_norm": 25.820980072021484, "learning_rate": 5.478741961875628e-05, "loss": 1.2325, "step": 53600 }, { "epoch": 0.7398528560800198, "grad_norm": 6.452317237854004, "learning_rate": 5.4786423748980085e-05, "loss": 1.2066, "step": 53700 }, { "epoch": 0.7412306081397592, "grad_norm": 9.134746551513672, "learning_rate": 5.4785425561082594e-05, "loss": 1.2001, "step": 53800 }, { "epoch": 0.7426083601994985, "grad_norm": 15.809443473815918, "learning_rate": 5.478442505514861e-05, "loss": 1.2269, "step": 53900 }, { "epoch": 0.7439861122592378, "grad_norm": 7.069437503814697, "learning_rate": 5.478342223126313e-05, "loss": 1.1525, "step": 54000 }, { "epoch": 0.7453638643189772, "grad_norm": 9.375837326049805, "learning_rate": 5.478241708951136e-05, "loss": 1.279, "step": 54100 }, { "epoch": 0.7467416163787165, "grad_norm": 7.450246334075928, "learning_rate": 5.478140962997867e-05, "loss": 1.0976, "step": 54200 }, { "epoch": 0.7481193684384558, "grad_norm": 6.898708820343018, "learning_rate": 5.478039985275067e-05, "loss": 1.1062, "step": 54300 }, { "epoch": 0.7494971204981952, "grad_norm": 40.28762435913086, "learning_rate": 5.477938775791313e-05, "loss": 1.1483, "step": 54400 }, { "epoch": 0.7508748725579345, "grad_norm": 14.639007568359375, "learning_rate": 5.477837334555205e-05, "loss": 1.2125, "step": 54500 }, { "epoch": 0.7522526246176738, "grad_norm": 39.45486068725586, "learning_rate": 5.47773667945226e-05, "loss": 1.1797, "step": 54600 }, { "epoch": 0.7536303766774132, "grad_norm": 14.289118766784668, "learning_rate": 5.477634777054623e-05, "loss": 1.2033, "step": 54700 }, { "epoch": 0.7550081287371525, "grad_norm": 20.10711669921875, "learning_rate": 5.477532642930456e-05, "loss": 1.1648, "step": 54800 }, { "epoch": 0.7563858807968918, "grad_norm": 18.584131240844727, "learning_rate": 5.477430277088439e-05, "loss": 1.1935, "step": 54900 }, { "epoch": 0.7577636328566312, "grad_norm": 16.353309631347656, "learning_rate": 5.477327679537266e-05, "loss": 1.1627, "step": 55000 }, { "epoch": 0.7591413849163704, "grad_norm": 5.6780524253845215, "learning_rate": 5.477224850285653e-05, "loss": 1.1773, "step": 55100 }, { "epoch": 0.7605191369761097, "grad_norm": 29.311391830444336, "learning_rate": 5.477121789342337e-05, "loss": 1.1408, "step": 55200 }, { "epoch": 0.7618968890358491, "grad_norm": 23.528329849243164, "learning_rate": 5.477018496716073e-05, "loss": 1.2482, "step": 55300 }, { "epoch": 0.7632746410955884, "grad_norm": 37.14783477783203, "learning_rate": 5.476914972415636e-05, "loss": 1.1922, "step": 55400 }, { "epoch": 0.7646523931553277, "grad_norm": 21.471261978149414, "learning_rate": 5.476811216449822e-05, "loss": 1.1817, "step": 55500 }, { "epoch": 0.7660301452150671, "grad_norm": 8.953851699829102, "learning_rate": 5.4767072288274446e-05, "loss": 1.2108, "step": 55600 }, { "epoch": 0.7674078972748064, "grad_norm": 2.856712579727173, "learning_rate": 5.476603009557338e-05, "loss": 1.1368, "step": 55700 }, { "epoch": 0.7687856493345457, "grad_norm": 7.848891735076904, "learning_rate": 5.4764985586483574e-05, "loss": 1.2068, "step": 55800 }, { "epoch": 0.7701634013942851, "grad_norm": 10.449261665344238, "learning_rate": 5.476393876109375e-05, "loss": 1.1533, "step": 55900 }, { "epoch": 0.7715411534540244, "grad_norm": 15.309361457824707, "learning_rate": 5.476288961949285e-05, "loss": 1.1055, "step": 56000 }, { "epoch": 0.7729189055137637, "grad_norm": 18.263992309570312, "learning_rate": 5.4761838161769997e-05, "loss": 1.2973, "step": 56100 }, { "epoch": 0.7742966575735031, "grad_norm": 8.282423973083496, "learning_rate": 5.4760784388014526e-05, "loss": 1.0724, "step": 56200 }, { "epoch": 0.7756744096332424, "grad_norm": 7.8377366065979, "learning_rate": 5.475972829831595e-05, "loss": 1.0374, "step": 56300 }, { "epoch": 0.7770521616929817, "grad_norm": 40.84781265258789, "learning_rate": 5.4758669892764e-05, "loss": 1.1089, "step": 56400 }, { "epoch": 0.7784299137527211, "grad_norm": 6.531658172607422, "learning_rate": 5.4757609171448586e-05, "loss": 1.0716, "step": 56500 }, { "epoch": 0.7798076658124604, "grad_norm": 12.462308883666992, "learning_rate": 5.475654613445982e-05, "loss": 1.054, "step": 56600 }, { "epoch": 0.7811854178721997, "grad_norm": 8.333971977233887, "learning_rate": 5.475548078188803e-05, "loss": 1.0387, "step": 56700 }, { "epoch": 0.7825631699319391, "grad_norm": 10.69863510131836, "learning_rate": 5.47544131138237e-05, "loss": 1.0798, "step": 56800 }, { "epoch": 0.7839409219916784, "grad_norm": 10.501168251037598, "learning_rate": 5.475334313035754e-05, "loss": 1.2019, "step": 56900 }, { "epoch": 0.7853186740514178, "grad_norm": 7.4806318283081055, "learning_rate": 5.475227083158045e-05, "loss": 1.1566, "step": 57000 }, { "epoch": 0.7866964261111571, "grad_norm": 9.43379020690918, "learning_rate": 5.475119621758355e-05, "loss": 1.0975, "step": 57100 }, { "epoch": 0.7880741781708963, "grad_norm": 22.25967025756836, "learning_rate": 5.4750119288458095e-05, "loss": 1.0693, "step": 57200 }, { "epoch": 0.7894519302306356, "grad_norm": 45.955928802490234, "learning_rate": 5.4749040044295605e-05, "loss": 1.054, "step": 57300 }, { "epoch": 0.790829682290375, "grad_norm": 16.152667999267578, "learning_rate": 5.474795848518776e-05, "loss": 1.1842, "step": 57400 }, { "epoch": 0.7922074343501143, "grad_norm": 4.7594075202941895, "learning_rate": 5.474687461122644e-05, "loss": 1.1633, "step": 57500 }, { "epoch": 0.7935851864098536, "grad_norm": 8.323111534118652, "learning_rate": 5.474578842250373e-05, "loss": 1.108, "step": 57600 }, { "epoch": 0.794962938469593, "grad_norm": 30.226247787475586, "learning_rate": 5.4744699919111895e-05, "loss": 1.1067, "step": 57700 }, { "epoch": 0.7963406905293323, "grad_norm": 10.015289306640625, "learning_rate": 5.474360910114343e-05, "loss": 1.161, "step": 57800 }, { "epoch": 0.7977184425890717, "grad_norm": 12.430333137512207, "learning_rate": 5.474251596869099e-05, "loss": 1.0728, "step": 57900 }, { "epoch": 0.799096194648811, "grad_norm": 16.917774200439453, "learning_rate": 5.4741420521847457e-05, "loss": 1.3061, "step": 58000 }, { "epoch": 0.8004739467085503, "grad_norm": 23.206104278564453, "learning_rate": 5.474032276070587e-05, "loss": 1.1408, "step": 58100 }, { "epoch": 0.8018516987682897, "grad_norm": 2.4593045711517334, "learning_rate": 5.473922268535951e-05, "loss": 1.0693, "step": 58200 }, { "epoch": 0.803229450828029, "grad_norm": 9.281790733337402, "learning_rate": 5.4738120295901824e-05, "loss": 1.0999, "step": 58300 }, { "epoch": 0.8046072028877683, "grad_norm": 94.02510833740234, "learning_rate": 5.4737015592426474e-05, "loss": 1.1631, "step": 58400 }, { "epoch": 0.8059849549475077, "grad_norm": 11.41101360321045, "learning_rate": 5.4735908575027304e-05, "loss": 1.1112, "step": 58500 }, { "epoch": 0.807362707007247, "grad_norm": 26.403072357177734, "learning_rate": 5.4734799243798356e-05, "loss": 1.079, "step": 58600 }, { "epoch": 0.8087404590669863, "grad_norm": 10.233325958251953, "learning_rate": 5.4733687598833895e-05, "loss": 1.0384, "step": 58700 }, { "epoch": 0.8101182111267257, "grad_norm": 1124.02490234375, "learning_rate": 5.473257364022833e-05, "loss": 1.0471, "step": 58800 }, { "epoch": 0.811495963186465, "grad_norm": 90.05939483642578, "learning_rate": 5.473145736807632e-05, "loss": 1.1022, "step": 58900 }, { "epoch": 0.8128737152462043, "grad_norm": 11.457798957824707, "learning_rate": 5.4730338782472696e-05, "loss": 1.0113, "step": 59000 }, { "epoch": 0.8142514673059437, "grad_norm": 7.373476505279541, "learning_rate": 5.472921788351248e-05, "loss": 1.1842, "step": 59100 }, { "epoch": 0.815629219365683, "grad_norm": 42.67975997924805, "learning_rate": 5.47280946712909e-05, "loss": 1.0965, "step": 59200 }, { "epoch": 0.8170069714254223, "grad_norm": 8.14940357208252, "learning_rate": 5.472696914590338e-05, "loss": 1.1904, "step": 59300 }, { "epoch": 0.8183847234851616, "grad_norm": 15.231611251831055, "learning_rate": 5.472584130744554e-05, "loss": 1.083, "step": 59400 }, { "epoch": 0.8197624755449009, "grad_norm": 16.02197265625, "learning_rate": 5.4724711156013194e-05, "loss": 1.2139, "step": 59500 }, { "epoch": 0.8211402276046402, "grad_norm": 14.748459815979004, "learning_rate": 5.472357869170236e-05, "loss": 1.051, "step": 59600 }, { "epoch": 0.8225179796643796, "grad_norm": 4.968407154083252, "learning_rate": 5.472244391460923e-05, "loss": 1.1052, "step": 59700 }, { "epoch": 0.8238957317241189, "grad_norm": 15.957460403442383, "learning_rate": 5.4721306824830225e-05, "loss": 1.1636, "step": 59800 }, { "epoch": 0.8252734837838582, "grad_norm": 16.082277297973633, "learning_rate": 5.472016742246194e-05, "loss": 1.0048, "step": 59900 }, { "epoch": 0.8266512358435976, "grad_norm": 16.96127700805664, "learning_rate": 5.471902570760118e-05, "loss": 1.1646, "step": 60000 }, { "epoch": 0.8280289879033369, "grad_norm": 10.596650123596191, "learning_rate": 5.4717881680344935e-05, "loss": 1.2358, "step": 60100 }, { "epoch": 0.8294067399630762, "grad_norm": 19.039791107177734, "learning_rate": 5.471674681563149e-05, "loss": 1.2627, "step": 60200 }, { "epoch": 0.8307844920228156, "grad_norm": 16.778432846069336, "learning_rate": 5.471559818699756e-05, "loss": 1.1974, "step": 60300 }, { "epoch": 0.8321622440825549, "grad_norm": 495.1744689941406, "learning_rate": 5.471444724625934e-05, "loss": 1.1441, "step": 60400 }, { "epoch": 0.8335399961422942, "grad_norm": 18.594533920288086, "learning_rate": 5.47132939935146e-05, "loss": 1.2501, "step": 60500 }, { "epoch": 0.8349177482020336, "grad_norm": 15.665253639221191, "learning_rate": 5.471213842886131e-05, "loss": 1.1846, "step": 60600 }, { "epoch": 0.8362955002617729, "grad_norm": 8.79216480255127, "learning_rate": 5.4710980552397636e-05, "loss": 1.1777, "step": 60700 }, { "epoch": 0.8376732523215122, "grad_norm": 80.83506774902344, "learning_rate": 5.470982036422196e-05, "loss": 1.0193, "step": 60800 }, { "epoch": 0.8390510043812516, "grad_norm": 69.26873016357422, "learning_rate": 5.4708657864432836e-05, "loss": 1.2296, "step": 60900 }, { "epoch": 0.8404287564409909, "grad_norm": 23.37373924255371, "learning_rate": 5.4707493053129036e-05, "loss": 1.0475, "step": 61000 }, { "epoch": 0.8418065085007302, "grad_norm": 43.963623046875, "learning_rate": 5.470632593040951e-05, "loss": 1.1772, "step": 61100 }, { "epoch": 0.8431842605604696, "grad_norm": 17.73714828491211, "learning_rate": 5.4705156496373395e-05, "loss": 1.1627, "step": 61200 }, { "epoch": 0.8445620126202089, "grad_norm": 27.438568115234375, "learning_rate": 5.470398475112007e-05, "loss": 1.2035, "step": 61300 }, { "epoch": 0.8459397646799482, "grad_norm": 100.108154296875, "learning_rate": 5.470281069474906e-05, "loss": 1.0764, "step": 61400 }, { "epoch": 0.8473175167396875, "grad_norm": 27.32681655883789, "learning_rate": 5.4701634327360105e-05, "loss": 1.1767, "step": 61500 }, { "epoch": 0.8486952687994268, "grad_norm": 18.406723022460938, "learning_rate": 5.4700455649053164e-05, "loss": 1.0829, "step": 61600 }, { "epoch": 0.8500730208591661, "grad_norm": 14.237214088439941, "learning_rate": 5.469927465992835e-05, "loss": 1.0697, "step": 61700 }, { "epoch": 0.8514507729189055, "grad_norm": 17.61935043334961, "learning_rate": 5.4698091360086004e-05, "loss": 1.1711, "step": 61800 }, { "epoch": 0.8528285249786448, "grad_norm": 11.328566551208496, "learning_rate": 5.469690574962666e-05, "loss": 1.1212, "step": 61900 }, { "epoch": 0.8542062770383841, "grad_norm": 21.75775718688965, "learning_rate": 5.469571782865103e-05, "loss": 1.1068, "step": 62000 }, { "epoch": 0.8555840290981235, "grad_norm": 8.026041984558105, "learning_rate": 5.469452759726004e-05, "loss": 1.1298, "step": 62100 }, { "epoch": 0.8569617811578628, "grad_norm": 48.023170471191406, "learning_rate": 5.469333505555479e-05, "loss": 1.1877, "step": 62200 }, { "epoch": 0.8583395332176021, "grad_norm": 25.07050895690918, "learning_rate": 5.469214020363662e-05, "loss": 1.1046, "step": 62300 }, { "epoch": 0.8597172852773415, "grad_norm": 6.722031116485596, "learning_rate": 5.4690943041607023e-05, "loss": 1.0693, "step": 62400 }, { "epoch": 0.8610950373370808, "grad_norm": 15.410558700561523, "learning_rate": 5.468974356956771e-05, "loss": 1.2216, "step": 62500 }, { "epoch": 0.8624727893968201, "grad_norm": 19.42689323425293, "learning_rate": 5.468854178762058e-05, "loss": 1.1332, "step": 62600 }, { "epoch": 0.8638505414565595, "grad_norm": 24.845836639404297, "learning_rate": 5.468733769586772e-05, "loss": 1.2004, "step": 62700 }, { "epoch": 0.8652282935162988, "grad_norm": 17.53343963623047, "learning_rate": 5.4686131294411444e-05, "loss": 1.22, "step": 62800 }, { "epoch": 0.8666060455760382, "grad_norm": 6.367466449737549, "learning_rate": 5.468492258335422e-05, "loss": 1.0237, "step": 62900 }, { "epoch": 0.8679837976357775, "grad_norm": 11.674467086791992, "learning_rate": 5.468371156279876e-05, "loss": 1.1647, "step": 63000 }, { "epoch": 0.8693615496955168, "grad_norm": 8.33002758026123, "learning_rate": 5.468249823284792e-05, "loss": 1.1642, "step": 63100 }, { "epoch": 0.8707393017552562, "grad_norm": 11.837431907653809, "learning_rate": 5.4681282593604794e-05, "loss": 1.0929, "step": 63200 }, { "epoch": 0.8721170538149955, "grad_norm": 15.618926048278809, "learning_rate": 5.4680064645172656e-05, "loss": 1.1287, "step": 63300 }, { "epoch": 0.8734948058747348, "grad_norm": 11.067187309265137, "learning_rate": 5.467884438765497e-05, "loss": 1.3, "step": 63400 }, { "epoch": 0.8748725579344742, "grad_norm": 7.3953537940979, "learning_rate": 5.467762182115541e-05, "loss": 1.1744, "step": 63500 }, { "epoch": 0.8762503099942135, "grad_norm": 14.800235748291016, "learning_rate": 5.467639694577783e-05, "loss": 1.176, "step": 63600 }, { "epoch": 0.8776280620539527, "grad_norm": 24.887876510620117, "learning_rate": 5.46751697616263e-05, "loss": 1.127, "step": 63700 }, { "epoch": 0.879005814113692, "grad_norm": 67.39167785644531, "learning_rate": 5.4673940268805074e-05, "loss": 1.1131, "step": 63800 }, { "epoch": 0.8803835661734314, "grad_norm": 12.841975212097168, "learning_rate": 5.46727084674186e-05, "loss": 1.1908, "step": 63900 }, { "epoch": 0.8817613182331707, "grad_norm": 8.85670280456543, "learning_rate": 5.4671474357571525e-05, "loss": 1.0991, "step": 64000 }, { "epoch": 0.8831390702929101, "grad_norm": 6.850118160247803, "learning_rate": 5.46702379393687e-05, "loss": 1.0663, "step": 64100 }, { "epoch": 0.8845168223526494, "grad_norm": 22.98515510559082, "learning_rate": 5.466899921291516e-05, "loss": 1.059, "step": 64200 }, { "epoch": 0.8858945744123887, "grad_norm": 11.451286315917969, "learning_rate": 5.466775817831614e-05, "loss": 1.1818, "step": 64300 }, { "epoch": 0.8872723264721281, "grad_norm": 7.088813781738281, "learning_rate": 5.4666514835677075e-05, "loss": 1.1725, "step": 64400 }, { "epoch": 0.8886500785318674, "grad_norm": 11.199012756347656, "learning_rate": 5.466526918510358e-05, "loss": 1.1438, "step": 64500 }, { "epoch": 0.8900278305916067, "grad_norm": 9.510293960571289, "learning_rate": 5.4664033717708927e-05, "loss": 1.1066, "step": 64600 }, { "epoch": 0.8914055826513461, "grad_norm": 10.846967697143555, "learning_rate": 5.4662783474660975e-05, "loss": 1.2442, "step": 64700 }, { "epoch": 0.8927833347110854, "grad_norm": 12.702670097351074, "learning_rate": 5.4661530923995595e-05, "loss": 1.023, "step": 64800 }, { "epoch": 0.8941610867708247, "grad_norm": 5.255120754241943, "learning_rate": 5.4660276065819214e-05, "loss": 1.1045, "step": 64900 }, { "epoch": 0.8955388388305641, "grad_norm": 27.506940841674805, "learning_rate": 5.465901890023843e-05, "loss": 1.0864, "step": 65000 }, { "epoch": 0.8969165908903034, "grad_norm": 7.346675395965576, "learning_rate": 5.465775942736004e-05, "loss": 1.0771, "step": 65100 }, { "epoch": 0.8982943429500427, "grad_norm": 12.305924415588379, "learning_rate": 5.465649764729106e-05, "loss": 1.0462, "step": 65200 }, { "epoch": 0.8996720950097821, "grad_norm": 16.626296997070312, "learning_rate": 5.4655233560138655e-05, "loss": 1.164, "step": 65300 }, { "epoch": 0.9010498470695214, "grad_norm": 11.740323066711426, "learning_rate": 5.465396716601025e-05, "loss": 1.1718, "step": 65400 }, { "epoch": 0.9024275991292607, "grad_norm": 13.916251182556152, "learning_rate": 5.465269846501341e-05, "loss": 1.141, "step": 65500 }, { "epoch": 0.9038053511890001, "grad_norm": 8.259647369384766, "learning_rate": 5.465142745725592e-05, "loss": 1.1684, "step": 65600 }, { "epoch": 0.9051831032487394, "grad_norm": 5.774639129638672, "learning_rate": 5.465015414284577e-05, "loss": 1.1246, "step": 65700 }, { "epoch": 0.9065608553084787, "grad_norm": 5.18399715423584, "learning_rate": 5.464887852189112e-05, "loss": 1.2111, "step": 65800 }, { "epoch": 0.907938607368218, "grad_norm": 14.119433403015137, "learning_rate": 5.464760059450035e-05, "loss": 1.0581, "step": 65900 }, { "epoch": 0.9093163594279573, "grad_norm": 9.2739896774292, "learning_rate": 5.4646320360782015e-05, "loss": 1.0555, "step": 66000 }, { "epoch": 0.9106941114876966, "grad_norm": 68.081787109375, "learning_rate": 5.4645037820844895e-05, "loss": 1.073, "step": 66100 }, { "epoch": 0.912071863547436, "grad_norm": 155.14369201660156, "learning_rate": 5.464375297479794e-05, "loss": 1.0979, "step": 66200 }, { "epoch": 0.9134496156071753, "grad_norm": 12.8380708694458, "learning_rate": 5.4642465822750296e-05, "loss": 1.1742, "step": 66300 }, { "epoch": 0.9148273676669146, "grad_norm": 29.779661178588867, "learning_rate": 5.464117636481132e-05, "loss": 1.1311, "step": 66400 }, { "epoch": 0.916205119726654, "grad_norm": 10.672758102416992, "learning_rate": 5.463988460109056e-05, "loss": 1.0626, "step": 66500 }, { "epoch": 0.9175828717863933, "grad_norm": 23.137615203857422, "learning_rate": 5.463859053169776e-05, "loss": 1.162, "step": 66600 }, { "epoch": 0.9189606238461326, "grad_norm": 21.966482162475586, "learning_rate": 5.4637294156742854e-05, "loss": 1.213, "step": 66700 }, { "epoch": 0.920338375905872, "grad_norm": 23.99468231201172, "learning_rate": 5.463600847455167e-05, "loss": 1.262, "step": 66800 }, { "epoch": 0.9217161279656113, "grad_norm": 6.36064338684082, "learning_rate": 5.4634707511856026e-05, "loss": 1.2435, "step": 66900 }, { "epoch": 0.9230938800253506, "grad_norm": 103.19817352294922, "learning_rate": 5.4633404243928144e-05, "loss": 1.1248, "step": 67000 }, { "epoch": 0.92447163208509, "grad_norm": 97.3226089477539, "learning_rate": 5.463209867087877e-05, "loss": 1.3189, "step": 67100 }, { "epoch": 0.9258493841448293, "grad_norm": 13.455120086669922, "learning_rate": 5.463079079281879e-05, "loss": 1.1353, "step": 67200 }, { "epoch": 0.9272271362045686, "grad_norm": 9.061980247497559, "learning_rate": 5.462948060985935e-05, "loss": 1.2046, "step": 67300 }, { "epoch": 0.928604888264308, "grad_norm": 25.758026123046875, "learning_rate": 5.462816812211173e-05, "loss": 1.1738, "step": 67400 }, { "epoch": 0.9299826403240473, "grad_norm": 5.642591953277588, "learning_rate": 5.4626853329687444e-05, "loss": 1.1939, "step": 67500 }, { "epoch": 0.9313603923837867, "grad_norm": 10.177616119384766, "learning_rate": 5.4625536232698185e-05, "loss": 1.1388, "step": 67600 }, { "epoch": 0.932738144443526, "grad_norm": 30.894012451171875, "learning_rate": 5.4624216831255856e-05, "loss": 1.0834, "step": 67700 }, { "epoch": 0.9341158965032653, "grad_norm": 7.21911096572876, "learning_rate": 5.462289512547254e-05, "loss": 1.1439, "step": 67800 }, { "epoch": 0.9354936485630047, "grad_norm": 15.170910835266113, "learning_rate": 5.462157111546052e-05, "loss": 1.1791, "step": 67900 }, { "epoch": 0.9368714006227439, "grad_norm": 19.506067276000977, "learning_rate": 5.462024480133228e-05, "loss": 1.263, "step": 68000 }, { "epoch": 0.9382491526824832, "grad_norm": 48.98276901245117, "learning_rate": 5.461891618320049e-05, "loss": 1.1762, "step": 68100 }, { "epoch": 0.9396269047422225, "grad_norm": 6.124650478363037, "learning_rate": 5.4617585261178045e-05, "loss": 1.0375, "step": 68200 }, { "epoch": 0.9410046568019619, "grad_norm": 11.622027397155762, "learning_rate": 5.461625203537799e-05, "loss": 1.1505, "step": 68300 }, { "epoch": 0.9423824088617012, "grad_norm": 9.864316940307617, "learning_rate": 5.4614916505913604e-05, "loss": 1.0624, "step": 68400 }, { "epoch": 0.9437601609214405, "grad_norm": 28.526643753051758, "learning_rate": 5.4613578672898343e-05, "loss": 1.1675, "step": 68500 }, { "epoch": 0.9451379129811799, "grad_norm": 9.211615562438965, "learning_rate": 5.461223853644585e-05, "loss": 1.1768, "step": 68600 }, { "epoch": 0.9465156650409192, "grad_norm": 17.06532096862793, "learning_rate": 5.461089609667e-05, "loss": 1.0767, "step": 68700 }, { "epoch": 0.9478934171006586, "grad_norm": 5.959560871124268, "learning_rate": 5.460955135368483e-05, "loss": 1.1172, "step": 68800 }, { "epoch": 0.9492711691603979, "grad_norm": 12.894081115722656, "learning_rate": 5.460820430760457e-05, "loss": 1.1053, "step": 68900 }, { "epoch": 0.9506489212201372, "grad_norm": 61.03858184814453, "learning_rate": 5.460685495854367e-05, "loss": 1.1451, "step": 69000 }, { "epoch": 0.9520266732798766, "grad_norm": 55.478179931640625, "learning_rate": 5.460550330661677e-05, "loss": 1.1774, "step": 69100 }, { "epoch": 0.9534044253396159, "grad_norm": 120.8309555053711, "learning_rate": 5.46041493519387e-05, "loss": 1.0696, "step": 69200 }, { "epoch": 0.9547821773993552, "grad_norm": 15.431912422180176, "learning_rate": 5.460279309462447e-05, "loss": 1.1694, "step": 69300 }, { "epoch": 0.9561599294590946, "grad_norm": 13.145162582397461, "learning_rate": 5.460143453478931e-05, "loss": 1.1647, "step": 69400 }, { "epoch": 0.9575376815188339, "grad_norm": 9.228443145751953, "learning_rate": 5.460007367254863e-05, "loss": 1.1053, "step": 69500 }, { "epoch": 0.9589154335785732, "grad_norm": 15.133450508117676, "learning_rate": 5.459871050801806e-05, "loss": 1.0844, "step": 69600 }, { "epoch": 0.9602931856383126, "grad_norm": 39.60796356201172, "learning_rate": 5.459734504131339e-05, "loss": 1.1495, "step": 69700 }, { "epoch": 0.9616709376980519, "grad_norm": 195.38800048828125, "learning_rate": 5.4595977272550626e-05, "loss": 1.0921, "step": 69800 }, { "epoch": 0.9630486897577912, "grad_norm": 10.106812477111816, "learning_rate": 5.459460720184598e-05, "loss": 1.1482, "step": 69900 }, { "epoch": 0.9644264418175306, "grad_norm": 19.906232833862305, "learning_rate": 5.4593234829315834e-05, "loss": 1.1633, "step": 70000 }, { "epoch": 0.9658041938772699, "grad_norm": 14.408281326293945, "learning_rate": 5.459186015507678e-05, "loss": 1.249, "step": 70100 }, { "epoch": 0.9671819459370091, "grad_norm": 6.585309982299805, "learning_rate": 5.459048317924561e-05, "loss": 1.1562, "step": 70200 }, { "epoch": 0.9685596979967485, "grad_norm": 16.187843322753906, "learning_rate": 5.458910390193929e-05, "loss": 1.0716, "step": 70300 }, { "epoch": 0.9699374500564878, "grad_norm": 25.06880760192871, "learning_rate": 5.458772232327501e-05, "loss": 1.1231, "step": 70400 }, { "epoch": 0.9713152021162271, "grad_norm": 9.305314064025879, "learning_rate": 5.458633844337015e-05, "loss": 1.1219, "step": 70500 }, { "epoch": 0.9726929541759665, "grad_norm": 49.83774948120117, "learning_rate": 5.458495226234225e-05, "loss": 1.044, "step": 70600 }, { "epoch": 0.9740707062357058, "grad_norm": 10.125755310058594, "learning_rate": 5.458357767651902e-05, "loss": 1.0559, "step": 70700 }, { "epoch": 0.9754484582954451, "grad_norm": 30.343183517456055, "learning_rate": 5.458218691660685e-05, "loss": 1.1131, "step": 70800 }, { "epoch": 0.9768262103551845, "grad_norm": 9.028037071228027, "learning_rate": 5.4580793855924345e-05, "loss": 1.0412, "step": 70900 }, { "epoch": 0.9782039624149238, "grad_norm": 3.4295129776000977, "learning_rate": 5.457939849458987e-05, "loss": 0.9659, "step": 71000 }, { "epoch": 0.9795817144746631, "grad_norm": 9.905821800231934, "learning_rate": 5.457800083272196e-05, "loss": 1.0396, "step": 71100 }, { "epoch": 0.9809594665344025, "grad_norm": 161.8009033203125, "learning_rate": 5.4576600870439336e-05, "loss": 1.0404, "step": 71200 }, { "epoch": 0.9823372185941418, "grad_norm": 8.669951438903809, "learning_rate": 5.4575212641872804e-05, "loss": 1.1563, "step": 71300 }, { "epoch": 0.9837149706538811, "grad_norm": 37.04841613769531, "learning_rate": 5.4573808102118954e-05, "loss": 1.0112, "step": 71400 }, { "epoch": 0.9850927227136205, "grad_norm": 16.401836395263672, "learning_rate": 5.45724012623066e-05, "loss": 1.1272, "step": 71500 }, { "epoch": 0.9864704747733598, "grad_norm": 12.509251594543457, "learning_rate": 5.4570992122555254e-05, "loss": 1.1502, "step": 71600 }, { "epoch": 0.9878482268330991, "grad_norm": 6.93540096282959, "learning_rate": 5.456958068298463e-05, "loss": 1.0207, "step": 71700 }, { "epoch": 0.9892259788928385, "grad_norm": 19.594242095947266, "learning_rate": 5.4568166943714654e-05, "loss": 1.0321, "step": 71800 }, { "epoch": 0.9906037309525778, "grad_norm": 18.886825561523438, "learning_rate": 5.45667509048654e-05, "loss": 1.0534, "step": 71900 }, { "epoch": 0.9919814830123171, "grad_norm": 8.338887214660645, "learning_rate": 5.45653325665572e-05, "loss": 1.092, "step": 72000 }, { "epoch": 0.9933592350720565, "grad_norm": 8.154275894165039, "learning_rate": 5.4563911928910524e-05, "loss": 1.0882, "step": 72100 }, { "epoch": 0.9947369871317958, "grad_norm": 13.929986000061035, "learning_rate": 5.456248899204607e-05, "loss": 0.9251, "step": 72200 }, { "epoch": 0.996114739191535, "grad_norm": 46.77289581298828, "learning_rate": 5.456106375608472e-05, "loss": 1.0503, "step": 72300 }, { "epoch": 0.9974924912512744, "grad_norm": 19.588340759277344, "learning_rate": 5.455963622114758e-05, "loss": 1.1006, "step": 72400 }, { "epoch": 0.9988702433110137, "grad_norm": 12.388969421386719, "learning_rate": 5.455820638735589e-05, "loss": 1.0414, "step": 72500 }, { "epoch": 1.000247995370753, "grad_norm": 15.652881622314453, "learning_rate": 5.4556774254831144e-05, "loss": 1.0677, "step": 72600 }, { "epoch": 1.0016257474304924, "grad_norm": 32.35802459716797, "learning_rate": 5.4555339823695003e-05, "loss": 1.0572, "step": 72700 }, { "epoch": 1.0030034994902317, "grad_norm": 7.339658260345459, "learning_rate": 5.4553903094069325e-05, "loss": 1.0814, "step": 72800 }, { "epoch": 1.004381251549971, "grad_norm": 16.973472595214844, "learning_rate": 5.455246406607618e-05, "loss": 1.1038, "step": 72900 }, { "epoch": 1.0057590036097104, "grad_norm": 20.264204025268555, "learning_rate": 5.455102273983781e-05, "loss": 1.0272, "step": 73000 }, { "epoch": 1.0071367556694497, "grad_norm": 7.178906440734863, "learning_rate": 5.4549579115476665e-05, "loss": 1.0812, "step": 73100 }, { "epoch": 1.008514507729189, "grad_norm": 13.878907203674316, "learning_rate": 5.45481331931154e-05, "loss": 1.1016, "step": 73200 }, { "epoch": 1.0098922597889284, "grad_norm": 5.6693925857543945, "learning_rate": 5.454668497287682e-05, "loss": 1.0893, "step": 73300 }, { "epoch": 1.0112700118486677, "grad_norm": 9.715073585510254, "learning_rate": 5.4545234454884e-05, "loss": 1.0805, "step": 73400 }, { "epoch": 1.012647763908407, "grad_norm": 13.02507209777832, "learning_rate": 5.454378163926014e-05, "loss": 1.0722, "step": 73500 }, { "epoch": 1.0140255159681464, "grad_norm": 9.488529205322266, "learning_rate": 5.4542326526128687e-05, "loss": 1.1949, "step": 73600 }, { "epoch": 1.0154032680278857, "grad_norm": 31.917760848999023, "learning_rate": 5.454086911561324e-05, "loss": 1.0524, "step": 73700 }, { "epoch": 1.016781020087625, "grad_norm": 16.788251876831055, "learning_rate": 5.453942401628641e-05, "loss": 1.1226, "step": 73800 }, { "epoch": 1.0181587721473644, "grad_norm": 15.653600692749023, "learning_rate": 5.4537962034345385e-05, "loss": 1.0383, "step": 73900 }, { "epoch": 1.0195365242071037, "grad_norm": 14.568146705627441, "learning_rate": 5.4536497755391145e-05, "loss": 1.1341, "step": 74000 }, { "epoch": 1.020914276266843, "grad_norm": 22.769643783569336, "learning_rate": 5.453503117954811e-05, "loss": 1.0483, "step": 74100 }, { "epoch": 1.0222920283265824, "grad_norm": 16.251747131347656, "learning_rate": 5.4533562306940866e-05, "loss": 1.0303, "step": 74200 }, { "epoch": 1.0236697803863217, "grad_norm": 7.284687042236328, "learning_rate": 5.4532091137694206e-05, "loss": 0.9862, "step": 74300 }, { "epoch": 1.025047532446061, "grad_norm": 13.01443099975586, "learning_rate": 5.4530617671933106e-05, "loss": 1.1108, "step": 74400 }, { "epoch": 1.0264252845058004, "grad_norm": 20.628028869628906, "learning_rate": 5.4529141909782745e-05, "loss": 1.098, "step": 74500 }, { "epoch": 1.0278030365655397, "grad_norm": 104.16590118408203, "learning_rate": 5.4527663851368504e-05, "loss": 1.1839, "step": 74600 }, { "epoch": 1.029180788625279, "grad_norm": 28.211145401000977, "learning_rate": 5.452618349681594e-05, "loss": 1.0293, "step": 74700 }, { "epoch": 1.0305585406850184, "grad_norm": 11.115974426269531, "learning_rate": 5.452470084625083e-05, "loss": 1.1017, "step": 74800 }, { "epoch": 1.0319362927447577, "grad_norm": 16.250064849853516, "learning_rate": 5.452321589979911e-05, "loss": 1.1867, "step": 74900 }, { "epoch": 1.033314044804497, "grad_norm": 6.32594633102417, "learning_rate": 5.452172865758696e-05, "loss": 1.0984, "step": 75000 }, { "epoch": 1.0346917968642364, "grad_norm": 9.784749984741211, "learning_rate": 5.4520239119740725e-05, "loss": 1.1673, "step": 75100 }, { "epoch": 1.0360695489239757, "grad_norm": 7.5926408767700195, "learning_rate": 5.4518747286386934e-05, "loss": 1.0425, "step": 75200 }, { "epoch": 1.037447300983715, "grad_norm": 30.351991653442383, "learning_rate": 5.451725315765233e-05, "loss": 1.1489, "step": 75300 }, { "epoch": 1.0388250530434544, "grad_norm": 32.387943267822266, "learning_rate": 5.4515756733663866e-05, "loss": 1.2019, "step": 75400 }, { "epoch": 1.0402028051031937, "grad_norm": 7.632412433624268, "learning_rate": 5.4514258014548644e-05, "loss": 1.1611, "step": 75500 }, { "epoch": 1.0415805571629329, "grad_norm": 12.371882438659668, "learning_rate": 5.4512757000434e-05, "loss": 1.1791, "step": 75600 }, { "epoch": 1.0429583092226722, "grad_norm": 59.52131271362305, "learning_rate": 5.4511253691447454e-05, "loss": 1.0491, "step": 75700 }, { "epoch": 1.0443360612824115, "grad_norm": 13.746917724609375, "learning_rate": 5.4509748087716715e-05, "loss": 1.1139, "step": 75800 }, { "epoch": 1.0457138133421509, "grad_norm": 4.954247951507568, "learning_rate": 5.45082401893697e-05, "loss": 1.1633, "step": 75900 }, { "epoch": 1.0470915654018902, "grad_norm": 65.24946594238281, "learning_rate": 5.450672999653451e-05, "loss": 1.1437, "step": 76000 }, { "epoch": 1.0484693174616295, "grad_norm": 29.954261779785156, "learning_rate": 5.450521750933944e-05, "loss": 1.118, "step": 76100 }, { "epoch": 1.0498470695213689, "grad_norm": 23.084720611572266, "learning_rate": 5.450370272791298e-05, "loss": 1.1746, "step": 76200 }, { "epoch": 1.0512248215811082, "grad_norm": 14.409061431884766, "learning_rate": 5.45022008344945e-05, "loss": 1.0738, "step": 76300 }, { "epoch": 1.0526025736408475, "grad_norm": 20.48155975341797, "learning_rate": 5.450068148793063e-05, "loss": 1.1904, "step": 76400 }, { "epoch": 1.0539803257005869, "grad_norm": 3.173302173614502, "learning_rate": 5.449917507527895e-05, "loss": 1.1493, "step": 76500 }, { "epoch": 1.0553580777603262, "grad_norm": 10.985121726989746, "learning_rate": 5.4497651164088826e-05, "loss": 1.0501, "step": 76600 }, { "epoch": 1.0567358298200655, "grad_norm": 14.65538215637207, "learning_rate": 5.449612495931011e-05, "loss": 1.133, "step": 76700 }, { "epoch": 1.0581135818798049, "grad_norm": 32.66143798828125, "learning_rate": 5.449459646107248e-05, "loss": 1.1614, "step": 76800 }, { "epoch": 1.0594913339395442, "grad_norm": 11.89787483215332, "learning_rate": 5.449306566950577e-05, "loss": 1.0989, "step": 76900 }, { "epoch": 1.0608690859992835, "grad_norm": 5.535636901855469, "learning_rate": 5.449153258474003e-05, "loss": 1.0709, "step": 77000 }, { "epoch": 1.0622468380590229, "grad_norm": 3.966078758239746, "learning_rate": 5.4489997206905524e-05, "loss": 1.088, "step": 77100 }, { "epoch": 1.0636245901187622, "grad_norm": 33.74238204956055, "learning_rate": 5.448845953613267e-05, "loss": 1.1627, "step": 77200 }, { "epoch": 1.0650023421785015, "grad_norm": 23.126930236816406, "learning_rate": 5.4486919572552104e-05, "loss": 1.124, "step": 77300 }, { "epoch": 1.0663800942382409, "grad_norm": 9.946000099182129, "learning_rate": 5.448537731629465e-05, "loss": 1.0372, "step": 77400 }, { "epoch": 1.0677578462979802, "grad_norm": 43.816619873046875, "learning_rate": 5.4483832767491345e-05, "loss": 1.0756, "step": 77500 }, { "epoch": 1.0691355983577195, "grad_norm": 11.190947532653809, "learning_rate": 5.44822859262734e-05, "loss": 1.1891, "step": 77600 }, { "epoch": 1.0705133504174589, "grad_norm": 15.756780624389648, "learning_rate": 5.448073679277221e-05, "loss": 1.1597, "step": 77700 }, { "epoch": 1.0718911024771982, "grad_norm": 11.828129768371582, "learning_rate": 5.447918536711941e-05, "loss": 1.0724, "step": 77800 }, { "epoch": 1.0732688545369375, "grad_norm": 6.720120429992676, "learning_rate": 5.44776316494468e-05, "loss": 1.0642, "step": 77900 }, { "epoch": 1.0746466065966769, "grad_norm": 50.856388092041016, "learning_rate": 5.447607563988635e-05, "loss": 1.1431, "step": 78000 }, { "epoch": 1.0760243586564162, "grad_norm": 13.411588668823242, "learning_rate": 5.447451733857026e-05, "loss": 1.1438, "step": 78100 }, { "epoch": 1.0774021107161555, "grad_norm": 12.107073783874512, "learning_rate": 5.447295674563093e-05, "loss": 1.1408, "step": 78200 }, { "epoch": 1.0787798627758949, "grad_norm": 22.65484619140625, "learning_rate": 5.4471393861200924e-05, "loss": 1.0825, "step": 78300 }, { "epoch": 1.0801576148356342, "grad_norm": 9.944424629211426, "learning_rate": 5.446982868541304e-05, "loss": 1.1244, "step": 78400 }, { "epoch": 1.0815353668953736, "grad_norm": 11.047139167785645, "learning_rate": 5.446826121840022e-05, "loss": 0.9826, "step": 78500 }, { "epoch": 1.0829131189551129, "grad_norm": 11.193907737731934, "learning_rate": 5.446669146029564e-05, "loss": 1.0336, "step": 78600 }, { "epoch": 1.0842908710148522, "grad_norm": 5.81455135345459, "learning_rate": 5.4465119411232666e-05, "loss": 1.1661, "step": 78700 }, { "epoch": 1.0856686230745916, "grad_norm": 11.2352294921875, "learning_rate": 5.446354507134484e-05, "loss": 1.0469, "step": 78800 }, { "epoch": 1.0870463751343309, "grad_norm": 12.239124298095703, "learning_rate": 5.446196844076592e-05, "loss": 1.0839, "step": 78900 }, { "epoch": 1.0884241271940702, "grad_norm": 13.329319953918457, "learning_rate": 5.4460389519629845e-05, "loss": 1.2385, "step": 79000 }, { "epoch": 1.0898018792538096, "grad_norm": 11.425338745117188, "learning_rate": 5.445880830807075e-05, "loss": 1.1366, "step": 79100 }, { "epoch": 1.091179631313549, "grad_norm": 45.51327896118164, "learning_rate": 5.445722480622297e-05, "loss": 1.1483, "step": 79200 }, { "epoch": 1.0925573833732882, "grad_norm": 9.737133979797363, "learning_rate": 5.445563901422103e-05, "loss": 1.1978, "step": 79300 }, { "epoch": 1.0939351354330276, "grad_norm": 10.909536361694336, "learning_rate": 5.4454050932199644e-05, "loss": 1.183, "step": 79400 }, { "epoch": 1.095312887492767, "grad_norm": 9.269868850708008, "learning_rate": 5.445246056029374e-05, "loss": 1.1329, "step": 79500 }, { "epoch": 1.0966906395525062, "grad_norm": 11.121707916259766, "learning_rate": 5.445086789863844e-05, "loss": 1.0166, "step": 79600 }, { "epoch": 1.0980683916122453, "grad_norm": 21.100162506103516, "learning_rate": 5.444928890821485e-05, "loss": 1.0112, "step": 79700 }, { "epoch": 1.099446143671985, "grad_norm": 38.795997619628906, "learning_rate": 5.4447691690360935e-05, "loss": 1.0999, "step": 79800 }, { "epoch": 1.100823895731724, "grad_norm": 38.402748107910156, "learning_rate": 5.444609218316274e-05, "loss": 1.2368, "step": 79900 }, { "epoch": 1.1022016477914633, "grad_norm": 8.940460205078125, "learning_rate": 5.444449038675617e-05, "loss": 1.0731, "step": 80000 }, { "epoch": 1.1035793998512027, "grad_norm": 13.04713249206543, "learning_rate": 5.444288630127729e-05, "loss": 1.1178, "step": 80100 }, { "epoch": 1.104957151910942, "grad_norm": 75.44645690917969, "learning_rate": 5.444127992686238e-05, "loss": 1.135, "step": 80200 }, { "epoch": 1.1063349039706813, "grad_norm": 43.578285217285156, "learning_rate": 5.4439671263647916e-05, "loss": 1.0547, "step": 80300 }, { "epoch": 1.1077126560304207, "grad_norm": 9.73839282989502, "learning_rate": 5.443806031177055e-05, "loss": 1.074, "step": 80400 }, { "epoch": 1.10909040809016, "grad_norm": 30.30518913269043, "learning_rate": 5.443644707136714e-05, "loss": 1.0595, "step": 80500 }, { "epoch": 1.1104681601498994, "grad_norm": 11.806612968444824, "learning_rate": 5.443483154257475e-05, "loss": 1.0632, "step": 80600 }, { "epoch": 1.1118459122096387, "grad_norm": 12.915070533752441, "learning_rate": 5.4433213725530626e-05, "loss": 1.1425, "step": 80700 }, { "epoch": 1.113223664269378, "grad_norm": 1.4595685005187988, "learning_rate": 5.4431593620372206e-05, "loss": 1.0396, "step": 80800 }, { "epoch": 1.1146014163291174, "grad_norm": 33.75121307373047, "learning_rate": 5.442997122723712e-05, "loss": 1.3027, "step": 80900 }, { "epoch": 1.1159791683888567, "grad_norm": 46.50124740600586, "learning_rate": 5.442834654626321e-05, "loss": 1.109, "step": 81000 }, { "epoch": 1.117356920448596, "grad_norm": 25.134326934814453, "learning_rate": 5.4426719577588504e-05, "loss": 1.0968, "step": 81100 }, { "epoch": 1.1187346725083354, "grad_norm": 20.71164321899414, "learning_rate": 5.44250903213512e-05, "loss": 1.0733, "step": 81200 }, { "epoch": 1.1201124245680747, "grad_norm": 17.346027374267578, "learning_rate": 5.442347510444865e-05, "loss": 1.2123, "step": 81300 }, { "epoch": 1.121490176627814, "grad_norm": 12.854451179504395, "learning_rate": 5.4421841296373786e-05, "loss": 1.0808, "step": 81400 }, { "epoch": 1.1228679286875534, "grad_norm": 33.05289840698242, "learning_rate": 5.4420205201150774e-05, "loss": 1.1255, "step": 81500 }, { "epoch": 1.1242456807472927, "grad_norm": 29.11379051208496, "learning_rate": 5.4418566818918604e-05, "loss": 1.129, "step": 81600 }, { "epoch": 1.125623432807032, "grad_norm": 8.390328407287598, "learning_rate": 5.441692614981648e-05, "loss": 1.131, "step": 81700 }, { "epoch": 1.1270011848667714, "grad_norm": 6.617385387420654, "learning_rate": 5.4415283193983766e-05, "loss": 1.0623, "step": 81800 }, { "epoch": 1.1283789369265107, "grad_norm": 27.417757034301758, "learning_rate": 5.441363795156004e-05, "loss": 1.0916, "step": 81900 }, { "epoch": 1.12975668898625, "grad_norm": 57.721649169921875, "learning_rate": 5.441199042268509e-05, "loss": 1.0992, "step": 82000 }, { "epoch": 1.1311344410459894, "grad_norm": 14.383347511291504, "learning_rate": 5.441034060749888e-05, "loss": 1.1283, "step": 82100 }, { "epoch": 1.1325121931057287, "grad_norm": 11.432974815368652, "learning_rate": 5.440868850614155e-05, "loss": 1.0373, "step": 82200 }, { "epoch": 1.133889945165468, "grad_norm": 33.512874603271484, "learning_rate": 5.4407034118753466e-05, "loss": 1.0787, "step": 82300 }, { "epoch": 1.1352676972252074, "grad_norm": 8.812397003173828, "learning_rate": 5.440537744547519e-05, "loss": 1.2184, "step": 82400 }, { "epoch": 1.1366454492849467, "grad_norm": 4.684133052825928, "learning_rate": 5.440371848644745e-05, "loss": 0.9759, "step": 82500 }, { "epoch": 1.138023201344686, "grad_norm": 8.109193801879883, "learning_rate": 5.440205724181118e-05, "loss": 1.0988, "step": 82600 }, { "epoch": 1.1394009534044254, "grad_norm": 53.559410095214844, "learning_rate": 5.440039371170752e-05, "loss": 1.1205, "step": 82700 }, { "epoch": 1.1407787054641647, "grad_norm": 10.38835620880127, "learning_rate": 5.439872789627779e-05, "loss": 1.1365, "step": 82800 }, { "epoch": 1.142156457523904, "grad_norm": 18.772029876708984, "learning_rate": 5.439705979566352e-05, "loss": 1.0443, "step": 82900 }, { "epoch": 1.1435342095836434, "grad_norm": 13.300806045532227, "learning_rate": 5.439538941000641e-05, "loss": 1.1483, "step": 83000 }, { "epoch": 1.1449119616433827, "grad_norm": 10.32981014251709, "learning_rate": 5.439371673944837e-05, "loss": 1.2567, "step": 83100 }, { "epoch": 1.146289713703122, "grad_norm": 22.647167205810547, "learning_rate": 5.439204178413151e-05, "loss": 1.1153, "step": 83200 }, { "epoch": 1.1476674657628614, "grad_norm": 14.755345344543457, "learning_rate": 5.439036454419812e-05, "loss": 1.126, "step": 83300 }, { "epoch": 1.1490452178226007, "grad_norm": 38.85015106201172, "learning_rate": 5.43886850197907e-05, "loss": 1.164, "step": 83400 }, { "epoch": 1.15042296988234, "grad_norm": 6.0228376388549805, "learning_rate": 5.4387003211051914e-05, "loss": 1.1508, "step": 83500 }, { "epoch": 1.1518007219420794, "grad_norm": 6.189916610717773, "learning_rate": 5.4385319118124655e-05, "loss": 1.114, "step": 83600 }, { "epoch": 1.1531784740018187, "grad_norm": 12.52712345123291, "learning_rate": 5.4383632741152e-05, "loss": 1.0507, "step": 83700 }, { "epoch": 1.154556226061558, "grad_norm": 34.678489685058594, "learning_rate": 5.4381944080277204e-05, "loss": 1.1446, "step": 83800 }, { "epoch": 1.1559339781212974, "grad_norm": 19.96841049194336, "learning_rate": 5.438025313564373e-05, "loss": 1.0987, "step": 83900 }, { "epoch": 1.1573117301810365, "grad_norm": 11.498269081115723, "learning_rate": 5.4378559907395235e-05, "loss": 1.0812, "step": 84000 }, { "epoch": 1.158689482240776, "grad_norm": 17.331729888916016, "learning_rate": 5.4376864395675564e-05, "loss": 1.124, "step": 84100 }, { "epoch": 1.1600672343005152, "grad_norm": 12.576549530029297, "learning_rate": 5.437516660062876e-05, "loss": 1.032, "step": 84200 }, { "epoch": 1.1614449863602547, "grad_norm": 32.467193603515625, "learning_rate": 5.437346652239906e-05, "loss": 1.1427, "step": 84300 }, { "epoch": 1.1628227384199938, "grad_norm": 8.65752124786377, "learning_rate": 5.43717641611309e-05, "loss": 1.0757, "step": 84400 }, { "epoch": 1.1642004904797332, "grad_norm": 27.366947174072266, "learning_rate": 5.43700595169689e-05, "loss": 1.2517, "step": 84500 }, { "epoch": 1.1655782425394725, "grad_norm": 15.15655517578125, "learning_rate": 5.436835259005788e-05, "loss": 1.1006, "step": 84600 }, { "epoch": 1.1669559945992118, "grad_norm": 9.536809921264648, "learning_rate": 5.4366643380542846e-05, "loss": 1.095, "step": 84700 }, { "epoch": 1.1683337466589512, "grad_norm": 9.374640464782715, "learning_rate": 5.436493188856901e-05, "loss": 1.0874, "step": 84800 }, { "epoch": 1.1697114987186905, "grad_norm": 4.268041133880615, "learning_rate": 5.4363218114281764e-05, "loss": 1.0426, "step": 84900 }, { "epoch": 1.1710892507784298, "grad_norm": 11.414450645446777, "learning_rate": 5.436150205782671e-05, "loss": 1.0633, "step": 85000 }, { "epoch": 1.1724670028381692, "grad_norm": 18.067049026489258, "learning_rate": 5.4359783719349637e-05, "loss": 1.0085, "step": 85100 }, { "epoch": 1.1738447548979085, "grad_norm": 8.926839828491211, "learning_rate": 5.4358063098996524e-05, "loss": 1.1346, "step": 85200 }, { "epoch": 1.1752225069576479, "grad_norm": 8.551411628723145, "learning_rate": 5.435634019691354e-05, "loss": 1.1359, "step": 85300 }, { "epoch": 1.1766002590173872, "grad_norm": 10.83679485321045, "learning_rate": 5.4354615013247064e-05, "loss": 1.1378, "step": 85400 }, { "epoch": 1.1779780110771265, "grad_norm": 8.291827201843262, "learning_rate": 5.435288754814366e-05, "loss": 1.0827, "step": 85500 }, { "epoch": 1.1793557631368659, "grad_norm": 5.762343883514404, "learning_rate": 5.4351157801750076e-05, "loss": 1.118, "step": 85600 }, { "epoch": 1.1807335151966052, "grad_norm": 15.36035442352295, "learning_rate": 5.434942577421326e-05, "loss": 1.1872, "step": 85700 }, { "epoch": 1.1821112672563445, "grad_norm": 189.766845703125, "learning_rate": 5.434770882005615e-05, "loss": 1.1369, "step": 85800 }, { "epoch": 1.1834890193160839, "grad_norm": 19.740503311157227, "learning_rate": 5.4345972253482275e-05, "loss": 1.0792, "step": 85900 }, { "epoch": 1.1848667713758232, "grad_norm": 31.87593650817871, "learning_rate": 5.434423340620572e-05, "loss": 1.1086, "step": 86000 }, { "epoch": 1.1862445234355625, "grad_norm": 31.415952682495117, "learning_rate": 5.4342492278374195e-05, "loss": 1.1566, "step": 86100 }, { "epoch": 1.1876222754953019, "grad_norm": 20.956693649291992, "learning_rate": 5.4340748870135626e-05, "loss": 1.1708, "step": 86200 }, { "epoch": 1.1890000275550412, "grad_norm": 16.08473777770996, "learning_rate": 5.433900318163812e-05, "loss": 1.1599, "step": 86300 }, { "epoch": 1.1903777796147805, "grad_norm": 7.2160139083862305, "learning_rate": 5.433725521303e-05, "loss": 1.1283, "step": 86400 }, { "epoch": 1.1917555316745199, "grad_norm": 7.467589855194092, "learning_rate": 5.433550496445974e-05, "loss": 1.1068, "step": 86500 }, { "epoch": 1.1931332837342592, "grad_norm": 16.656461715698242, "learning_rate": 5.4333752436076055e-05, "loss": 1.1579, "step": 86600 }, { "epoch": 1.1945110357939985, "grad_norm": 4.586458206176758, "learning_rate": 5.4331997628027814e-05, "loss": 1.1274, "step": 86700 }, { "epoch": 1.1958887878537379, "grad_norm": 21.88697052001953, "learning_rate": 5.43302405404641e-05, "loss": 1.055, "step": 86800 }, { "epoch": 1.1972665399134772, "grad_norm": 44.077423095703125, "learning_rate": 5.4328481173534196e-05, "loss": 1.1449, "step": 86900 }, { "epoch": 1.1986442919732165, "grad_norm": 13.202244758605957, "learning_rate": 5.432671952738755e-05, "loss": 1.2522, "step": 87000 }, { "epoch": 1.2000220440329559, "grad_norm": 12.632771492004395, "learning_rate": 5.4324955602173845e-05, "loss": 1.0926, "step": 87100 }, { "epoch": 1.2013997960926952, "grad_norm": 97.10979461669922, "learning_rate": 5.4323189398042925e-05, "loss": 1.2086, "step": 87200 }, { "epoch": 1.2027775481524345, "grad_norm": 1011.980224609375, "learning_rate": 5.4321420915144845e-05, "loss": 1.2412, "step": 87300 }, { "epoch": 1.2041553002121739, "grad_norm": 169.3117218017578, "learning_rate": 5.431965015362983e-05, "loss": 1.2365, "step": 87400 }, { "epoch": 1.2055330522719132, "grad_norm": 9.306952476501465, "learning_rate": 5.431787711364833e-05, "loss": 1.1619, "step": 87500 }, { "epoch": 1.2069108043316525, "grad_norm": 11.595381736755371, "learning_rate": 5.431610179535097e-05, "loss": 1.2372, "step": 87600 }, { "epoch": 1.2082885563913919, "grad_norm": 9.741960525512695, "learning_rate": 5.4314324198888576e-05, "loss": 1.1783, "step": 87700 }, { "epoch": 1.2096663084511312, "grad_norm": 38.73411178588867, "learning_rate": 5.4312544324412154e-05, "loss": 1.1611, "step": 87800 }, { "epoch": 1.2110440605108705, "grad_norm": 76.61911010742188, "learning_rate": 5.431076217207292e-05, "loss": 1.1557, "step": 87900 }, { "epoch": 1.2124218125706099, "grad_norm": 106.42626190185547, "learning_rate": 5.430897774202228e-05, "loss": 1.1955, "step": 88000 }, { "epoch": 1.2137995646303492, "grad_norm": 41.87074279785156, "learning_rate": 5.430719103441183e-05, "loss": 1.152, "step": 88100 }, { "epoch": 1.2151773166900885, "grad_norm": 79.47062683105469, "learning_rate": 5.430540204939335e-05, "loss": 1.1882, "step": 88200 }, { "epoch": 1.2165550687498277, "grad_norm": 14.044783592224121, "learning_rate": 5.430361078711884e-05, "loss": 1.2085, "step": 88300 }, { "epoch": 1.2179328208095672, "grad_norm": 101.65431213378906, "learning_rate": 5.4301817247740456e-05, "loss": 1.0778, "step": 88400 }, { "epoch": 1.2193105728693063, "grad_norm": 20.04885482788086, "learning_rate": 5.430002143141059e-05, "loss": 1.0642, "step": 88500 }, { "epoch": 1.2206883249290459, "grad_norm": 18.162139892578125, "learning_rate": 5.4298241330482725e-05, "loss": 1.1004, "step": 88600 }, { "epoch": 1.222066076988785, "grad_norm": 90.41080474853516, "learning_rate": 5.429644098347346e-05, "loss": 1.1673, "step": 88700 }, { "epoch": 1.2234438290485243, "grad_norm": 23.455533981323242, "learning_rate": 5.429463835996944e-05, "loss": 1.1221, "step": 88800 }, { "epoch": 1.2248215811082637, "grad_norm": 12.024797439575195, "learning_rate": 5.4292833460123815e-05, "loss": 1.0866, "step": 88900 }, { "epoch": 1.226199333168003, "grad_norm": 118.55679321289062, "learning_rate": 5.429102628408991e-05, "loss": 1.225, "step": 89000 }, { "epoch": 1.2275770852277423, "grad_norm": 32.787933349609375, "learning_rate": 5.4289216832021266e-05, "loss": 1.3286, "step": 89100 }, { "epoch": 1.2289548372874817, "grad_norm": 20.325054168701172, "learning_rate": 5.4287405104071596e-05, "loss": 1.261, "step": 89200 }, { "epoch": 1.230332589347221, "grad_norm": 12.644694328308105, "learning_rate": 5.428559110039483e-05, "loss": 1.0944, "step": 89300 }, { "epoch": 1.2317103414069603, "grad_norm": 19.038776397705078, "learning_rate": 5.428377482114505e-05, "loss": 1.0673, "step": 89400 }, { "epoch": 1.2330880934666997, "grad_norm": 28.572973251342773, "learning_rate": 5.4281956266476585e-05, "loss": 1.1157, "step": 89500 }, { "epoch": 1.234465845526439, "grad_norm": 205.89178466796875, "learning_rate": 5.428013543654391e-05, "loss": 1.1564, "step": 89600 }, { "epoch": 1.2358435975861783, "grad_norm": 5.926337242126465, "learning_rate": 5.427831233150173e-05, "loss": 1.0738, "step": 89700 }, { "epoch": 1.2372213496459177, "grad_norm": 11.620429039001465, "learning_rate": 5.427648695150492e-05, "loss": 1.1264, "step": 89800 }, { "epoch": 1.238599101705657, "grad_norm": 14.232254028320312, "learning_rate": 5.427465929670856e-05, "loss": 1.2858, "step": 89900 }, { "epoch": 1.2399768537653963, "grad_norm": 20.881052017211914, "learning_rate": 5.427282936726791e-05, "loss": 1.2236, "step": 90000 }, { "epoch": 1.2413546058251357, "grad_norm": 6.499723434448242, "learning_rate": 5.427099716333844e-05, "loss": 1.1673, "step": 90100 }, { "epoch": 1.242732357884875, "grad_norm": 88.66902923583984, "learning_rate": 5.426916268507579e-05, "loss": 1.367, "step": 90200 }, { "epoch": 1.2441101099446144, "grad_norm": 46.79194259643555, "learning_rate": 5.426732593263583e-05, "loss": 1.212, "step": 90300 }, { "epoch": 1.2454878620043537, "grad_norm": 11.282959938049316, "learning_rate": 5.426548690617459e-05, "loss": 1.1415, "step": 90400 }, { "epoch": 1.246865614064093, "grad_norm": 46.6762809753418, "learning_rate": 5.426364560584831e-05, "loss": 1.1784, "step": 90500 }, { "epoch": 1.2482433661238324, "grad_norm": 78.02066802978516, "learning_rate": 5.42618020318134e-05, "loss": 1.1381, "step": 90600 }, { "epoch": 1.2496211181835717, "grad_norm": 6.296343803405762, "learning_rate": 5.425995618422651e-05, "loss": 1.1683, "step": 90700 }, { "epoch": 1.250998870243311, "grad_norm": 7.056164741516113, "learning_rate": 5.425810806324442e-05, "loss": 1.1351, "step": 90800 }, { "epoch": 1.2523766223030504, "grad_norm": 24.29969596862793, "learning_rate": 5.4256257669024176e-05, "loss": 1.3496, "step": 90900 }, { "epoch": 1.2537543743627897, "grad_norm": 224.34046936035156, "learning_rate": 5.4254405001722954e-05, "loss": 1.1915, "step": 91000 }, { "epoch": 1.255132126422529, "grad_norm": 18.948619842529297, "learning_rate": 5.425255006149815e-05, "loss": 1.115, "step": 91100 }, { "epoch": 1.2565098784822684, "grad_norm": 15.805986404418945, "learning_rate": 5.425069284850735e-05, "loss": 1.2344, "step": 91200 }, { "epoch": 1.2578876305420077, "grad_norm": 6.287398338317871, "learning_rate": 5.4248833362908344e-05, "loss": 1.1521, "step": 91300 }, { "epoch": 1.259265382601747, "grad_norm": 10.883261680603027, "learning_rate": 5.424697160485909e-05, "loss": 1.0913, "step": 91400 }, { "epoch": 1.2606431346614864, "grad_norm": 310.649658203125, "learning_rate": 5.424512622606851e-05, "loss": 1.1438, "step": 91500 }, { "epoch": 1.2620208867212257, "grad_norm": 12.606843948364258, "learning_rate": 5.424325994631402e-05, "loss": 1.1751, "step": 91600 }, { "epoch": 1.263398638780965, "grad_norm": 18.939218521118164, "learning_rate": 5.424139139458279e-05, "loss": 1.2275, "step": 91700 }, { "epoch": 1.2647763908407044, "grad_norm": 199.7487030029297, "learning_rate": 5.423952057103354e-05, "loss": 1.237, "step": 91800 }, { "epoch": 1.2661541429004437, "grad_norm": 16.321306228637695, "learning_rate": 5.423764747582522e-05, "loss": 1.2226, "step": 91900 }, { "epoch": 1.267531894960183, "grad_norm": 88.80003356933594, "learning_rate": 5.4235772109116976e-05, "loss": 1.2745, "step": 92000 }, { "epoch": 1.2689096470199224, "grad_norm": 171.31634521484375, "learning_rate": 5.4233894471068096e-05, "loss": 1.2717, "step": 92100 }, { "epoch": 1.2702873990796617, "grad_norm": 40.57966613769531, "learning_rate": 5.423201456183811e-05, "loss": 1.2254, "step": 92200 }, { "epoch": 1.271665151139401, "grad_norm": 35.70359420776367, "learning_rate": 5.423013238158673e-05, "loss": 1.3632, "step": 92300 }, { "epoch": 1.2730429031991402, "grad_norm": 89.7231674194336, "learning_rate": 5.422824793047386e-05, "loss": 1.8249, "step": 92400 }, { "epoch": 1.2744206552588797, "grad_norm": 204.28256225585938, "learning_rate": 5.422636120865958e-05, "loss": 1.2999, "step": 92500 }, { "epoch": 1.2757984073186188, "grad_norm": 51.34372329711914, "learning_rate": 5.422447221630418e-05, "loss": 1.4106, "step": 92600 }, { "epoch": 1.2771761593783584, "grad_norm": 15.5244140625, "learning_rate": 5.4222580953568154e-05, "loss": 1.4799, "step": 92700 }, { "epoch": 1.2785539114380975, "grad_norm": 94.4416275024414, "learning_rate": 5.422068742061216e-05, "loss": 1.2477, "step": 92800 }, { "epoch": 1.279931663497837, "grad_norm": 47.11005401611328, "learning_rate": 5.421879161759708e-05, "loss": 1.3798, "step": 92900 }, { "epoch": 1.2813094155575762, "grad_norm": 95.08572387695312, "learning_rate": 5.421689354468394e-05, "loss": 1.2238, "step": 93000 }, { "epoch": 1.2826871676173157, "grad_norm": 36.19763946533203, "learning_rate": 5.421499320203402e-05, "loss": 1.341, "step": 93100 }, { "epoch": 1.2840649196770548, "grad_norm": 31.36441993713379, "learning_rate": 5.421309058980876e-05, "loss": 1.3659, "step": 93200 }, { "epoch": 1.2854426717367944, "grad_norm": 28.856643676757812, "learning_rate": 5.4211185708169776e-05, "loss": 1.4084, "step": 93300 }, { "epoch": 1.2868204237965335, "grad_norm": 12.829009056091309, "learning_rate": 5.4209278557278916e-05, "loss": 1.3829, "step": 93400 }, { "epoch": 1.2881981758562728, "grad_norm": 41.47861099243164, "learning_rate": 5.420736913729821e-05, "loss": 1.2971, "step": 93500 }, { "epoch": 1.2895759279160122, "grad_norm": 20.384492874145508, "learning_rate": 5.420545744838985e-05, "loss": 1.295, "step": 93600 }, { "epoch": 1.2909536799757515, "grad_norm": 41.33100128173828, "learning_rate": 5.420354349071626e-05, "loss": 1.2833, "step": 93700 }, { "epoch": 1.2923314320354908, "grad_norm": 23.125564575195312, "learning_rate": 5.420164643793185e-05, "loss": 1.3758, "step": 93800 }, { "epoch": 1.2937091840952302, "grad_norm": 21.431570053100586, "learning_rate": 5.419972796589938e-05, "loss": 1.2996, "step": 93900 }, { "epoch": 1.2950869361549695, "grad_norm": 11.668604850769043, "learning_rate": 5.419780722558842e-05, "loss": 1.2024, "step": 94000 }, { "epoch": 1.2964646882147088, "grad_norm": 39.09562683105469, "learning_rate": 5.4195884217162155e-05, "loss": 1.2559, "step": 94100 }, { "epoch": 1.2978424402744482, "grad_norm": 19.265756607055664, "learning_rate": 5.4193958940783945e-05, "loss": 1.3205, "step": 94200 }, { "epoch": 1.2992201923341875, "grad_norm": 16.87660026550293, "learning_rate": 5.419203139661737e-05, "loss": 1.1221, "step": 94300 }, { "epoch": 1.3005979443939268, "grad_norm": 12.028328895568848, "learning_rate": 5.4190101584826164e-05, "loss": 1.0838, "step": 94400 }, { "epoch": 1.3019756964536662, "grad_norm": 47.374629974365234, "learning_rate": 5.418816950557428e-05, "loss": 1.1656, "step": 94500 }, { "epoch": 1.3033534485134055, "grad_norm": 17.502593994140625, "learning_rate": 5.4186235159025864e-05, "loss": 1.2574, "step": 94600 }, { "epoch": 1.3047312005731448, "grad_norm": 116.42330169677734, "learning_rate": 5.418429854534524e-05, "loss": 1.1281, "step": 94700 }, { "epoch": 1.3061089526328842, "grad_norm": 11.536840438842773, "learning_rate": 5.418235966469695e-05, "loss": 1.0518, "step": 94800 }, { "epoch": 1.3074867046926235, "grad_norm": 22.08839225769043, "learning_rate": 5.4180418517245694e-05, "loss": 1.1579, "step": 94900 }, { "epoch": 1.3088644567523628, "grad_norm": 18.14686393737793, "learning_rate": 5.417847510315639e-05, "loss": 1.212, "step": 95000 }, { "epoch": 1.3102422088121022, "grad_norm": 21.127187728881836, "learning_rate": 5.4176529422594134e-05, "loss": 1.172, "step": 95100 }, { "epoch": 1.3116199608718415, "grad_norm": 10.276876449584961, "learning_rate": 5.417458147572423e-05, "loss": 1.1477, "step": 95200 }, { "epoch": 1.3129977129315809, "grad_norm": 40.1536979675293, "learning_rate": 5.417263126271217e-05, "loss": 1.1976, "step": 95300 }, { "epoch": 1.3143754649913202, "grad_norm": 16.13709259033203, "learning_rate": 5.417067878372362e-05, "loss": 1.2112, "step": 95400 }, { "epoch": 1.3157532170510595, "grad_norm": 31.721723556518555, "learning_rate": 5.4168724038924465e-05, "loss": 1.1679, "step": 95500 }, { "epoch": 1.3171309691107989, "grad_norm": 11.788978576660156, "learning_rate": 5.416676702848076e-05, "loss": 1.2025, "step": 95600 }, { "epoch": 1.3185087211705382, "grad_norm": 16.537700653076172, "learning_rate": 5.416480775255878e-05, "loss": 1.2318, "step": 95700 }, { "epoch": 1.3198864732302775, "grad_norm": 20.435949325561523, "learning_rate": 5.4162846211324964e-05, "loss": 1.1599, "step": 95800 }, { "epoch": 1.3212642252900169, "grad_norm": 14.62575912475586, "learning_rate": 5.416088240494595e-05, "loss": 1.1141, "step": 95900 }, { "epoch": 1.3226419773497562, "grad_norm": 15.326416015625, "learning_rate": 5.415891633358858e-05, "loss": 1.1135, "step": 96000 }, { "epoch": 1.3240197294094955, "grad_norm": 5.411820888519287, "learning_rate": 5.415694799741989e-05, "loss": 1.0651, "step": 96100 }, { "epoch": 1.3253974814692349, "grad_norm": 11.630475997924805, "learning_rate": 5.415497739660708e-05, "loss": 1.0951, "step": 96200 }, { "epoch": 1.3267752335289742, "grad_norm": 22.017620086669922, "learning_rate": 5.415300453131758e-05, "loss": 1.1332, "step": 96300 }, { "epoch": 1.3281529855887135, "grad_norm": 15.014047622680664, "learning_rate": 5.4151029401718985e-05, "loss": 1.2671, "step": 96400 }, { "epoch": 1.3295307376484529, "grad_norm": 9.514591217041016, "learning_rate": 5.4149052007979106e-05, "loss": 1.023, "step": 96500 }, { "epoch": 1.3309084897081922, "grad_norm": 9.969331741333008, "learning_rate": 5.414707235026592e-05, "loss": 1.0165, "step": 96600 }, { "epoch": 1.3322862417679313, "grad_norm": 6.215322017669678, "learning_rate": 5.414509042874761e-05, "loss": 1.0, "step": 96700 }, { "epoch": 1.3336639938276709, "grad_norm": 21.648183822631836, "learning_rate": 5.414310624359255e-05, "loss": 1.1317, "step": 96800 }, { "epoch": 1.33504174588741, "grad_norm": 21.194229125976562, "learning_rate": 5.4141119794969316e-05, "loss": 1.1609, "step": 96900 }, { "epoch": 1.3364194979471495, "grad_norm": 10.29611587524414, "learning_rate": 5.413913108304666e-05, "loss": 1.1667, "step": 97000 }, { "epoch": 1.3377972500068886, "grad_norm": 24.378509521484375, "learning_rate": 5.4137140107993526e-05, "loss": 1.0474, "step": 97100 }, { "epoch": 1.3391750020666282, "grad_norm": 12.182476997375488, "learning_rate": 5.413514686997907e-05, "loss": 1.0979, "step": 97200 }, { "epoch": 1.3405527541263673, "grad_norm": 17.44414710998535, "learning_rate": 5.413315136917262e-05, "loss": 1.1522, "step": 97300 }, { "epoch": 1.3419305061861069, "grad_norm": 14.256574630737305, "learning_rate": 5.413115360574371e-05, "loss": 1.0445, "step": 97400 }, { "epoch": 1.343308258245846, "grad_norm": 26.928070068359375, "learning_rate": 5.412915357986206e-05, "loss": 1.143, "step": 97500 }, { "epoch": 1.3446860103055855, "grad_norm": 7.874545574188232, "learning_rate": 5.412715129169757e-05, "loss": 1.1181, "step": 97600 }, { "epoch": 1.3460637623653247, "grad_norm": 6.059437274932861, "learning_rate": 5.412514674142036e-05, "loss": 1.1641, "step": 97700 }, { "epoch": 1.347441514425064, "grad_norm": 24.248310089111328, "learning_rate": 5.4123139929200724e-05, "loss": 1.1107, "step": 97800 }, { "epoch": 1.3488192664848033, "grad_norm": 9.78466510772705, "learning_rate": 5.412113085520915e-05, "loss": 1.0107, "step": 97900 }, { "epoch": 1.3501970185445427, "grad_norm": 38.441551208496094, "learning_rate": 5.4119119519616306e-05, "loss": 1.0825, "step": 98000 }, { "epoch": 1.351574770604282, "grad_norm": 32.95125961303711, "learning_rate": 5.411710592259308e-05, "loss": 1.2035, "step": 98100 }, { "epoch": 1.3529525226640213, "grad_norm": 129.1112823486328, "learning_rate": 5.411509006431054e-05, "loss": 1.076, "step": 98200 }, { "epoch": 1.3543302747237607, "grad_norm": 14.370128631591797, "learning_rate": 5.411307194493993e-05, "loss": 1.1042, "step": 98300 }, { "epoch": 1.3557080267835, "grad_norm": 9.850196838378906, "learning_rate": 5.411105156465271e-05, "loss": 1.0179, "step": 98400 }, { "epoch": 1.3570857788432393, "grad_norm": 3.594674825668335, "learning_rate": 5.4109028923620524e-05, "loss": 1.0863, "step": 98500 }, { "epoch": 1.3584635309029787, "grad_norm": 9.291257858276367, "learning_rate": 5.410700402201519e-05, "loss": 1.0532, "step": 98600 }, { "epoch": 1.359841282962718, "grad_norm": 10.405354499816895, "learning_rate": 5.4104976860008745e-05, "loss": 1.108, "step": 98700 }, { "epoch": 1.3612190350224573, "grad_norm": 7.508618354797363, "learning_rate": 5.410294743777341e-05, "loss": 1.0318, "step": 98800 }, { "epoch": 1.3625967870821967, "grad_norm": 9.89979362487793, "learning_rate": 5.410091575548159e-05, "loss": 1.0753, "step": 98900 }, { "epoch": 1.363974539141936, "grad_norm": 17.785009384155273, "learning_rate": 5.4098881813305884e-05, "loss": 1.2069, "step": 99000 }, { "epoch": 1.3653522912016753, "grad_norm": 19.928552627563477, "learning_rate": 5.40968456114191e-05, "loss": 1.1077, "step": 99100 }, { "epoch": 1.3667300432614147, "grad_norm": 32.78633499145508, "learning_rate": 5.40948071499942e-05, "loss": 1.0913, "step": 99200 }, { "epoch": 1.368107795321154, "grad_norm": 36.34523010253906, "learning_rate": 5.409276642920438e-05, "loss": 1.1546, "step": 99300 }, { "epoch": 1.3694855473808933, "grad_norm": 12.677918434143066, "learning_rate": 5.4090723449223e-05, "loss": 1.1183, "step": 99400 }, { "epoch": 1.3708632994406327, "grad_norm": 10.551412582397461, "learning_rate": 5.408867821022363e-05, "loss": 1.0943, "step": 99500 }, { "epoch": 1.372241051500372, "grad_norm": 15.108549118041992, "learning_rate": 5.408663071238001e-05, "loss": 1.0646, "step": 99600 }, { "epoch": 1.3736188035601113, "grad_norm": 125.39273834228516, "learning_rate": 5.408458095586611e-05, "loss": 1.0462, "step": 99700 }, { "epoch": 1.3749965556198507, "grad_norm": 15.085184097290039, "learning_rate": 5.408252894085605e-05, "loss": 0.9886, "step": 99800 }, { "epoch": 1.37637430767959, "grad_norm": 5.552258014678955, "learning_rate": 5.408047466752415e-05, "loss": 1.1143, "step": 99900 }, { "epoch": 1.3777520597393293, "grad_norm": 12.217641830444336, "learning_rate": 5.407841813604495e-05, "loss": 0.9727, "step": 100000 }, { "epoch": 1.3791298117990687, "grad_norm": 6.6453423500061035, "learning_rate": 5.4076359346593153e-05, "loss": 0.974, "step": 100100 }, { "epoch": 1.380507563858808, "grad_norm": 8.828533172607422, "learning_rate": 5.407431892099168e-05, "loss": 0.9244, "step": 100200 }, { "epoch": 1.3818853159185474, "grad_norm": 4.5843329429626465, "learning_rate": 5.407225563869496e-05, "loss": 0.9709, "step": 100300 }, { "epoch": 1.3832630679782867, "grad_norm": 8.428098678588867, "learning_rate": 5.407019009894918e-05, "loss": 1.1502, "step": 100400 }, { "epoch": 1.384640820038026, "grad_norm": 6.951627254486084, "learning_rate": 5.406812230192981e-05, "loss": 1.0878, "step": 100500 }, { "epoch": 1.3860185720977654, "grad_norm": 14.449235916137695, "learning_rate": 5.406605224781254e-05, "loss": 1.0733, "step": 100600 }, { "epoch": 1.3873963241575047, "grad_norm": 34.672786712646484, "learning_rate": 5.406397993677322e-05, "loss": 0.9911, "step": 100700 }, { "epoch": 1.388774076217244, "grad_norm": 7.907532215118408, "learning_rate": 5.406190536898789e-05, "loss": 0.977, "step": 100800 }, { "epoch": 1.3901518282769834, "grad_norm": 12.782571792602539, "learning_rate": 5.4059828544632824e-05, "loss": 1.2255, "step": 100900 }, { "epoch": 1.3915295803367225, "grad_norm": 18.650449752807617, "learning_rate": 5.405774946388445e-05, "loss": 1.0604, "step": 101000 }, { "epoch": 1.392907332396462, "grad_norm": 27.865568161010742, "learning_rate": 5.4055668126919373e-05, "loss": 1.0506, "step": 101100 }, { "epoch": 1.3942850844562011, "grad_norm": 7.016862869262695, "learning_rate": 5.405358453391444e-05, "loss": 1.0837, "step": 101200 }, { "epoch": 1.3956628365159407, "grad_norm": 47.0599250793457, "learning_rate": 5.4051498685046655e-05, "loss": 1.0757, "step": 101300 }, { "epoch": 1.3970405885756798, "grad_norm": 47.66703414916992, "learning_rate": 5.404941058049321e-05, "loss": 1.1705, "step": 101400 }, { "epoch": 1.3984183406354194, "grad_norm": 28.557613372802734, "learning_rate": 5.4047320220431524e-05, "loss": 1.1288, "step": 101500 }, { "epoch": 1.3997960926951585, "grad_norm": 60.851383209228516, "learning_rate": 5.404524854235639e-05, "loss": 1.1702, "step": 101600 }, { "epoch": 1.401173844754898, "grad_norm": 87.85972595214844, "learning_rate": 5.404315369436178e-05, "loss": 1.1407, "step": 101700 }, { "epoch": 1.4025515968146371, "grad_norm": 12.193360328674316, "learning_rate": 5.404105659139048e-05, "loss": 1.1541, "step": 101800 }, { "epoch": 1.4039293488743767, "grad_norm": 58.03166961669922, "learning_rate": 5.4038957233620636e-05, "loss": 1.2748, "step": 101900 }, { "epoch": 1.4053071009341158, "grad_norm": 9.301190376281738, "learning_rate": 5.403685562123061e-05, "loss": 1.1759, "step": 102000 }, { "epoch": 1.4066848529938552, "grad_norm": 21.133102416992188, "learning_rate": 5.403475175439893e-05, "loss": 1.0426, "step": 102100 }, { "epoch": 1.4080626050535945, "grad_norm": 20.334646224975586, "learning_rate": 5.403264563330434e-05, "loss": 1.1278, "step": 102200 }, { "epoch": 1.4094403571133338, "grad_norm": 12.950143814086914, "learning_rate": 5.403053725812576e-05, "loss": 1.1, "step": 102300 }, { "epoch": 1.4108181091730732, "grad_norm": 49.374385833740234, "learning_rate": 5.402842662904231e-05, "loss": 0.9982, "step": 102400 }, { "epoch": 1.4121958612328125, "grad_norm": 12.126702308654785, "learning_rate": 5.40263137462333e-05, "loss": 1.1325, "step": 102500 }, { "epoch": 1.4135736132925518, "grad_norm": 30.65694236755371, "learning_rate": 5.4024198609878235e-05, "loss": 1.1715, "step": 102600 }, { "epoch": 1.4149513653522912, "grad_norm": 39.14908218383789, "learning_rate": 5.40220812201568e-05, "loss": 1.1905, "step": 102700 }, { "epoch": 1.4163291174120305, "grad_norm": 6.459589958190918, "learning_rate": 5.4019961577248875e-05, "loss": 1.0743, "step": 102800 }, { "epoch": 1.4177068694717698, "grad_norm": 7.655505657196045, "learning_rate": 5.401783968133454e-05, "loss": 1.1819, "step": 102900 }, { "epoch": 1.4190846215315092, "grad_norm": 36.74433898925781, "learning_rate": 5.4015715532594056e-05, "loss": 1.0713, "step": 103000 }, { "epoch": 1.4204623735912485, "grad_norm": 8.84647274017334, "learning_rate": 5.4013589131207894e-05, "loss": 1.1303, "step": 103100 }, { "epoch": 1.4218401256509878, "grad_norm": 6.3132853507995605, "learning_rate": 5.401146047735668e-05, "loss": 0.9917, "step": 103200 }, { "epoch": 1.4232178777107272, "grad_norm": 27.791242599487305, "learning_rate": 5.4009329571221284e-05, "loss": 1.1319, "step": 103300 }, { "epoch": 1.4245956297704665, "grad_norm": 28.155214309692383, "learning_rate": 5.400719641298271e-05, "loss": 1.0323, "step": 103400 }, { "epoch": 1.4259733818302058, "grad_norm": 24.31360626220703, "learning_rate": 5.400506100282219e-05, "loss": 1.034, "step": 103500 }, { "epoch": 1.4273511338899452, "grad_norm": 8.539220809936523, "learning_rate": 5.400292334092114e-05, "loss": 1.0076, "step": 103600 }, { "epoch": 1.4287288859496845, "grad_norm": 8.166019439697266, "learning_rate": 5.400078342746116e-05, "loss": 1.0668, "step": 103700 }, { "epoch": 1.4301066380094238, "grad_norm": 12.023422241210938, "learning_rate": 5.3998641262624057e-05, "loss": 1.0229, "step": 103800 }, { "epoch": 1.4314843900691632, "grad_norm": 7.248012065887451, "learning_rate": 5.3996496846591805e-05, "loss": 1.0255, "step": 103900 }, { "epoch": 1.4328621421289025, "grad_norm": 135.61102294921875, "learning_rate": 5.399435017954659e-05, "loss": 1.0787, "step": 104000 }, { "epoch": 1.4342398941886418, "grad_norm": 15.248514175415039, "learning_rate": 5.3992201261670796e-05, "loss": 1.067, "step": 104100 }, { "epoch": 1.4356176462483812, "grad_norm": 7.450089931488037, "learning_rate": 5.3990050093146966e-05, "loss": 1.0073, "step": 104200 }, { "epoch": 1.4369953983081205, "grad_norm": 17.849102020263672, "learning_rate": 5.398789667415786e-05, "loss": 1.0851, "step": 104300 }, { "epoch": 1.4383731503678598, "grad_norm": 25.645145416259766, "learning_rate": 5.398574100488642e-05, "loss": 1.0042, "step": 104400 }, { "epoch": 1.4397509024275992, "grad_norm": 16.04149055480957, "learning_rate": 5.398358308551577e-05, "loss": 1.0395, "step": 104500 }, { "epoch": 1.4411286544873385, "grad_norm": 48.494625091552734, "learning_rate": 5.398142291622926e-05, "loss": 1.1061, "step": 104600 }, { "epoch": 1.4425064065470778, "grad_norm": 8.755069732666016, "learning_rate": 5.39792604972104e-05, "loss": 1.1055, "step": 104700 }, { "epoch": 1.4438841586068172, "grad_norm": 12.382411003112793, "learning_rate": 5.397709582864288e-05, "loss": 1.0642, "step": 104800 }, { "epoch": 1.4452619106665565, "grad_norm": 23.622568130493164, "learning_rate": 5.3974928910710615e-05, "loss": 1.0832, "step": 104900 }, { "epoch": 1.4466396627262958, "grad_norm": 5.427385330200195, "learning_rate": 5.3972759743597696e-05, "loss": 1.0343, "step": 105000 }, { "epoch": 1.4480174147860352, "grad_norm": 10.036494255065918, "learning_rate": 5.397058832748841e-05, "loss": 1.049, "step": 105100 }, { "epoch": 1.4493951668457745, "grad_norm": 37.23356628417969, "learning_rate": 5.396841466256722e-05, "loss": 1.0519, "step": 105200 }, { "epoch": 1.4507729189055136, "grad_norm": 10.193114280700684, "learning_rate": 5.396626051928437e-05, "loss": 1.1681, "step": 105300 }, { "epoch": 1.4521506709652532, "grad_norm": 12.274484634399414, "learning_rate": 5.3964082379777066e-05, "loss": 1.0629, "step": 105400 }, { "epoch": 1.4535284230249923, "grad_norm": 29.097864151000977, "learning_rate": 5.396190199201058e-05, "loss": 0.9886, "step": 105500 }, { "epoch": 1.4549061750847319, "grad_norm": 35.82841110229492, "learning_rate": 5.3959719356170134e-05, "loss": 1.0094, "step": 105600 }, { "epoch": 1.456283927144471, "grad_norm": 18.119590759277344, "learning_rate": 5.395753447244117e-05, "loss": 1.1827, "step": 105700 }, { "epoch": 1.4576616792042105, "grad_norm": 32.812923431396484, "learning_rate": 5.395534734100929e-05, "loss": 1.0668, "step": 105800 }, { "epoch": 1.4590394312639496, "grad_norm": 8.903825759887695, "learning_rate": 5.395315796206031e-05, "loss": 1.0225, "step": 105900 }, { "epoch": 1.4604171833236892, "grad_norm": 28.307601928710938, "learning_rate": 5.3950988263166706e-05, "loss": 0.9488, "step": 106000 }, { "epoch": 1.4617949353834283, "grad_norm": 5.0161542892456055, "learning_rate": 5.3948794412212234e-05, "loss": 0.9748, "step": 106100 }, { "epoch": 1.4631726874431679, "grad_norm": 32.3056755065918, "learning_rate": 5.394659831429737e-05, "loss": 1.0922, "step": 106200 }, { "epoch": 1.464550439502907, "grad_norm": 20.302715301513672, "learning_rate": 5.3944399969608686e-05, "loss": 1.1104, "step": 106300 }, { "epoch": 1.4659281915626463, "grad_norm": 15.857366561889648, "learning_rate": 5.3942199378332935e-05, "loss": 1.0989, "step": 106400 }, { "epoch": 1.4673059436223856, "grad_norm": 22.21146583557129, "learning_rate": 5.3939996540657055e-05, "loss": 1.1167, "step": 106500 }, { "epoch": 1.468683695682125, "grad_norm": 8.890013694763184, "learning_rate": 5.393779145676821e-05, "loss": 1.1702, "step": 106600 }, { "epoch": 1.4700614477418643, "grad_norm": 32.30009841918945, "learning_rate": 5.393558412685373e-05, "loss": 1.0053, "step": 106700 }, { "epoch": 1.4714391998016036, "grad_norm": 40.002044677734375, "learning_rate": 5.393337455110113e-05, "loss": 1.0999, "step": 106800 }, { "epoch": 1.472816951861343, "grad_norm": 13.027634620666504, "learning_rate": 5.393116272969814e-05, "loss": 1.1368, "step": 106900 }, { "epoch": 1.4741947039210823, "grad_norm": 10.368160247802734, "learning_rate": 5.3928948662832645e-05, "loss": 1.0435, "step": 107000 }, { "epoch": 1.4755724559808217, "grad_norm": 3.22397518157959, "learning_rate": 5.3926732350692756e-05, "loss": 1.0691, "step": 107100 }, { "epoch": 1.476950208040561, "grad_norm": 4.171754837036133, "learning_rate": 5.392451379346676e-05, "loss": 1.0992, "step": 107200 }, { "epoch": 1.4783279601003003, "grad_norm": 7.594644069671631, "learning_rate": 5.392229299134312e-05, "loss": 1.0613, "step": 107300 }, { "epoch": 1.4797057121600397, "grad_norm": 69.45877838134766, "learning_rate": 5.392006994451051e-05, "loss": 0.9404, "step": 107400 }, { "epoch": 1.481083464219779, "grad_norm": 10.095816612243652, "learning_rate": 5.3917844653157806e-05, "loss": 1.0178, "step": 107500 }, { "epoch": 1.4824612162795183, "grad_norm": 14.364834785461426, "learning_rate": 5.391561711747404e-05, "loss": 1.0506, "step": 107600 }, { "epoch": 1.4838389683392577, "grad_norm": 9.645843505859375, "learning_rate": 5.3913387337648464e-05, "loss": 0.9973, "step": 107700 }, { "epoch": 1.485216720398997, "grad_norm": 10.550060272216797, "learning_rate": 5.39111553138705e-05, "loss": 1.0046, "step": 107800 }, { "epoch": 1.4865944724587363, "grad_norm": 29.86020851135254, "learning_rate": 5.3908921046329774e-05, "loss": 1.0615, "step": 107900 }, { "epoch": 1.4879722245184757, "grad_norm": 5.713249206542969, "learning_rate": 5.39066845352161e-05, "loss": 1.0428, "step": 108000 }, { "epoch": 1.489349976578215, "grad_norm": 15.704887390136719, "learning_rate": 5.390444578071948e-05, "loss": 1.1417, "step": 108100 }, { "epoch": 1.4907277286379543, "grad_norm": 59.428550720214844, "learning_rate": 5.3902204783030106e-05, "loss": 1.002, "step": 108200 }, { "epoch": 1.4921054806976937, "grad_norm": 49.97745895385742, "learning_rate": 5.389996154233835e-05, "loss": 0.9987, "step": 108300 }, { "epoch": 1.493483232757433, "grad_norm": 5.066510200500488, "learning_rate": 5.3897716058834815e-05, "loss": 1.059, "step": 108400 }, { "epoch": 1.4948609848171723, "grad_norm": 9.55388355255127, "learning_rate": 5.3895468332710244e-05, "loss": 1.0086, "step": 108500 }, { "epoch": 1.4962387368769117, "grad_norm": 12.000092506408691, "learning_rate": 5.38932183641556e-05, "loss": 1.0447, "step": 108600 }, { "epoch": 1.497616488936651, "grad_norm": 19.03859519958496, "learning_rate": 5.3890966153362034e-05, "loss": 1.028, "step": 108700 }, { "epoch": 1.4989942409963903, "grad_norm": 22.942298889160156, "learning_rate": 5.388871170052088e-05, "loss": 1.0599, "step": 108800 }, { "epoch": 1.5003719930561297, "grad_norm": 8.335474014282227, "learning_rate": 5.388647758386719e-05, "loss": 1.0734, "step": 108900 }, { "epoch": 1.5017497451158688, "grad_norm": 6.595418453216553, "learning_rate": 5.388421866992131e-05, "loss": 1.0177, "step": 109000 }, { "epoch": 1.5031274971756083, "grad_norm": 20.5299129486084, "learning_rate": 5.3881957514501086e-05, "loss": 0.971, "step": 109100 }, { "epoch": 1.5045052492353475, "grad_norm": 25.569503784179688, "learning_rate": 5.3879694117798595e-05, "loss": 1.1343, "step": 109200 }, { "epoch": 1.505883001295087, "grad_norm": 12.386091232299805, "learning_rate": 5.387742848000614e-05, "loss": 0.9886, "step": 109300 }, { "epoch": 1.5072607533548261, "grad_norm": 17.859819412231445, "learning_rate": 5.3875160601316176e-05, "loss": 0.9962, "step": 109400 }, { "epoch": 1.5086385054145657, "grad_norm": 12.250004768371582, "learning_rate": 5.387289048192139e-05, "loss": 1.1542, "step": 109500 }, { "epoch": 1.5100162574743048, "grad_norm": 13.733154296875, "learning_rate": 5.387061812201464e-05, "loss": 0.9375, "step": 109600 }, { "epoch": 1.5113940095340443, "grad_norm": 57.36831283569336, "learning_rate": 5.386834352178896e-05, "loss": 1.0113, "step": 109700 }, { "epoch": 1.5127717615937835, "grad_norm": 17.72985076904297, "learning_rate": 5.386606668143761e-05, "loss": 1.0345, "step": 109800 }, { "epoch": 1.514149513653523, "grad_norm": 34.852169036865234, "learning_rate": 5.3863787601153996e-05, "loss": 1.0197, "step": 109900 }, { "epoch": 1.5155272657132621, "grad_norm": 45.96980667114258, "learning_rate": 5.386150628113176e-05, "loss": 1.0378, "step": 110000 }, { "epoch": 1.5169050177730017, "grad_norm": 15.316779136657715, "learning_rate": 5.3859222721564696e-05, "loss": 1.0835, "step": 110100 }, { "epoch": 1.5182827698327408, "grad_norm": 14.019207000732422, "learning_rate": 5.38569369226468e-05, "loss": 1.0603, "step": 110200 }, { "epoch": 1.5196605218924804, "grad_norm": 4.631565570831299, "learning_rate": 5.3854648884572284e-05, "loss": 1.0236, "step": 110300 }, { "epoch": 1.5210382739522195, "grad_norm": 4.182553768157959, "learning_rate": 5.385235860753551e-05, "loss": 0.9696, "step": 110400 }, { "epoch": 1.522416026011959, "grad_norm": 14.243215560913086, "learning_rate": 5.385006609173106e-05, "loss": 1.0056, "step": 110500 }, { "epoch": 1.5237937780716981, "grad_norm": 8.640573501586914, "learning_rate": 5.384777133735368e-05, "loss": 1.0065, "step": 110600 }, { "epoch": 1.5251715301314377, "grad_norm": 13.582250595092773, "learning_rate": 5.384547434459834e-05, "loss": 1.0522, "step": 110700 }, { "epoch": 1.5265492821911768, "grad_norm": 17.925390243530273, "learning_rate": 5.3843175113660164e-05, "loss": 0.9387, "step": 110800 }, { "epoch": 1.5279270342509164, "grad_norm": 42.459449768066406, "learning_rate": 5.38408736447345e-05, "loss": 0.9725, "step": 110900 }, { "epoch": 1.5293047863106555, "grad_norm": 6.976260662078857, "learning_rate": 5.3838569938016854e-05, "loss": 1.063, "step": 111000 }, { "epoch": 1.530682538370395, "grad_norm": 59.334529876708984, "learning_rate": 5.383626399370295e-05, "loss": 1.0706, "step": 111100 }, { "epoch": 1.5320602904301341, "grad_norm": 7.620677471160889, "learning_rate": 5.383395581198867e-05, "loss": 1.109, "step": 111200 }, { "epoch": 1.5334380424898735, "grad_norm": 37.53221893310547, "learning_rate": 5.3831645393070136e-05, "loss": 1.0702, "step": 111300 }, { "epoch": 1.5348157945496128, "grad_norm": 35.887611389160156, "learning_rate": 5.3829332737143606e-05, "loss": 1.0681, "step": 111400 }, { "epoch": 1.5361935466093521, "grad_norm": 16.638322830200195, "learning_rate": 5.3827017844405564e-05, "loss": 1.1362, "step": 111500 }, { "epoch": 1.5375712986690915, "grad_norm": 20.41179656982422, "learning_rate": 5.3824700715052664e-05, "loss": 0.9979, "step": 111600 }, { "epoch": 1.5389490507288308, "grad_norm": 17.379352569580078, "learning_rate": 5.382238134928177e-05, "loss": 1.0898, "step": 111700 }, { "epoch": 1.5403268027885701, "grad_norm": 33.538482666015625, "learning_rate": 5.38200597472899e-05, "loss": 1.1033, "step": 111800 }, { "epoch": 1.5417045548483095, "grad_norm": 13.266926765441895, "learning_rate": 5.381773590927432e-05, "loss": 0.9791, "step": 111900 }, { "epoch": 1.5430823069080488, "grad_norm": 12.860810279846191, "learning_rate": 5.381540983543243e-05, "loss": 1.088, "step": 112000 }, { "epoch": 1.5444600589677882, "grad_norm": 5.510438919067383, "learning_rate": 5.381308152596184e-05, "loss": 1.0915, "step": 112100 }, { "epoch": 1.5458378110275275, "grad_norm": 17.27724838256836, "learning_rate": 5.381075098106036e-05, "loss": 1.0701, "step": 112200 }, { "epoch": 1.5472155630872668, "grad_norm": 7.033637046813965, "learning_rate": 5.3808418200925974e-05, "loss": 1.0984, "step": 112300 }, { "epoch": 1.5485933151470062, "grad_norm": 10.704571723937988, "learning_rate": 5.380608318575688e-05, "loss": 0.9909, "step": 112400 }, { "epoch": 1.5499710672067455, "grad_norm": 9.773233413696289, "learning_rate": 5.3803745935751426e-05, "loss": 0.9809, "step": 112500 }, { "epoch": 1.5513488192664848, "grad_norm": 16.5738468170166, "learning_rate": 5.38014064511082e-05, "loss": 1.1881, "step": 112600 }, { "epoch": 1.5527265713262242, "grad_norm": 8.641104698181152, "learning_rate": 5.379906473202593e-05, "loss": 1.1011, "step": 112700 }, { "epoch": 1.5541043233859635, "grad_norm": 7.702586650848389, "learning_rate": 5.379674422929562e-05, "loss": 0.8744, "step": 112800 }, { "epoch": 1.5554820754457028, "grad_norm": 44.07160568237305, "learning_rate": 5.379439806427172e-05, "loss": 1.03, "step": 112900 }, { "epoch": 1.5568598275054422, "grad_norm": 34.84183883666992, "learning_rate": 5.379204966540418e-05, "loss": 0.9633, "step": 113000 }, { "epoch": 1.5582375795651815, "grad_norm": 84.15872955322266, "learning_rate": 5.3789699032892514e-05, "loss": 1.0958, "step": 113100 }, { "epoch": 1.5596153316249208, "grad_norm": 6.767889022827148, "learning_rate": 5.378734616693641e-05, "loss": 1.0156, "step": 113200 }, { "epoch": 1.56099308368466, "grad_norm": 12.367714881896973, "learning_rate": 5.378499106773578e-05, "loss": 1.0273, "step": 113300 }, { "epoch": 1.5623708357443995, "grad_norm": 12.43026351928711, "learning_rate": 5.378263373549067e-05, "loss": 1.0319, "step": 113400 }, { "epoch": 1.5637485878041386, "grad_norm": 19.776586532592773, "learning_rate": 5.3780274170401365e-05, "loss": 1.0497, "step": 113500 }, { "epoch": 1.5651263398638782, "grad_norm": 30.242477416992188, "learning_rate": 5.377791237266833e-05, "loss": 1.0893, "step": 113600 }, { "epoch": 1.5665040919236173, "grad_norm": 13.9710693359375, "learning_rate": 5.3775548342492194e-05, "loss": 1.0857, "step": 113700 }, { "epoch": 1.5678818439833568, "grad_norm": 28.976524353027344, "learning_rate": 5.37731820800738e-05, "loss": 1.0386, "step": 113800 }, { "epoch": 1.569259596043096, "grad_norm": 5.827281475067139, "learning_rate": 5.377081358561418e-05, "loss": 1.1341, "step": 113900 }, { "epoch": 1.5706373481028355, "grad_norm": 4.12724494934082, "learning_rate": 5.3768442859314545e-05, "loss": 1.0815, "step": 114000 }, { "epoch": 1.5720151001625746, "grad_norm": 24.242382049560547, "learning_rate": 5.37660699013763e-05, "loss": 1.1044, "step": 114100 }, { "epoch": 1.5733928522223142, "grad_norm": 73.92936706542969, "learning_rate": 5.376369471200104e-05, "loss": 1.0414, "step": 114200 }, { "epoch": 1.5747706042820533, "grad_norm": 13.798851013183594, "learning_rate": 5.3761317291390545e-05, "loss": 1.2288, "step": 114300 }, { "epoch": 1.5761483563417928, "grad_norm": 23.716012954711914, "learning_rate": 5.3758937639746806e-05, "loss": 1.119, "step": 114400 }, { "epoch": 1.577526108401532, "grad_norm": 12.037857055664062, "learning_rate": 5.375655575727197e-05, "loss": 0.998, "step": 114500 }, { "epoch": 1.5789038604612715, "grad_norm": 8.871610641479492, "learning_rate": 5.375417164416839e-05, "loss": 1.0477, "step": 114600 }, { "epoch": 1.5802816125210106, "grad_norm": 18.3157958984375, "learning_rate": 5.375178530063862e-05, "loss": 1.0899, "step": 114700 }, { "epoch": 1.5816593645807502, "grad_norm": 24.479957580566406, "learning_rate": 5.374939672688538e-05, "loss": 1.0893, "step": 114800 }, { "epoch": 1.5830371166404893, "grad_norm": 15.456332206726074, "learning_rate": 5.3747005923111596e-05, "loss": 1.1074, "step": 114900 }, { "epoch": 1.5844148687002289, "grad_norm": 43.567195892333984, "learning_rate": 5.3744612889520384e-05, "loss": 1.1602, "step": 115000 }, { "epoch": 1.585792620759968, "grad_norm": 2.886906623840332, "learning_rate": 5.374221762631504e-05, "loss": 1.1854, "step": 115100 }, { "epoch": 1.5871703728197075, "grad_norm": 8.213479995727539, "learning_rate": 5.373982013369905e-05, "loss": 1.1184, "step": 115200 }, { "epoch": 1.5885481248794466, "grad_norm": 26.33953094482422, "learning_rate": 5.3737420411876106e-05, "loss": 1.0642, "step": 115300 }, { "epoch": 1.5899258769391862, "grad_norm": 11.88366985321045, "learning_rate": 5.373501846105007e-05, "loss": 1.0355, "step": 115400 }, { "epoch": 1.5913036289989253, "grad_norm": 10.156526565551758, "learning_rate": 5.373261428142499e-05, "loss": 1.0554, "step": 115500 }, { "epoch": 1.5926813810586646, "grad_norm": 12.289494514465332, "learning_rate": 5.373020787320512e-05, "loss": 1.1031, "step": 115600 }, { "epoch": 1.594059133118404, "grad_norm": 3.1095759868621826, "learning_rate": 5.3727799236594904e-05, "loss": 1.0181, "step": 115700 }, { "epoch": 1.5954368851781433, "grad_norm": 21.690570831298828, "learning_rate": 5.372538837179896e-05, "loss": 1.0868, "step": 115800 }, { "epoch": 1.5968146372378826, "grad_norm": 8.749502182006836, "learning_rate": 5.372297527902211e-05, "loss": 1.1044, "step": 115900 }, { "epoch": 1.598192389297622, "grad_norm": 9.71464729309082, "learning_rate": 5.3720559958469344e-05, "loss": 0.9406, "step": 116000 }, { "epoch": 1.5995701413573613, "grad_norm": 58.29253005981445, "learning_rate": 5.3718142410345875e-05, "loss": 1.0254, "step": 116100 }, { "epoch": 1.6009478934171006, "grad_norm": 14.844298362731934, "learning_rate": 5.3715722634857074e-05, "loss": 1.0255, "step": 116200 }, { "epoch": 1.60232564547684, "grad_norm": 22.95594024658203, "learning_rate": 5.371330063220852e-05, "loss": 1.096, "step": 116300 }, { "epoch": 1.6037033975365793, "grad_norm": 25.867507934570312, "learning_rate": 5.371087640260597e-05, "loss": 1.0485, "step": 116400 }, { "epoch": 1.6050811495963186, "grad_norm": 11.754130363464355, "learning_rate": 5.370844994625537e-05, "loss": 1.1064, "step": 116500 }, { "epoch": 1.606458901656058, "grad_norm": 11.510786056518555, "learning_rate": 5.3706021263362867e-05, "loss": 1.0876, "step": 116600 }, { "epoch": 1.6078366537157973, "grad_norm": 16.26561737060547, "learning_rate": 5.370359035413479e-05, "loss": 1.1881, "step": 116700 }, { "epoch": 1.6092144057755366, "grad_norm": 36.43238067626953, "learning_rate": 5.3701157218777656e-05, "loss": 1.0849, "step": 116800 }, { "epoch": 1.610592157835276, "grad_norm": 10.017860412597656, "learning_rate": 5.3698721857498174e-05, "loss": 0.9248, "step": 116900 }, { "epoch": 1.6119699098950153, "grad_norm": 10.625978469848633, "learning_rate": 5.369628427050324e-05, "loss": 0.9446, "step": 117000 }, { "epoch": 1.6133476619547547, "grad_norm": 38.3299560546875, "learning_rate": 5.369384445799993e-05, "loss": 1.1157, "step": 117100 }, { "epoch": 1.614725414014494, "grad_norm": 75.82661437988281, "learning_rate": 5.369142685158813e-05, "loss": 1.1497, "step": 117200 }, { "epoch": 1.6161031660742333, "grad_norm": 25.32996368408203, "learning_rate": 5.3688982610940014e-05, "loss": 1.0587, "step": 117300 }, { "epoch": 1.6174809181339727, "grad_norm": 36.150203704833984, "learning_rate": 5.3686536145403844e-05, "loss": 1.1681, "step": 117400 }, { "epoch": 1.618858670193712, "grad_norm": 26.849124908447266, "learning_rate": 5.368408745518745e-05, "loss": 1.1352, "step": 117500 }, { "epoch": 1.620236422253451, "grad_norm": 23.102895736694336, "learning_rate": 5.3681636540498876e-05, "loss": 1.0638, "step": 117600 }, { "epoch": 1.6216141743131907, "grad_norm": 12.821172714233398, "learning_rate": 5.367918340154633e-05, "loss": 1.0358, "step": 117700 }, { "epoch": 1.6229919263729298, "grad_norm": 26.357751846313477, "learning_rate": 5.367672803853823e-05, "loss": 1.0402, "step": 117800 }, { "epoch": 1.6243696784326693, "grad_norm": 52.07672119140625, "learning_rate": 5.367427045168315e-05, "loss": 0.9626, "step": 117900 }, { "epoch": 1.6257474304924084, "grad_norm": 5.345720291137695, "learning_rate": 5.36718106411899e-05, "loss": 0.9762, "step": 118000 }, { "epoch": 1.627125182552148, "grad_norm": 18.012269973754883, "learning_rate": 5.366934860726744e-05, "loss": 0.9929, "step": 118100 }, { "epoch": 1.628502934611887, "grad_norm": 61.259674072265625, "learning_rate": 5.366688435012493e-05, "loss": 0.9906, "step": 118200 }, { "epoch": 1.6298806866716267, "grad_norm": 4.80804967880249, "learning_rate": 5.3664417869971725e-05, "loss": 1.0891, "step": 118300 }, { "epoch": 1.6312584387313658, "grad_norm": 80.92395782470703, "learning_rate": 5.366194916701737e-05, "loss": 1.0689, "step": 118400 }, { "epoch": 1.6326361907911053, "grad_norm": 11.650620460510254, "learning_rate": 5.3659478241471594e-05, "loss": 1.1024, "step": 118500 }, { "epoch": 1.6340139428508444, "grad_norm": 22.641496658325195, "learning_rate": 5.36570050935443e-05, "loss": 0.9952, "step": 118600 }, { "epoch": 1.635391694910584, "grad_norm": 6.600434303283691, "learning_rate": 5.365452972344561e-05, "loss": 0.9908, "step": 118700 }, { "epoch": 1.6367694469703231, "grad_norm": 20.471500396728516, "learning_rate": 5.365205213138582e-05, "loss": 0.9861, "step": 118800 }, { "epoch": 1.6381471990300627, "grad_norm": 2.438000440597534, "learning_rate": 5.3649572317575405e-05, "loss": 0.9061, "step": 118900 }, { "epoch": 1.6395249510898018, "grad_norm": 13.06838321685791, "learning_rate": 5.364709028222504e-05, "loss": 0.9864, "step": 119000 }, { "epoch": 1.6409027031495413, "grad_norm": 24.539138793945312, "learning_rate": 5.364460602554557e-05, "loss": 1.1174, "step": 119100 }, { "epoch": 1.6422804552092805, "grad_norm": 114.63029479980469, "learning_rate": 5.364211954774809e-05, "loss": 1.1937, "step": 119200 }, { "epoch": 1.64365820726902, "grad_norm": 13.547866821289062, "learning_rate": 5.36396308490438e-05, "loss": 1.0725, "step": 119300 }, { "epoch": 1.6450359593287591, "grad_norm": 24.412593841552734, "learning_rate": 5.363713992964415e-05, "loss": 1.0889, "step": 119400 }, { "epoch": 1.6464137113884987, "grad_norm": 31.08279800415039, "learning_rate": 5.3634646789760736e-05, "loss": 1.1982, "step": 119500 }, { "epoch": 1.6477914634482378, "grad_norm": 27.614606857299805, "learning_rate": 5.363217639419657e-05, "loss": 1.1285, "step": 119600 }, { "epoch": 1.6491692155079773, "grad_norm": 27.000633239746094, "learning_rate": 5.3629678836180814e-05, "loss": 1.0992, "step": 119700 }, { "epoch": 1.6505469675677165, "grad_norm": 25.138166427612305, "learning_rate": 5.362717905831515e-05, "loss": 1.2039, "step": 119800 }, { "epoch": 1.6519247196274558, "grad_norm": 12.283367156982422, "learning_rate": 5.362467706081196e-05, "loss": 1.1113, "step": 119900 }, { "epoch": 1.6533024716871951, "grad_norm": 51.160743713378906, "learning_rate": 5.362217284388381e-05, "loss": 1.105, "step": 120000 }, { "epoch": 1.6546802237469345, "grad_norm": 13.717453956604004, "learning_rate": 5.3619666407743424e-05, "loss": 1.0964, "step": 120100 }, { "epoch": 1.6560579758066738, "grad_norm": 141.6107940673828, "learning_rate": 5.361715775260376e-05, "loss": 0.9988, "step": 120200 }, { "epoch": 1.6574357278664131, "grad_norm": 11.551924705505371, "learning_rate": 5.361464687867792e-05, "loss": 1.1018, "step": 120300 }, { "epoch": 1.6588134799261525, "grad_norm": 21.031797409057617, "learning_rate": 5.3612133786179225e-05, "loss": 1.0463, "step": 120400 }, { "epoch": 1.6601912319858918, "grad_norm": 8.770426750183105, "learning_rate": 5.360961847532118e-05, "loss": 1.0918, "step": 120500 }, { "epoch": 1.6615689840456311, "grad_norm": 22.356077194213867, "learning_rate": 5.360710094631748e-05, "loss": 1.0765, "step": 120600 }, { "epoch": 1.6629467361053705, "grad_norm": 18.905723571777344, "learning_rate": 5.360458119938198e-05, "loss": 1.1508, "step": 120700 }, { "epoch": 1.6643244881651098, "grad_norm": 361.44158935546875, "learning_rate": 5.360205923472876e-05, "loss": 1.005, "step": 120800 }, { "epoch": 1.6657022402248491, "grad_norm": 5.470854759216309, "learning_rate": 5.359953505257207e-05, "loss": 1.0896, "step": 120900 }, { "epoch": 1.6670799922845885, "grad_norm": 11.167045593261719, "learning_rate": 5.3597008653126354e-05, "loss": 0.9672, "step": 121000 }, { "epoch": 1.6684577443443278, "grad_norm": 4.910240650177002, "learning_rate": 5.3594480036606245e-05, "loss": 1.0398, "step": 121100 }, { "epoch": 1.6698354964040671, "grad_norm": 6.964974403381348, "learning_rate": 5.359194920322655e-05, "loss": 1.0586, "step": 121200 }, { "epoch": 1.6712132484638065, "grad_norm": 10.561227798461914, "learning_rate": 5.358941615320229e-05, "loss": 1.0649, "step": 121300 }, { "epoch": 1.6725910005235458, "grad_norm": 15.112284660339355, "learning_rate": 5.358688088674866e-05, "loss": 0.9604, "step": 121400 }, { "epoch": 1.6739687525832851, "grad_norm": 11.338651657104492, "learning_rate": 5.358434340408103e-05, "loss": 1.0518, "step": 121500 }, { "epoch": 1.6753465046430245, "grad_norm": 18.34324073791504, "learning_rate": 5.3581803705414985e-05, "loss": 0.9173, "step": 121600 }, { "epoch": 1.6767242567027638, "grad_norm": 18.65089988708496, "learning_rate": 5.357926179096629e-05, "loss": 1.0138, "step": 121700 }, { "epoch": 1.6781020087625032, "grad_norm": 10.005807876586914, "learning_rate": 5.357671766095088e-05, "loss": 1.0448, "step": 121800 }, { "epoch": 1.6794797608222425, "grad_norm": 17.847454071044922, "learning_rate": 5.3574171315584886e-05, "loss": 1.1733, "step": 121900 }, { "epoch": 1.6808575128819818, "grad_norm": 11.443880081176758, "learning_rate": 5.357164825165385e-05, "loss": 1.0959, "step": 122000 }, { "epoch": 1.682235264941721, "grad_norm": 6.307702541351318, "learning_rate": 5.3569097498383995e-05, "loss": 1.219, "step": 122100 }, { "epoch": 1.6836130170014605, "grad_norm": 27.30145835876465, "learning_rate": 5.356654453041093e-05, "loss": 1.1133, "step": 122200 }, { "epoch": 1.6849907690611996, "grad_norm": 69.27091979980469, "learning_rate": 5.3564014910737146e-05, "loss": 1.1829, "step": 122300 }, { "epoch": 1.6863685211209392, "grad_norm": 87.97235870361328, "learning_rate": 5.3561457536150146e-05, "loss": 1.2058, "step": 122400 }, { "epoch": 1.6877462731806783, "grad_norm": 11.573745727539062, "learning_rate": 5.3558897947508997e-05, "loss": 1.0475, "step": 122500 }, { "epoch": 1.6891240252404178, "grad_norm": 137.28189086914062, "learning_rate": 5.3556336145031156e-05, "loss": 1.1262, "step": 122600 }, { "epoch": 1.690501777300157, "grad_norm": 11.57826042175293, "learning_rate": 5.3553772128934256e-05, "loss": 1.2299, "step": 122700 }, { "epoch": 1.6918795293598965, "grad_norm": 18.035369873046875, "learning_rate": 5.355120589943612e-05, "loss": 1.1575, "step": 122800 }, { "epoch": 1.6932572814196356, "grad_norm": 27.564599990844727, "learning_rate": 5.354863745675477e-05, "loss": 1.1184, "step": 122900 }, { "epoch": 1.6946350334793752, "grad_norm": 62.29301452636719, "learning_rate": 5.35460668011084e-05, "loss": 0.9375, "step": 123000 }, { "epoch": 1.6960127855391143, "grad_norm": 14.21219253540039, "learning_rate": 5.3543493932715406e-05, "loss": 1.122, "step": 123100 }, { "epoch": 1.6973905375988538, "grad_norm": 31.510623931884766, "learning_rate": 5.354091885179437e-05, "loss": 1.1473, "step": 123200 }, { "epoch": 1.698768289658593, "grad_norm": 22.348297119140625, "learning_rate": 5.3538341558564047e-05, "loss": 1.0537, "step": 123300 }, { "epoch": 1.7001460417183325, "grad_norm": 41.6342658996582, "learning_rate": 5.35357620532434e-05, "loss": 1.1125, "step": 123400 }, { "epoch": 1.7015237937780716, "grad_norm": 89.36843872070312, "learning_rate": 5.353318033605157e-05, "loss": 1.2083, "step": 123500 }, { "epoch": 1.7029015458378112, "grad_norm": 17.638071060180664, "learning_rate": 5.3530596407207885e-05, "loss": 1.0446, "step": 123600 }, { "epoch": 1.7042792978975503, "grad_norm": 15.610889434814453, "learning_rate": 5.3528010266931856e-05, "loss": 1.1434, "step": 123700 }, { "epoch": 1.7056570499572898, "grad_norm": 7.199135780334473, "learning_rate": 5.35254219154432e-05, "loss": 1.0682, "step": 123800 }, { "epoch": 1.707034802017029, "grad_norm": 5.713176727294922, "learning_rate": 5.3522831352961814e-05, "loss": 1.0126, "step": 123900 }, { "epoch": 1.7084125540767685, "grad_norm": 17.724994659423828, "learning_rate": 5.3520238579707764e-05, "loss": 1.0586, "step": 124000 }, { "epoch": 1.7097903061365076, "grad_norm": 28.14649772644043, "learning_rate": 5.3517643595901334e-05, "loss": 0.9933, "step": 124100 }, { "epoch": 1.711168058196247, "grad_norm": 6.085136413574219, "learning_rate": 5.351504640176297e-05, "loss": 1.0703, "step": 124200 }, { "epoch": 1.7125458102559863, "grad_norm": 26.125911712646484, "learning_rate": 5.351244699751333e-05, "loss": 1.1096, "step": 124300 }, { "epoch": 1.7139235623157256, "grad_norm": 16.116865158081055, "learning_rate": 5.350984538337323e-05, "loss": 1.1115, "step": 124400 }, { "epoch": 1.715301314375465, "grad_norm": 6.875175952911377, "learning_rate": 5.350724155956371e-05, "loss": 1.0288, "step": 124500 }, { "epoch": 1.7166790664352043, "grad_norm": 16.344148635864258, "learning_rate": 5.350463552630595e-05, "loss": 1.0952, "step": 124600 }, { "epoch": 1.7180568184949436, "grad_norm": 33.94709014892578, "learning_rate": 5.350202728382138e-05, "loss": 1.1, "step": 124700 }, { "epoch": 1.719434570554683, "grad_norm": 9.75704288482666, "learning_rate": 5.349941683233156e-05, "loss": 1.0786, "step": 124800 }, { "epoch": 1.7208123226144223, "grad_norm": 2.9066145420074463, "learning_rate": 5.349680417205827e-05, "loss": 1.0042, "step": 124900 }, { "epoch": 1.7221900746741616, "grad_norm": 5.697661399841309, "learning_rate": 5.349418930322347e-05, "loss": 1.0052, "step": 125000 }, { "epoch": 1.723567826733901, "grad_norm": 4.006572723388672, "learning_rate": 5.3491572226049287e-05, "loss": 0.9651, "step": 125100 }, { "epoch": 1.7249455787936403, "grad_norm": 16.873310089111328, "learning_rate": 5.348895294075809e-05, "loss": 0.9622, "step": 125200 }, { "epoch": 1.7263233308533796, "grad_norm": 16.663330078125, "learning_rate": 5.348633144757237e-05, "loss": 0.9845, "step": 125300 }, { "epoch": 1.727701082913119, "grad_norm": 13.301389694213867, "learning_rate": 5.3483707746714854e-05, "loss": 1.0242, "step": 125400 }, { "epoch": 1.7290788349728583, "grad_norm": 7.422813892364502, "learning_rate": 5.348108183840844e-05, "loss": 0.9887, "step": 125500 }, { "epoch": 1.7304565870325976, "grad_norm": 10.25568962097168, "learning_rate": 5.347845372287619e-05, "loss": 1.1022, "step": 125600 }, { "epoch": 1.731834339092337, "grad_norm": 15.564820289611816, "learning_rate": 5.3475823400341405e-05, "loss": 1.0533, "step": 125700 }, { "epoch": 1.7332120911520763, "grad_norm": 14.092329025268555, "learning_rate": 5.347319087102752e-05, "loss": 1.1312, "step": 125800 }, { "epoch": 1.7345898432118156, "grad_norm": 38.11343002319336, "learning_rate": 5.3470556135158204e-05, "loss": 1.0647, "step": 125900 }, { "epoch": 1.735967595271555, "grad_norm": 45.88645553588867, "learning_rate": 5.3467919192957265e-05, "loss": 1.0641, "step": 126000 }, { "epoch": 1.7373453473312943, "grad_norm": 7.421977996826172, "learning_rate": 5.3465280044648756e-05, "loss": 1.0337, "step": 126100 }, { "epoch": 1.7387230993910336, "grad_norm": 6.982689380645752, "learning_rate": 5.3462638690456856e-05, "loss": 1.0006, "step": 126200 }, { "epoch": 1.740100851450773, "grad_norm": 20.65532875061035, "learning_rate": 5.345999513060598e-05, "loss": 1.1318, "step": 126300 }, { "epoch": 1.741478603510512, "grad_norm": 19.071537017822266, "learning_rate": 5.345734936532071e-05, "loss": 1.1514, "step": 126400 }, { "epoch": 1.7428563555702516, "grad_norm": 23.1497802734375, "learning_rate": 5.345470139482581e-05, "loss": 1.0037, "step": 126500 }, { "epoch": 1.7442341076299908, "grad_norm": 13.743980407714844, "learning_rate": 5.345205121934625e-05, "loss": 0.9893, "step": 126600 }, { "epoch": 1.7456118596897303, "grad_norm": 10.10053825378418, "learning_rate": 5.344939883910716e-05, "loss": 0.9411, "step": 126700 }, { "epoch": 1.7469896117494694, "grad_norm": 27.273420333862305, "learning_rate": 5.344674425433389e-05, "loss": 1.0426, "step": 126800 }, { "epoch": 1.748367363809209, "grad_norm": 8.651666641235352, "learning_rate": 5.344411404405336e-05, "loss": 1.0865, "step": 126900 }, { "epoch": 1.749745115868948, "grad_norm": 12.816594123840332, "learning_rate": 5.344145507292818e-05, "loss": 1.0971, "step": 127000 }, { "epoch": 1.7511228679286877, "grad_norm": 7.724693298339844, "learning_rate": 5.343879389794367e-05, "loss": 0.9634, "step": 127100 }, { "epoch": 1.7525006199884268, "grad_norm": 16.434913635253906, "learning_rate": 5.3436130519325905e-05, "loss": 1.1176, "step": 127200 }, { "epoch": 1.7538783720481663, "grad_norm": 92.48381042480469, "learning_rate": 5.343349160402755e-05, "loss": 1.1346, "step": 127300 }, { "epoch": 1.7552561241079054, "grad_norm": 37.74311065673828, "learning_rate": 5.343082384085298e-05, "loss": 1.0324, "step": 127400 }, { "epoch": 1.756633876167645, "grad_norm": 20.710325241088867, "learning_rate": 5.342815387472226e-05, "loss": 1.0489, "step": 127500 }, { "epoch": 1.758011628227384, "grad_norm": 101.49832916259766, "learning_rate": 5.342548170586223e-05, "loss": 1.157, "step": 127600 }, { "epoch": 1.7593893802871237, "grad_norm": 13.25818920135498, "learning_rate": 5.342280733449989e-05, "loss": 1.2041, "step": 127700 }, { "epoch": 1.7607671323468628, "grad_norm": 9.916483879089355, "learning_rate": 5.3420130760862445e-05, "loss": 1.0011, "step": 127800 }, { "epoch": 1.7621448844066023, "grad_norm": 31.11695098876953, "learning_rate": 5.341745198517729e-05, "loss": 0.9658, "step": 127900 }, { "epoch": 1.7635226364663414, "grad_norm": 67.74144744873047, "learning_rate": 5.3414771007671994e-05, "loss": 0.9779, "step": 128000 }, { "epoch": 1.764900388526081, "grad_norm": 11.926155090332031, "learning_rate": 5.341208782857433e-05, "loss": 1.1963, "step": 128100 }, { "epoch": 1.7662781405858201, "grad_norm": 54.3149528503418, "learning_rate": 5.3409402448112226e-05, "loss": 1.1097, "step": 128200 }, { "epoch": 1.7676558926455597, "grad_norm": 35.44221878051758, "learning_rate": 5.340671486651384e-05, "loss": 1.1201, "step": 128300 }, { "epoch": 1.7690336447052988, "grad_norm": 16.38726234436035, "learning_rate": 5.340402508400749e-05, "loss": 1.0558, "step": 128400 }, { "epoch": 1.7704113967650381, "grad_norm": 12.363265037536621, "learning_rate": 5.340133310082168e-05, "loss": 1.2323, "step": 128500 }, { "epoch": 1.7717891488247774, "grad_norm": 15.031146049499512, "learning_rate": 5.339863891718511e-05, "loss": 1.1629, "step": 128600 }, { "epoch": 1.7731669008845168, "grad_norm": 24.993499755859375, "learning_rate": 5.3395942533326675e-05, "loss": 1.1863, "step": 128700 }, { "epoch": 1.7745446529442561, "grad_norm": 12.42155647277832, "learning_rate": 5.3393243949475436e-05, "loss": 1.0377, "step": 128800 }, { "epoch": 1.7759224050039955, "grad_norm": 28.53933334350586, "learning_rate": 5.339054316586065e-05, "loss": 1.0024, "step": 128900 }, { "epoch": 1.7773001570637348, "grad_norm": 21.87401008605957, "learning_rate": 5.338784018271177e-05, "loss": 1.0078, "step": 129000 }, { "epoch": 1.7786779091234741, "grad_norm": 10.402198791503906, "learning_rate": 5.338513500025843e-05, "loss": 1.0056, "step": 129100 }, { "epoch": 1.7800556611832135, "grad_norm": 10.986882209777832, "learning_rate": 5.338242761873044e-05, "loss": 1.1049, "step": 129200 }, { "epoch": 1.7814334132429528, "grad_norm": 29.090335845947266, "learning_rate": 5.3379718038357815e-05, "loss": 1.0349, "step": 129300 }, { "epoch": 1.7828111653026921, "grad_norm": 11.125770568847656, "learning_rate": 5.337700625937074e-05, "loss": 0.9962, "step": 129400 }, { "epoch": 1.7841889173624315, "grad_norm": 8.75442886352539, "learning_rate": 5.3374292281999596e-05, "loss": 1.1801, "step": 129500 }, { "epoch": 1.7855666694221708, "grad_norm": 51.815059661865234, "learning_rate": 5.337157610647495e-05, "loss": 1.1249, "step": 129600 }, { "epoch": 1.7869444214819101, "grad_norm": 41.39480972290039, "learning_rate": 5.3368857733027556e-05, "loss": 1.0997, "step": 129700 }, { "epoch": 1.7883221735416495, "grad_norm": 14.78404426574707, "learning_rate": 5.336613716188836e-05, "loss": 1.0338, "step": 129800 }, { "epoch": 1.7896999256013888, "grad_norm": 15.128174781799316, "learning_rate": 5.3363414393288466e-05, "loss": 1.0623, "step": 129900 }, { "epoch": 1.7910776776611281, "grad_norm": 16.106595993041992, "learning_rate": 5.336068942745922e-05, "loss": 1.087, "step": 130000 }, { "epoch": 1.7924554297208675, "grad_norm": 22.07122039794922, "learning_rate": 5.3357962264632096e-05, "loss": 1.1794, "step": 130100 }, { "epoch": 1.7938331817806068, "grad_norm": 15.329657554626465, "learning_rate": 5.335523290503879e-05, "loss": 1.039, "step": 130200 }, { "epoch": 1.7952109338403461, "grad_norm": 12.507319450378418, "learning_rate": 5.335250134891117e-05, "loss": 1.0841, "step": 130300 }, { "epoch": 1.7965886859000855, "grad_norm": 15.831151962280273, "learning_rate": 5.334976759648131e-05, "loss": 1.1097, "step": 130400 }, { "epoch": 1.7979664379598248, "grad_norm": 10.346972465515137, "learning_rate": 5.3347031647981446e-05, "loss": 1.0386, "step": 130500 }, { "epoch": 1.7993441900195641, "grad_norm": 30.961881637573242, "learning_rate": 5.3344293503644e-05, "loss": 1.1335, "step": 130600 }, { "epoch": 1.8007219420793033, "grad_norm": 19.25282096862793, "learning_rate": 5.33415531637016e-05, "loss": 1.191, "step": 130700 }, { "epoch": 1.8020996941390428, "grad_norm": 15.14389705657959, "learning_rate": 5.333881062838707e-05, "loss": 1.1822, "step": 130800 }, { "epoch": 1.803477446198782, "grad_norm": 68.57266235351562, "learning_rate": 5.333606589793338e-05, "loss": 1.0386, "step": 130900 }, { "epoch": 1.8048551982585215, "grad_norm": 9.26943302154541, "learning_rate": 5.3333318972573715e-05, "loss": 1.0345, "step": 131000 }, { "epoch": 1.8062329503182606, "grad_norm": 20.253746032714844, "learning_rate": 5.3330569852541435e-05, "loss": 1.0864, "step": 131100 }, { "epoch": 1.8076107023780001, "grad_norm": 8.966611862182617, "learning_rate": 5.332781853807011e-05, "loss": 1.0157, "step": 131200 }, { "epoch": 1.8089884544377393, "grad_norm": 8.893041610717773, "learning_rate": 5.332506502939346e-05, "loss": 0.9388, "step": 131300 }, { "epoch": 1.8103662064974788, "grad_norm": 7.839788913726807, "learning_rate": 5.332230932674542e-05, "loss": 1.0934, "step": 131400 }, { "epoch": 1.811743958557218, "grad_norm": 13.025264739990234, "learning_rate": 5.331955143036009e-05, "loss": 1.0612, "step": 131500 }, { "epoch": 1.8131217106169575, "grad_norm": 15.265260696411133, "learning_rate": 5.331679134047177e-05, "loss": 1.0583, "step": 131600 }, { "epoch": 1.8144994626766966, "grad_norm": 8.136308670043945, "learning_rate": 5.3314029057314956e-05, "loss": 1.0168, "step": 131700 }, { "epoch": 1.8158772147364362, "grad_norm": 21.833972930908203, "learning_rate": 5.33112645811243e-05, "loss": 1.0654, "step": 131800 }, { "epoch": 1.8172549667961753, "grad_norm": 28.135250091552734, "learning_rate": 5.330849791213468e-05, "loss": 1.0604, "step": 131900 }, { "epoch": 1.8186327188559148, "grad_norm": 29.88810157775879, "learning_rate": 5.330572905058112e-05, "loss": 0.9664, "step": 132000 }, { "epoch": 1.820010470915654, "grad_norm": 21.69635581970215, "learning_rate": 5.330295799669886e-05, "loss": 1.0365, "step": 132100 }, { "epoch": 1.8213882229753935, "grad_norm": 9.101287841796875, "learning_rate": 5.3300184750723314e-05, "loss": 0.8906, "step": 132200 }, { "epoch": 1.8227659750351326, "grad_norm": 60.037200927734375, "learning_rate": 5.3297409312890075e-05, "loss": 1.102, "step": 132300 }, { "epoch": 1.8241437270948722, "grad_norm": 5.7496442794799805, "learning_rate": 5.329463168343494e-05, "loss": 0.9268, "step": 132400 }, { "epoch": 1.8255214791546113, "grad_norm": 18.774789810180664, "learning_rate": 5.3291851862593874e-05, "loss": 1.0282, "step": 132500 }, { "epoch": 1.8268992312143508, "grad_norm": 2.4276559352874756, "learning_rate": 5.328906985060305e-05, "loss": 0.941, "step": 132600 }, { "epoch": 1.82827698327409, "grad_norm": 11.795206069946289, "learning_rate": 5.32862856476988e-05, "loss": 1.0247, "step": 132700 }, { "epoch": 1.8296547353338293, "grad_norm": 12.539592742919922, "learning_rate": 5.328349925411766e-05, "loss": 0.9362, "step": 132800 }, { "epoch": 1.8310324873935686, "grad_norm": 11.351883888244629, "learning_rate": 5.328071067009636e-05, "loss": 1.0445, "step": 132900 }, { "epoch": 1.832410239453308, "grad_norm": 6.156078338623047, "learning_rate": 5.327791989587179e-05, "loss": 0.9723, "step": 133000 }, { "epoch": 1.8337879915130473, "grad_norm": 7.734241008758545, "learning_rate": 5.327512693168104e-05, "loss": 0.9273, "step": 133100 }, { "epoch": 1.8351657435727866, "grad_norm": 12.243995666503906, "learning_rate": 5.32723317777614e-05, "loss": 1.0539, "step": 133200 }, { "epoch": 1.836543495632526, "grad_norm": 12.704156875610352, "learning_rate": 5.326953443435032e-05, "loss": 0.9992, "step": 133300 }, { "epoch": 1.8379212476922653, "grad_norm": 10.600850105285645, "learning_rate": 5.3266734901685454e-05, "loss": 1.1213, "step": 133400 }, { "epoch": 1.8392989997520046, "grad_norm": 19.322465896606445, "learning_rate": 5.326393318000464e-05, "loss": 1.1197, "step": 133500 }, { "epoch": 1.840676751811744, "grad_norm": 16.50214195251465, "learning_rate": 5.32611292695459e-05, "loss": 0.8972, "step": 133600 }, { "epoch": 1.8420545038714833, "grad_norm": 10.501808166503906, "learning_rate": 5.325832317054742e-05, "loss": 1.0425, "step": 133700 }, { "epoch": 1.8434322559312226, "grad_norm": 29.4556941986084, "learning_rate": 5.325551488324762e-05, "loss": 0.9163, "step": 133800 }, { "epoch": 1.844810007990962, "grad_norm": 4.9216485023498535, "learning_rate": 5.3252704407885074e-05, "loss": 0.9956, "step": 133900 }, { "epoch": 1.8461877600507013, "grad_norm": 12.077922821044922, "learning_rate": 5.324989174469852e-05, "loss": 0.9936, "step": 134000 }, { "epoch": 1.8475655121104406, "grad_norm": 27.104740142822266, "learning_rate": 5.3247105053262415e-05, "loss": 1.1411, "step": 134100 }, { "epoch": 1.84894326417018, "grad_norm": 9.051424026489258, "learning_rate": 5.32442880370172e-05, "loss": 0.9995, "step": 134200 }, { "epoch": 1.8503210162299193, "grad_norm": 7.275717735290527, "learning_rate": 5.324146883366301e-05, "loss": 1.1227, "step": 134300 }, { "epoch": 1.8516987682896586, "grad_norm": 10.74730396270752, "learning_rate": 5.323864744343936e-05, "loss": 0.9496, "step": 134400 }, { "epoch": 1.853076520349398, "grad_norm": 32.46671676635742, "learning_rate": 5.323582386658592e-05, "loss": 0.9779, "step": 134500 }, { "epoch": 1.8544542724091373, "grad_norm": 6.33997917175293, "learning_rate": 5.323299810334257e-05, "loss": 0.948, "step": 134600 }, { "epoch": 1.8558320244688766, "grad_norm": 7.527612209320068, "learning_rate": 5.32301701539494e-05, "loss": 0.9229, "step": 134700 }, { "epoch": 1.857209776528616, "grad_norm": 13.82846450805664, "learning_rate": 5.322734001864664e-05, "loss": 0.9606, "step": 134800 }, { "epoch": 1.8585875285883553, "grad_norm": 28.552188873291016, "learning_rate": 5.322450769767472e-05, "loss": 1.0237, "step": 134900 }, { "epoch": 1.8599652806480944, "grad_norm": 6.158682346343994, "learning_rate": 5.3221673191274255e-05, "loss": 1.0853, "step": 135000 }, { "epoch": 1.861343032707834, "grad_norm": 28.212495803833008, "learning_rate": 5.321883649968608e-05, "loss": 0.9924, "step": 135100 }, { "epoch": 1.862720784767573, "grad_norm": 9.977272987365723, "learning_rate": 5.321599762315116e-05, "loss": 0.9859, "step": 135200 }, { "epoch": 1.8640985368273126, "grad_norm": 8.814349174499512, "learning_rate": 5.321315656191067e-05, "loss": 0.876, "step": 135300 }, { "epoch": 1.8654762888870517, "grad_norm": 11.563972473144531, "learning_rate": 5.3210313316205995e-05, "loss": 0.9478, "step": 135400 }, { "epoch": 1.8668540409467913, "grad_norm": 6.570127964019775, "learning_rate": 5.320746788627867e-05, "loss": 0.9925, "step": 135500 }, { "epoch": 1.8682317930065304, "grad_norm": 14.85019302368164, "learning_rate": 5.320462027237043e-05, "loss": 0.9663, "step": 135600 }, { "epoch": 1.86960954506627, "grad_norm": 10.365165710449219, "learning_rate": 5.320177047472319e-05, "loss": 1.0398, "step": 135700 }, { "epoch": 1.870987297126009, "grad_norm": 17.870952606201172, "learning_rate": 5.3198918493579054e-05, "loss": 1.001, "step": 135800 }, { "epoch": 1.8723650491857486, "grad_norm": 22.607162475585938, "learning_rate": 5.3196064329180326e-05, "loss": 1.0497, "step": 135900 }, { "epoch": 1.8737428012454878, "grad_norm": 4.769906044006348, "learning_rate": 5.3193207981769465e-05, "loss": 1.0029, "step": 136000 }, { "epoch": 1.8751205533052273, "grad_norm": 22.761455535888672, "learning_rate": 5.319034945158914e-05, "loss": 0.9172, "step": 136100 }, { "epoch": 1.8764983053649664, "grad_norm": 20.56629180908203, "learning_rate": 5.318748873888221e-05, "loss": 0.9593, "step": 136200 }, { "epoch": 1.877876057424706, "grad_norm": 14.103052139282227, "learning_rate": 5.3184654483643094e-05, "loss": 1.12, "step": 136300 }, { "epoch": 1.879253809484445, "grad_norm": 5.662879467010498, "learning_rate": 5.318178942843141e-05, "loss": 1.0458, "step": 136400 }, { "epoch": 1.8806315615441846, "grad_norm": 28.704423904418945, "learning_rate": 5.3178922191420325e-05, "loss": 1.041, "step": 136500 }, { "epoch": 1.8820093136039238, "grad_norm": 12.396733283996582, "learning_rate": 5.3176052772853416e-05, "loss": 0.9739, "step": 136600 }, { "epoch": 1.8833870656636633, "grad_norm": 7.1563615798950195, "learning_rate": 5.317318117297447e-05, "loss": 1.0313, "step": 136700 }, { "epoch": 1.8847648177234024, "grad_norm": 16.226806640625, "learning_rate": 5.317030739202745e-05, "loss": 1.0649, "step": 136800 }, { "epoch": 1.886142569783142, "grad_norm": 44.95390319824219, "learning_rate": 5.3167431430256484e-05, "loss": 1.0083, "step": 136900 }, { "epoch": 1.887520321842881, "grad_norm": 9.811665534973145, "learning_rate": 5.31645532879059e-05, "loss": 0.9397, "step": 137000 }, { "epoch": 1.8888980739026204, "grad_norm": 8.302355766296387, "learning_rate": 5.3161672965220227e-05, "loss": 1.0824, "step": 137100 }, { "epoch": 1.8902758259623598, "grad_norm": 61.96098327636719, "learning_rate": 5.315879046244414e-05, "loss": 0.9977, "step": 137200 }, { "epoch": 1.891653578022099, "grad_norm": 82.41322326660156, "learning_rate": 5.3155905779822534e-05, "loss": 0.9977, "step": 137300 }, { "epoch": 1.8930313300818384, "grad_norm": 16.206649780273438, "learning_rate": 5.3153018917600475e-05, "loss": 1.0369, "step": 137400 }, { "epoch": 1.8944090821415778, "grad_norm": 12.784920692443848, "learning_rate": 5.315012987602322e-05, "loss": 0.9264, "step": 137500 }, { "epoch": 1.895786834201317, "grad_norm": 41.18064880371094, "learning_rate": 5.3147238655336214e-05, "loss": 1.0255, "step": 137600 }, { "epoch": 1.8971645862610564, "grad_norm": 6.60025691986084, "learning_rate": 5.314434525578507e-05, "loss": 1.0017, "step": 137700 }, { "epoch": 1.8985423383207958, "grad_norm": 25.215187072753906, "learning_rate": 5.3141449677615604e-05, "loss": 1.0696, "step": 137800 }, { "epoch": 1.899920090380535, "grad_norm": 7.089141368865967, "learning_rate": 5.3138551921073804e-05, "loss": 0.9741, "step": 137900 }, { "epoch": 1.9012978424402744, "grad_norm": 12.382874488830566, "learning_rate": 5.313565198640585e-05, "loss": 0.9946, "step": 138000 }, { "epoch": 1.9026755945000138, "grad_norm": 8.173346519470215, "learning_rate": 5.3132749873858116e-05, "loss": 0.9643, "step": 138100 }, { "epoch": 1.9040533465597531, "grad_norm": 8.598787307739258, "learning_rate": 5.312984558367713e-05, "loss": 0.9642, "step": 138200 }, { "epoch": 1.9054310986194924, "grad_norm": 13.2457914352417, "learning_rate": 5.312693911610966e-05, "loss": 1.0556, "step": 138300 }, { "epoch": 1.9068088506792318, "grad_norm": 9.43970775604248, "learning_rate": 5.312403047140259e-05, "loss": 0.9242, "step": 138400 }, { "epoch": 1.9081866027389711, "grad_norm": 3.202592134475708, "learning_rate": 5.312111964980305e-05, "loss": 1.0047, "step": 138500 }, { "epoch": 1.9095643547987105, "grad_norm": 8.235738754272461, "learning_rate": 5.3118206651558315e-05, "loss": 0.9901, "step": 138600 }, { "epoch": 1.9109421068584498, "grad_norm": 6.716213703155518, "learning_rate": 5.3115291476915855e-05, "loss": 0.943, "step": 138700 }, { "epoch": 1.9123198589181891, "grad_norm": 11.197039604187012, "learning_rate": 5.311237412612335e-05, "loss": 1.0723, "step": 138800 }, { "epoch": 1.9136976109779285, "grad_norm": 22.365943908691406, "learning_rate": 5.3109454599428626e-05, "loss": 1.1182, "step": 138900 }, { "epoch": 1.9150753630376678, "grad_norm": 8.819984436035156, "learning_rate": 5.310653289707971e-05, "loss": 1.058, "step": 139000 }, { "epoch": 1.9164531150974071, "grad_norm": 8.569857597351074, "learning_rate": 5.310360901932483e-05, "loss": 1.074, "step": 139100 }, { "epoch": 1.9178308671571465, "grad_norm": 14.33634090423584, "learning_rate": 5.310068296641237e-05, "loss": 1.1774, "step": 139200 }, { "epoch": 1.9192086192168856, "grad_norm": 4.235088348388672, "learning_rate": 5.309775473859092e-05, "loss": 0.9613, "step": 139300 }, { "epoch": 1.9205863712766251, "grad_norm": 22.20789337158203, "learning_rate": 5.309482433610926e-05, "loss": 0.9929, "step": 139400 }, { "epoch": 1.9219641233363642, "grad_norm": 4.4684343338012695, "learning_rate": 5.3091891759216326e-05, "loss": 0.9585, "step": 139500 }, { "epoch": 1.9233418753961038, "grad_norm": 16.434553146362305, "learning_rate": 5.308895700816125e-05, "loss": 0.9839, "step": 139600 }, { "epoch": 1.924719627455843, "grad_norm": 35.85634994506836, "learning_rate": 5.30860494632031e-05, "loss": 1.0199, "step": 139700 }, { "epoch": 1.9260973795155825, "grad_norm": 18.60834312438965, "learning_rate": 5.3083139787835095e-05, "loss": 0.9354, "step": 139800 }, { "epoch": 1.9274751315753216, "grad_norm": 9.364298820495605, "learning_rate": 5.308019855925613e-05, "loss": 1.017, "step": 139900 }, { "epoch": 1.9288528836350611, "grad_norm": 38.099571228027344, "learning_rate": 5.3077255157508424e-05, "loss": 0.9935, "step": 140000 }, { "epoch": 1.9302306356948002, "grad_norm": 39.307682037353516, "learning_rate": 5.3074309582842035e-05, "loss": 1.0742, "step": 140100 }, { "epoch": 1.9316083877545398, "grad_norm": 70.52771759033203, "learning_rate": 5.3071361835507216e-05, "loss": 0.9509, "step": 140200 }, { "epoch": 1.932986139814279, "grad_norm": 120.77656555175781, "learning_rate": 5.306841191575439e-05, "loss": 1.0197, "step": 140300 }, { "epoch": 1.9343638918740185, "grad_norm": 51.007225036621094, "learning_rate": 5.3065459823834155e-05, "loss": 1.0303, "step": 140400 }, { "epoch": 1.9357416439337576, "grad_norm": 16.969768524169922, "learning_rate": 5.306250555999732e-05, "loss": 0.952, "step": 140500 }, { "epoch": 1.9371193959934971, "grad_norm": 11.94832706451416, "learning_rate": 5.3059549124494866e-05, "loss": 1.0485, "step": 140600 }, { "epoch": 1.9384971480532363, "grad_norm": 20.453916549682617, "learning_rate": 5.3056590517577946e-05, "loss": 1.0441, "step": 140700 }, { "epoch": 1.9398749001129758, "grad_norm": 24.510923385620117, "learning_rate": 5.3053629739497916e-05, "loss": 0.955, "step": 140800 }, { "epoch": 1.941252652172715, "grad_norm": 21.415592193603516, "learning_rate": 5.3050666790506316e-05, "loss": 1.0676, "step": 140900 }, { "epoch": 1.9426304042324545, "grad_norm": 10.470038414001465, "learning_rate": 5.3047701670854844e-05, "loss": 1.0797, "step": 141000 }, { "epoch": 1.9440081562921936, "grad_norm": 5.278820037841797, "learning_rate": 5.304473438079542e-05, "loss": 1.0034, "step": 141100 }, { "epoch": 1.9453859083519331, "grad_norm": 9.555508613586426, "learning_rate": 5.304176492058012e-05, "loss": 1.0018, "step": 141200 }, { "epoch": 1.9467636604116723, "grad_norm": 5.737916469573975, "learning_rate": 5.303879329046122e-05, "loss": 1.0342, "step": 141300 }, { "epoch": 1.9481414124714116, "grad_norm": 56.66264724731445, "learning_rate": 5.303581949069118e-05, "loss": 1.1203, "step": 141400 }, { "epoch": 1.949519164531151, "grad_norm": 11.701165199279785, "learning_rate": 5.303284352152263e-05, "loss": 1.1388, "step": 141500 }, { "epoch": 1.9508969165908903, "grad_norm": 16.850122451782227, "learning_rate": 5.30298653832084e-05, "loss": 1.1057, "step": 141600 }, { "epoch": 1.9522746686506296, "grad_norm": 5.651546001434326, "learning_rate": 5.302688507600149e-05, "loss": 1.0818, "step": 141700 }, { "epoch": 1.953652420710369, "grad_norm": 84.90093994140625, "learning_rate": 5.302390260015511e-05, "loss": 1.0933, "step": 141800 }, { "epoch": 1.9550301727701083, "grad_norm": 55.46168518066406, "learning_rate": 5.3020917955922616e-05, "loss": 1.1006, "step": 141900 }, { "epoch": 1.9564079248298476, "grad_norm": 56.97056579589844, "learning_rate": 5.301793114355758e-05, "loss": 1.2585, "step": 142000 }, { "epoch": 1.957785676889587, "grad_norm": 45.9921760559082, "learning_rate": 5.301494216331374e-05, "loss": 1.4076, "step": 142100 }, { "epoch": 1.9591634289493263, "grad_norm": 47.68534851074219, "learning_rate": 5.301198093765263e-05, "loss": 1.289, "step": 142200 }, { "epoch": 1.9605411810090656, "grad_norm": 14.917179107666016, "learning_rate": 5.300898764408561e-05, "loss": 1.1821, "step": 142300 }, { "epoch": 1.961918933068805, "grad_norm": 24.069034576416016, "learning_rate": 5.300602214873286e-05, "loss": 1.2142, "step": 142400 }, { "epoch": 1.9632966851285443, "grad_norm": 19.97752571105957, "learning_rate": 5.300302454284971e-05, "loss": 1.2426, "step": 142500 }, { "epoch": 1.9646744371882836, "grad_norm": 19.85276985168457, "learning_rate": 5.300002477035414e-05, "loss": 1.1953, "step": 142600 }, { "epoch": 1.966052189248023, "grad_norm": 16.321653366088867, "learning_rate": 5.2997022831501014e-05, "loss": 1.0981, "step": 142700 }, { "epoch": 1.9674299413077623, "grad_norm": 80.24627685546875, "learning_rate": 5.2994018726545354e-05, "loss": 1.161, "step": 142800 }, { "epoch": 1.9688076933675016, "grad_norm": 6.758548259735107, "learning_rate": 5.299101245574238e-05, "loss": 1.1315, "step": 142900 }, { "epoch": 1.970185445427241, "grad_norm": 53.158634185791016, "learning_rate": 5.2988004019347485e-05, "loss": 1.2088, "step": 143000 }, { "epoch": 1.9715631974869803, "grad_norm": 107.23058319091797, "learning_rate": 5.2984993417616255e-05, "loss": 1.2485, "step": 143100 }, { "epoch": 1.9729409495467196, "grad_norm": 9.508332252502441, "learning_rate": 5.298198065080446e-05, "loss": 0.9912, "step": 143200 }, { "epoch": 1.974318701606459, "grad_norm": 51.10231018066406, "learning_rate": 5.2978965719168025e-05, "loss": 1.0465, "step": 143300 }, { "epoch": 1.9756964536661983, "grad_norm": 4.199136257171631, "learning_rate": 5.2975948622963114e-05, "loss": 1.0531, "step": 143400 }, { "epoch": 1.9770742057259376, "grad_norm": 31.836280822753906, "learning_rate": 5.297292936244604e-05, "loss": 1.159, "step": 143500 }, { "epoch": 1.9784519577856767, "grad_norm": 37.550933837890625, "learning_rate": 5.296990793787328e-05, "loss": 1.1227, "step": 143600 }, { "epoch": 1.9798297098454163, "grad_norm": 2.2217354774475098, "learning_rate": 5.2966884349501555e-05, "loss": 1.1316, "step": 143700 }, { "epoch": 1.9812074619051554, "grad_norm": 26.14838218688965, "learning_rate": 5.296385859758771e-05, "loss": 0.9354, "step": 143800 }, { "epoch": 1.982585213964895, "grad_norm": 16.10267448425293, "learning_rate": 5.29608306823888e-05, "loss": 1.1742, "step": 143900 }, { "epoch": 1.983962966024634, "grad_norm": 27.14734649658203, "learning_rate": 5.2957800604162075e-05, "loss": 0.9897, "step": 144000 }, { "epoch": 1.9853407180843736, "grad_norm": 9.895814895629883, "learning_rate": 5.2954768363164945e-05, "loss": 1.0951, "step": 144100 }, { "epoch": 1.9867184701441127, "grad_norm": 12.091023445129395, "learning_rate": 5.295173395965502e-05, "loss": 1.0686, "step": 144200 }, { "epoch": 1.9880962222038523, "grad_norm": 19.649566650390625, "learning_rate": 5.2948697393890075e-05, "loss": 1.0196, "step": 144300 }, { "epoch": 1.9894739742635914, "grad_norm": 8.195260047912598, "learning_rate": 5.29456586661281e-05, "loss": 1.155, "step": 144400 }, { "epoch": 1.990851726323331, "grad_norm": 167.79872131347656, "learning_rate": 5.294261777662724e-05, "loss": 1.1521, "step": 144500 }, { "epoch": 1.99222947838307, "grad_norm": 9.540665626525879, "learning_rate": 5.293957472564584e-05, "loss": 1.1728, "step": 144600 }, { "epoch": 1.9936072304428096, "grad_norm": 13.336929321289062, "learning_rate": 5.2936529513442414e-05, "loss": 0.9594, "step": 144700 }, { "epoch": 1.9949849825025487, "grad_norm": 5.253993511199951, "learning_rate": 5.293348214027568e-05, "loss": 0.9967, "step": 144800 }, { "epoch": 1.9963627345622883, "grad_norm": 244.40768432617188, "learning_rate": 5.293043260640451e-05, "loss": 1.0451, "step": 144900 }, { "epoch": 1.9977404866220274, "grad_norm": 9.982230186462402, "learning_rate": 5.2927380912088e-05, "loss": 1.0934, "step": 145000 }, { "epoch": 1.999118238681767, "grad_norm": 47.84312057495117, "learning_rate": 5.292432705758539e-05, "loss": 1.19, "step": 145100 }, { "epoch": 2.000495990741506, "grad_norm": 8.944329261779785, "learning_rate": 5.292127104315613e-05, "loss": 1.0664, "step": 145200 }, { "epoch": 2.0018737428012456, "grad_norm": 25.012645721435547, "learning_rate": 5.291821286905984e-05, "loss": 1.089, "step": 145300 }, { "epoch": 2.0032514948609847, "grad_norm": 7.168227195739746, "learning_rate": 5.291515253555632e-05, "loss": 1.0854, "step": 145400 }, { "epoch": 2.0046292469207243, "grad_norm": 25.42429542541504, "learning_rate": 5.291209004290557e-05, "loss": 1.071, "step": 145500 }, { "epoch": 2.0060069989804634, "grad_norm": 16.521404266357422, "learning_rate": 5.290902539136777e-05, "loss": 1.0406, "step": 145600 }, { "epoch": 2.007384751040203, "grad_norm": 16.918079376220703, "learning_rate": 5.290595858120328e-05, "loss": 1.0292, "step": 145700 }, { "epoch": 2.008762503099942, "grad_norm": 5.9267144203186035, "learning_rate": 5.290288961267261e-05, "loss": 0.9708, "step": 145800 }, { "epoch": 2.0101402551596816, "grad_norm": 16.487794876098633, "learning_rate": 5.2899818486036525e-05, "loss": 1.0435, "step": 145900 }, { "epoch": 2.0115180072194208, "grad_norm": 11.98873233795166, "learning_rate": 5.289674520155591e-05, "loss": 0.9971, "step": 146000 }, { "epoch": 2.0128957592791603, "grad_norm": 14.290670394897461, "learning_rate": 5.289366975949187e-05, "loss": 1.1232, "step": 146100 }, { "epoch": 2.0142735113388994, "grad_norm": 17.280029296875, "learning_rate": 5.2890592160105656e-05, "loss": 0.9318, "step": 146200 }, { "epoch": 2.015651263398639, "grad_norm": 9.275042533874512, "learning_rate": 5.2887512403658746e-05, "loss": 1.021, "step": 146300 }, { "epoch": 2.017029015458378, "grad_norm": 35.25020980834961, "learning_rate": 5.288443049041277e-05, "loss": 1.0714, "step": 146400 }, { "epoch": 2.0184067675181177, "grad_norm": 8.693073272705078, "learning_rate": 5.2881346420629566e-05, "loss": 1.101, "step": 146500 }, { "epoch": 2.0197845195778568, "grad_norm": 32.61711120605469, "learning_rate": 5.2878260194571134e-05, "loss": 1.0787, "step": 146600 }, { "epoch": 2.0211622716375963, "grad_norm": 15.949615478515625, "learning_rate": 5.2875171812499664e-05, "loss": 1.0578, "step": 146700 }, { "epoch": 2.0225400236973354, "grad_norm": 11.64084243774414, "learning_rate": 5.2872081274677524e-05, "loss": 1.0368, "step": 146800 }, { "epoch": 2.0239177757570745, "grad_norm": 29.57013511657715, "learning_rate": 5.286898858136728e-05, "loss": 1.0844, "step": 146900 }, { "epoch": 2.025295527816814, "grad_norm": 17.013378143310547, "learning_rate": 5.2865893732831675e-05, "loss": 0.997, "step": 147000 }, { "epoch": 2.026673279876553, "grad_norm": 3.7932121753692627, "learning_rate": 5.286279672933362e-05, "loss": 1.1071, "step": 147100 }, { "epoch": 2.0280510319362928, "grad_norm": 3.597313165664673, "learning_rate": 5.285969757113623e-05, "loss": 0.9796, "step": 147200 }, { "epoch": 2.029428783996032, "grad_norm": 3.0874714851379395, "learning_rate": 5.285659625850279e-05, "loss": 1.0383, "step": 147300 }, { "epoch": 2.0308065360557714, "grad_norm": 110.7660140991211, "learning_rate": 5.285349279169677e-05, "loss": 1.0886, "step": 147400 }, { "epoch": 2.0321842881155106, "grad_norm": 34.73955154418945, "learning_rate": 5.2850387170981836e-05, "loss": 0.9866, "step": 147500 }, { "epoch": 2.03356204017525, "grad_norm": 11.254148483276367, "learning_rate": 5.284727939662182e-05, "loss": 1.0124, "step": 147600 }, { "epoch": 2.034939792234989, "grad_norm": 40.052974700927734, "learning_rate": 5.2844169468880746e-05, "loss": 1.0453, "step": 147700 }, { "epoch": 2.0363175442947288, "grad_norm": 13.228699684143066, "learning_rate": 5.2841057388022805e-05, "loss": 1.0064, "step": 147800 }, { "epoch": 2.037695296354468, "grad_norm": 7.61269474029541, "learning_rate": 5.283794315431241e-05, "loss": 1.0142, "step": 147900 }, { "epoch": 2.0390730484142074, "grad_norm": 22.158586502075195, "learning_rate": 5.2834826768014106e-05, "loss": 0.9403, "step": 148000 }, { "epoch": 2.0404508004739466, "grad_norm": 65.23225402832031, "learning_rate": 5.2831708229392656e-05, "loss": 1.0094, "step": 148100 }, { "epoch": 2.041828552533686, "grad_norm": 33.195255279541016, "learning_rate": 5.2828587538713004e-05, "loss": 0.9455, "step": 148200 }, { "epoch": 2.0432063045934252, "grad_norm": 27.402624130249023, "learning_rate": 5.282546469624025e-05, "loss": 0.9618, "step": 148300 }, { "epoch": 2.044584056653165, "grad_norm": 11.631880760192871, "learning_rate": 5.282233970223971e-05, "loss": 1.0161, "step": 148400 }, { "epoch": 2.045961808712904, "grad_norm": 9.41140079498291, "learning_rate": 5.2819212556976865e-05, "loss": 1.1504, "step": 148500 }, { "epoch": 2.0473395607726435, "grad_norm": 22.59799575805664, "learning_rate": 5.2816083260717376e-05, "loss": 0.8834, "step": 148600 }, { "epoch": 2.0487173128323826, "grad_norm": 212.4950408935547, "learning_rate": 5.281298313884226e-05, "loss": 1.1245, "step": 148700 }, { "epoch": 2.050095064892122, "grad_norm": 13.342792510986328, "learning_rate": 5.280984956289055e-05, "loss": 1.0093, "step": 148800 }, { "epoch": 2.0514728169518612, "grad_norm": 23.59154510498047, "learning_rate": 5.280671383673765e-05, "loss": 0.9521, "step": 148900 }, { "epoch": 2.052850569011601, "grad_norm": 12.94513988494873, "learning_rate": 5.280357596064993e-05, "loss": 1.1052, "step": 149000 }, { "epoch": 2.05422832107134, "grad_norm": 15.56832218170166, "learning_rate": 5.280043593489399e-05, "loss": 1.0303, "step": 149100 }, { "epoch": 2.0556060731310795, "grad_norm": 12.582783699035645, "learning_rate": 5.279729375973658e-05, "loss": 0.8779, "step": 149200 }, { "epoch": 2.0569838251908186, "grad_norm": 10.843000411987305, "learning_rate": 5.279414943544464e-05, "loss": 0.9863, "step": 149300 }, { "epoch": 2.058361577250558, "grad_norm": 13.15982437133789, "learning_rate": 5.27910029622853e-05, "loss": 1.023, "step": 149400 }, { "epoch": 2.0597393293102972, "grad_norm": 10.263442039489746, "learning_rate": 5.278785434052588e-05, "loss": 0.978, "step": 149500 }, { "epoch": 2.061117081370037, "grad_norm": 14.632909774780273, "learning_rate": 5.278470357043385e-05, "loss": 0.9385, "step": 149600 }, { "epoch": 2.062494833429776, "grad_norm": 15.738605499267578, "learning_rate": 5.2781550652276904e-05, "loss": 1.0193, "step": 149700 }, { "epoch": 2.0638725854895155, "grad_norm": 8.371048927307129, "learning_rate": 5.277839558632289e-05, "loss": 0.919, "step": 149800 }, { "epoch": 2.0652503375492546, "grad_norm": 19.53658103942871, "learning_rate": 5.277523837283985e-05, "loss": 1.0999, "step": 149900 }, { "epoch": 2.066628089608994, "grad_norm": 5.926362991333008, "learning_rate": 5.2772079012095995e-05, "loss": 1.0423, "step": 150000 }, { "epoch": 2.0680058416687332, "grad_norm": 72.1875991821289, "learning_rate": 5.276891750435974e-05, "loss": 1.0406, "step": 150100 }, { "epoch": 2.069383593728473, "grad_norm": 4.526686668395996, "learning_rate": 5.2765753849899666e-05, "loss": 0.8817, "step": 150200 }, { "epoch": 2.070761345788212, "grad_norm": 23.28188705444336, "learning_rate": 5.2762588048984555e-05, "loss": 0.8766, "step": 150300 }, { "epoch": 2.0721390978479515, "grad_norm": 5.578934669494629, "learning_rate": 5.2759420101883335e-05, "loss": 0.8567, "step": 150400 }, { "epoch": 2.0735168499076906, "grad_norm": 23.463237762451172, "learning_rate": 5.275625000886516e-05, "loss": 0.9686, "step": 150500 }, { "epoch": 2.07489460196743, "grad_norm": 10.182479858398438, "learning_rate": 5.2753077770199326e-05, "loss": 0.8766, "step": 150600 }, { "epoch": 2.0762723540271693, "grad_norm": 16.524850845336914, "learning_rate": 5.274993514061452e-05, "loss": 0.9731, "step": 150700 }, { "epoch": 2.077650106086909, "grad_norm": 7.713785171508789, "learning_rate": 5.274675863291182e-05, "loss": 0.9396, "step": 150800 }, { "epoch": 2.079027858146648, "grad_norm": 7.996649265289307, "learning_rate": 5.2743579980367816e-05, "loss": 1.0065, "step": 150900 }, { "epoch": 2.0804056102063875, "grad_norm": 27.859678268432617, "learning_rate": 5.274039918325255e-05, "loss": 1.0318, "step": 151000 }, { "epoch": 2.0817833622661266, "grad_norm": 4.3011064529418945, "learning_rate": 5.273721624183623e-05, "loss": 0.8812, "step": 151100 }, { "epoch": 2.0831611143258657, "grad_norm": 42.48961639404297, "learning_rate": 5.2734031156389284e-05, "loss": 0.9334, "step": 151200 }, { "epoch": 2.0845388663856053, "grad_norm": 11.868022918701172, "learning_rate": 5.273084392718229e-05, "loss": 0.9865, "step": 151300 }, { "epoch": 2.0859166184453444, "grad_norm": 57.42210006713867, "learning_rate": 5.2727654554486044e-05, "loss": 0.8884, "step": 151400 }, { "epoch": 2.087294370505084, "grad_norm": 14.839951515197754, "learning_rate": 5.272446303857146e-05, "loss": 1.0079, "step": 151500 }, { "epoch": 2.088672122564823, "grad_norm": 12.66772747039795, "learning_rate": 5.27212693797097e-05, "loss": 1.0085, "step": 151600 }, { "epoch": 2.0900498746245626, "grad_norm": 5.242060661315918, "learning_rate": 5.271807357817208e-05, "loss": 1.0945, "step": 151700 }, { "epoch": 2.0914276266843017, "grad_norm": 17.87880516052246, "learning_rate": 5.2714875634230086e-05, "loss": 0.9576, "step": 151800 }, { "epoch": 2.0928053787440413, "grad_norm": 58.844966888427734, "learning_rate": 5.271167554815541e-05, "loss": 0.9756, "step": 151900 }, { "epoch": 2.0941831308037804, "grad_norm": 4.86928653717041, "learning_rate": 5.270847332021991e-05, "loss": 0.9737, "step": 152000 }, { "epoch": 2.09556088286352, "grad_norm": 63.286617279052734, "learning_rate": 5.270526895069564e-05, "loss": 1.0174, "step": 152100 }, { "epoch": 2.096938634923259, "grad_norm": 28.359848022460938, "learning_rate": 5.2702062439854825e-05, "loss": 0.8507, "step": 152200 }, { "epoch": 2.0983163869829986, "grad_norm": 19.392351150512695, "learning_rate": 5.2698853787969885e-05, "loss": 1.0027, "step": 152300 }, { "epoch": 2.0996941390427377, "grad_norm": 2.0332088470458984, "learning_rate": 5.269564299531338e-05, "loss": 0.9885, "step": 152400 }, { "epoch": 2.1010718911024773, "grad_norm": 11.420206069946289, "learning_rate": 5.269243006215811e-05, "loss": 0.9107, "step": 152500 }, { "epoch": 2.1024496431622164, "grad_norm": 9.256026268005371, "learning_rate": 5.268921498877702e-05, "loss": 0.8763, "step": 152600 }, { "epoch": 2.103827395221956, "grad_norm": 4.526012420654297, "learning_rate": 5.2686029958168464e-05, "loss": 0.9273, "step": 152700 }, { "epoch": 2.105205147281695, "grad_norm": 20.233253479003906, "learning_rate": 5.2682810626550776e-05, "loss": 0.9727, "step": 152800 }, { "epoch": 2.1065828993414346, "grad_norm": 6.938904762268066, "learning_rate": 5.2679589155524485e-05, "loss": 0.9338, "step": 152900 }, { "epoch": 2.1079606514011737, "grad_norm": 50.39978790283203, "learning_rate": 5.267636554536328e-05, "loss": 1.0513, "step": 153000 }, { "epoch": 2.1093384034609133, "grad_norm": 27.855424880981445, "learning_rate": 5.267313979634102e-05, "loss": 1.0309, "step": 153100 }, { "epoch": 2.1107161555206524, "grad_norm": 8.242439270019531, "learning_rate": 5.2669911908731754e-05, "loss": 0.9814, "step": 153200 }, { "epoch": 2.112093907580392, "grad_norm": 7.917934417724609, "learning_rate": 5.26666818828097e-05, "loss": 0.9505, "step": 153300 }, { "epoch": 2.113471659640131, "grad_norm": 5.1177568435668945, "learning_rate": 5.266344971884927e-05, "loss": 0.9524, "step": 153400 }, { "epoch": 2.1148494116998706, "grad_norm": 28.833621978759766, "learning_rate": 5.266021541712505e-05, "loss": 0.9389, "step": 153500 }, { "epoch": 2.1162271637596097, "grad_norm": 2.6027934551239014, "learning_rate": 5.265701135288361e-05, "loss": 0.975, "step": 153600 }, { "epoch": 2.1176049158193493, "grad_norm": 98.79273223876953, "learning_rate": 5.265377279782709e-05, "loss": 0.9552, "step": 153700 }, { "epoch": 2.1189826678790884, "grad_norm": 22.76773452758789, "learning_rate": 5.265053210582888e-05, "loss": 1.0472, "step": 153800 }, { "epoch": 2.120360419938828, "grad_norm": 17.150712966918945, "learning_rate": 5.26472892771643e-05, "loss": 0.9667, "step": 153900 }, { "epoch": 2.121738171998567, "grad_norm": 14.188424110412598, "learning_rate": 5.264404431210883e-05, "loss": 0.9707, "step": 154000 }, { "epoch": 2.1231159240583066, "grad_norm": 4.750355243682861, "learning_rate": 5.264079721093818e-05, "loss": 0.9813, "step": 154100 }, { "epoch": 2.1244936761180457, "grad_norm": 158.37037658691406, "learning_rate": 5.263754797392817e-05, "loss": 0.9929, "step": 154200 }, { "epoch": 2.1258714281777853, "grad_norm": 20.168994903564453, "learning_rate": 5.2634296601354864e-05, "loss": 0.9156, "step": 154300 }, { "epoch": 2.1272491802375244, "grad_norm": 5.984154224395752, "learning_rate": 5.263104309349448e-05, "loss": 1.0161, "step": 154400 }, { "epoch": 2.128626932297264, "grad_norm": 37.03858947753906, "learning_rate": 5.262778745062341e-05, "loss": 0.9872, "step": 154500 }, { "epoch": 2.130004684357003, "grad_norm": 12.900752067565918, "learning_rate": 5.262452967301824e-05, "loss": 0.9569, "step": 154600 }, { "epoch": 2.1313824364167426, "grad_norm": 11.495844841003418, "learning_rate": 5.2621269760955744e-05, "loss": 0.9399, "step": 154700 }, { "epoch": 2.1327601884764817, "grad_norm": 10.074590682983398, "learning_rate": 5.2618007714712864e-05, "loss": 1.0478, "step": 154800 }, { "epoch": 2.1341379405362213, "grad_norm": 14.428563117980957, "learning_rate": 5.261474353456672e-05, "loss": 0.9832, "step": 154900 }, { "epoch": 2.1355156925959604, "grad_norm": 4.966893672943115, "learning_rate": 5.2611477220794635e-05, "loss": 0.9286, "step": 155000 }, { "epoch": 2.1368934446556995, "grad_norm": 11.962916374206543, "learning_rate": 5.2608208773674084e-05, "loss": 1.0126, "step": 155100 }, { "epoch": 2.138271196715439, "grad_norm": 13.316461563110352, "learning_rate": 5.260493819348275e-05, "loss": 0.9576, "step": 155200 }, { "epoch": 2.1396489487751786, "grad_norm": 7.460152626037598, "learning_rate": 5.260166548049847e-05, "loss": 1.0248, "step": 155300 }, { "epoch": 2.1410267008349178, "grad_norm": 7.871267318725586, "learning_rate": 5.2598390634999296e-05, "loss": 1.0188, "step": 155400 }, { "epoch": 2.142404452894657, "grad_norm": 13.87922477722168, "learning_rate": 5.259511365726343e-05, "loss": 0.985, "step": 155500 }, { "epoch": 2.1437822049543964, "grad_norm": 43.95943832397461, "learning_rate": 5.259183454756928e-05, "loss": 0.8258, "step": 155600 }, { "epoch": 2.1451599570141355, "grad_norm": 10.786490440368652, "learning_rate": 5.258855330619541e-05, "loss": 0.998, "step": 155700 }, { "epoch": 2.146537709073875, "grad_norm": 24.638206481933594, "learning_rate": 5.258526993342059e-05, "loss": 1.0332, "step": 155800 }, { "epoch": 2.147915461133614, "grad_norm": 9.810965538024902, "learning_rate": 5.258198442952375e-05, "loss": 1.0727, "step": 155900 }, { "epoch": 2.1492932131933538, "grad_norm": 2.26515531539917, "learning_rate": 5.257869679478402e-05, "loss": 0.9942, "step": 156000 }, { "epoch": 2.150670965253093, "grad_norm": 7.039181709289551, "learning_rate": 5.257540702948069e-05, "loss": 0.9363, "step": 156100 }, { "epoch": 2.1520487173128324, "grad_norm": 28.103984832763672, "learning_rate": 5.257211513389324e-05, "loss": 0.8856, "step": 156200 }, { "epoch": 2.1534264693725715, "grad_norm": 7.266382694244385, "learning_rate": 5.2568821108301356e-05, "loss": 0.8907, "step": 156300 }, { "epoch": 2.154804221432311, "grad_norm": 11.471376419067383, "learning_rate": 5.256552495298486e-05, "loss": 0.9448, "step": 156400 }, { "epoch": 2.15618197349205, "grad_norm": 8.607222557067871, "learning_rate": 5.2562226668223785e-05, "loss": 0.8891, "step": 156500 }, { "epoch": 2.1575597255517898, "grad_norm": 9.009541511535645, "learning_rate": 5.255892625429834e-05, "loss": 0.903, "step": 156600 }, { "epoch": 2.158937477611529, "grad_norm": 5.408649444580078, "learning_rate": 5.255562371148891e-05, "loss": 0.9355, "step": 156700 }, { "epoch": 2.1603152296712684, "grad_norm": 10.264758110046387, "learning_rate": 5.2552319040076056e-05, "loss": 0.9703, "step": 156800 }, { "epoch": 2.1616929817310075, "grad_norm": 7.09464693069458, "learning_rate": 5.254901224034054e-05, "loss": 0.8404, "step": 156900 }, { "epoch": 2.163070733790747, "grad_norm": 8.133198738098145, "learning_rate": 5.254570331256328e-05, "loss": 0.8998, "step": 157000 }, { "epoch": 2.164448485850486, "grad_norm": 22.95881462097168, "learning_rate": 5.254239225702539e-05, "loss": 0.9318, "step": 157100 }, { "epoch": 2.1658262379102258, "grad_norm": 6.442829132080078, "learning_rate": 5.2539079074008164e-05, "loss": 0.9243, "step": 157200 }, { "epoch": 2.167203989969965, "grad_norm": 4.950994491577148, "learning_rate": 5.253576376379307e-05, "loss": 0.8612, "step": 157300 }, { "epoch": 2.1685817420297044, "grad_norm": 26.30369758605957, "learning_rate": 5.2532446326661764e-05, "loss": 0.9551, "step": 157400 }, { "epoch": 2.1699594940894436, "grad_norm": 16.370683670043945, "learning_rate": 5.2529126762896076e-05, "loss": 0.879, "step": 157500 }, { "epoch": 2.171337246149183, "grad_norm": 5.687817096710205, "learning_rate": 5.2525805072778024e-05, "loss": 0.8551, "step": 157600 }, { "epoch": 2.172714998208922, "grad_norm": 8.148140907287598, "learning_rate": 5.252248125658979e-05, "loss": 0.9787, "step": 157700 }, { "epoch": 2.1740927502686618, "grad_norm": 9.443391799926758, "learning_rate": 5.251915531461377e-05, "loss": 0.9364, "step": 157800 }, { "epoch": 2.175470502328401, "grad_norm": 10.445510864257812, "learning_rate": 5.2515827247132505e-05, "loss": 0.8426, "step": 157900 }, { "epoch": 2.1768482543881404, "grad_norm": 12.912979125976562, "learning_rate": 5.251249705442874e-05, "loss": 0.9348, "step": 158000 }, { "epoch": 2.1782260064478796, "grad_norm": 5.275145053863525, "learning_rate": 5.250916473678538e-05, "loss": 0.8936, "step": 158100 }, { "epoch": 2.179603758507619, "grad_norm": 6.103515148162842, "learning_rate": 5.250583029448553e-05, "loss": 0.9887, "step": 158200 }, { "epoch": 2.1809815105673582, "grad_norm": 10.050488471984863, "learning_rate": 5.250249372781247e-05, "loss": 0.941, "step": 158300 }, { "epoch": 2.182359262627098, "grad_norm": 26.635068893432617, "learning_rate": 5.249915503704966e-05, "loss": 1.0517, "step": 158400 }, { "epoch": 2.183737014686837, "grad_norm": 12.143385887145996, "learning_rate": 5.249581422248073e-05, "loss": 0.933, "step": 158500 }, { "epoch": 2.1851147667465765, "grad_norm": 5.439852714538574, "learning_rate": 5.24924712843895e-05, "loss": 0.9008, "step": 158600 }, { "epoch": 2.1864925188063156, "grad_norm": 9.545210838317871, "learning_rate": 5.2489126223059984e-05, "loss": 0.9218, "step": 158700 }, { "epoch": 2.187870270866055, "grad_norm": 13.009220123291016, "learning_rate": 5.248577903877635e-05, "loss": 0.9617, "step": 158800 }, { "epoch": 2.1892480229257942, "grad_norm": 15.801142692565918, "learning_rate": 5.248242973182296e-05, "loss": 1.0053, "step": 158900 }, { "epoch": 2.190625774985534, "grad_norm": 20.715030670166016, "learning_rate": 5.2479078302484346e-05, "loss": 0.9531, "step": 159000 }, { "epoch": 2.192003527045273, "grad_norm": 2.668189525604248, "learning_rate": 5.247572475104524e-05, "loss": 0.9391, "step": 159100 }, { "epoch": 2.1933812791050125, "grad_norm": 10.495641708374023, "learning_rate": 5.247236907779055e-05, "loss": 0.8596, "step": 159200 }, { "epoch": 2.1947590311647516, "grad_norm": 5.542959690093994, "learning_rate": 5.246901128300534e-05, "loss": 0.9221, "step": 159300 }, { "epoch": 2.1961367832244907, "grad_norm": 8.880916595458984, "learning_rate": 5.2465651366974884e-05, "loss": 0.8193, "step": 159400 }, { "epoch": 2.1975145352842302, "grad_norm": 3.6114373207092285, "learning_rate": 5.2462289329984626e-05, "loss": 0.8921, "step": 159500 }, { "epoch": 2.19889228734397, "grad_norm": 27.882787704467773, "learning_rate": 5.245892517232018e-05, "loss": 0.8985, "step": 159600 }, { "epoch": 2.200270039403709, "grad_norm": 13.388383865356445, "learning_rate": 5.245555889426735e-05, "loss": 0.9079, "step": 159700 }, { "epoch": 2.201647791463448, "grad_norm": 16.630842208862305, "learning_rate": 5.245219049611212e-05, "loss": 0.9598, "step": 159800 }, { "epoch": 2.2030255435231876, "grad_norm": 4.307223796844482, "learning_rate": 5.2448819978140654e-05, "loss": 0.9551, "step": 159900 }, { "epoch": 2.2044032955829267, "grad_norm": 13.902462005615234, "learning_rate": 5.2445447340639293e-05, "loss": 0.9426, "step": 160000 }, { "epoch": 2.2057810476426662, "grad_norm": 5.826269149780273, "learning_rate": 5.244207258389456e-05, "loss": 0.9591, "step": 160100 }, { "epoch": 2.2071587997024054, "grad_norm": 3.053081512451172, "learning_rate": 5.2438695708193164e-05, "loss": 0.926, "step": 160200 }, { "epoch": 2.208536551762145, "grad_norm": 23.950437545776367, "learning_rate": 5.243531671382198e-05, "loss": 0.9329, "step": 160300 }, { "epoch": 2.209914303821884, "grad_norm": 5.027472019195557, "learning_rate": 5.243193560106806e-05, "loss": 1.0093, "step": 160400 }, { "epoch": 2.2112920558816236, "grad_norm": 37.77529525756836, "learning_rate": 5.242855237021868e-05, "loss": 0.9378, "step": 160500 }, { "epoch": 2.2126698079413627, "grad_norm": 26.79740333557129, "learning_rate": 5.242516702156123e-05, "loss": 0.9427, "step": 160600 }, { "epoch": 2.2140475600011023, "grad_norm": 7.521678924560547, "learning_rate": 5.2421779555383325e-05, "loss": 0.9531, "step": 160700 }, { "epoch": 2.2154253120608414, "grad_norm": 14.960001945495605, "learning_rate": 5.2418389971972756e-05, "loss": 0.972, "step": 160800 }, { "epoch": 2.216803064120581, "grad_norm": 47.28500747680664, "learning_rate": 5.241499827161746e-05, "loss": 0.8699, "step": 160900 }, { "epoch": 2.21818081618032, "grad_norm": 8.531850814819336, "learning_rate": 5.2411604454605614e-05, "loss": 1.0195, "step": 161000 }, { "epoch": 2.2195585682400596, "grad_norm": 5.627152442932129, "learning_rate": 5.240820852122551e-05, "loss": 0.9033, "step": 161100 }, { "epoch": 2.2209363202997987, "grad_norm": 38.93098068237305, "learning_rate": 5.240481047176568e-05, "loss": 0.9048, "step": 161200 }, { "epoch": 2.2223140723595383, "grad_norm": 7.977657794952393, "learning_rate": 5.240141030651477e-05, "loss": 0.9873, "step": 161300 }, { "epoch": 2.2236918244192774, "grad_norm": 8.564409255981445, "learning_rate": 5.239800802576167e-05, "loss": 0.9275, "step": 161400 }, { "epoch": 2.225069576479017, "grad_norm": 11.705387115478516, "learning_rate": 5.239460362979541e-05, "loss": 0.7914, "step": 161500 }, { "epoch": 2.226447328538756, "grad_norm": 13.273726463317871, "learning_rate": 5.2391197118905204e-05, "loss": 0.9548, "step": 161600 }, { "epoch": 2.2278250805984956, "grad_norm": 4.967896938323975, "learning_rate": 5.238778849338048e-05, "loss": 0.932, "step": 161700 }, { "epoch": 2.2292028326582347, "grad_norm": 9.133955955505371, "learning_rate": 5.238437775351078e-05, "loss": 0.8389, "step": 161800 }, { "epoch": 2.2305805847179743, "grad_norm": 14.688834190368652, "learning_rate": 5.238096489958589e-05, "loss": 0.8829, "step": 161900 }, { "epoch": 2.2319583367777134, "grad_norm": 40.99361038208008, "learning_rate": 5.2377549931895733e-05, "loss": 0.8876, "step": 162000 }, { "epoch": 2.233336088837453, "grad_norm": 54.10387420654297, "learning_rate": 5.237413285073044e-05, "loss": 0.9795, "step": 162100 }, { "epoch": 2.234713840897192, "grad_norm": 31.367080688476562, "learning_rate": 5.237071365638031e-05, "loss": 0.9148, "step": 162200 }, { "epoch": 2.2360915929569316, "grad_norm": 9.466647148132324, "learning_rate": 5.2367326572666135e-05, "loss": 1.0082, "step": 162300 }, { "epoch": 2.2374693450166707, "grad_norm": 10.659646987915039, "learning_rate": 5.236390317394253e-05, "loss": 0.9343, "step": 162400 }, { "epoch": 2.2388470970764103, "grad_norm": 8.352649688720703, "learning_rate": 5.2360477662903155e-05, "loss": 0.8674, "step": 162500 }, { "epoch": 2.2402248491361494, "grad_norm": 11.551562309265137, "learning_rate": 5.235705003983901e-05, "loss": 0.9781, "step": 162600 }, { "epoch": 2.241602601195889, "grad_norm": 101.57791900634766, "learning_rate": 5.23536203050413e-05, "loss": 1.0003, "step": 162700 }, { "epoch": 2.242980353255628, "grad_norm": 33.70100402832031, "learning_rate": 5.235018845880139e-05, "loss": 1.015, "step": 162800 }, { "epoch": 2.2443581053153676, "grad_norm": 8.317302703857422, "learning_rate": 5.234675450141084e-05, "loss": 0.9795, "step": 162900 }, { "epoch": 2.2457358573751067, "grad_norm": 11.664278030395508, "learning_rate": 5.234331843316138e-05, "loss": 0.9885, "step": 163000 }, { "epoch": 2.2471136094348463, "grad_norm": 5.981961250305176, "learning_rate": 5.233988025434493e-05, "loss": 1.014, "step": 163100 }, { "epoch": 2.2484913614945854, "grad_norm": 9.815963745117188, "learning_rate": 5.233643996525356e-05, "loss": 1.007, "step": 163200 }, { "epoch": 2.249869113554325, "grad_norm": 32.083648681640625, "learning_rate": 5.2332997566179563e-05, "loss": 1.0017, "step": 163300 }, { "epoch": 2.251246865614064, "grad_norm": 64.1452407836914, "learning_rate": 5.2329553057415375e-05, "loss": 1.0477, "step": 163400 }, { "epoch": 2.2526246176738036, "grad_norm": 5.024203300476074, "learning_rate": 5.2326106439253624e-05, "loss": 0.8959, "step": 163500 }, { "epoch": 2.2540023697335427, "grad_norm": 14.767755508422852, "learning_rate": 5.232272670719978e-05, "loss": 0.9719, "step": 163600 }, { "epoch": 2.255380121793282, "grad_norm": 18.60188102722168, "learning_rate": 5.231927591329488e-05, "loss": 0.9632, "step": 163700 }, { "epoch": 2.2567578738530214, "grad_norm": 12.099367141723633, "learning_rate": 5.231582301086551e-05, "loss": 0.9621, "step": 163800 }, { "epoch": 2.258135625912761, "grad_norm": 16.069604873657227, "learning_rate": 5.2312368000205026e-05, "loss": 1.016, "step": 163900 }, { "epoch": 2.2595133779725, "grad_norm": 8.607207298278809, "learning_rate": 5.230891088160694e-05, "loss": 0.9365, "step": 164000 }, { "epoch": 2.260891130032239, "grad_norm": 5.177475929260254, "learning_rate": 5.230545165536495e-05, "loss": 1.0815, "step": 164100 }, { "epoch": 2.2622688820919787, "grad_norm": 153.39865112304688, "learning_rate": 5.230199032177294e-05, "loss": 1.0885, "step": 164200 }, { "epoch": 2.2636466341517183, "grad_norm": 9.966987609863281, "learning_rate": 5.229852688112496e-05, "loss": 0.955, "step": 164300 }, { "epoch": 2.2650243862114574, "grad_norm": 6.414557456970215, "learning_rate": 5.229506133371527e-05, "loss": 1.0635, "step": 164400 }, { "epoch": 2.2664021382711965, "grad_norm": 64.12127685546875, "learning_rate": 5.2291593679838246e-05, "loss": 0.922, "step": 164500 }, { "epoch": 2.267779890330936, "grad_norm": 17.174057006835938, "learning_rate": 5.228812391978852e-05, "loss": 0.9401, "step": 164600 }, { "epoch": 2.269157642390675, "grad_norm": 6.744351863861084, "learning_rate": 5.2284652053860846e-05, "loss": 1.02, "step": 164700 }, { "epoch": 2.2705353944504147, "grad_norm": 7.3058390617370605, "learning_rate": 5.228121283248696e-05, "loss": 0.9649, "step": 164800 }, { "epoch": 2.271913146510154, "grad_norm": 14.383864402770996, "learning_rate": 5.227773677673986e-05, "loss": 1.0199, "step": 164900 }, { "epoch": 2.2732908985698934, "grad_norm": 212.670654296875, "learning_rate": 5.227425861599725e-05, "loss": 0.9731, "step": 165000 }, { "epoch": 2.2746686506296325, "grad_norm": 8.136039733886719, "learning_rate": 5.2270778350554635e-05, "loss": 0.9479, "step": 165100 }, { "epoch": 2.276046402689372, "grad_norm": 59.46452331542969, "learning_rate": 5.2267295980707675e-05, "loss": 0.978, "step": 165200 }, { "epoch": 2.277424154749111, "grad_norm": 15.516117095947266, "learning_rate": 5.2263811506752215e-05, "loss": 1.0419, "step": 165300 }, { "epoch": 2.2788019068088508, "grad_norm": 18.471847534179688, "learning_rate": 5.226032492898427e-05, "loss": 0.9516, "step": 165400 }, { "epoch": 2.28017965886859, "grad_norm": 9.486163139343262, "learning_rate": 5.225683624770004e-05, "loss": 0.9295, "step": 165500 }, { "epoch": 2.2815574109283294, "grad_norm": 6.116447925567627, "learning_rate": 5.2253345463195935e-05, "loss": 0.9983, "step": 165600 }, { "epoch": 2.2829351629880685, "grad_norm": 10.514097213745117, "learning_rate": 5.224985257576848e-05, "loss": 0.9038, "step": 165700 }, { "epoch": 2.284312915047808, "grad_norm": 27.088058471679688, "learning_rate": 5.224635758571444e-05, "loss": 0.98, "step": 165800 }, { "epoch": 2.285690667107547, "grad_norm": 27.673587799072266, "learning_rate": 5.224286049333071e-05, "loss": 0.9489, "step": 165900 }, { "epoch": 2.2870684191672868, "grad_norm": 11.980297088623047, "learning_rate": 5.223936129891441e-05, "loss": 1.0186, "step": 166000 }, { "epoch": 2.288446171227026, "grad_norm": 120.99385833740234, "learning_rate": 5.22358600027628e-05, "loss": 0.9645, "step": 166100 }, { "epoch": 2.2898239232867654, "grad_norm": 59.4659538269043, "learning_rate": 5.2232356605173326e-05, "loss": 1.0393, "step": 166200 }, { "epoch": 2.2912016753465045, "grad_norm": 112.47696685791016, "learning_rate": 5.222885110644364e-05, "loss": 1.1404, "step": 166300 }, { "epoch": 2.292579427406244, "grad_norm": 22.85833168029785, "learning_rate": 5.2225378593265444e-05, "loss": 1.0471, "step": 166400 }, { "epoch": 2.293957179465983, "grad_norm": 16.62672233581543, "learning_rate": 5.222186891415288e-05, "loss": 1.0863, "step": 166500 }, { "epoch": 2.2953349315257228, "grad_norm": 52.5628547668457, "learning_rate": 5.2218357134791074e-05, "loss": 0.9931, "step": 166600 }, { "epoch": 2.296712683585462, "grad_norm": 32.63602828979492, "learning_rate": 5.2214843255478376e-05, "loss": 1.0147, "step": 166700 }, { "epoch": 2.2980904356452014, "grad_norm": 5.233877658843994, "learning_rate": 5.221132727651331e-05, "loss": 1.0359, "step": 166800 }, { "epoch": 2.2994681877049405, "grad_norm": 14.243330001831055, "learning_rate": 5.220780919819456e-05, "loss": 1.1741, "step": 166900 }, { "epoch": 2.30084593976468, "grad_norm": 15.849098205566406, "learning_rate": 5.220428902082102e-05, "loss": 1.0128, "step": 167000 }, { "epoch": 2.302223691824419, "grad_norm": 51.943416595458984, "learning_rate": 5.220076674469173e-05, "loss": 1.0337, "step": 167100 }, { "epoch": 2.3036014438841588, "grad_norm": 16.928619384765625, "learning_rate": 5.219724237010594e-05, "loss": 1.0639, "step": 167200 }, { "epoch": 2.304979195943898, "grad_norm": 10.309880256652832, "learning_rate": 5.219371589736307e-05, "loss": 1.019, "step": 167300 }, { "epoch": 2.3063569480036374, "grad_norm": 16.806623458862305, "learning_rate": 5.21901873267627e-05, "loss": 1.0678, "step": 167400 }, { "epoch": 2.3077347000633766, "grad_norm": 17.494747161865234, "learning_rate": 5.21866566586046e-05, "loss": 0.9179, "step": 167500 }, { "epoch": 2.309112452123116, "grad_norm": 40.3848876953125, "learning_rate": 5.2183123893188716e-05, "loss": 0.9432, "step": 167600 }, { "epoch": 2.3104902041828552, "grad_norm": 135.49258422851562, "learning_rate": 5.217958903081518e-05, "loss": 1.0602, "step": 167700 }, { "epoch": 2.311867956242595, "grad_norm": 14.116009712219238, "learning_rate": 5.2176052071784306e-05, "loss": 1.0863, "step": 167800 }, { "epoch": 2.313245708302334, "grad_norm": 15.387771606445312, "learning_rate": 5.217251301639656e-05, "loss": 1.1613, "step": 167900 }, { "epoch": 2.314623460362073, "grad_norm": 88.38285064697266, "learning_rate": 5.216897186495261e-05, "loss": 1.1406, "step": 168000 }, { "epoch": 2.3160012124218126, "grad_norm": 11.002593040466309, "learning_rate": 5.216542861775329e-05, "loss": 1.0048, "step": 168100 }, { "epoch": 2.317378964481552, "grad_norm": 8.143379211425781, "learning_rate": 5.216188327509963e-05, "loss": 0.9663, "step": 168200 }, { "epoch": 2.3187567165412912, "grad_norm": 33.01509475708008, "learning_rate": 5.21583358372928e-05, "loss": 1.0433, "step": 168300 }, { "epoch": 2.3201344686010303, "grad_norm": 13.792733192443848, "learning_rate": 5.2154786304634206e-05, "loss": 0.9347, "step": 168400 }, { "epoch": 2.32151222066077, "grad_norm": 38.95616912841797, "learning_rate": 5.215123467742538e-05, "loss": 1.1345, "step": 168500 }, { "epoch": 2.3228899727205095, "grad_norm": 17.847572326660156, "learning_rate": 5.214768095596805e-05, "loss": 1.0973, "step": 168600 }, { "epoch": 2.3242677247802486, "grad_norm": 120.91565704345703, "learning_rate": 5.214412514056413e-05, "loss": 1.1131, "step": 168700 }, { "epoch": 2.3256454768399877, "grad_norm": 11.627106666564941, "learning_rate": 5.21405672315157e-05, "loss": 0.9853, "step": 168800 }, { "epoch": 2.3270232288997272, "grad_norm": 17.328262329101562, "learning_rate": 5.2137007229125026e-05, "loss": 1.0606, "step": 168900 }, { "epoch": 2.3284009809594663, "grad_norm": 29.377147674560547, "learning_rate": 5.2133445133694536e-05, "loss": 1.0498, "step": 169000 }, { "epoch": 2.329778733019206, "grad_norm": 52.74782943725586, "learning_rate": 5.212988094552686e-05, "loss": 1.0497, "step": 169100 }, { "epoch": 2.331156485078945, "grad_norm": 25.169078826904297, "learning_rate": 5.21263146649248e-05, "loss": 1.0042, "step": 169200 }, { "epoch": 2.3325342371386846, "grad_norm": 66.08953857421875, "learning_rate": 5.2122746292191314e-05, "loss": 1.0627, "step": 169300 }, { "epoch": 2.3339119891984237, "grad_norm": 34.516014099121094, "learning_rate": 5.211917582762956e-05, "loss": 1.1022, "step": 169400 }, { "epoch": 2.3352897412581632, "grad_norm": 25.2557373046875, "learning_rate": 5.211560327154288e-05, "loss": 1.0437, "step": 169500 }, { "epoch": 2.3366674933179024, "grad_norm": 7.747533798217773, "learning_rate": 5.211202862423476e-05, "loss": 1.0407, "step": 169600 }, { "epoch": 2.338045245377642, "grad_norm": 9.991101264953613, "learning_rate": 5.2108451886008894e-05, "loss": 1.0045, "step": 169700 }, { "epoch": 2.339422997437381, "grad_norm": 10.435796737670898, "learning_rate": 5.210487305716914e-05, "loss": 1.0129, "step": 169800 }, { "epoch": 2.3408007494971206, "grad_norm": 8.149819374084473, "learning_rate": 5.210129213801955e-05, "loss": 0.9709, "step": 169900 }, { "epoch": 2.3421785015568597, "grad_norm": 32.893741607666016, "learning_rate": 5.209770912886434e-05, "loss": 1.0459, "step": 170000 }, { "epoch": 2.3435562536165992, "grad_norm": 10.252907752990723, "learning_rate": 5.209412403000789e-05, "loss": 0.9632, "step": 170100 }, { "epoch": 2.3449340056763384, "grad_norm": 123.37541961669922, "learning_rate": 5.2090536841754784e-05, "loss": 1.0678, "step": 170200 }, { "epoch": 2.346311757736078, "grad_norm": 15.898186683654785, "learning_rate": 5.208694756440977e-05, "loss": 1.0805, "step": 170300 }, { "epoch": 2.347689509795817, "grad_norm": 11.285989761352539, "learning_rate": 5.2083356198277765e-05, "loss": 0.9717, "step": 170400 }, { "epoch": 2.3490672618555566, "grad_norm": 62.288143157958984, "learning_rate": 5.2079762743663884e-05, "loss": 1.1064, "step": 170500 }, { "epoch": 2.3504450139152957, "grad_norm": 27.738956451416016, "learning_rate": 5.207616720087343e-05, "loss": 1.0938, "step": 170600 }, { "epoch": 2.3518227659750353, "grad_norm": 9.988628387451172, "learning_rate": 5.2072569570211815e-05, "loss": 1.108, "step": 170700 }, { "epoch": 2.3532005180347744, "grad_norm": 24.56854820251465, "learning_rate": 5.206896985198471e-05, "loss": 1.0553, "step": 170800 }, { "epoch": 2.354578270094514, "grad_norm": 4.584719181060791, "learning_rate": 5.206536804649793e-05, "loss": 0.9619, "step": 170900 }, { "epoch": 2.355956022154253, "grad_norm": 24.88559341430664, "learning_rate": 5.2061764154057456e-05, "loss": 1.0665, "step": 171000 }, { "epoch": 2.3573337742139926, "grad_norm": 27.593503952026367, "learning_rate": 5.205815817496946e-05, "loss": 1.0, "step": 171100 }, { "epoch": 2.3587115262737317, "grad_norm": 35.88474655151367, "learning_rate": 5.2054550109540284e-05, "loss": 0.9997, "step": 171200 }, { "epoch": 2.3600892783334713, "grad_norm": 32.62916946411133, "learning_rate": 5.205093995807646e-05, "loss": 1.0596, "step": 171300 }, { "epoch": 2.3614670303932104, "grad_norm": 11.110734939575195, "learning_rate": 5.204732772088468e-05, "loss": 1.0892, "step": 171400 }, { "epoch": 2.36284478245295, "grad_norm": 69.90022277832031, "learning_rate": 5.204371339827183e-05, "loss": 1.0014, "step": 171500 }, { "epoch": 2.364222534512689, "grad_norm": 32.14912414550781, "learning_rate": 5.2040096990544966e-05, "loss": 1.0531, "step": 171600 }, { "epoch": 2.3656002865724286, "grad_norm": 53.77708435058594, "learning_rate": 5.203647849801131e-05, "loss": 1.0277, "step": 171700 }, { "epoch": 2.3669780386321677, "grad_norm": 8.331936836242676, "learning_rate": 5.2032857920978283e-05, "loss": 0.845, "step": 171800 }, { "epoch": 2.3683557906919073, "grad_norm": 7.75789737701416, "learning_rate": 5.2029235259753464e-05, "loss": 0.9146, "step": 171900 }, { "epoch": 2.3697335427516464, "grad_norm": 48.754512786865234, "learning_rate": 5.2025610514644614e-05, "loss": 0.8927, "step": 172000 }, { "epoch": 2.371111294811386, "grad_norm": 4.2286248207092285, "learning_rate": 5.202198368595969e-05, "loss": 0.9599, "step": 172100 }, { "epoch": 2.372489046871125, "grad_norm": 95.67806243896484, "learning_rate": 5.201835477400679e-05, "loss": 1.065, "step": 172200 }, { "epoch": 2.373866798930864, "grad_norm": 24.00796127319336, "learning_rate": 5.2014723779094215e-05, "loss": 1.0615, "step": 172300 }, { "epoch": 2.3752445509906037, "grad_norm": 302.600830078125, "learning_rate": 5.201109070153044e-05, "loss": 0.9921, "step": 172400 }, { "epoch": 2.3766223030503433, "grad_norm": 16.198699951171875, "learning_rate": 5.200745554162412e-05, "loss": 1.0142, "step": 172500 }, { "epoch": 2.3780000551100824, "grad_norm": 6.950168132781982, "learning_rate": 5.2003818299684066e-05, "loss": 1.0386, "step": 172600 }, { "epoch": 2.3793778071698215, "grad_norm": 6.602731704711914, "learning_rate": 5.2000178976019284e-05, "loss": 1.0338, "step": 172700 }, { "epoch": 2.380755559229561, "grad_norm": 10.891432762145996, "learning_rate": 5.1996537570938964e-05, "loss": 0.9747, "step": 172800 }, { "epoch": 2.3821333112893006, "grad_norm": 104.41752624511719, "learning_rate": 5.199289408475245e-05, "loss": 0.951, "step": 172900 }, { "epoch": 2.3835110633490397, "grad_norm": 13.820969581604004, "learning_rate": 5.198924851776928e-05, "loss": 0.9435, "step": 173000 }, { "epoch": 2.384888815408779, "grad_norm": 21.821395874023438, "learning_rate": 5.198560087029916e-05, "loss": 1.0227, "step": 173100 }, { "epoch": 2.3862665674685184, "grad_norm": 22.758373260498047, "learning_rate": 5.1981951142651974e-05, "loss": 1.0461, "step": 173200 }, { "epoch": 2.3876443195282575, "grad_norm": 22.348026275634766, "learning_rate": 5.197829933513779e-05, "loss": 1.0892, "step": 173300 }, { "epoch": 2.389022071587997, "grad_norm": 10.256428718566895, "learning_rate": 5.1974645448066855e-05, "loss": 0.9678, "step": 173400 }, { "epoch": 2.390399823647736, "grad_norm": 9.233114242553711, "learning_rate": 5.197098948174957e-05, "loss": 1.0123, "step": 173500 }, { "epoch": 2.3917775757074757, "grad_norm": 8.449286460876465, "learning_rate": 5.196733143649654e-05, "loss": 0.9685, "step": 173600 }, { "epoch": 2.393155327767215, "grad_norm": 19.234813690185547, "learning_rate": 5.1963671312618535e-05, "loss": 0.9747, "step": 173700 }, { "epoch": 2.3945330798269544, "grad_norm": 30.018075942993164, "learning_rate": 5.1960009110426495e-05, "loss": 0.9633, "step": 173800 }, { "epoch": 2.3959108318866935, "grad_norm": 65.04824829101562, "learning_rate": 5.195634483023154e-05, "loss": 1.0313, "step": 173900 }, { "epoch": 2.397288583946433, "grad_norm": 16.118972778320312, "learning_rate": 5.195267847234498e-05, "loss": 1.0113, "step": 174000 }, { "epoch": 2.398666336006172, "grad_norm": 32.16482162475586, "learning_rate": 5.194901003707827e-05, "loss": 1.0195, "step": 174100 }, { "epoch": 2.4000440880659117, "grad_norm": 33.35468673706055, "learning_rate": 5.194533952474309e-05, "loss": 1.0382, "step": 174200 }, { "epoch": 2.401421840125651, "grad_norm": 17.260162353515625, "learning_rate": 5.1941666935651253e-05, "loss": 1.0441, "step": 174300 }, { "epoch": 2.4027995921853904, "grad_norm": 10.887475967407227, "learning_rate": 5.1938029027047506e-05, "loss": 1.0326, "step": 174400 }, { "epoch": 2.4041773442451295, "grad_norm": 187.56747436523438, "learning_rate": 5.193435230613833e-05, "loss": 1.0227, "step": 174500 }, { "epoch": 2.405555096304869, "grad_norm": 25.112625122070312, "learning_rate": 5.1930673509405926e-05, "loss": 1.05, "step": 174600 }, { "epoch": 2.406932848364608, "grad_norm": 15.029642105102539, "learning_rate": 5.192699263716282e-05, "loss": 1.1454, "step": 174700 }, { "epoch": 2.4083106004243477, "grad_norm": 97.6905288696289, "learning_rate": 5.192330968972171e-05, "loss": 1.0397, "step": 174800 }, { "epoch": 2.409688352484087, "grad_norm": 7.099515914916992, "learning_rate": 5.19196246673955e-05, "loss": 1.0869, "step": 174900 }, { "epoch": 2.4110661045438264, "grad_norm": 29.096044540405273, "learning_rate": 5.191593757049726e-05, "loss": 1.0363, "step": 175000 }, { "epoch": 2.4124438566035655, "grad_norm": 10.986734390258789, "learning_rate": 5.1912248399340194e-05, "loss": 1.0172, "step": 175100 }, { "epoch": 2.413821608663305, "grad_norm": 25.678735733032227, "learning_rate": 5.1908594076953767e-05, "loss": 1.0575, "step": 175200 }, { "epoch": 2.415199360723044, "grad_norm": 20.406526565551758, "learning_rate": 5.190490077895427e-05, "loss": 1.0295, "step": 175300 }, { "epoch": 2.4165771127827838, "grad_norm": 21.655616760253906, "learning_rate": 5.19012054076336e-05, "loss": 1.0918, "step": 175400 }, { "epoch": 2.417954864842523, "grad_norm": 26.264522552490234, "learning_rate": 5.18975079633057e-05, "loss": 1.0617, "step": 175500 }, { "epoch": 2.4193326169022624, "grad_norm": 77.81526947021484, "learning_rate": 5.1893808446284675e-05, "loss": 1.0516, "step": 175600 }, { "epoch": 2.4207103689620015, "grad_norm": 67.17523193359375, "learning_rate": 5.1890106856884824e-05, "loss": 1.0606, "step": 175700 }, { "epoch": 2.422088121021741, "grad_norm": 30.02215576171875, "learning_rate": 5.188640319542062e-05, "loss": 1.2209, "step": 175800 }, { "epoch": 2.42346587308148, "grad_norm": 206.22950744628906, "learning_rate": 5.1882697462206705e-05, "loss": 1.0157, "step": 175900 }, { "epoch": 2.4248436251412198, "grad_norm": 14.736010551452637, "learning_rate": 5.18789896575579e-05, "loss": 1.0217, "step": 176000 }, { "epoch": 2.426221377200959, "grad_norm": 204.29269409179688, "learning_rate": 5.18752797817892e-05, "loss": 1.1247, "step": 176100 }, { "epoch": 2.4275991292606984, "grad_norm": 140.85768127441406, "learning_rate": 5.187156783521578e-05, "loss": 0.9863, "step": 176200 }, { "epoch": 2.4289768813204375, "grad_norm": 22.160306930541992, "learning_rate": 5.186785381815299e-05, "loss": 0.9873, "step": 176300 }, { "epoch": 2.430354633380177, "grad_norm": 8.243571281433105, "learning_rate": 5.186413773091634e-05, "loss": 1.0438, "step": 176400 }, { "epoch": 2.431732385439916, "grad_norm": 16.818946838378906, "learning_rate": 5.186041957382156e-05, "loss": 1.0263, "step": 176500 }, { "epoch": 2.4331101374996553, "grad_norm": 4.928921222686768, "learning_rate": 5.1856699347184505e-05, "loss": 1.1613, "step": 176600 }, { "epoch": 2.434487889559395, "grad_norm": 129.56832885742188, "learning_rate": 5.1853014284521496e-05, "loss": 1.1088, "step": 176700 }, { "epoch": 2.4358656416191344, "grad_norm": 44.18943405151367, "learning_rate": 5.184928994043577e-05, "loss": 1.1488, "step": 176800 }, { "epoch": 2.4372433936788735, "grad_norm": 123.67875671386719, "learning_rate": 5.184556352775329e-05, "loss": 1.119, "step": 176900 }, { "epoch": 2.4386211457386127, "grad_norm": 9.233516693115234, "learning_rate": 5.184183504679064e-05, "loss": 1.0325, "step": 177000 }, { "epoch": 2.439998897798352, "grad_norm": 10.702132225036621, "learning_rate": 5.183810449786457e-05, "loss": 1.0569, "step": 177100 }, { "epoch": 2.4413766498580918, "grad_norm": 21.172279357910156, "learning_rate": 5.183437188129201e-05, "loss": 1.1594, "step": 177200 }, { "epoch": 2.442754401917831, "grad_norm": 129.290771484375, "learning_rate": 5.1830637197390064e-05, "loss": 1.038, "step": 177300 }, { "epoch": 2.44413215397757, "grad_norm": 7.982485771179199, "learning_rate": 5.182690044647601e-05, "loss": 1.1337, "step": 177400 }, { "epoch": 2.4455099060373096, "grad_norm": 12.42431354522705, "learning_rate": 5.182316162886731e-05, "loss": 1.1621, "step": 177500 }, { "epoch": 2.4468876580970487, "grad_norm": 176.325927734375, "learning_rate": 5.1819420744881596e-05, "loss": 1.0764, "step": 177600 }, { "epoch": 2.4482654101567882, "grad_norm": 24.430160522460938, "learning_rate": 5.181567779483667e-05, "loss": 0.9985, "step": 177700 }, { "epoch": 2.4496431622165273, "grad_norm": 4.872775077819824, "learning_rate": 5.1811932779050515e-05, "loss": 1.0255, "step": 177800 }, { "epoch": 2.451020914276267, "grad_norm": 10.347536087036133, "learning_rate": 5.1808185697841296e-05, "loss": 1.0522, "step": 177900 }, { "epoch": 2.452398666336006, "grad_norm": 3.4556097984313965, "learning_rate": 5.180443655152734e-05, "loss": 1.0476, "step": 178000 }, { "epoch": 2.4537764183957456, "grad_norm": 18.893156051635742, "learning_rate": 5.1800685340427155e-05, "loss": 1.0728, "step": 178100 }, { "epoch": 2.4551541704554847, "grad_norm": 18.359251022338867, "learning_rate": 5.179693206485944e-05, "loss": 0.9061, "step": 178200 }, { "epoch": 2.4565319225152242, "grad_norm": 147.23263549804688, "learning_rate": 5.1793176725143034e-05, "loss": 1.0574, "step": 178300 }, { "epoch": 2.4579096745749633, "grad_norm": 5.758816242218018, "learning_rate": 5.178941932159698e-05, "loss": 0.9852, "step": 178400 }, { "epoch": 2.459287426634703, "grad_norm": 33.01610565185547, "learning_rate": 5.1785659854540494e-05, "loss": 0.9077, "step": 178500 }, { "epoch": 2.460665178694442, "grad_norm": 17.409378051757812, "learning_rate": 5.178189832429296e-05, "loss": 0.9479, "step": 178600 }, { "epoch": 2.4620429307541816, "grad_norm": 17.433063507080078, "learning_rate": 5.1778134731173935e-05, "loss": 0.9505, "step": 178700 }, { "epoch": 2.4634206828139207, "grad_norm": 7.8560261726379395, "learning_rate": 5.177436907550317e-05, "loss": 1.1017, "step": 178800 }, { "epoch": 2.4647984348736602, "grad_norm": 8.070338249206543, "learning_rate": 5.177060135760056e-05, "loss": 0.9516, "step": 178900 }, { "epoch": 2.4661761869333994, "grad_norm": 6.128262519836426, "learning_rate": 5.176683157778619e-05, "loss": 0.9875, "step": 179000 }, { "epoch": 2.467553938993139, "grad_norm": 13.260516166687012, "learning_rate": 5.176305973638033e-05, "loss": 0.8901, "step": 179100 }, { "epoch": 2.468931691052878, "grad_norm": 4.990565299987793, "learning_rate": 5.175928583370342e-05, "loss": 1.0459, "step": 179200 }, { "epoch": 2.4703094431126176, "grad_norm": 19.502981185913086, "learning_rate": 5.1755509870076066e-05, "loss": 0.9791, "step": 179300 }, { "epoch": 2.4716871951723567, "grad_norm": 9.662759780883789, "learning_rate": 5.175173184581906e-05, "loss": 0.8444, "step": 179400 }, { "epoch": 2.4730649472320962, "grad_norm": 164.9662628173828, "learning_rate": 5.174795176125336e-05, "loss": 0.983, "step": 179500 }, { "epoch": 2.4744426992918354, "grad_norm": 54.17001724243164, "learning_rate": 5.174416961670011e-05, "loss": 0.9716, "step": 179600 }, { "epoch": 2.475820451351575, "grad_norm": 12.545207977294922, "learning_rate": 5.174038541248062e-05, "loss": 0.9388, "step": 179700 }, { "epoch": 2.477198203411314, "grad_norm": 7.745179176330566, "learning_rate": 5.1736599148916374e-05, "loss": 0.9997, "step": 179800 }, { "epoch": 2.4785759554710536, "grad_norm": 14.741572380065918, "learning_rate": 5.173281082632903e-05, "loss": 1.0515, "step": 179900 }, { "epoch": 2.4799537075307927, "grad_norm": 10.962214469909668, "learning_rate": 5.172902044504045e-05, "loss": 0.9865, "step": 180000 }, { "epoch": 2.4813314595905323, "grad_norm": 6.911766052246094, "learning_rate": 5.172522800537261e-05, "loss": 0.8742, "step": 180100 }, { "epoch": 2.4827092116502714, "grad_norm": 7.514871120452881, "learning_rate": 5.172143350764773e-05, "loss": 0.9228, "step": 180200 }, { "epoch": 2.484086963710011, "grad_norm": 7.460342884063721, "learning_rate": 5.171763695218814e-05, "loss": 1.0701, "step": 180300 }, { "epoch": 2.48546471576975, "grad_norm": 12.996829986572266, "learning_rate": 5.1713838339316415e-05, "loss": 0.912, "step": 180400 }, { "epoch": 2.4868424678294896, "grad_norm": 12.467066764831543, "learning_rate": 5.171003766935523e-05, "loss": 0.9402, "step": 180500 }, { "epoch": 2.4882202198892287, "grad_norm": 7.390972137451172, "learning_rate": 5.170623494262749e-05, "loss": 1.0215, "step": 180600 }, { "epoch": 2.4895979719489683, "grad_norm": 5.795259952545166, "learning_rate": 5.170243015945626e-05, "loss": 0.8947, "step": 180700 }, { "epoch": 2.4909757240087074, "grad_norm": 12.004082679748535, "learning_rate": 5.169862332016476e-05, "loss": 1.0102, "step": 180800 }, { "epoch": 2.4923534760684465, "grad_norm": 12.660604476928711, "learning_rate": 5.169481442507642e-05, "loss": 0.9221, "step": 180900 }, { "epoch": 2.493731228128186, "grad_norm": 7.932154178619385, "learning_rate": 5.169100347451481e-05, "loss": 0.9068, "step": 181000 }, { "epoch": 2.4951089801879256, "grad_norm": 14.801414489746094, "learning_rate": 5.168719046880369e-05, "loss": 0.8937, "step": 181100 }, { "epoch": 2.4964867322476647, "grad_norm": 41.460079193115234, "learning_rate": 5.1683375408267006e-05, "loss": 0.9151, "step": 181200 }, { "epoch": 2.497864484307404, "grad_norm": 6.121121406555176, "learning_rate": 5.167955829322886e-05, "loss": 0.8914, "step": 181300 }, { "epoch": 2.4992422363671434, "grad_norm": 8.627535820007324, "learning_rate": 5.167573912401353e-05, "loss": 0.8863, "step": 181400 }, { "epoch": 2.500619988426883, "grad_norm": 9.960061073303223, "learning_rate": 5.167191790094548e-05, "loss": 0.8927, "step": 181500 }, { "epoch": 2.501997740486622, "grad_norm": 6.799235820770264, "learning_rate": 5.1668094624349345e-05, "loss": 0.868, "step": 181600 }, { "epoch": 2.503375492546361, "grad_norm": 5.950893402099609, "learning_rate": 5.166426929454992e-05, "loss": 0.8329, "step": 181700 }, { "epoch": 2.5047532446061007, "grad_norm": 12.782723426818848, "learning_rate": 5.16604419118722e-05, "loss": 0.9383, "step": 181800 }, { "epoch": 2.5061309966658403, "grad_norm": 11.052231788635254, "learning_rate": 5.1656612476641346e-05, "loss": 0.9806, "step": 181900 }, { "epoch": 2.5075087487255794, "grad_norm": 15.06791877746582, "learning_rate": 5.165278098918266e-05, "loss": 0.9745, "step": 182000 }, { "epoch": 2.5088865007853185, "grad_norm": 9.330639839172363, "learning_rate": 5.164894744982167e-05, "loss": 0.8876, "step": 182100 }, { "epoch": 2.510264252845058, "grad_norm": 15.254342079162598, "learning_rate": 5.164511185888406e-05, "loss": 0.9242, "step": 182200 }, { "epoch": 2.5116420049047976, "grad_norm": 14.822551727294922, "learning_rate": 5.1641274216695665e-05, "loss": 0.9226, "step": 182300 }, { "epoch": 2.5130197569645367, "grad_norm": 4.59323263168335, "learning_rate": 5.1637434523582514e-05, "loss": 0.9647, "step": 182400 }, { "epoch": 2.514397509024276, "grad_norm": 18.973844528198242, "learning_rate": 5.163359277987081e-05, "loss": 0.8483, "step": 182500 }, { "epoch": 2.5157752610840154, "grad_norm": 10.772690773010254, "learning_rate": 5.1629748985886946e-05, "loss": 0.9064, "step": 182600 }, { "epoch": 2.5171530131437545, "grad_norm": 7.380763053894043, "learning_rate": 5.1625903141957455e-05, "loss": 0.9273, "step": 182700 }, { "epoch": 2.518530765203494, "grad_norm": 4.440961837768555, "learning_rate": 5.1622055248409066e-05, "loss": 0.8626, "step": 182800 }, { "epoch": 2.519908517263233, "grad_norm": 4.579497337341309, "learning_rate": 5.161820530556867e-05, "loss": 0.9964, "step": 182900 }, { "epoch": 2.5212862693229727, "grad_norm": 7.019351005554199, "learning_rate": 5.161435331376335e-05, "loss": 0.8699, "step": 183000 }, { "epoch": 2.522664021382712, "grad_norm": 7.681735515594482, "learning_rate": 5.161049927332034e-05, "loss": 0.9026, "step": 183100 }, { "epoch": 2.5240417734424514, "grad_norm": 12.842411041259766, "learning_rate": 5.1606643184567076e-05, "loss": 0.8147, "step": 183200 }, { "epoch": 2.5254195255021905, "grad_norm": 5.394651412963867, "learning_rate": 5.160278504783114e-05, "loss": 1.0233, "step": 183300 }, { "epoch": 2.52679727756193, "grad_norm": 11.24244499206543, "learning_rate": 5.159892486344031e-05, "loss": 0.7975, "step": 183400 }, { "epoch": 2.528175029621669, "grad_norm": 3.0735654830932617, "learning_rate": 5.1595062631722525e-05, "loss": 0.8718, "step": 183500 }, { "epoch": 2.5295527816814087, "grad_norm": 15.372994422912598, "learning_rate": 5.1591198353005896e-05, "loss": 0.9473, "step": 183600 }, { "epoch": 2.530930533741148, "grad_norm": 10.221742630004883, "learning_rate": 5.158733202761872e-05, "loss": 0.8634, "step": 183700 }, { "epoch": 2.5323082858008874, "grad_norm": 6.982173442840576, "learning_rate": 5.1583463655889457e-05, "loss": 0.8723, "step": 183800 }, { "epoch": 2.5336860378606265, "grad_norm": 6.5016303062438965, "learning_rate": 5.1579593238146746e-05, "loss": 0.9272, "step": 183900 }, { "epoch": 2.535063789920366, "grad_norm": 6.839928150177002, "learning_rate": 5.1575720774719396e-05, "loss": 0.9469, "step": 184000 }, { "epoch": 2.536441541980105, "grad_norm": 7.967733860015869, "learning_rate": 5.15718462659364e-05, "loss": 1.0197, "step": 184100 }, { "epoch": 2.5378192940398447, "grad_norm": 11.0763521194458, "learning_rate": 5.1567969712126904e-05, "loss": 0.8719, "step": 184200 }, { "epoch": 2.539197046099584, "grad_norm": 5.825921535491943, "learning_rate": 5.1564091113620266e-05, "loss": 0.8551, "step": 184300 }, { "epoch": 2.5405747981593234, "grad_norm": 8.032723426818848, "learning_rate": 5.156021047074596e-05, "loss": 0.8672, "step": 184400 }, { "epoch": 2.5419525502190625, "grad_norm": 27.34185791015625, "learning_rate": 5.15563277838337e-05, "loss": 0.9162, "step": 184500 }, { "epoch": 2.543330302278802, "grad_norm": 13.580251693725586, "learning_rate": 5.1552443053213316e-05, "loss": 0.9123, "step": 184600 }, { "epoch": 2.544708054338541, "grad_norm": 13.947186470031738, "learning_rate": 5.154855627921483e-05, "loss": 0.9009, "step": 184700 }, { "epoch": 2.5460858063982803, "grad_norm": 14.822541236877441, "learning_rate": 5.1544667462168475e-05, "loss": 0.9732, "step": 184800 }, { "epoch": 2.54746355845802, "grad_norm": 28.581523895263672, "learning_rate": 5.1540776602404595e-05, "loss": 0.9473, "step": 184900 }, { "epoch": 2.5488413105177594, "grad_norm": 80.29841613769531, "learning_rate": 5.1536883700253764e-05, "loss": 0.8771, "step": 185000 }, { "epoch": 2.5502190625774985, "grad_norm": 1.720413327217102, "learning_rate": 5.1532988756046684e-05, "loss": 0.9222, "step": 185100 }, { "epoch": 2.5515968146372376, "grad_norm": 6.913283348083496, "learning_rate": 5.1529091770114254e-05, "loss": 0.9653, "step": 185200 }, { "epoch": 2.552974566696977, "grad_norm": 18.21382713317871, "learning_rate": 5.152519274278755e-05, "loss": 0.837, "step": 185300 }, { "epoch": 2.5543523187567168, "grad_norm": 8.810418128967285, "learning_rate": 5.1521291674397807e-05, "loss": 0.9495, "step": 185400 }, { "epoch": 2.555730070816456, "grad_norm": 30.632896423339844, "learning_rate": 5.1517388565276446e-05, "loss": 0.947, "step": 185500 }, { "epoch": 2.557107822876195, "grad_norm": 21.800125122070312, "learning_rate": 5.151348341575506e-05, "loss": 0.9462, "step": 185600 }, { "epoch": 2.5584855749359345, "grad_norm": 4.775737285614014, "learning_rate": 5.15095762261654e-05, "loss": 0.9657, "step": 185700 }, { "epoch": 2.559863326995674, "grad_norm": 6.699219703674316, "learning_rate": 5.15056669968394e-05, "loss": 0.888, "step": 185800 }, { "epoch": 2.561241079055413, "grad_norm": 14.519498825073242, "learning_rate": 5.150179485089045e-05, "loss": 0.9088, "step": 185900 }, { "epoch": 2.5626188311151523, "grad_norm": 13.107704162597656, "learning_rate": 5.149788156347737e-05, "loss": 1.0211, "step": 186000 }, { "epoch": 2.563996583174892, "grad_norm": 169.83856201171875, "learning_rate": 5.149396623732147e-05, "loss": 0.886, "step": 186100 }, { "epoch": 2.5653743352346314, "grad_norm": 4.7520647048950195, "learning_rate": 5.1490048872755394e-05, "loss": 0.9538, "step": 186200 }, { "epoch": 2.5667520872943705, "grad_norm": 51.831214904785156, "learning_rate": 5.148612947011193e-05, "loss": 0.9556, "step": 186300 }, { "epoch": 2.5681298393541097, "grad_norm": 21.457719802856445, "learning_rate": 5.148220802972406e-05, "loss": 0.9755, "step": 186400 }, { "epoch": 2.569507591413849, "grad_norm": 13.799973487854004, "learning_rate": 5.147828455192492e-05, "loss": 0.9267, "step": 186500 }, { "epoch": 2.5708853434735888, "grad_norm": 4.978957653045654, "learning_rate": 5.147435903704784e-05, "loss": 0.8939, "step": 186600 }, { "epoch": 2.572263095533328, "grad_norm": 44.25135803222656, "learning_rate": 5.147043148542631e-05, "loss": 0.9141, "step": 186700 }, { "epoch": 2.573640847593067, "grad_norm": 6.83934211730957, "learning_rate": 5.1466501897393984e-05, "loss": 0.7704, "step": 186800 }, { "epoch": 2.5750185996528066, "grad_norm": 18.938186645507812, "learning_rate": 5.146257027328471e-05, "loss": 0.8558, "step": 186900 }, { "epoch": 2.5763963517125457, "grad_norm": 10.271759033203125, "learning_rate": 5.1458636613432517e-05, "loss": 0.8964, "step": 187000 }, { "epoch": 2.577774103772285, "grad_norm": 9.096436500549316, "learning_rate": 5.145470091817155e-05, "loss": 0.8865, "step": 187100 }, { "epoch": 2.5791518558320243, "grad_norm": 102.71835327148438, "learning_rate": 5.145076318783621e-05, "loss": 1.016, "step": 187200 }, { "epoch": 2.580529607891764, "grad_norm": 20.117971420288086, "learning_rate": 5.144682342276099e-05, "loss": 0.9383, "step": 187300 }, { "epoch": 2.581907359951503, "grad_norm": 9.525062561035156, "learning_rate": 5.1442881623280605e-05, "loss": 1.0051, "step": 187400 }, { "epoch": 2.5832851120112426, "grad_norm": 4.5403666496276855, "learning_rate": 5.143893778972995e-05, "loss": 0.7994, "step": 187500 }, { "epoch": 2.5846628640709817, "grad_norm": 27.31873893737793, "learning_rate": 5.1434991922444053e-05, "loss": 0.9004, "step": 187600 }, { "epoch": 2.5860406161307212, "grad_norm": 4.647820949554443, "learning_rate": 5.1431044021758145e-05, "loss": 0.9793, "step": 187700 }, { "epoch": 2.5874183681904603, "grad_norm": 19.6611270904541, "learning_rate": 5.142709408800761e-05, "loss": 1.0231, "step": 187800 }, { "epoch": 2.5887961202502, "grad_norm": 54.47097396850586, "learning_rate": 5.1423142121528026e-05, "loss": 0.9445, "step": 187900 }, { "epoch": 2.590173872309939, "grad_norm": 6.3395609855651855, "learning_rate": 5.14192276727031e-05, "loss": 0.8839, "step": 188000 }, { "epoch": 2.5915516243696786, "grad_norm": 14.5242919921875, "learning_rate": 5.141527166209171e-05, "loss": 0.9234, "step": 188100 }, { "epoch": 2.5929293764294177, "grad_norm": 19.29517936706543, "learning_rate": 5.1411313619755646e-05, "loss": 0.8739, "step": 188200 }, { "epoch": 2.5943071284891572, "grad_norm": 23.12270736694336, "learning_rate": 5.140735354603116e-05, "loss": 0.9486, "step": 188300 }, { "epoch": 2.5956848805488963, "grad_norm": 6.882327556610107, "learning_rate": 5.140339144125468e-05, "loss": 0.8628, "step": 188400 }, { "epoch": 2.597062632608636, "grad_norm": 314.453857421875, "learning_rate": 5.139946695716866e-05, "loss": 0.9973, "step": 188500 }, { "epoch": 2.598440384668375, "grad_norm": 4.135373115539551, "learning_rate": 5.139550081160029e-05, "loss": 0.9938, "step": 188600 }, { "epoch": 2.5998181367281146, "grad_norm": 18.16997528076172, "learning_rate": 5.139153263598688e-05, "loss": 0.8673, "step": 188700 }, { "epoch": 2.6011958887878537, "grad_norm": 20.263628005981445, "learning_rate": 5.138756243066554e-05, "loss": 0.9005, "step": 188800 }, { "epoch": 2.6025736408475932, "grad_norm": 40.79197692871094, "learning_rate": 5.138359019597356e-05, "loss": 0.9471, "step": 188900 }, { "epoch": 2.6039513929073324, "grad_norm": 11.931047439575195, "learning_rate": 5.137961593224842e-05, "loss": 0.9217, "step": 189000 }, { "epoch": 2.6053291449670715, "grad_norm": 10.099672317504883, "learning_rate": 5.137563963982772e-05, "loss": 0.9391, "step": 189100 }, { "epoch": 2.606706897026811, "grad_norm": 7.533517837524414, "learning_rate": 5.137166131904929e-05, "loss": 0.9268, "step": 189200 }, { "epoch": 2.6080846490865506, "grad_norm": 18.0084228515625, "learning_rate": 5.1367680970251106e-05, "loss": 0.9717, "step": 189300 }, { "epoch": 2.6094624011462897, "grad_norm": 18.37828254699707, "learning_rate": 5.136369859377133e-05, "loss": 0.8855, "step": 189400 }, { "epoch": 2.610840153206029, "grad_norm": 12.89180850982666, "learning_rate": 5.135971418994826e-05, "loss": 0.9168, "step": 189500 }, { "epoch": 2.6122179052657684, "grad_norm": 17.849620819091797, "learning_rate": 5.1355767633461244e-05, "loss": 0.9399, "step": 189600 }, { "epoch": 2.613595657325508, "grad_norm": 32.4993782043457, "learning_rate": 5.1351779196232266e-05, "loss": 0.9172, "step": 189700 }, { "epoch": 2.614973409385247, "grad_norm": 2.5051157474517822, "learning_rate": 5.13477887326726e-05, "loss": 0.9514, "step": 189800 }, { "epoch": 2.616351161444986, "grad_norm": 4.378296852111816, "learning_rate": 5.134379624312129e-05, "loss": 1.0027, "step": 189900 }, { "epoch": 2.6177289135047257, "grad_norm": 7.234669208526611, "learning_rate": 5.133980172791751e-05, "loss": 0.9112, "step": 190000 }, { "epoch": 2.6191066655644653, "grad_norm": 8.032721519470215, "learning_rate": 5.1335805187400596e-05, "loss": 0.9584, "step": 190100 }, { "epoch": 2.6204844176242044, "grad_norm": 9.139363288879395, "learning_rate": 5.1331806621910094e-05, "loss": 0.9918, "step": 190200 }, { "epoch": 2.6218621696839435, "grad_norm": 8.141135215759277, "learning_rate": 5.13278060317857e-05, "loss": 0.9646, "step": 190300 }, { "epoch": 2.623239921743683, "grad_norm": 12.721752166748047, "learning_rate": 5.132380341736727e-05, "loss": 0.8484, "step": 190400 }, { "epoch": 2.6246176738034226, "grad_norm": 1.455106496810913, "learning_rate": 5.1319798778994874e-05, "loss": 0.9257, "step": 190500 }, { "epoch": 2.6259954258631617, "grad_norm": 9.383966445922852, "learning_rate": 5.1315792117008703e-05, "loss": 0.8106, "step": 190600 }, { "epoch": 2.627373177922901, "grad_norm": 7.546175003051758, "learning_rate": 5.131178343174915e-05, "loss": 0.9302, "step": 190700 }, { "epoch": 2.6287509299826404, "grad_norm": 18.524019241333008, "learning_rate": 5.1307772723556774e-05, "loss": 0.8488, "step": 190800 }, { "epoch": 2.63012868204238, "grad_norm": 6.301628589630127, "learning_rate": 5.130375999277231e-05, "loss": 0.9353, "step": 190900 }, { "epoch": 2.631506434102119, "grad_norm": 6.803900718688965, "learning_rate": 5.1299745239736646e-05, "loss": 1.0234, "step": 191000 }, { "epoch": 2.632884186161858, "grad_norm": 11.359065055847168, "learning_rate": 5.129572846479088e-05, "loss": 0.9681, "step": 191100 }, { "epoch": 2.6342619382215977, "grad_norm": 7.186391830444336, "learning_rate": 5.129170966827623e-05, "loss": 0.9555, "step": 191200 }, { "epoch": 2.635639690281337, "grad_norm": 35.95524597167969, "learning_rate": 5.128768885053413e-05, "loss": 0.9275, "step": 191300 }, { "epoch": 2.6370174423410764, "grad_norm": 10.369431495666504, "learning_rate": 5.1283666011906165e-05, "loss": 0.8901, "step": 191400 }, { "epoch": 2.6383951944008155, "grad_norm": 6.025608062744141, "learning_rate": 5.1279641152734096e-05, "loss": 0.9045, "step": 191500 }, { "epoch": 2.639772946460555, "grad_norm": 16.589677810668945, "learning_rate": 5.127561427335986e-05, "loss": 0.9506, "step": 191600 }, { "epoch": 2.641150698520294, "grad_norm": 61.594337463378906, "learning_rate": 5.1271585374125547e-05, "loss": 1.0512, "step": 191700 }, { "epoch": 2.6425284505800337, "grad_norm": 14.661096572875977, "learning_rate": 5.126755445537345e-05, "loss": 1.042, "step": 191800 }, { "epoch": 2.643906202639773, "grad_norm": 6.875641345977783, "learning_rate": 5.1263521517446e-05, "loss": 1.0386, "step": 191900 }, { "epoch": 2.6452839546995124, "grad_norm": 19.642898559570312, "learning_rate": 5.1259486560685824e-05, "loss": 0.9108, "step": 192000 }, { "epoch": 2.6466617067592515, "grad_norm": 3.6995697021484375, "learning_rate": 5.125544958543572e-05, "loss": 0.925, "step": 192100 }, { "epoch": 2.648039458818991, "grad_norm": 25.81755256652832, "learning_rate": 5.125141059203864e-05, "loss": 0.992, "step": 192200 }, { "epoch": 2.64941721087873, "grad_norm": 16.51807975769043, "learning_rate": 5.124736958083771e-05, "loss": 0.9433, "step": 192300 }, { "epoch": 2.6507949629384697, "grad_norm": 17.831552505493164, "learning_rate": 5.1243326552176254e-05, "loss": 0.964, "step": 192400 }, { "epoch": 2.652172714998209, "grad_norm": 28.03275489807129, "learning_rate": 5.1239321966839124e-05, "loss": 0.8869, "step": 192500 }, { "epoch": 2.6535504670579484, "grad_norm": 7.905217170715332, "learning_rate": 5.123527492445322e-05, "loss": 0.9856, "step": 192600 }, { "epoch": 2.6549282191176875, "grad_norm": 52.83687210083008, "learning_rate": 5.1231225865634286e-05, "loss": 0.9353, "step": 192700 }, { "epoch": 2.656305971177427, "grad_norm": 16.814350128173828, "learning_rate": 5.12271747907263e-05, "loss": 0.8925, "step": 192800 }, { "epoch": 2.657683723237166, "grad_norm": 23.393108367919922, "learning_rate": 5.122312170007344e-05, "loss": 0.9281, "step": 192900 }, { "epoch": 2.6590614752969057, "grad_norm": 27.19084930419922, "learning_rate": 5.121906659402001e-05, "loss": 0.9208, "step": 193000 }, { "epoch": 2.660439227356645, "grad_norm": 8.059378623962402, "learning_rate": 5.121500947291054e-05, "loss": 1.0676, "step": 193100 }, { "epoch": 2.6618169794163844, "grad_norm": 36.84059524536133, "learning_rate": 5.121095033708969e-05, "loss": 0.9761, "step": 193200 }, { "epoch": 2.6631947314761235, "grad_norm": 12.709820747375488, "learning_rate": 5.12068891869023e-05, "loss": 1.0557, "step": 193300 }, { "epoch": 2.6645724835358626, "grad_norm": 18.64215087890625, "learning_rate": 5.120282602269339e-05, "loss": 0.9581, "step": 193400 }, { "epoch": 2.665950235595602, "grad_norm": 12.242551803588867, "learning_rate": 5.119876084480814e-05, "loss": 0.9817, "step": 193500 }, { "epoch": 2.6673279876553417, "grad_norm": 31.557125091552734, "learning_rate": 5.119469365359192e-05, "loss": 0.9564, "step": 193600 }, { "epoch": 2.668705739715081, "grad_norm": 33.50242614746094, "learning_rate": 5.119062444939026e-05, "loss": 0.9729, "step": 193700 }, { "epoch": 2.67008349177482, "grad_norm": 7.800500869750977, "learning_rate": 5.1186553232548844e-05, "loss": 0.854, "step": 193800 }, { "epoch": 2.6714612438345595, "grad_norm": 89.7582015991211, "learning_rate": 5.1182480003413555e-05, "loss": 1.0125, "step": 193900 }, { "epoch": 2.672838995894299, "grad_norm": 50.66563415527344, "learning_rate": 5.117840476233043e-05, "loss": 1.0552, "step": 194000 }, { "epoch": 2.674216747954038, "grad_norm": 10.461341857910156, "learning_rate": 5.117432750964568e-05, "loss": 0.9723, "step": 194100 }, { "epoch": 2.6755945000137773, "grad_norm": 126.65116119384766, "learning_rate": 5.117024824570569e-05, "loss": 1.0072, "step": 194200 }, { "epoch": 2.676972252073517, "grad_norm": 23.27111053466797, "learning_rate": 5.116616697085702e-05, "loss": 0.9332, "step": 194300 }, { "epoch": 2.6783500041332564, "grad_norm": 14.857584953308105, "learning_rate": 5.116208368544639e-05, "loss": 1.0109, "step": 194400 }, { "epoch": 2.6797277561929955, "grad_norm": 3.397592544555664, "learning_rate": 5.11579983898207e-05, "loss": 0.9242, "step": 194500 }, { "epoch": 2.6811055082527346, "grad_norm": 4.395207405090332, "learning_rate": 5.1153911084327004e-05, "loss": 0.9252, "step": 194600 }, { "epoch": 2.682483260312474, "grad_norm": 5.647323131561279, "learning_rate": 5.114982176931255e-05, "loss": 0.9692, "step": 194700 }, { "epoch": 2.6838610123722138, "grad_norm": 10.908419609069824, "learning_rate": 5.114573044512475e-05, "loss": 0.916, "step": 194800 }, { "epoch": 2.685238764431953, "grad_norm": 13.371127128601074, "learning_rate": 5.114163711211117e-05, "loss": 0.955, "step": 194900 }, { "epoch": 2.686616516491692, "grad_norm": 8.732426643371582, "learning_rate": 5.1137541770619586e-05, "loss": 0.9925, "step": 195000 }, { "epoch": 2.6879942685514315, "grad_norm": 4.614773750305176, "learning_rate": 5.113344442099789e-05, "loss": 0.9404, "step": 195100 }, { "epoch": 2.689372020611171, "grad_norm": 7.362372398376465, "learning_rate": 5.112934506359417e-05, "loss": 0.9214, "step": 195200 }, { "epoch": 2.69074977267091, "grad_norm": 7.540863037109375, "learning_rate": 5.1125243698756713e-05, "loss": 0.9476, "step": 195300 }, { "epoch": 2.6921275247306493, "grad_norm": 15.06735610961914, "learning_rate": 5.1121140326833934e-05, "loss": 0.9528, "step": 195400 }, { "epoch": 2.693505276790389, "grad_norm": 3.328019857406616, "learning_rate": 5.111703494817444e-05, "loss": 1.0005, "step": 195500 }, { "epoch": 2.694883028850128, "grad_norm": 11.551410675048828, "learning_rate": 5.111292756312701e-05, "loss": 0.9917, "step": 195600 }, { "epoch": 2.6962607809098675, "grad_norm": 18.742382049560547, "learning_rate": 5.110881817204057e-05, "loss": 0.9887, "step": 195700 }, { "epoch": 2.6976385329696067, "grad_norm": 3.999192476272583, "learning_rate": 5.110470677526425e-05, "loss": 0.9466, "step": 195800 }, { "epoch": 2.699016285029346, "grad_norm": 6.355814456939697, "learning_rate": 5.1100593373147325e-05, "loss": 0.9488, "step": 195900 }, { "epoch": 2.7003940370890853, "grad_norm": 12.663365364074707, "learning_rate": 5.109647796603925e-05, "loss": 0.9965, "step": 196000 }, { "epoch": 2.701771789148825, "grad_norm": 47.599945068359375, "learning_rate": 5.1092360554289656e-05, "loss": 0.9193, "step": 196100 }, { "epoch": 2.703149541208564, "grad_norm": 31.9816837310791, "learning_rate": 5.108824113824835e-05, "loss": 0.9268, "step": 196200 }, { "epoch": 2.7045272932683035, "grad_norm": 7.92744779586792, "learning_rate": 5.1084119718265264e-05, "loss": 0.9374, "step": 196300 }, { "epoch": 2.7059050453280427, "grad_norm": 10.099730491638184, "learning_rate": 5.1079996294690556e-05, "loss": 1.0393, "step": 196400 }, { "epoch": 2.707282797387782, "grad_norm": 35.509029388427734, "learning_rate": 5.107587086787453e-05, "loss": 0.991, "step": 196500 }, { "epoch": 2.7086605494475213, "grad_norm": 24.436607360839844, "learning_rate": 5.107174343816766e-05, "loss": 0.9262, "step": 196600 }, { "epoch": 2.710038301507261, "grad_norm": 33.455020904541016, "learning_rate": 5.1067614005920595e-05, "loss": 0.9521, "step": 196700 }, { "epoch": 2.711416053567, "grad_norm": 11.710322380065918, "learning_rate": 5.106348257148415e-05, "loss": 0.874, "step": 196800 }, { "epoch": 2.7127938056267396, "grad_norm": 22.525588989257812, "learning_rate": 5.105934913520931e-05, "loss": 1.0296, "step": 196900 }, { "epoch": 2.7141715576864787, "grad_norm": 6.8205976486206055, "learning_rate": 5.105521369744723e-05, "loss": 0.9052, "step": 197000 }, { "epoch": 2.715549309746218, "grad_norm": 10.318865776062012, "learning_rate": 5.1051076258549236e-05, "loss": 0.9833, "step": 197100 }, { "epoch": 2.7169270618059573, "grad_norm": 14.404354095458984, "learning_rate": 5.104693681886684e-05, "loss": 0.8851, "step": 197200 }, { "epoch": 2.718304813865697, "grad_norm": 9.110835075378418, "learning_rate": 5.104279537875168e-05, "loss": 0.8997, "step": 197300 }, { "epoch": 2.719682565925436, "grad_norm": 35.47287368774414, "learning_rate": 5.103865193855561e-05, "loss": 0.9696, "step": 197400 }, { "epoch": 2.7210603179851756, "grad_norm": 23.111448287963867, "learning_rate": 5.10345479629274e-05, "loss": 1.0111, "step": 197500 }, { "epoch": 2.7224380700449147, "grad_norm": 7.040343284606934, "learning_rate": 5.103040054361772e-05, "loss": 0.8598, "step": 197600 }, { "epoch": 2.723815822104654, "grad_norm": 27.314756393432617, "learning_rate": 5.1026251125280146e-05, "loss": 0.989, "step": 197700 }, { "epoch": 2.7251935741643933, "grad_norm": 9.709699630737305, "learning_rate": 5.102209970826717e-05, "loss": 0.8676, "step": 197800 }, { "epoch": 2.726571326224133, "grad_norm": 58.49681091308594, "learning_rate": 5.101794629293148e-05, "loss": 0.8694, "step": 197900 }, { "epoch": 2.727949078283872, "grad_norm": 2.6799161434173584, "learning_rate": 5.1013790879625944e-05, "loss": 0.9155, "step": 198000 }, { "epoch": 2.729326830343611, "grad_norm": 2.6764700412750244, "learning_rate": 5.100963346870358e-05, "loss": 0.9121, "step": 198100 }, { "epoch": 2.7307045824033507, "grad_norm": 9.626672744750977, "learning_rate": 5.100547406051757e-05, "loss": 0.8487, "step": 198200 }, { "epoch": 2.7320823344630902, "grad_norm": 23.776607513427734, "learning_rate": 5.100131265542129e-05, "loss": 1.0353, "step": 198300 }, { "epoch": 2.7334600865228293, "grad_norm": 6.544618606567383, "learning_rate": 5.0997232541365294e-05, "loss": 0.9686, "step": 198400 }, { "epoch": 2.7348378385825685, "grad_norm": 126.95794677734375, "learning_rate": 5.0993067183429824e-05, "loss": 0.9406, "step": 198500 }, { "epoch": 2.736215590642308, "grad_norm": 11.303043365478516, "learning_rate": 5.098889982963811e-05, "loss": 0.9429, "step": 198600 }, { "epoch": 2.7375933427020476, "grad_norm": 59.51509094238281, "learning_rate": 5.098473048034419e-05, "loss": 0.8943, "step": 198700 }, { "epoch": 2.7389710947617867, "grad_norm": 55.9721565246582, "learning_rate": 5.0980559135902254e-05, "loss": 0.9932, "step": 198800 }, { "epoch": 2.740348846821526, "grad_norm": 16.333709716796875, "learning_rate": 5.097638579666671e-05, "loss": 0.914, "step": 198900 }, { "epoch": 2.7417265988812654, "grad_norm": 100.32839965820312, "learning_rate": 5.097221046299208e-05, "loss": 0.9003, "step": 199000 }, { "epoch": 2.743104350941005, "grad_norm": 11.707318305969238, "learning_rate": 5.0968033135233093e-05, "loss": 0.9718, "step": 199100 }, { "epoch": 2.744482103000744, "grad_norm": 24.461414337158203, "learning_rate": 5.096385381374462e-05, "loss": 1.0454, "step": 199200 }, { "epoch": 2.745859855060483, "grad_norm": 23.23276710510254, "learning_rate": 5.0959672498881734e-05, "loss": 1.017, "step": 199300 }, { "epoch": 2.7472376071202227, "grad_norm": 7.538372039794922, "learning_rate": 5.095548919099965e-05, "loss": 1.0469, "step": 199400 }, { "epoch": 2.7486153591799622, "grad_norm": 6.8459296226501465, "learning_rate": 5.0951303890453756e-05, "loss": 0.9409, "step": 199500 }, { "epoch": 2.7499931112397014, "grad_norm": 14.233335494995117, "learning_rate": 5.094711659759962e-05, "loss": 0.9833, "step": 199600 }, { "epoch": 2.7513708632994405, "grad_norm": 12.74863052368164, "learning_rate": 5.094292731279298e-05, "loss": 0.8698, "step": 199700 }, { "epoch": 2.75274861535918, "grad_norm": 4.602093696594238, "learning_rate": 5.0938736036389734e-05, "loss": 1.0049, "step": 199800 }, { "epoch": 2.754126367418919, "grad_norm": 5.2265753746032715, "learning_rate": 5.093454276874594e-05, "loss": 1.1012, "step": 199900 }, { "epoch": 2.7555041194786587, "grad_norm": 66.78414916992188, "learning_rate": 5.093034751021785e-05, "loss": 0.9655, "step": 200000 }, { "epoch": 2.756881871538398, "grad_norm": 95.28353118896484, "learning_rate": 5.092615026116188e-05, "loss": 1.0562, "step": 200100 }, { "epoch": 2.7582596235981374, "grad_norm": 15.570234298706055, "learning_rate": 5.092195102193459e-05, "loss": 0.9923, "step": 200200 }, { "epoch": 2.7596373756578765, "grad_norm": 30.775211334228516, "learning_rate": 5.091774979289274e-05, "loss": 1.049, "step": 200300 }, { "epoch": 2.761015127717616, "grad_norm": 18.46593475341797, "learning_rate": 5.0913546574393234e-05, "loss": 1.0008, "step": 200400 }, { "epoch": 2.762392879777355, "grad_norm": 22.81820297241211, "learning_rate": 5.090934136679317e-05, "loss": 1.1333, "step": 200500 }, { "epoch": 2.7637706318370947, "grad_norm": 47.288787841796875, "learning_rate": 5.090513417044979e-05, "loss": 1.2612, "step": 200600 }, { "epoch": 2.765148383896834, "grad_norm": 11.055154800415039, "learning_rate": 5.0900924985720516e-05, "loss": 1.0377, "step": 200700 }, { "epoch": 2.7665261359565734, "grad_norm": 13.395950317382812, "learning_rate": 5.0896713812962964e-05, "loss": 1.0821, "step": 200800 }, { "epoch": 2.7679038880163125, "grad_norm": 13.816061019897461, "learning_rate": 5.089250065253486e-05, "loss": 1.0163, "step": 200900 }, { "epoch": 2.769281640076052, "grad_norm": 30.19580078125, "learning_rate": 5.088828550479416e-05, "loss": 1.0871, "step": 201000 }, { "epoch": 2.770659392135791, "grad_norm": 28.16486358642578, "learning_rate": 5.088406837009895e-05, "loss": 1.0917, "step": 201100 }, { "epoch": 2.7720371441955307, "grad_norm": 31.24152374267578, "learning_rate": 5.0879849248807506e-05, "loss": 0.9505, "step": 201200 }, { "epoch": 2.77341489625527, "grad_norm": 68.79824829101562, "learning_rate": 5.0875628141278246e-05, "loss": 1.0807, "step": 201300 }, { "epoch": 2.7747926483150094, "grad_norm": 33.98255157470703, "learning_rate": 5.087140504786979e-05, "loss": 1.0745, "step": 201400 }, { "epoch": 2.7761704003747485, "grad_norm": 14.746597290039062, "learning_rate": 5.0867179968940906e-05, "loss": 1.0066, "step": 201500 }, { "epoch": 2.777548152434488, "grad_norm": 147.15943908691406, "learning_rate": 5.086299518531682e-05, "loss": 1.0723, "step": 201600 }, { "epoch": 2.778925904494227, "grad_norm": 25.66536521911621, "learning_rate": 5.085876615627033e-05, "loss": 1.0457, "step": 201700 }, { "epoch": 2.7803036565539667, "grad_norm": 10.483482360839844, "learning_rate": 5.085457746273391e-05, "loss": 0.9462, "step": 201800 }, { "epoch": 2.781681408613706, "grad_norm": 16.32695770263672, "learning_rate": 5.0850344484992584e-05, "loss": 1.0422, "step": 201900 }, { "epoch": 2.783059160673445, "grad_norm": 8.703271865844727, "learning_rate": 5.084610952352003e-05, "loss": 1.0052, "step": 202000 }, { "epoch": 2.7844369127331845, "grad_norm": 11.93905258178711, "learning_rate": 5.0841872578676045e-05, "loss": 0.9117, "step": 202100 }, { "epoch": 2.785814664792924, "grad_norm": 8.124862670898438, "learning_rate": 5.0837633650820556e-05, "loss": 0.9941, "step": 202200 }, { "epoch": 2.787192416852663, "grad_norm": 29.387733459472656, "learning_rate": 5.08333927403137e-05, "loss": 0.9597, "step": 202300 }, { "epoch": 2.7885701689124023, "grad_norm": 64.52640533447266, "learning_rate": 5.0829149847515754e-05, "loss": 0.957, "step": 202400 }, { "epoch": 2.789947920972142, "grad_norm": 13.357098579406738, "learning_rate": 5.082490497278717e-05, "loss": 1.0003, "step": 202500 }, { "epoch": 2.7913256730318814, "grad_norm": 14.107404708862305, "learning_rate": 5.08206581164886e-05, "loss": 1.0488, "step": 202600 }, { "epoch": 2.7927034250916205, "grad_norm": 28.695308685302734, "learning_rate": 5.08164092789808e-05, "loss": 1.0696, "step": 202700 }, { "epoch": 2.7940811771513596, "grad_norm": 7.728165149688721, "learning_rate": 5.081215846062475e-05, "loss": 1.2595, "step": 202800 }, { "epoch": 2.795458929211099, "grad_norm": 41.977813720703125, "learning_rate": 5.0807905661781575e-05, "loss": 1.1636, "step": 202900 }, { "epoch": 2.7968366812708387, "grad_norm": 32.05202865600586, "learning_rate": 5.080365088281257e-05, "loss": 0.9333, "step": 203000 }, { "epoch": 2.798214433330578, "grad_norm": 58.59223175048828, "learning_rate": 5.0799394124079204e-05, "loss": 0.9932, "step": 203100 }, { "epoch": 2.799592185390317, "grad_norm": 14.666093826293945, "learning_rate": 5.07951353859431e-05, "loss": 1.0129, "step": 203200 }, { "epoch": 2.8009699374500565, "grad_norm": 402.260009765625, "learning_rate": 5.079087466876607e-05, "loss": 1.0459, "step": 203300 }, { "epoch": 2.802347689509796, "grad_norm": 15.870574951171875, "learning_rate": 5.078661197291009e-05, "loss": 1.0175, "step": 203400 }, { "epoch": 2.803725441569535, "grad_norm": 10.65597152709961, "learning_rate": 5.078234729873729e-05, "loss": 1.0481, "step": 203500 }, { "epoch": 2.8051031936292743, "grad_norm": 49.02125549316406, "learning_rate": 5.077808064660997e-05, "loss": 1.0732, "step": 203600 }, { "epoch": 2.806480945689014, "grad_norm": 27.337841033935547, "learning_rate": 5.077381201689061e-05, "loss": 1.0755, "step": 203700 }, { "epoch": 2.8078586977487534, "grad_norm": 7.510376930236816, "learning_rate": 5.076954140994185e-05, "loss": 1.1307, "step": 203800 }, { "epoch": 2.8092364498084925, "grad_norm": 4.652828216552734, "learning_rate": 5.0765268826126506e-05, "loss": 1.0991, "step": 203900 }, { "epoch": 2.8106142018682316, "grad_norm": 27.985675811767578, "learning_rate": 5.076099426580754e-05, "loss": 1.162, "step": 204000 }, { "epoch": 2.811991953927971, "grad_norm": 14.408230781555176, "learning_rate": 5.075671772934812e-05, "loss": 1.0234, "step": 204100 }, { "epoch": 2.8133697059877103, "grad_norm": 8.758142471313477, "learning_rate": 5.0752439217111545e-05, "loss": 1.076, "step": 204200 }, { "epoch": 2.81474745804745, "grad_norm": 38.64096450805664, "learning_rate": 5.074815872946129e-05, "loss": 0.9636, "step": 204300 }, { "epoch": 2.816125210107189, "grad_norm": 18.52665138244629, "learning_rate": 5.074387626676103e-05, "loss": 1.144, "step": 204400 }, { "epoch": 2.8175029621669285, "grad_norm": 19.902381896972656, "learning_rate": 5.073959182937455e-05, "loss": 0.9249, "step": 204500 }, { "epoch": 2.8188807142266676, "grad_norm": 182.37232971191406, "learning_rate": 5.0735305417665864e-05, "loss": 1.0091, "step": 204600 }, { "epoch": 2.820258466286407, "grad_norm": 12.161040306091309, "learning_rate": 5.073101703199911e-05, "loss": 1.0376, "step": 204700 }, { "epoch": 2.8216362183461463, "grad_norm": 17.16762924194336, "learning_rate": 5.0726726672738606e-05, "loss": 0.9096, "step": 204800 }, { "epoch": 2.823013970405886, "grad_norm": 6.061741828918457, "learning_rate": 5.072243434024885e-05, "loss": 0.9391, "step": 204900 }, { "epoch": 2.824391722465625, "grad_norm": 22.281923294067383, "learning_rate": 5.0718140034894485e-05, "loss": 0.9729, "step": 205000 }, { "epoch": 2.8257694745253645, "grad_norm": 4.719633102416992, "learning_rate": 5.0713843757040345e-05, "loss": 0.8728, "step": 205100 }, { "epoch": 2.8271472265851036, "grad_norm": 100.57250213623047, "learning_rate": 5.0709545507051414e-05, "loss": 0.9244, "step": 205200 }, { "epoch": 2.828524978644843, "grad_norm": 8.920479774475098, "learning_rate": 5.070524528529285e-05, "loss": 0.9892, "step": 205300 }, { "epoch": 2.8299027307045823, "grad_norm": 5.643247604370117, "learning_rate": 5.070094309212999e-05, "loss": 0.9452, "step": 205400 }, { "epoch": 2.831280482764322, "grad_norm": 4.509352207183838, "learning_rate": 5.069663892792831e-05, "loss": 0.9497, "step": 205500 }, { "epoch": 2.832658234824061, "grad_norm": 8.125088691711426, "learning_rate": 5.069233279305349e-05, "loss": 0.8737, "step": 205600 }, { "epoch": 2.8340359868838005, "grad_norm": 10.599855422973633, "learning_rate": 5.068802468787134e-05, "loss": 0.9742, "step": 205700 }, { "epoch": 2.8354137389435397, "grad_norm": 8.889613151550293, "learning_rate": 5.068371461274787e-05, "loss": 0.9359, "step": 205800 }, { "epoch": 2.836791491003279, "grad_norm": 19.9876651763916, "learning_rate": 5.067940256804923e-05, "loss": 0.8721, "step": 205900 }, { "epoch": 2.8381692430630183, "grad_norm": 3.8612754344940186, "learning_rate": 5.067513170402722e-05, "loss": 0.9702, "step": 206000 }, { "epoch": 2.839546995122758, "grad_norm": 10.275188446044922, "learning_rate": 5.067081574096402e-05, "loss": 1.1416, "step": 206100 }, { "epoch": 2.840924747182497, "grad_norm": 5.825127124786377, "learning_rate": 5.066649780942148e-05, "loss": 0.8829, "step": 206200 }, { "epoch": 2.842302499242236, "grad_norm": 14.092430114746094, "learning_rate": 5.066217790976644e-05, "loss": 1.0462, "step": 206300 }, { "epoch": 2.8436802513019757, "grad_norm": 10.614217758178711, "learning_rate": 5.0657856042365886e-05, "loss": 0.9353, "step": 206400 }, { "epoch": 2.845058003361715, "grad_norm": 17.03887176513672, "learning_rate": 5.065353220758699e-05, "loss": 0.9406, "step": 206500 }, { "epoch": 2.8464357554214543, "grad_norm": 82.23941802978516, "learning_rate": 5.064920640579707e-05, "loss": 0.8721, "step": 206600 }, { "epoch": 2.8478135074811934, "grad_norm": 12.375205039978027, "learning_rate": 5.064487863736364e-05, "loss": 0.8346, "step": 206700 }, { "epoch": 2.849191259540933, "grad_norm": 16.048620223999023, "learning_rate": 5.064054890265437e-05, "loss": 0.9715, "step": 206800 }, { "epoch": 2.8505690116006726, "grad_norm": 16.478927612304688, "learning_rate": 5.063621720203708e-05, "loss": 1.0036, "step": 206900 }, { "epoch": 2.8519467636604117, "grad_norm": 16.252845764160156, "learning_rate": 5.063188353587978e-05, "loss": 0.8991, "step": 207000 }, { "epoch": 2.853324515720151, "grad_norm": 15.043439865112305, "learning_rate": 5.062754790455062e-05, "loss": 0.9867, "step": 207100 }, { "epoch": 2.8547022677798903, "grad_norm": 11.19311237335205, "learning_rate": 5.0623210308417944e-05, "loss": 0.9143, "step": 207200 }, { "epoch": 2.85608001983963, "grad_norm": 11.718269348144531, "learning_rate": 5.061887074785026e-05, "loss": 0.9183, "step": 207300 }, { "epoch": 2.857457771899369, "grad_norm": 18.80988883972168, "learning_rate": 5.0614529223216235e-05, "loss": 0.9199, "step": 207400 }, { "epoch": 2.858835523959108, "grad_norm": 36.35650634765625, "learning_rate": 5.06101857348847e-05, "loss": 0.998, "step": 207500 }, { "epoch": 2.8602132760188477, "grad_norm": 46.12152099609375, "learning_rate": 5.060584028322465e-05, "loss": 1.0043, "step": 207600 }, { "epoch": 2.8615910280785872, "grad_norm": 6.979639053344727, "learning_rate": 5.0601492868605255e-05, "loss": 0.9806, "step": 207700 }, { "epoch": 2.8629687801383263, "grad_norm": 827.8052978515625, "learning_rate": 5.0597143491395866e-05, "loss": 1.0127, "step": 207800 }, { "epoch": 2.8643465321980655, "grad_norm": 62.50857925415039, "learning_rate": 5.059279215196597e-05, "loss": 0.7998, "step": 207900 }, { "epoch": 2.865724284257805, "grad_norm": 41.70628356933594, "learning_rate": 5.0588438850685234e-05, "loss": 0.9539, "step": 208000 }, { "epoch": 2.8671020363175446, "grad_norm": 59.98887252807617, "learning_rate": 5.05840835879235e-05, "loss": 0.9108, "step": 208100 }, { "epoch": 2.8684797883772837, "grad_norm": 8.437341690063477, "learning_rate": 5.057972636405077e-05, "loss": 0.8788, "step": 208200 }, { "epoch": 2.869857540437023, "grad_norm": 9.392892837524414, "learning_rate": 5.057536717943721e-05, "loss": 0.8857, "step": 208300 }, { "epoch": 2.8712352924967623, "grad_norm": 6.860851764678955, "learning_rate": 5.057100603445315e-05, "loss": 1.0502, "step": 208400 }, { "epoch": 2.8726130445565015, "grad_norm": 34.85822296142578, "learning_rate": 5.056664292946911e-05, "loss": 0.9415, "step": 208500 }, { "epoch": 2.873990796616241, "grad_norm": 12.538309097290039, "learning_rate": 5.0562277864855726e-05, "loss": 0.9393, "step": 208600 }, { "epoch": 2.87536854867598, "grad_norm": 7.2170090675354, "learning_rate": 5.0557910840983865e-05, "loss": 1.0151, "step": 208700 }, { "epoch": 2.8767463007357197, "grad_norm": 181.87399291992188, "learning_rate": 5.0553541858224504e-05, "loss": 0.956, "step": 208800 }, { "epoch": 2.878124052795459, "grad_norm": 57.523582458496094, "learning_rate": 5.0549170916948834e-05, "loss": 0.9321, "step": 208900 }, { "epoch": 2.8795018048551984, "grad_norm": 19.816213607788086, "learning_rate": 5.054479801752817e-05, "loss": 0.9569, "step": 209000 }, { "epoch": 2.8808795569149375, "grad_norm": 5.597317218780518, "learning_rate": 5.054042316033402e-05, "loss": 0.9008, "step": 209100 }, { "epoch": 2.882257308974677, "grad_norm": 9.473077774047852, "learning_rate": 5.0536046345738044e-05, "loss": 1.0578, "step": 209200 }, { "epoch": 2.883635061034416, "grad_norm": 63.83018112182617, "learning_rate": 5.053166757411207e-05, "loss": 0.9711, "step": 209300 }, { "epoch": 2.8850128130941557, "grad_norm": 7.8863043785095215, "learning_rate": 5.052728684582813e-05, "loss": 1.0625, "step": 209400 }, { "epoch": 2.886390565153895, "grad_norm": 62.159629821777344, "learning_rate": 5.052290416125835e-05, "loss": 0.9356, "step": 209500 }, { "epoch": 2.8877683172136344, "grad_norm": 10.068209648132324, "learning_rate": 5.051851952077508e-05, "loss": 0.9454, "step": 209600 }, { "epoch": 2.8891460692733735, "grad_norm": 10.165882110595703, "learning_rate": 5.0514132924750814e-05, "loss": 0.9771, "step": 209700 }, { "epoch": 2.890523821333113, "grad_norm": 7.802192211151123, "learning_rate": 5.050974437355822e-05, "loss": 0.9354, "step": 209800 }, { "epoch": 2.891901573392852, "grad_norm": 14.152220726013184, "learning_rate": 5.0505353867570126e-05, "loss": 0.9706, "step": 209900 }, { "epoch": 2.8932793254525917, "grad_norm": 28.09430503845215, "learning_rate": 5.050096140715952e-05, "loss": 0.9906, "step": 210000 }, { "epoch": 2.894657077512331, "grad_norm": 22.049821853637695, "learning_rate": 5.049656699269957e-05, "loss": 0.9114, "step": 210100 }, { "epoch": 2.8960348295720704, "grad_norm": 5.832665920257568, "learning_rate": 5.049217062456361e-05, "loss": 0.8558, "step": 210200 }, { "epoch": 2.8974125816318095, "grad_norm": 102.17266082763672, "learning_rate": 5.048781629600713e-05, "loss": 0.8317, "step": 210300 }, { "epoch": 2.898790333691549, "grad_norm": 21.945758819580078, "learning_rate": 5.0483416041167227e-05, "loss": 0.9463, "step": 210400 }, { "epoch": 2.900168085751288, "grad_norm": 5.362823963165283, "learning_rate": 5.047901383376854e-05, "loss": 0.9946, "step": 210500 }, { "epoch": 2.9015458378110273, "grad_norm": 19.6251163482666, "learning_rate": 5.047460967418507e-05, "loss": 0.929, "step": 210600 }, { "epoch": 2.902923589870767, "grad_norm": 2.2251875400543213, "learning_rate": 5.047020356279097e-05, "loss": 0.8858, "step": 210700 }, { "epoch": 2.9043013419305064, "grad_norm": 4.737329959869385, "learning_rate": 5.0465795499960574e-05, "loss": 0.9394, "step": 210800 }, { "epoch": 2.9056790939902455, "grad_norm": 7.203439712524414, "learning_rate": 5.046138548606834e-05, "loss": 0.9765, "step": 210900 }, { "epoch": 2.9070568460499846, "grad_norm": 13.028100967407227, "learning_rate": 5.0456973521488954e-05, "loss": 0.9495, "step": 211000 }, { "epoch": 2.908434598109724, "grad_norm": 109.66581726074219, "learning_rate": 5.045255960659722e-05, "loss": 0.8785, "step": 211100 }, { "epoch": 2.9098123501694637, "grad_norm": 33.24901580810547, "learning_rate": 5.044814374176812e-05, "loss": 0.9389, "step": 211200 }, { "epoch": 2.911190102229203, "grad_norm": 176.90284729003906, "learning_rate": 5.0443725927376795e-05, "loss": 0.8288, "step": 211300 }, { "epoch": 2.912567854288942, "grad_norm": 12.923941612243652, "learning_rate": 5.043930616379859e-05, "loss": 0.9319, "step": 211400 }, { "epoch": 2.9139456063486815, "grad_norm": 20.760425567626953, "learning_rate": 5.043492867817825e-05, "loss": 0.9642, "step": 211500 }, { "epoch": 2.915323358408421, "grad_norm": 11.829825401306152, "learning_rate": 5.043050503683535e-05, "loss": 1.0146, "step": 211600 }, { "epoch": 2.91670111046816, "grad_norm": 20.66179847717285, "learning_rate": 5.042607944742873e-05, "loss": 0.9887, "step": 211700 }, { "epoch": 2.9180788625278993, "grad_norm": 8.777021408081055, "learning_rate": 5.042165191033439e-05, "loss": 0.9347, "step": 211800 }, { "epoch": 2.919456614587639, "grad_norm": 3.8520824909210205, "learning_rate": 5.041722242592844e-05, "loss": 0.9659, "step": 211900 }, { "epoch": 2.9208343666473784, "grad_norm": 8.009589195251465, "learning_rate": 5.0412790994587204e-05, "loss": 0.9064, "step": 212000 }, { "epoch": 2.9222121187071175, "grad_norm": 6.041582107543945, "learning_rate": 5.040835761668715e-05, "loss": 0.8501, "step": 212100 }, { "epoch": 2.9235898707668566, "grad_norm": 12.624541282653809, "learning_rate": 5.040392229260492e-05, "loss": 0.8915, "step": 212200 }, { "epoch": 2.924967622826596, "grad_norm": 5.336921691894531, "learning_rate": 5.0399485022717325e-05, "loss": 0.8739, "step": 212300 }, { "epoch": 2.9263453748863357, "grad_norm": 20.81460189819336, "learning_rate": 5.039504580740131e-05, "loss": 0.9489, "step": 212400 }, { "epoch": 2.927723126946075, "grad_norm": 12.097780227661133, "learning_rate": 5.039060464703403e-05, "loss": 0.8683, "step": 212500 }, { "epoch": 2.929100879005814, "grad_norm": 15.456731796264648, "learning_rate": 5.038616154199278e-05, "loss": 1.0058, "step": 212600 }, { "epoch": 2.9304786310655535, "grad_norm": 6.603039264678955, "learning_rate": 5.038171649265502e-05, "loss": 0.9007, "step": 212700 }, { "epoch": 2.9318563831252926, "grad_norm": 12.64572525024414, "learning_rate": 5.037726949939838e-05, "loss": 1.0257, "step": 212800 }, { "epoch": 2.933234135185032, "grad_norm": 13.877869606018066, "learning_rate": 5.0372820562600654e-05, "loss": 0.9532, "step": 212900 }, { "epoch": 2.9346118872447713, "grad_norm": 7.404748916625977, "learning_rate": 5.036836968263981e-05, "loss": 0.8473, "step": 213000 }, { "epoch": 2.935989639304511, "grad_norm": 9.050270080566406, "learning_rate": 5.036391685989397e-05, "loss": 0.8868, "step": 213100 }, { "epoch": 2.93736739136425, "grad_norm": 6.573676586151123, "learning_rate": 5.0359462094741415e-05, "loss": 0.8853, "step": 213200 }, { "epoch": 2.9387451434239895, "grad_norm": 8.138660430908203, "learning_rate": 5.035500538756061e-05, "loss": 1.056, "step": 213300 }, { "epoch": 2.9401228954837286, "grad_norm": 3.7192952632904053, "learning_rate": 5.0350546738730174e-05, "loss": 0.8238, "step": 213400 }, { "epoch": 2.941500647543468, "grad_norm": 145.99395751953125, "learning_rate": 5.034608614862889e-05, "loss": 0.9578, "step": 213500 }, { "epoch": 2.9428783996032073, "grad_norm": 5.892977714538574, "learning_rate": 5.0341623617635706e-05, "loss": 0.8992, "step": 213600 }, { "epoch": 2.944256151662947, "grad_norm": 9.625137329101562, "learning_rate": 5.0337159146129735e-05, "loss": 0.898, "step": 213700 }, { "epoch": 2.945633903722686, "grad_norm": 4.5275983810424805, "learning_rate": 5.0332692734490266e-05, "loss": 0.8915, "step": 213800 }, { "epoch": 2.9470116557824255, "grad_norm": 6.736204147338867, "learning_rate": 5.032822438309673e-05, "loss": 0.8381, "step": 213900 }, { "epoch": 2.9483894078421646, "grad_norm": 9.65224838256836, "learning_rate": 5.0323754092328755e-05, "loss": 0.9175, "step": 214000 }, { "epoch": 2.949767159901904, "grad_norm": 9.735228538513184, "learning_rate": 5.03192818625661e-05, "loss": 0.8673, "step": 214100 }, { "epoch": 2.9511449119616433, "grad_norm": 11.951399803161621, "learning_rate": 5.03148076941887e-05, "loss": 0.8943, "step": 214200 }, { "epoch": 2.952522664021383, "grad_norm": 20.874267578125, "learning_rate": 5.0310331587576676e-05, "loss": 0.868, "step": 214300 }, { "epoch": 2.953900416081122, "grad_norm": 11.858498573303223, "learning_rate": 5.030585354311028e-05, "loss": 0.824, "step": 214400 }, { "epoch": 2.9552781681408615, "grad_norm": 22.140026092529297, "learning_rate": 5.0301373561169965e-05, "loss": 0.8791, "step": 214500 }, { "epoch": 2.9566559202006006, "grad_norm": 2.468700647354126, "learning_rate": 5.0296891642136306e-05, "loss": 0.9103, "step": 214600 }, { "epoch": 2.95803367226034, "grad_norm": 11.627277374267578, "learning_rate": 5.0292407786390076e-05, "loss": 0.9136, "step": 214700 }, { "epoch": 2.9594114243200793, "grad_norm": 9.476028442382812, "learning_rate": 5.028792199431219e-05, "loss": 0.9554, "step": 214800 }, { "epoch": 2.9607891763798184, "grad_norm": 13.155414581298828, "learning_rate": 5.028343426628377e-05, "loss": 0.8592, "step": 214900 }, { "epoch": 2.962166928439558, "grad_norm": 17.985248565673828, "learning_rate": 5.027894460268603e-05, "loss": 0.9618, "step": 215000 }, { "epoch": 2.9635446804992975, "grad_norm": 10.667896270751953, "learning_rate": 5.027445300390041e-05, "loss": 0.9412, "step": 215100 }, { "epoch": 2.9649224325590366, "grad_norm": 18.606653213500977, "learning_rate": 5.02699594703085e-05, "loss": 0.8715, "step": 215200 }, { "epoch": 2.9663001846187758, "grad_norm": 2.976524829864502, "learning_rate": 5.026546400229204e-05, "loss": 0.874, "step": 215300 }, { "epoch": 2.9676779366785153, "grad_norm": 4.229238033294678, "learning_rate": 5.0260966600232956e-05, "loss": 0.8868, "step": 215400 }, { "epoch": 2.969055688738255, "grad_norm": 3.0284242630004883, "learning_rate": 5.0256467264513304e-05, "loss": 0.797, "step": 215500 }, { "epoch": 2.970433440797994, "grad_norm": 8.253076553344727, "learning_rate": 5.025196599551534e-05, "loss": 0.7905, "step": 215600 }, { "epoch": 2.971811192857733, "grad_norm": 5.734689712524414, "learning_rate": 5.024746279362146e-05, "loss": 0.8638, "step": 215700 }, { "epoch": 2.9731889449174727, "grad_norm": 10.03180980682373, "learning_rate": 5.024295765921424e-05, "loss": 0.9138, "step": 215800 }, { "epoch": 2.974566696977212, "grad_norm": 4.002693176269531, "learning_rate": 5.0238450592676434e-05, "loss": 0.8836, "step": 215900 }, { "epoch": 2.9759444490369513, "grad_norm": 40.967979431152344, "learning_rate": 5.02339415943909e-05, "loss": 0.7907, "step": 216000 }, { "epoch": 2.9773222010966904, "grad_norm": 14.720328330993652, "learning_rate": 5.0229430664740736e-05, "loss": 0.8197, "step": 216100 }, { "epoch": 2.97869995315643, "grad_norm": 3.3215177059173584, "learning_rate": 5.022491780410915e-05, "loss": 0.9149, "step": 216200 }, { "epoch": 2.9800777052161695, "grad_norm": 6.096555709838867, "learning_rate": 5.022040301287953e-05, "loss": 0.768, "step": 216300 }, { "epoch": 2.9814554572759087, "grad_norm": 5.482794761657715, "learning_rate": 5.0215886291435445e-05, "loss": 0.8891, "step": 216400 }, { "epoch": 2.9828332093356478, "grad_norm": 10.852657318115234, "learning_rate": 5.02113676401606e-05, "loss": 0.9312, "step": 216500 }, { "epoch": 2.9842109613953873, "grad_norm": 4.301094055175781, "learning_rate": 5.020684705943889e-05, "loss": 0.9136, "step": 216600 }, { "epoch": 2.985588713455127, "grad_norm": 5.961721420288086, "learning_rate": 5.020232454965435e-05, "loss": 0.8515, "step": 216700 }, { "epoch": 2.986966465514866, "grad_norm": 9.368277549743652, "learning_rate": 5.019780011119119e-05, "loss": 0.961, "step": 216800 }, { "epoch": 2.988344217574605, "grad_norm": 5.15329122543335, "learning_rate": 5.019327374443379e-05, "loss": 0.7407, "step": 216900 }, { "epoch": 2.9897219696343447, "grad_norm": 13.162623405456543, "learning_rate": 5.01887454497667e-05, "loss": 0.8083, "step": 217000 }, { "epoch": 2.991099721694084, "grad_norm": 7.091092109680176, "learning_rate": 5.01842152275746e-05, "loss": 0.9746, "step": 217100 }, { "epoch": 2.9924774737538233, "grad_norm": 6.582491874694824, "learning_rate": 5.017968307824236e-05, "loss": 0.9766, "step": 217200 }, { "epoch": 2.9938552258135624, "grad_norm": 6.723458290100098, "learning_rate": 5.017514900215502e-05, "loss": 0.9296, "step": 217300 }, { "epoch": 2.995232977873302, "grad_norm": 3.6613121032714844, "learning_rate": 5.017061299969777e-05, "loss": 0.7399, "step": 217400 }, { "epoch": 2.996610729933041, "grad_norm": 6.171582221984863, "learning_rate": 5.016607507125596e-05, "loss": 0.8053, "step": 217500 }, { "epoch": 2.9979884819927807, "grad_norm": 6.335168361663818, "learning_rate": 5.016153521721512e-05, "loss": 0.7905, "step": 217600 }, { "epoch": 2.99936623405252, "grad_norm": 11.291169166564941, "learning_rate": 5.0156993437960915e-05, "loss": 0.8497, "step": 217700 }, { "epoch": 3.0007439861122593, "grad_norm": 9.230731964111328, "learning_rate": 5.015244973387922e-05, "loss": 0.8362, "step": 217800 }, { "epoch": 3.0021217381719985, "grad_norm": 10.072036743164062, "learning_rate": 5.0147904105356024e-05, "loss": 0.6933, "step": 217900 }, { "epoch": 3.003499490231738, "grad_norm": 4.5025634765625, "learning_rate": 5.014335655277751e-05, "loss": 0.7591, "step": 218000 }, { "epoch": 3.004877242291477, "grad_norm": 11.540807723999023, "learning_rate": 5.013880707653001e-05, "loss": 0.8106, "step": 218100 }, { "epoch": 3.0062549943512167, "grad_norm": 19.40630340576172, "learning_rate": 5.013425567700003e-05, "loss": 0.7955, "step": 218200 }, { "epoch": 3.007632746410956, "grad_norm": 21.899789810180664, "learning_rate": 5.0129702354574234e-05, "loss": 0.7447, "step": 218300 }, { "epoch": 3.0090104984706953, "grad_norm": 5.635985851287842, "learning_rate": 5.012514710963945e-05, "loss": 0.8361, "step": 218400 }, { "epoch": 3.0103882505304345, "grad_norm": 2.316899538040161, "learning_rate": 5.0120589942582674e-05, "loss": 0.6912, "step": 218500 }, { "epoch": 3.011766002590174, "grad_norm": 4.447273254394531, "learning_rate": 5.011603085379106e-05, "loss": 0.709, "step": 218600 }, { "epoch": 3.013143754649913, "grad_norm": 8.26756763458252, "learning_rate": 5.011146984365191e-05, "loss": 0.8466, "step": 218700 }, { "epoch": 3.0145215067096527, "grad_norm": 2.827197313308716, "learning_rate": 5.0106906912552724e-05, "loss": 0.7533, "step": 218800 }, { "epoch": 3.015899258769392, "grad_norm": 8.207708358764648, "learning_rate": 5.010234206088114e-05, "loss": 0.8498, "step": 218900 }, { "epoch": 3.0172770108291314, "grad_norm": 3.69254207611084, "learning_rate": 5.009777528902496e-05, "loss": 0.8627, "step": 219000 }, { "epoch": 3.0186547628888705, "grad_norm": 12.752203941345215, "learning_rate": 5.009320659737217e-05, "loss": 0.7736, "step": 219100 }, { "epoch": 3.02003251494861, "grad_norm": 10.074353218078613, "learning_rate": 5.008863598631088e-05, "loss": 0.7728, "step": 219200 }, { "epoch": 3.021410267008349, "grad_norm": 10.07119369506836, "learning_rate": 5.008406345622941e-05, "loss": 0.8828, "step": 219300 }, { "epoch": 3.0227880190680887, "grad_norm": 11.883198738098145, "learning_rate": 5.00794890075162e-05, "loss": 0.7723, "step": 219400 }, { "epoch": 3.024165771127828, "grad_norm": 3.8973376750946045, "learning_rate": 5.0074912640559895e-05, "loss": 0.7811, "step": 219500 }, { "epoch": 3.0255435231875674, "grad_norm": 15.14806842803955, "learning_rate": 5.0070334355749264e-05, "loss": 0.8236, "step": 219600 }, { "epoch": 3.0269212752473065, "grad_norm": 4.638752460479736, "learning_rate": 5.006575415347326e-05, "loss": 0.7919, "step": 219700 }, { "epoch": 3.028299027307046, "grad_norm": 7.042424201965332, "learning_rate": 5.0061172034121e-05, "loss": 0.8563, "step": 219800 }, { "epoch": 3.029676779366785, "grad_norm": 5.502007007598877, "learning_rate": 5.0056587998081746e-05, "loss": 0.8177, "step": 219900 }, { "epoch": 3.0310545314265247, "grad_norm": 17.79983901977539, "learning_rate": 5.005200204574495e-05, "loss": 0.8254, "step": 220000 }, { "epoch": 3.032432283486264, "grad_norm": 43.76643753051758, "learning_rate": 5.00474141775002e-05, "loss": 0.897, "step": 220100 }, { "epoch": 3.033810035546003, "grad_norm": 4.057260990142822, "learning_rate": 5.004282439373726e-05, "loss": 0.8396, "step": 220200 }, { "epoch": 3.0351877876057425, "grad_norm": 10.954686164855957, "learning_rate": 5.003823269484607e-05, "loss": 0.7823, "step": 220300 }, { "epoch": 3.0365655396654816, "grad_norm": 10.25699520111084, "learning_rate": 5.00336390812167e-05, "loss": 0.8326, "step": 220400 }, { "epoch": 3.037943291725221, "grad_norm": 6.70409631729126, "learning_rate": 5.002908951799392e-05, "loss": 0.8036, "step": 220500 }, { "epoch": 3.0393210437849603, "grad_norm": 9.063064575195312, "learning_rate": 5.0024492095196766e-05, "loss": 0.782, "step": 220600 }, { "epoch": 3.0406987958447, "grad_norm": 2.866466522216797, "learning_rate": 5.0019892758828774e-05, "loss": 0.9106, "step": 220700 }, { "epoch": 3.042076547904439, "grad_norm": 26.142011642456055, "learning_rate": 5.001529150928068e-05, "loss": 0.7895, "step": 220800 }, { "epoch": 3.0434542999641785, "grad_norm": 9.043352127075195, "learning_rate": 5.0010688346943393e-05, "loss": 0.7718, "step": 220900 }, { "epoch": 3.0448320520239176, "grad_norm": 23.54694175720215, "learning_rate": 5.000608327220795e-05, "loss": 0.7454, "step": 221000 }, { "epoch": 3.046209804083657, "grad_norm": 5.502631187438965, "learning_rate": 5.000147628546561e-05, "loss": 0.8136, "step": 221100 }, { "epoch": 3.0475875561433963, "grad_norm": 3.3556718826293945, "learning_rate": 4.999686738710772e-05, "loss": 0.7063, "step": 221200 }, { "epoch": 3.048965308203136, "grad_norm": 7.735686779022217, "learning_rate": 4.999225657752587e-05, "loss": 0.8155, "step": 221300 }, { "epoch": 3.050343060262875, "grad_norm": 8.710448265075684, "learning_rate": 4.998764385711175e-05, "loss": 0.8199, "step": 221400 }, { "epoch": 3.0517208123226145, "grad_norm": 8.710021018981934, "learning_rate": 4.9983029226257244e-05, "loss": 0.7873, "step": 221500 }, { "epoch": 3.0530985643823536, "grad_norm": 2.5617220401763916, "learning_rate": 4.9978412685354383e-05, "loss": 0.8495, "step": 221600 }, { "epoch": 3.054476316442093, "grad_norm": 12.85446834564209, "learning_rate": 4.997379423479536e-05, "loss": 0.7541, "step": 221700 }, { "epoch": 3.0558540685018323, "grad_norm": 10.130171775817871, "learning_rate": 4.9969173874972534e-05, "loss": 0.7851, "step": 221800 }, { "epoch": 3.057231820561572, "grad_norm": 4.199276924133301, "learning_rate": 4.9964597838413016e-05, "loss": 0.8581, "step": 221900 }, { "epoch": 3.058609572621311, "grad_norm": 9.939815521240234, "learning_rate": 4.995997368032318e-05, "loss": 0.7881, "step": 222000 }, { "epoch": 3.0599873246810505, "grad_norm": 4.393372535705566, "learning_rate": 4.995534761414367e-05, "loss": 0.8115, "step": 222100 }, { "epoch": 3.0613650767407896, "grad_norm": 4.2641215324401855, "learning_rate": 4.9950719640267503e-05, "loss": 0.7293, "step": 222200 }, { "epoch": 3.062742828800529, "grad_norm": 8.543421745300293, "learning_rate": 4.994608975908785e-05, "loss": 0.7358, "step": 222300 }, { "epoch": 3.0641205808602683, "grad_norm": 26.361059188842773, "learning_rate": 4.994145797099804e-05, "loss": 0.8521, "step": 222400 }, { "epoch": 3.065498332920008, "grad_norm": 6.281571865081787, "learning_rate": 4.993682427639156e-05, "loss": 0.8118, "step": 222500 }, { "epoch": 3.066876084979747, "grad_norm": 18.130990982055664, "learning_rate": 4.993218867566208e-05, "loss": 0.78, "step": 222600 }, { "epoch": 3.0682538370394865, "grad_norm": 14.497140884399414, "learning_rate": 4.99275511692034e-05, "loss": 0.8034, "step": 222700 }, { "epoch": 3.0696315890992256, "grad_norm": 3.5376479625701904, "learning_rate": 4.992291175740951e-05, "loss": 0.8179, "step": 222800 }, { "epoch": 3.071009341158965, "grad_norm": 14.781637191772461, "learning_rate": 4.991827044067455e-05, "loss": 0.8105, "step": 222900 }, { "epoch": 3.0723870932187043, "grad_norm": 7.163496494293213, "learning_rate": 4.991362721939284e-05, "loss": 0.7548, "step": 223000 }, { "epoch": 3.073764845278444, "grad_norm": 2.9227442741394043, "learning_rate": 4.9908982093958814e-05, "loss": 0.7841, "step": 223100 }, { "epoch": 3.075142597338183, "grad_norm": 10.931370735168457, "learning_rate": 4.9904335064767126e-05, "loss": 0.7604, "step": 223200 }, { "epoch": 3.0765203493979225, "grad_norm": 11.45769214630127, "learning_rate": 4.989968613221255e-05, "loss": 0.8351, "step": 223300 }, { "epoch": 3.0778981014576616, "grad_norm": 99.72901153564453, "learning_rate": 4.989503529669004e-05, "loss": 0.8157, "step": 223400 }, { "epoch": 3.079275853517401, "grad_norm": 5.26624870300293, "learning_rate": 4.9890382558594705e-05, "loss": 0.8175, "step": 223500 }, { "epoch": 3.0806536055771403, "grad_norm": 16.809803009033203, "learning_rate": 4.9885727918321825e-05, "loss": 0.7716, "step": 223600 }, { "epoch": 3.08203135763688, "grad_norm": 10.616503715515137, "learning_rate": 4.988107137626684e-05, "loss": 0.8469, "step": 223700 }, { "epoch": 3.083409109696619, "grad_norm": 42.45695877075195, "learning_rate": 4.9876412932825345e-05, "loss": 0.7514, "step": 223800 }, { "epoch": 3.0847868617563585, "grad_norm": 13.717066764831543, "learning_rate": 4.9871752588393085e-05, "loss": 0.8221, "step": 223900 }, { "epoch": 3.0861646138160976, "grad_norm": 40.778228759765625, "learning_rate": 4.986709034336599e-05, "loss": 0.7813, "step": 224000 }, { "epoch": 3.087542365875837, "grad_norm": 23.3128662109375, "learning_rate": 4.9862426198140146e-05, "loss": 0.6969, "step": 224100 }, { "epoch": 3.0889201179355763, "grad_norm": 6.894881248474121, "learning_rate": 4.985776015311178e-05, "loss": 0.7658, "step": 224200 }, { "epoch": 3.0902978699953154, "grad_norm": 20.713045120239258, "learning_rate": 4.985309220867732e-05, "loss": 0.7492, "step": 224300 }, { "epoch": 3.091675622055055, "grad_norm": 9.274797439575195, "learning_rate": 4.984842236523331e-05, "loss": 0.8069, "step": 224400 }, { "epoch": 3.0930533741147945, "grad_norm": 51.10271453857422, "learning_rate": 4.9843750623176493e-05, "loss": 0.7797, "step": 224500 }, { "epoch": 3.0944311261745336, "grad_norm": 3.907578468322754, "learning_rate": 4.983907698290375e-05, "loss": 0.7626, "step": 224600 }, { "epoch": 3.0958088782342728, "grad_norm": 10.407511711120605, "learning_rate": 4.983440144481213e-05, "loss": 0.7279, "step": 224700 }, { "epoch": 3.0971866302940123, "grad_norm": 6.9903435707092285, "learning_rate": 4.9829724009298844e-05, "loss": 0.7881, "step": 224800 }, { "epoch": 3.0985643823537514, "grad_norm": 15.715108871459961, "learning_rate": 4.9825044676761265e-05, "loss": 0.7097, "step": 224900 }, { "epoch": 3.099942134413491, "grad_norm": 6.130674362182617, "learning_rate": 4.9820363447596936e-05, "loss": 0.8622, "step": 225000 }, { "epoch": 3.10131988647323, "grad_norm": 4.548302173614502, "learning_rate": 4.981568032220353e-05, "loss": 0.6775, "step": 225100 }, { "epoch": 3.1026976385329696, "grad_norm": 1.7215290069580078, "learning_rate": 4.9810995300978915e-05, "loss": 0.8629, "step": 225200 }, { "epoch": 3.1040753905927088, "grad_norm": 10.366414070129395, "learning_rate": 4.9806308384321115e-05, "loss": 0.8022, "step": 225300 }, { "epoch": 3.1054531426524483, "grad_norm": 37.325042724609375, "learning_rate": 4.9801619572628296e-05, "loss": 0.7219, "step": 225400 }, { "epoch": 3.1068308947121874, "grad_norm": 8.17691421508789, "learning_rate": 4.9796928866298794e-05, "loss": 0.7952, "step": 225500 }, { "epoch": 3.108208646771927, "grad_norm": 5.913504123687744, "learning_rate": 4.979223626573112e-05, "loss": 0.8158, "step": 225600 }, { "epoch": 3.109586398831666, "grad_norm": 15.628220558166504, "learning_rate": 4.9787541771323926e-05, "loss": 0.7727, "step": 225700 }, { "epoch": 3.1109641508914057, "grad_norm": 6.115756034851074, "learning_rate": 4.978284538347604e-05, "loss": 0.7998, "step": 225800 }, { "epoch": 3.1123419029511448, "grad_norm": 1.9399501085281372, "learning_rate": 4.977814710258644e-05, "loss": 0.7646, "step": 225900 }, { "epoch": 3.1137196550108843, "grad_norm": 52.01592254638672, "learning_rate": 4.977344692905427e-05, "loss": 0.7687, "step": 226000 }, { "epoch": 3.1150974070706234, "grad_norm": 5.268695831298828, "learning_rate": 4.9768744863278826e-05, "loss": 0.8519, "step": 226100 }, { "epoch": 3.116475159130363, "grad_norm": 8.1824951171875, "learning_rate": 4.976404090565958e-05, "loss": 0.8065, "step": 226200 }, { "epoch": 3.117852911190102, "grad_norm": 9.205942153930664, "learning_rate": 4.975933505659617e-05, "loss": 0.8016, "step": 226300 }, { "epoch": 3.1192306632498417, "grad_norm": 4.955207347869873, "learning_rate": 4.9754627316488365e-05, "loss": 0.7826, "step": 226400 }, { "epoch": 3.1206084153095808, "grad_norm": 24.628856658935547, "learning_rate": 4.974991768573611e-05, "loss": 0.8405, "step": 226500 }, { "epoch": 3.1219861673693203, "grad_norm": 15.530047416687012, "learning_rate": 4.9745206164739515e-05, "loss": 0.8171, "step": 226600 }, { "epoch": 3.1233639194290594, "grad_norm": 14.046979904174805, "learning_rate": 4.974049275389886e-05, "loss": 0.7727, "step": 226700 }, { "epoch": 3.124741671488799, "grad_norm": 3.7635655403137207, "learning_rate": 4.973577745361455e-05, "loss": 0.8115, "step": 226800 }, { "epoch": 3.126119423548538, "grad_norm": 12.773242950439453, "learning_rate": 4.97310602642872e-05, "loss": 0.8537, "step": 226900 }, { "epoch": 3.1274971756082777, "grad_norm": 6.3351030349731445, "learning_rate": 4.9726341186317545e-05, "loss": 0.8511, "step": 227000 }, { "epoch": 3.128874927668017, "grad_norm": 2.8498470783233643, "learning_rate": 4.9721620220106495e-05, "loss": 0.8589, "step": 227100 }, { "epoch": 3.1302526797277563, "grad_norm": 6.089428901672363, "learning_rate": 4.9716897366055124e-05, "loss": 0.8427, "step": 227200 }, { "epoch": 3.1316304317874954, "grad_norm": 2.1297292709350586, "learning_rate": 4.971217262456466e-05, "loss": 0.7443, "step": 227300 }, { "epoch": 3.133008183847235, "grad_norm": 5.4099040031433105, "learning_rate": 4.97074459960365e-05, "loss": 0.8339, "step": 227400 }, { "epoch": 3.134385935906974, "grad_norm": 6.637537956237793, "learning_rate": 4.9702717480872186e-05, "loss": 0.7741, "step": 227500 }, { "epoch": 3.1357636879667137, "grad_norm": 9.33049201965332, "learning_rate": 4.9697987079473424e-05, "loss": 0.8442, "step": 227600 }, { "epoch": 3.137141440026453, "grad_norm": 17.611705780029297, "learning_rate": 4.969325479224211e-05, "loss": 0.7534, "step": 227700 }, { "epoch": 3.1385191920861923, "grad_norm": 5.704777240753174, "learning_rate": 4.968852061958025e-05, "loss": 0.8559, "step": 227800 }, { "epoch": 3.1398969441459315, "grad_norm": 16.293712615966797, "learning_rate": 4.9683784561890056e-05, "loss": 0.6788, "step": 227900 }, { "epoch": 3.141274696205671, "grad_norm": 7.516731262207031, "learning_rate": 4.967904661957387e-05, "loss": 0.7837, "step": 228000 }, { "epoch": 3.14265244826541, "grad_norm": 4.9556498527526855, "learning_rate": 4.967430679303421e-05, "loss": 0.7702, "step": 228100 }, { "epoch": 3.1440302003251497, "grad_norm": 2.994645357131958, "learning_rate": 4.9669565082673737e-05, "loss": 0.8642, "step": 228200 }, { "epoch": 3.145407952384889, "grad_norm": 3.5711493492126465, "learning_rate": 4.96648214888953e-05, "loss": 0.8233, "step": 228300 }, { "epoch": 3.1467857044446284, "grad_norm": 15.144733428955078, "learning_rate": 4.9660076012101875e-05, "loss": 0.796, "step": 228400 }, { "epoch": 3.1481634565043675, "grad_norm": 0.978360652923584, "learning_rate": 4.965537613560828e-05, "loss": 0.7949, "step": 228500 }, { "epoch": 3.1495412085641066, "grad_norm": 8.534567832946777, "learning_rate": 4.965062691281461e-05, "loss": 0.6975, "step": 228600 }, { "epoch": 3.150918960623846, "grad_norm": 6.644782543182373, "learning_rate": 4.964587580821185e-05, "loss": 0.8, "step": 228700 }, { "epoch": 3.1522967126835857, "grad_norm": 2.691181182861328, "learning_rate": 4.964112282220365e-05, "loss": 0.8339, "step": 228800 }, { "epoch": 3.153674464743325, "grad_norm": 14.243195533752441, "learning_rate": 4.963641551317352e-05, "loss": 0.7791, "step": 228900 }, { "epoch": 3.155052216803064, "grad_norm": 3.695988893508911, "learning_rate": 4.963165878436993e-05, "loss": 0.7923, "step": 229000 }, { "epoch": 3.1564299688628035, "grad_norm": 10.843060493469238, "learning_rate": 4.962690017536871e-05, "loss": 0.6995, "step": 229100 }, { "epoch": 3.1578077209225426, "grad_norm": 9.659454345703125, "learning_rate": 4.962213968657411e-05, "loss": 0.8871, "step": 229200 }, { "epoch": 3.159185472982282, "grad_norm": 4.380928993225098, "learning_rate": 4.961737731839058e-05, "loss": 0.7582, "step": 229300 }, { "epoch": 3.1605632250420213, "grad_norm": 8.227472305297852, "learning_rate": 4.961261307122267e-05, "loss": 0.7525, "step": 229400 }, { "epoch": 3.161940977101761, "grad_norm": 5.710347652435303, "learning_rate": 4.960784694547517e-05, "loss": 0.8346, "step": 229500 }, { "epoch": 3.1633187291615, "grad_norm": 64.2777328491211, "learning_rate": 4.9603078941552974e-05, "loss": 0.7342, "step": 229600 }, { "epoch": 3.1646964812212395, "grad_norm": 15.041741371154785, "learning_rate": 4.959830905986113e-05, "loss": 0.7812, "step": 229700 }, { "epoch": 3.1660742332809786, "grad_norm": 9.057526588439941, "learning_rate": 4.9593537300804886e-05, "loss": 0.825, "step": 229800 }, { "epoch": 3.167451985340718, "grad_norm": 7.043004035949707, "learning_rate": 4.958876366478962e-05, "loss": 0.8314, "step": 229900 }, { "epoch": 3.1688297374004573, "grad_norm": 8.075080871582031, "learning_rate": 4.9583988152220866e-05, "loss": 0.7655, "step": 230000 }, { "epoch": 3.170207489460197, "grad_norm": 9.349101066589355, "learning_rate": 4.957921076350434e-05, "loss": 0.6921, "step": 230100 }, { "epoch": 3.171585241519936, "grad_norm": 53.45508575439453, "learning_rate": 4.957443149904591e-05, "loss": 0.9097, "step": 230200 }, { "epoch": 3.1729629935796755, "grad_norm": 4.940711498260498, "learning_rate": 4.9569650359251587e-05, "loss": 0.7577, "step": 230300 }, { "epoch": 3.1743407456394146, "grad_norm": 10.162055969238281, "learning_rate": 4.956486734452756e-05, "loss": 0.7773, "step": 230400 }, { "epoch": 3.175718497699154, "grad_norm": 4.496201038360596, "learning_rate": 4.9560082455280166e-05, "loss": 0.8011, "step": 230500 }, { "epoch": 3.1770962497588933, "grad_norm": 8.428885459899902, "learning_rate": 4.9555295691915915e-05, "loss": 0.7555, "step": 230600 }, { "epoch": 3.178474001818633, "grad_norm": 68.13356018066406, "learning_rate": 4.955050705484146e-05, "loss": 0.7661, "step": 230700 }, { "epoch": 3.179851753878372, "grad_norm": 3.0933313369750977, "learning_rate": 4.9545716544463616e-05, "loss": 0.7648, "step": 230800 }, { "epoch": 3.1812295059381115, "grad_norm": 9.075510025024414, "learning_rate": 4.9540924161189365e-05, "loss": 0.8338, "step": 230900 }, { "epoch": 3.1826072579978506, "grad_norm": 6.76422643661499, "learning_rate": 4.953612990542585e-05, "loss": 0.8421, "step": 231000 }, { "epoch": 3.18398501005759, "grad_norm": 17.76362419128418, "learning_rate": 4.953133377758037e-05, "loss": 0.7331, "step": 231100 }, { "epoch": 3.1853627621173293, "grad_norm": 18.08804702758789, "learning_rate": 4.952653577806036e-05, "loss": 0.85, "step": 231200 }, { "epoch": 3.186740514177069, "grad_norm": 32.398704528808594, "learning_rate": 4.952178391524276e-05, "loss": 0.9564, "step": 231300 }, { "epoch": 3.188118266236808, "grad_norm": 4.246829986572266, "learning_rate": 4.95169821923033e-05, "loss": 0.8004, "step": 231400 }, { "epoch": 3.1894960182965475, "grad_norm": 57.084571838378906, "learning_rate": 4.951217859890856e-05, "loss": 0.8262, "step": 231500 }, { "epoch": 3.1908737703562866, "grad_norm": 6.124798774719238, "learning_rate": 4.950737313546664e-05, "loss": 0.725, "step": 231600 }, { "epoch": 3.192251522416026, "grad_norm": 55.82466506958008, "learning_rate": 4.950256580238578e-05, "loss": 0.8259, "step": 231700 }, { "epoch": 3.1936292744757653, "grad_norm": 9.523139953613281, "learning_rate": 4.949775660007439e-05, "loss": 0.7991, "step": 231800 }, { "epoch": 3.195007026535505, "grad_norm": 9.465994834899902, "learning_rate": 4.949294552894105e-05, "loss": 0.6896, "step": 231900 }, { "epoch": 3.196384778595244, "grad_norm": 10.29808521270752, "learning_rate": 4.948813258939446e-05, "loss": 0.8282, "step": 232000 }, { "epoch": 3.1977625306549835, "grad_norm": 4.562504291534424, "learning_rate": 4.948331778184352e-05, "loss": 0.8014, "step": 232100 }, { "epoch": 3.1991402827147226, "grad_norm": 5.956326961517334, "learning_rate": 4.9478501106697264e-05, "loss": 0.8166, "step": 232200 }, { "epoch": 3.200518034774462, "grad_norm": 4.902593612670898, "learning_rate": 4.94736825643649e-05, "loss": 0.9226, "step": 232300 }, { "epoch": 3.2018957868342013, "grad_norm": 1.2039607763290405, "learning_rate": 4.9468862155255785e-05, "loss": 0.632, "step": 232400 }, { "epoch": 3.203273538893941, "grad_norm": 9.601936340332031, "learning_rate": 4.946403987977944e-05, "loss": 0.76, "step": 232500 }, { "epoch": 3.20465129095368, "grad_norm": 8.302136421203613, "learning_rate": 4.9459215738345545e-05, "loss": 0.8161, "step": 232600 }, { "epoch": 3.2060290430134195, "grad_norm": 3.656987428665161, "learning_rate": 4.945438973136392e-05, "loss": 0.7357, "step": 232700 }, { "epoch": 3.2074067950731586, "grad_norm": 20.771461486816406, "learning_rate": 4.944961014719685e-05, "loss": 0.7025, "step": 232800 }, { "epoch": 3.2087845471328977, "grad_norm": 6.188084125518799, "learning_rate": 4.944478042899518e-05, "loss": 0.7883, "step": 232900 }, { "epoch": 3.2101622991926373, "grad_norm": 4.967450141906738, "learning_rate": 4.943994884647214e-05, "loss": 0.718, "step": 233000 }, { "epoch": 3.211540051252377, "grad_norm": 3.7283780574798584, "learning_rate": 4.9435115400038196e-05, "loss": 0.8945, "step": 233100 }, { "epoch": 3.212917803312116, "grad_norm": 6.124407768249512, "learning_rate": 4.943028009010399e-05, "loss": 0.7377, "step": 233200 }, { "epoch": 3.214295555371855, "grad_norm": 6.074686050415039, "learning_rate": 4.9425442917080286e-05, "loss": 0.7844, "step": 233300 }, { "epoch": 3.2156733074315946, "grad_norm": 7.642630100250244, "learning_rate": 4.942060388137804e-05, "loss": 0.7258, "step": 233400 }, { "epoch": 3.2170510594913337, "grad_norm": 10.451384544372559, "learning_rate": 4.9415762983408353e-05, "loss": 0.8141, "step": 233500 }, { "epoch": 3.2184288115510733, "grad_norm": 6.519449234008789, "learning_rate": 4.941092022358248e-05, "loss": 0.8014, "step": 233600 }, { "epoch": 3.2198065636108124, "grad_norm": 2.3638296127319336, "learning_rate": 4.9406075602311826e-05, "loss": 0.8337, "step": 233700 }, { "epoch": 3.221184315670552, "grad_norm": 5.68211555480957, "learning_rate": 4.9401229120008e-05, "loss": 0.9516, "step": 233800 }, { "epoch": 3.222562067730291, "grad_norm": 13.048260688781738, "learning_rate": 4.9396380777082695e-05, "loss": 0.8023, "step": 233900 }, { "epoch": 3.2239398197900306, "grad_norm": 9.451485633850098, "learning_rate": 4.9391530573947836e-05, "loss": 0.8365, "step": 234000 }, { "epoch": 3.2253175718497697, "grad_norm": 7.705934047698975, "learning_rate": 4.938667851101545e-05, "loss": 0.8248, "step": 234100 }, { "epoch": 3.2266953239095093, "grad_norm": 12.55395221710205, "learning_rate": 4.938182458869775e-05, "loss": 0.7854, "step": 234200 }, { "epoch": 3.2280730759692484, "grad_norm": 5.107557773590088, "learning_rate": 4.937696880740711e-05, "loss": 0.8152, "step": 234300 }, { "epoch": 3.229450828028988, "grad_norm": 6.501389026641846, "learning_rate": 4.937211116755605e-05, "loss": 0.7882, "step": 234400 }, { "epoch": 3.230828580088727, "grad_norm": 42.70077133178711, "learning_rate": 4.936725166955725e-05, "loss": 0.818, "step": 234500 }, { "epoch": 3.2322063321484666, "grad_norm": 9.153830528259277, "learning_rate": 4.9362390313823545e-05, "loss": 0.8303, "step": 234600 }, { "epoch": 3.2335840842082058, "grad_norm": 25.342079162597656, "learning_rate": 4.935752710076793e-05, "loss": 0.6923, "step": 234700 }, { "epoch": 3.2349618362679453, "grad_norm": 7.59571647644043, "learning_rate": 4.9352662030803575e-05, "loss": 0.7765, "step": 234800 }, { "epoch": 3.2363395883276844, "grad_norm": 18.282854080200195, "learning_rate": 4.934779510434378e-05, "loss": 0.8527, "step": 234900 }, { "epoch": 3.237717340387424, "grad_norm": 7.603454113006592, "learning_rate": 4.934292632180202e-05, "loss": 0.8148, "step": 235000 }, { "epoch": 3.239095092447163, "grad_norm": 30.90890884399414, "learning_rate": 4.9338055683591914e-05, "loss": 0.7449, "step": 235100 }, { "epoch": 3.2404728445069026, "grad_norm": 9.09598445892334, "learning_rate": 4.933318319012727e-05, "loss": 0.8409, "step": 235200 }, { "epoch": 3.2418505965666418, "grad_norm": 19.666332244873047, "learning_rate": 4.9328308841822e-05, "loss": 0.804, "step": 235300 }, { "epoch": 3.2432283486263813, "grad_norm": 28.167722702026367, "learning_rate": 4.932343263909023e-05, "loss": 0.7589, "step": 235400 }, { "epoch": 3.2446061006861204, "grad_norm": 6.728652000427246, "learning_rate": 4.93185545823462e-05, "loss": 0.7256, "step": 235500 }, { "epoch": 3.24598385274586, "grad_norm": 10.087672233581543, "learning_rate": 4.931367467200435e-05, "loss": 0.8402, "step": 235600 }, { "epoch": 3.247361604805599, "grad_norm": 11.853771209716797, "learning_rate": 4.930879290847923e-05, "loss": 0.8326, "step": 235700 }, { "epoch": 3.2487393568653387, "grad_norm": 78.4873046875, "learning_rate": 4.930390929218558e-05, "loss": 0.8416, "step": 235800 }, { "epoch": 3.2501171089250778, "grad_norm": 42.94315719604492, "learning_rate": 4.929902382353828e-05, "loss": 0.8264, "step": 235900 }, { "epoch": 3.2514948609848173, "grad_norm": 5.60575532913208, "learning_rate": 4.92941365029524e-05, "loss": 0.813, "step": 236000 }, { "epoch": 3.2528726130445564, "grad_norm": 4.687264919281006, "learning_rate": 4.928924733084311e-05, "loss": 0.839, "step": 236100 }, { "epoch": 3.254250365104296, "grad_norm": 8.667098999023438, "learning_rate": 4.92843563076258e-05, "loss": 0.8067, "step": 236200 }, { "epoch": 3.255628117164035, "grad_norm": 6.4489946365356445, "learning_rate": 4.927946343371596e-05, "loss": 0.8006, "step": 236300 }, { "epoch": 3.2570058692237747, "grad_norm": 24.710708618164062, "learning_rate": 4.927456870952929e-05, "loss": 0.759, "step": 236400 }, { "epoch": 3.2583836212835138, "grad_norm": 6.834887981414795, "learning_rate": 4.92696721354816e-05, "loss": 0.7577, "step": 236500 }, { "epoch": 3.2597613733432533, "grad_norm": 2.1907448768615723, "learning_rate": 4.9264773711988895e-05, "loss": 0.8124, "step": 236600 }, { "epoch": 3.2611391254029924, "grad_norm": 5.3939337730407715, "learning_rate": 4.925987343946731e-05, "loss": 0.8307, "step": 236700 }, { "epoch": 3.2625168774627316, "grad_norm": 4.920928955078125, "learning_rate": 4.925497131833316e-05, "loss": 0.7474, "step": 236800 }, { "epoch": 3.263894629522471, "grad_norm": 7.566749572753906, "learning_rate": 4.92500673490029e-05, "loss": 0.7992, "step": 236900 }, { "epoch": 3.2652723815822107, "grad_norm": 6.4035539627075195, "learning_rate": 4.924516153189315e-05, "loss": 0.7469, "step": 237000 }, { "epoch": 3.26665013364195, "grad_norm": 13.303295135498047, "learning_rate": 4.924025386742067e-05, "loss": 0.7973, "step": 237100 }, { "epoch": 3.268027885701689, "grad_norm": 11.453577041625977, "learning_rate": 4.9235344356002404e-05, "loss": 0.7975, "step": 237200 }, { "epoch": 3.2694056377614285, "grad_norm": 15.064271926879883, "learning_rate": 4.9230432998055435e-05, "loss": 0.7601, "step": 237300 }, { "epoch": 3.270783389821168, "grad_norm": 70.24581146240234, "learning_rate": 4.922551979399702e-05, "loss": 0.8, "step": 237400 }, { "epoch": 3.272161141880907, "grad_norm": 13.285362243652344, "learning_rate": 4.922060474424454e-05, "loss": 0.7647, "step": 237500 }, { "epoch": 3.2735388939406462, "grad_norm": 20.040693283081055, "learning_rate": 4.921568784921557e-05, "loss": 0.7933, "step": 237600 }, { "epoch": 3.274916646000386, "grad_norm": 27.109277725219727, "learning_rate": 4.9210769109327825e-05, "loss": 0.8198, "step": 237700 }, { "epoch": 3.2762943980601253, "grad_norm": 58.7088737487793, "learning_rate": 4.9205848524999166e-05, "loss": 0.8131, "step": 237800 }, { "epoch": 3.2776721501198645, "grad_norm": 26.576208114624023, "learning_rate": 4.9200926096647635e-05, "loss": 0.8115, "step": 237900 }, { "epoch": 3.2790499021796036, "grad_norm": 9.232324600219727, "learning_rate": 4.919600182469141e-05, "loss": 0.7825, "step": 238000 }, { "epoch": 3.280427654239343, "grad_norm": 20.623849868774414, "learning_rate": 4.919112497982265e-05, "loss": 0.8295, "step": 238100 }, { "epoch": 3.2818054062990822, "grad_norm": 21.29308319091797, "learning_rate": 4.918619704033783e-05, "loss": 0.739, "step": 238200 }, { "epoch": 3.283183158358822, "grad_norm": 44.78761291503906, "learning_rate": 4.9181267258499615e-05, "loss": 0.8126, "step": 238300 }, { "epoch": 3.284560910418561, "grad_norm": 27.85205078125, "learning_rate": 4.917633563472684e-05, "loss": 0.7799, "step": 238400 }, { "epoch": 3.2859386624783005, "grad_norm": 9.93897819519043, "learning_rate": 4.917140216943844e-05, "loss": 0.7927, "step": 238500 }, { "epoch": 3.2873164145380396, "grad_norm": 20.88914680480957, "learning_rate": 4.916646686305357e-05, "loss": 0.8038, "step": 238600 }, { "epoch": 3.288694166597779, "grad_norm": 16.16457748413086, "learning_rate": 4.9161529715991495e-05, "loss": 0.8225, "step": 238700 }, { "epoch": 3.2900719186575182, "grad_norm": 9.250761032104492, "learning_rate": 4.915659072867164e-05, "loss": 0.7767, "step": 238800 }, { "epoch": 3.291449670717258, "grad_norm": 14.04985523223877, "learning_rate": 4.915164990151362e-05, "loss": 0.7441, "step": 238900 }, { "epoch": 3.292827422776997, "grad_norm": 5.425111770629883, "learning_rate": 4.9146707234937165e-05, "loss": 0.858, "step": 239000 }, { "epoch": 3.2942051748367365, "grad_norm": 6.852377891540527, "learning_rate": 4.914176272936219e-05, "loss": 0.6745, "step": 239100 }, { "epoch": 3.2955829268964756, "grad_norm": 3.4075093269348145, "learning_rate": 4.913681638520875e-05, "loss": 0.8322, "step": 239200 }, { "epoch": 3.296960678956215, "grad_norm": 16.69704246520996, "learning_rate": 4.9131868202897055e-05, "loss": 0.791, "step": 239300 }, { "epoch": 3.2983384310159543, "grad_norm": 9.13206672668457, "learning_rate": 4.9126918182847506e-05, "loss": 0.843, "step": 239400 }, { "epoch": 3.299716183075694, "grad_norm": 8.079787254333496, "learning_rate": 4.9121966325480604e-05, "loss": 0.8134, "step": 239500 }, { "epoch": 3.301093935135433, "grad_norm": 11.790070533752441, "learning_rate": 4.911701263121705e-05, "loss": 0.8127, "step": 239600 }, { "epoch": 3.3024716871951725, "grad_norm": 5.893110275268555, "learning_rate": 4.911205710047768e-05, "loss": 0.8124, "step": 239700 }, { "epoch": 3.3038494392549116, "grad_norm": 9.047781944274902, "learning_rate": 4.910709973368349e-05, "loss": 0.8461, "step": 239800 }, { "epoch": 3.305227191314651, "grad_norm": 2.1555025577545166, "learning_rate": 4.910214053125564e-05, "loss": 0.6842, "step": 239900 }, { "epoch": 3.3066049433743903, "grad_norm": 7.02500581741333, "learning_rate": 4.9097179493615434e-05, "loss": 0.7742, "step": 240000 }, { "epoch": 3.30798269543413, "grad_norm": 6.156063556671143, "learning_rate": 4.9092216621184346e-05, "loss": 0.7644, "step": 240100 }, { "epoch": 3.309360447493869, "grad_norm": 8.203913688659668, "learning_rate": 4.908725191438398e-05, "loss": 0.8096, "step": 240200 }, { "epoch": 3.3107381995536085, "grad_norm": 12.512221336364746, "learning_rate": 4.9082285373636135e-05, "loss": 0.7629, "step": 240300 }, { "epoch": 3.3121159516133476, "grad_norm": 35.61933135986328, "learning_rate": 4.9077316999362725e-05, "loss": 0.81, "step": 240400 }, { "epoch": 3.313493703673087, "grad_norm": 7.461085796356201, "learning_rate": 4.907234679198585e-05, "loss": 0.8275, "step": 240500 }, { "epoch": 3.3148714557328263, "grad_norm": 6.215442657470703, "learning_rate": 4.9067374751927755e-05, "loss": 0.7793, "step": 240600 }, { "epoch": 3.316249207792566, "grad_norm": 17.07806968688965, "learning_rate": 4.906245062740229e-05, "loss": 0.8535, "step": 240700 }, { "epoch": 3.317626959852305, "grad_norm": 27.341754913330078, "learning_rate": 4.905747494156537e-05, "loss": 0.8013, "step": 240800 }, { "epoch": 3.3190047119120445, "grad_norm": 5.159193515777588, "learning_rate": 4.9052497424310676e-05, "loss": 0.8102, "step": 240900 }, { "epoch": 3.3203824639717836, "grad_norm": 5.837748050689697, "learning_rate": 4.9047518076061055e-05, "loss": 0.8491, "step": 241000 }, { "epoch": 3.321760216031523, "grad_norm": 19.702241897583008, "learning_rate": 4.904253689723953e-05, "loss": 0.8197, "step": 241100 }, { "epoch": 3.3231379680912623, "grad_norm": 32.154605865478516, "learning_rate": 4.90375538882693e-05, "loss": 0.8, "step": 241200 }, { "epoch": 3.324515720151002, "grad_norm": 5.821761608123779, "learning_rate": 4.903256904957367e-05, "loss": 0.7286, "step": 241300 }, { "epoch": 3.325893472210741, "grad_norm": 6.600139141082764, "learning_rate": 4.902758238157615e-05, "loss": 0.8019, "step": 241400 }, { "epoch": 3.32727122427048, "grad_norm": 21.331989288330078, "learning_rate": 4.902264377872067e-05, "loss": 0.879, "step": 241500 }, { "epoch": 3.3286489763302196, "grad_norm": 5.771658420562744, "learning_rate": 4.901765347167288e-05, "loss": 0.8432, "step": 241600 }, { "epoch": 3.330026728389959, "grad_norm": 8.749160766601562, "learning_rate": 4.901266133659034e-05, "loss": 0.724, "step": 241700 }, { "epoch": 3.3314044804496983, "grad_norm": 5.671021938323975, "learning_rate": 4.900766737389716e-05, "loss": 0.7721, "step": 241800 }, { "epoch": 3.3327822325094374, "grad_norm": 4.128930568695068, "learning_rate": 4.90026715840176e-05, "loss": 0.7717, "step": 241900 }, { "epoch": 3.334159984569177, "grad_norm": 3.1032347679138184, "learning_rate": 4.8997673967376095e-05, "loss": 0.8735, "step": 242000 }, { "epoch": 3.3355377366289165, "grad_norm": 3.7344799041748047, "learning_rate": 4.899267452439718e-05, "loss": 0.7318, "step": 242100 }, { "epoch": 3.3369154886886556, "grad_norm": 9.54202938079834, "learning_rate": 4.898767325550563e-05, "loss": 0.8009, "step": 242200 }, { "epoch": 3.3382932407483947, "grad_norm": 2.4578840732574463, "learning_rate": 4.8982670161126306e-05, "loss": 0.7739, "step": 242300 }, { "epoch": 3.3396709928081343, "grad_norm": 10.420729637145996, "learning_rate": 4.897766524168424e-05, "loss": 0.7757, "step": 242400 }, { "epoch": 3.3410487448678734, "grad_norm": 1.3552196025848389, "learning_rate": 4.897265849760463e-05, "loss": 0.8338, "step": 242500 }, { "epoch": 3.342426496927613, "grad_norm": 8.202522277832031, "learning_rate": 4.8967649929312836e-05, "loss": 0.8542, "step": 242600 }, { "epoch": 3.343804248987352, "grad_norm": 5.955030918121338, "learning_rate": 4.8962639537234344e-05, "loss": 0.7837, "step": 242700 }, { "epoch": 3.3451820010470916, "grad_norm": 6.349592208862305, "learning_rate": 4.895762732179483e-05, "loss": 0.8793, "step": 242800 }, { "epoch": 3.3465597531068307, "grad_norm": 2.8601889610290527, "learning_rate": 4.895261328342009e-05, "loss": 0.8422, "step": 242900 }, { "epoch": 3.3479375051665703, "grad_norm": 9.999490737915039, "learning_rate": 4.894759742253611e-05, "loss": 0.7839, "step": 243000 }, { "epoch": 3.3493152572263094, "grad_norm": 6.5584516525268555, "learning_rate": 4.8942579739568993e-05, "loss": 0.7936, "step": 243100 }, { "epoch": 3.350693009286049, "grad_norm": 3.6478030681610107, "learning_rate": 4.893756023494503e-05, "loss": 0.7194, "step": 243200 }, { "epoch": 3.352070761345788, "grad_norm": 8.727842330932617, "learning_rate": 4.893253890909066e-05, "loss": 0.7699, "step": 243300 }, { "epoch": 3.3534485134055276, "grad_norm": 6.058422088623047, "learning_rate": 4.892751576243245e-05, "loss": 0.7151, "step": 243400 }, { "epoch": 3.3548262654652667, "grad_norm": 20.173702239990234, "learning_rate": 4.8922490795397165e-05, "loss": 0.8029, "step": 243500 }, { "epoch": 3.3562040175250063, "grad_norm": 14.083977699279785, "learning_rate": 4.891746400841168e-05, "loss": 0.8296, "step": 243600 }, { "epoch": 3.3575817695847454, "grad_norm": 4.4388651847839355, "learning_rate": 4.8912435401903064e-05, "loss": 0.7198, "step": 243700 }, { "epoch": 3.358959521644485, "grad_norm": 5.750528812408447, "learning_rate": 4.890740497629851e-05, "loss": 0.7084, "step": 243800 }, { "epoch": 3.360337273704224, "grad_norm": 7.963659763336182, "learning_rate": 4.8902372732025387e-05, "loss": 0.8059, "step": 243900 }, { "epoch": 3.3617150257639636, "grad_norm": 2.1787214279174805, "learning_rate": 4.889733866951121e-05, "loss": 0.7413, "step": 244000 }, { "epoch": 3.3630927778237028, "grad_norm": 6.035741329193115, "learning_rate": 4.889230278918364e-05, "loss": 0.7222, "step": 244100 }, { "epoch": 3.3644705298834423, "grad_norm": 15.489537239074707, "learning_rate": 4.8887265091470505e-05, "loss": 0.8222, "step": 244200 }, { "epoch": 3.3658482819431814, "grad_norm": 10.179112434387207, "learning_rate": 4.8882225576799784e-05, "loss": 0.7614, "step": 244300 }, { "epoch": 3.367226034002921, "grad_norm": 3.5675954818725586, "learning_rate": 4.887718424559961e-05, "loss": 0.821, "step": 244400 }, { "epoch": 3.36860378606266, "grad_norm": 11.235749244689941, "learning_rate": 4.8872141098298277e-05, "loss": 0.6896, "step": 244500 }, { "epoch": 3.3699815381223996, "grad_norm": 8.315887451171875, "learning_rate": 4.886709613532421e-05, "loss": 0.7544, "step": 244600 }, { "epoch": 3.3713592901821388, "grad_norm": 10.005486488342285, "learning_rate": 4.8862049357106016e-05, "loss": 0.7465, "step": 244700 }, { "epoch": 3.3727370422418783, "grad_norm": 11.52979850769043, "learning_rate": 4.8857000764072436e-05, "loss": 0.8028, "step": 244800 }, { "epoch": 3.3741147943016174, "grad_norm": 7.157665252685547, "learning_rate": 4.8851950356652386e-05, "loss": 0.7716, "step": 244900 }, { "epoch": 3.375492546361357, "grad_norm": 6.1718339920043945, "learning_rate": 4.884689813527491e-05, "loss": 0.7281, "step": 245000 }, { "epoch": 3.376870298421096, "grad_norm": 17.761404037475586, "learning_rate": 4.8841844100369234e-05, "loss": 0.7585, "step": 245100 }, { "epoch": 3.3782480504808357, "grad_norm": 5.098033428192139, "learning_rate": 4.883678825236472e-05, "loss": 0.7591, "step": 245200 }, { "epoch": 3.3796258025405748, "grad_norm": 11.7904634475708, "learning_rate": 4.8831730591690885e-05, "loss": 0.6035, "step": 245300 }, { "epoch": 3.3810035546003143, "grad_norm": 102.54789733886719, "learning_rate": 4.8826671118777396e-05, "loss": 0.7313, "step": 245400 }, { "epoch": 3.3823813066600534, "grad_norm": 18.690048217773438, "learning_rate": 4.88216098340541e-05, "loss": 0.7894, "step": 245500 }, { "epoch": 3.383759058719793, "grad_norm": 4.71035623550415, "learning_rate": 4.8816546737950955e-05, "loss": 0.8117, "step": 245600 }, { "epoch": 3.385136810779532, "grad_norm": 4.706634044647217, "learning_rate": 4.8811481830898115e-05, "loss": 0.7514, "step": 245700 }, { "epoch": 3.386514562839271, "grad_norm": 4.474785804748535, "learning_rate": 4.880641511332587e-05, "loss": 0.7543, "step": 245800 }, { "epoch": 3.3878923148990108, "grad_norm": 62.48046112060547, "learning_rate": 4.880134658566466e-05, "loss": 0.6726, "step": 245900 }, { "epoch": 3.3892700669587503, "grad_norm": 5.013173580169678, "learning_rate": 4.8796276248345066e-05, "loss": 0.7154, "step": 246000 }, { "epoch": 3.3906478190184894, "grad_norm": 20.278827667236328, "learning_rate": 4.879120410179787e-05, "loss": 0.6962, "step": 246100 }, { "epoch": 3.3920255710782286, "grad_norm": 4.829151630401611, "learning_rate": 4.8786130146453954e-05, "loss": 0.7029, "step": 246200 }, { "epoch": 3.393403323137968, "grad_norm": 7.272677421569824, "learning_rate": 4.8781054382744374e-05, "loss": 0.6493, "step": 246300 }, { "epoch": 3.3947810751977077, "grad_norm": 9.106263160705566, "learning_rate": 4.8775976811100356e-05, "loss": 0.8466, "step": 246400 }, { "epoch": 3.3961588272574468, "grad_norm": 4.589905261993408, "learning_rate": 4.877089743195327e-05, "loss": 0.7432, "step": 246500 }, { "epoch": 3.397536579317186, "grad_norm": 10.127010345458984, "learning_rate": 4.8765816245734616e-05, "loss": 0.7921, "step": 246600 }, { "epoch": 3.3989143313769254, "grad_norm": 1.9080126285552979, "learning_rate": 4.876073325287608e-05, "loss": 0.7825, "step": 246700 }, { "epoch": 3.4002920834366646, "grad_norm": 6.177097797393799, "learning_rate": 4.875564845380949e-05, "loss": 0.7109, "step": 246800 }, { "epoch": 3.401669835496404, "grad_norm": 4.822579383850098, "learning_rate": 4.875056184896682e-05, "loss": 0.6683, "step": 246900 }, { "epoch": 3.4030475875561432, "grad_norm": 2.6892597675323486, "learning_rate": 4.874547343878019e-05, "loss": 0.6962, "step": 247000 }, { "epoch": 3.404425339615883, "grad_norm": 4.09142541885376, "learning_rate": 4.874038322368192e-05, "loss": 0.6677, "step": 247100 }, { "epoch": 3.405803091675622, "grad_norm": 22.466520309448242, "learning_rate": 4.8735291204104416e-05, "loss": 0.7752, "step": 247200 }, { "epoch": 3.4071808437353615, "grad_norm": 14.71407413482666, "learning_rate": 4.873019738048029e-05, "loss": 0.7925, "step": 247300 }, { "epoch": 3.4085585957951006, "grad_norm": 1.8812305927276611, "learning_rate": 4.8725101753242285e-05, "loss": 0.8216, "step": 247400 }, { "epoch": 3.40993634785484, "grad_norm": 20.897409439086914, "learning_rate": 4.872000432282329e-05, "loss": 0.7097, "step": 247500 }, { "epoch": 3.4113140999145792, "grad_norm": 8.49991512298584, "learning_rate": 4.871490508965638e-05, "loss": 0.7486, "step": 247600 }, { "epoch": 3.412691851974319, "grad_norm": 9.218452453613281, "learning_rate": 4.870980405417475e-05, "loss": 0.8095, "step": 247700 }, { "epoch": 3.414069604034058, "grad_norm": 10.462718963623047, "learning_rate": 4.870470121681174e-05, "loss": 0.713, "step": 247800 }, { "epoch": 3.4154473560937975, "grad_norm": 18.287776947021484, "learning_rate": 4.86995965780009e-05, "loss": 0.7127, "step": 247900 }, { "epoch": 3.4168251081535366, "grad_norm": 48.037933349609375, "learning_rate": 4.869449013817585e-05, "loss": 0.7077, "step": 248000 }, { "epoch": 3.418202860213276, "grad_norm": 7.83187198638916, "learning_rate": 4.8689381897770454e-05, "loss": 0.7141, "step": 248100 }, { "epoch": 3.4195806122730152, "grad_norm": 20.0269775390625, "learning_rate": 4.8684271857218645e-05, "loss": 0.8318, "step": 248200 }, { "epoch": 3.420958364332755, "grad_norm": 13.097688674926758, "learning_rate": 4.867916001695457e-05, "loss": 0.6663, "step": 248300 }, { "epoch": 3.422336116392494, "grad_norm": 8.687681198120117, "learning_rate": 4.8674046377412505e-05, "loss": 0.753, "step": 248400 }, { "epoch": 3.4237138684522335, "grad_norm": 5.758730888366699, "learning_rate": 4.866893093902687e-05, "loss": 0.7695, "step": 248500 }, { "epoch": 3.4250916205119726, "grad_norm": 7.05889892578125, "learning_rate": 4.8663864883500895e-05, "loss": 0.6969, "step": 248600 }, { "epoch": 3.426469372571712, "grad_norm": 41.88780975341797, "learning_rate": 4.865874586670963e-05, "loss": 0.6694, "step": 248700 }, { "epoch": 3.4278471246314512, "grad_norm": 4.313061237335205, "learning_rate": 4.865362505237464e-05, "loss": 0.7804, "step": 248800 }, { "epoch": 3.429224876691191, "grad_norm": 10.696359634399414, "learning_rate": 4.864850244093099e-05, "loss": 0.7183, "step": 248900 }, { "epoch": 3.43060262875093, "grad_norm": 4.369784355163574, "learning_rate": 4.864337803281386e-05, "loss": 0.8142, "step": 249000 }, { "epoch": 3.4319803808106695, "grad_norm": 5.5349202156066895, "learning_rate": 4.86383030993921e-05, "loss": 0.8455, "step": 249100 }, { "epoch": 3.4333581328704086, "grad_norm": 25.62873649597168, "learning_rate": 4.863317511719007e-05, "loss": 0.7835, "step": 249200 }, { "epoch": 3.434735884930148, "grad_norm": 7.503832817077637, "learning_rate": 4.8628045339616694e-05, "loss": 0.796, "step": 249300 }, { "epoch": 3.4361136369898873, "grad_norm": 6.2945685386657715, "learning_rate": 4.8622913767107786e-05, "loss": 0.7037, "step": 249400 }, { "epoch": 3.437491389049627, "grad_norm": 12.687012672424316, "learning_rate": 4.8617780400099285e-05, "loss": 0.8203, "step": 249500 }, { "epoch": 3.438869141109366, "grad_norm": 5.0454816818237305, "learning_rate": 4.861264523902731e-05, "loss": 0.7008, "step": 249600 }, { "epoch": 3.4402468931691055, "grad_norm": 3.082151412963867, "learning_rate": 4.8607508284328097e-05, "loss": 0.8666, "step": 249700 }, { "epoch": 3.4416246452288446, "grad_norm": 29.07480812072754, "learning_rate": 4.860236953643807e-05, "loss": 0.7788, "step": 249800 }, { "epoch": 3.443002397288584, "grad_norm": 4.837975978851318, "learning_rate": 4.859722899579379e-05, "loss": 0.6349, "step": 249900 }, { "epoch": 3.4443801493483233, "grad_norm": 6.527149200439453, "learning_rate": 4.859208666283199e-05, "loss": 0.7828, "step": 250000 }, { "epoch": 3.4457579014080624, "grad_norm": 3.296645402908325, "learning_rate": 4.858694253798951e-05, "loss": 0.722, "step": 250100 }, { "epoch": 3.447135653467802, "grad_norm": 10.018839836120605, "learning_rate": 4.8581848089732456e-05, "loss": 0.7493, "step": 250200 }, { "epoch": 3.4485134055275415, "grad_norm": 22.27680206298828, "learning_rate": 4.857670040034775e-05, "loss": 0.6785, "step": 250300 }, { "epoch": 3.4498911575872806, "grad_norm": 5.179969787597656, "learning_rate": 4.857155092038952e-05, "loss": 0.7364, "step": 250400 }, { "epoch": 3.4512689096470197, "grad_norm": 5.054967403411865, "learning_rate": 4.856639965029524e-05, "loss": 0.6686, "step": 250500 }, { "epoch": 3.4526466617067593, "grad_norm": 17.167804718017578, "learning_rate": 4.856124659050253e-05, "loss": 0.7993, "step": 250600 }, { "epoch": 3.454024413766499, "grad_norm": 7.192329406738281, "learning_rate": 4.8556091741449175e-05, "loss": 0.7498, "step": 250700 }, { "epoch": 3.455402165826238, "grad_norm": 18.25777244567871, "learning_rate": 4.855093510357311e-05, "loss": 0.7532, "step": 250800 }, { "epoch": 3.456779917885977, "grad_norm": 6.189223766326904, "learning_rate": 4.854577667731241e-05, "loss": 0.7239, "step": 250900 }, { "epoch": 3.4581576699457166, "grad_norm": 5.249884605407715, "learning_rate": 4.854061646310531e-05, "loss": 0.7946, "step": 251000 }, { "epoch": 3.4595354220054557, "grad_norm": 5.6971659660339355, "learning_rate": 4.8535454461390194e-05, "loss": 0.6995, "step": 251100 }, { "epoch": 3.4609131740651953, "grad_norm": 40.974647521972656, "learning_rate": 4.8530290672605605e-05, "loss": 0.7671, "step": 251200 }, { "epoch": 3.4622909261249344, "grad_norm": 4.172917366027832, "learning_rate": 4.852512509719024e-05, "loss": 0.7805, "step": 251300 }, { "epoch": 3.463668678184674, "grad_norm": 18.657901763916016, "learning_rate": 4.851995773558293e-05, "loss": 0.7516, "step": 251400 }, { "epoch": 3.465046430244413, "grad_norm": 9.104113578796387, "learning_rate": 4.851478858822267e-05, "loss": 0.7881, "step": 251500 }, { "epoch": 3.4664241823041526, "grad_norm": 15.249149322509766, "learning_rate": 4.8509617655548614e-05, "loss": 0.7141, "step": 251600 }, { "epoch": 3.4678019343638917, "grad_norm": 4.647822380065918, "learning_rate": 4.8504444938000054e-05, "loss": 0.7246, "step": 251700 }, { "epoch": 3.4691796864236313, "grad_norm": 9.48563289642334, "learning_rate": 4.849927043601644e-05, "loss": 0.7333, "step": 251800 }, { "epoch": 3.4705574384833704, "grad_norm": 4.856204986572266, "learning_rate": 4.849409415003737e-05, "loss": 0.7581, "step": 251900 }, { "epoch": 3.47193519054311, "grad_norm": 3.4496560096740723, "learning_rate": 4.8488916080502594e-05, "loss": 0.6648, "step": 252000 }, { "epoch": 3.473312942602849, "grad_norm": 4.758912086486816, "learning_rate": 4.848373622785202e-05, "loss": 0.6926, "step": 252100 }, { "epoch": 3.4746906946625886, "grad_norm": 7.947057723999023, "learning_rate": 4.847855459252569e-05, "loss": 0.7147, "step": 252200 }, { "epoch": 3.4760684467223277, "grad_norm": 3.94535756111145, "learning_rate": 4.847337117496384e-05, "loss": 0.7929, "step": 252300 }, { "epoch": 3.4774461987820673, "grad_norm": 4.1371917724609375, "learning_rate": 4.8468185975606806e-05, "loss": 0.7555, "step": 252400 }, { "epoch": 3.4788239508418064, "grad_norm": 22.970508575439453, "learning_rate": 4.8462998994895086e-05, "loss": 0.6991, "step": 252500 }, { "epoch": 3.480201702901546, "grad_norm": 24.19109344482422, "learning_rate": 4.845781023326937e-05, "loss": 0.7397, "step": 252600 }, { "epoch": 3.481579454961285, "grad_norm": 8.861376762390137, "learning_rate": 4.845261969117046e-05, "loss": 0.7637, "step": 252700 }, { "epoch": 3.4829572070210246, "grad_norm": 3.156024694442749, "learning_rate": 4.84474273690393e-05, "loss": 0.7181, "step": 252800 }, { "epoch": 3.4843349590807637, "grad_norm": 5.054050922393799, "learning_rate": 4.8442233267317024e-05, "loss": 0.6469, "step": 252900 }, { "epoch": 3.4857127111405033, "grad_norm": 11.35214614868164, "learning_rate": 4.843703738644489e-05, "loss": 0.7373, "step": 253000 }, { "epoch": 3.4870904632002424, "grad_norm": 3.2364251613616943, "learning_rate": 4.843183972686432e-05, "loss": 0.7847, "step": 253100 }, { "epoch": 3.488468215259982, "grad_norm": 20.48175811767578, "learning_rate": 4.842664028901688e-05, "loss": 0.6701, "step": 253200 }, { "epoch": 3.489845967319721, "grad_norm": 10.260986328125, "learning_rate": 4.842143907334429e-05, "loss": 0.7859, "step": 253300 }, { "epoch": 3.4912237193794606, "grad_norm": 5.544047832489014, "learning_rate": 4.84162360802884e-05, "loss": 0.678, "step": 253400 }, { "epoch": 3.4926014714391997, "grad_norm": 16.02749252319336, "learning_rate": 4.841103131029127e-05, "loss": 0.7296, "step": 253500 }, { "epoch": 3.4939792234989393, "grad_norm": 1.9412063360214233, "learning_rate": 4.8405824763795035e-05, "loss": 0.7713, "step": 253600 }, { "epoch": 3.4953569755586784, "grad_norm": 8.680977821350098, "learning_rate": 4.840061644124204e-05, "loss": 0.7368, "step": 253700 }, { "epoch": 3.496734727618418, "grad_norm": 6.058280944824219, "learning_rate": 4.8395406343074755e-05, "loss": 0.7059, "step": 253800 }, { "epoch": 3.498112479678157, "grad_norm": 7.602729797363281, "learning_rate": 4.8390194469735795e-05, "loss": 0.7328, "step": 253900 }, { "epoch": 3.4994902317378966, "grad_norm": 35.16118240356445, "learning_rate": 4.8384980821667946e-05, "loss": 0.7251, "step": 254000 }, { "epoch": 3.5008679837976358, "grad_norm": 10.049656867980957, "learning_rate": 4.8379765399314125e-05, "loss": 0.6453, "step": 254100 }, { "epoch": 3.5022457358573753, "grad_norm": 1.452939510345459, "learning_rate": 4.837454820311741e-05, "loss": 0.7423, "step": 254200 }, { "epoch": 3.5036234879171144, "grad_norm": 4.007043361663818, "learning_rate": 4.836932923352104e-05, "loss": 0.6879, "step": 254300 }, { "epoch": 3.5050012399768535, "grad_norm": 10.145913124084473, "learning_rate": 4.836410849096838e-05, "loss": 0.7283, "step": 254400 }, { "epoch": 3.506378992036593, "grad_norm": 4.904376029968262, "learning_rate": 4.835888597590297e-05, "loss": 0.7373, "step": 254500 }, { "epoch": 3.5077567440963326, "grad_norm": 6.015730857849121, "learning_rate": 4.835366168876848e-05, "loss": 0.6974, "step": 254600 }, { "epoch": 3.5091344961560718, "grad_norm": 10.618706703186035, "learning_rate": 4.834843563000873e-05, "loss": 0.7305, "step": 254700 }, { "epoch": 3.510512248215811, "grad_norm": 6.833013534545898, "learning_rate": 4.834320780006773e-05, "loss": 0.754, "step": 254800 }, { "epoch": 3.5118900002755504, "grad_norm": 3.680297374725342, "learning_rate": 4.833797819938959e-05, "loss": 0.706, "step": 254900 }, { "epoch": 3.51326775233529, "grad_norm": 3.6445000171661377, "learning_rate": 4.833274682841859e-05, "loss": 0.6842, "step": 255000 }, { "epoch": 3.514645504395029, "grad_norm": 7.004006862640381, "learning_rate": 4.832751368759917e-05, "loss": 0.6952, "step": 255100 }, { "epoch": 3.516023256454768, "grad_norm": 5.290282726287842, "learning_rate": 4.832227877737591e-05, "loss": 0.7236, "step": 255200 }, { "epoch": 3.5174010085145078, "grad_norm": 5.978407382965088, "learning_rate": 4.831704209819354e-05, "loss": 0.7177, "step": 255300 }, { "epoch": 3.5187787605742473, "grad_norm": 8.913142204284668, "learning_rate": 4.8311803650496945e-05, "loss": 0.7054, "step": 255400 }, { "epoch": 3.5201565126339864, "grad_norm": 6.210485935211182, "learning_rate": 4.8306563434731164e-05, "loss": 0.6782, "step": 255500 }, { "epoch": 3.5215342646937255, "grad_norm": 49.2089958190918, "learning_rate": 4.8301321451341366e-05, "loss": 0.784, "step": 255600 }, { "epoch": 3.522912016753465, "grad_norm": 3.744565010070801, "learning_rate": 4.829607770077289e-05, "loss": 0.7407, "step": 255700 }, { "epoch": 3.524289768813204, "grad_norm": 13.391924858093262, "learning_rate": 4.829083218347123e-05, "loss": 0.6471, "step": 255800 }, { "epoch": 3.5256675208729438, "grad_norm": 3.9713025093078613, "learning_rate": 4.8285584899882e-05, "loss": 0.7265, "step": 255900 }, { "epoch": 3.527045272932683, "grad_norm": 14.249966621398926, "learning_rate": 4.8280335850451005e-05, "loss": 0.7673, "step": 256000 }, { "epoch": 3.5284230249924224, "grad_norm": 3.9654293060302734, "learning_rate": 4.8275085035624166e-05, "loss": 0.7313, "step": 256100 }, { "epoch": 3.5298007770521616, "grad_norm": 3.889775276184082, "learning_rate": 4.826983245584756e-05, "loss": 0.7627, "step": 256200 }, { "epoch": 3.531178529111901, "grad_norm": 7.995856285095215, "learning_rate": 4.826457811156744e-05, "loss": 0.743, "step": 256300 }, { "epoch": 3.5325562811716402, "grad_norm": 4.204479217529297, "learning_rate": 4.825932200323017e-05, "loss": 0.6754, "step": 256400 }, { "epoch": 3.53393403323138, "grad_norm": 15.757999420166016, "learning_rate": 4.8254064131282305e-05, "loss": 0.6966, "step": 256500 }, { "epoch": 3.535311785291119, "grad_norm": 8.793876647949219, "learning_rate": 4.82488044961705e-05, "loss": 0.7616, "step": 256600 }, { "epoch": 3.5366895373508584, "grad_norm": 8.58087158203125, "learning_rate": 4.8243543098341614e-05, "loss": 0.6844, "step": 256700 }, { "epoch": 3.5380672894105976, "grad_norm": 3.6597142219543457, "learning_rate": 4.823833257856536e-05, "loss": 0.7325, "step": 256800 }, { "epoch": 3.539445041470337, "grad_norm": 5.672914028167725, "learning_rate": 4.823306767425941e-05, "loss": 0.6947, "step": 256900 }, { "epoch": 3.5408227935300762, "grad_norm": 9.928915977478027, "learning_rate": 4.8227801008573277e-05, "loss": 0.7355, "step": 257000 }, { "epoch": 3.542200545589816, "grad_norm": 6.378506660461426, "learning_rate": 4.822258527493575e-05, "loss": 0.7494, "step": 257100 }, { "epoch": 3.543578297649555, "grad_norm": 9.644248008728027, "learning_rate": 4.8217315105434356e-05, "loss": 0.7181, "step": 257200 }, { "epoch": 3.5449560497092945, "grad_norm": 3.024136781692505, "learning_rate": 4.8212043175891056e-05, "loss": 0.659, "step": 257300 }, { "epoch": 3.5463338017690336, "grad_norm": 29.016172409057617, "learning_rate": 4.820676948675373e-05, "loss": 0.7913, "step": 257400 }, { "epoch": 3.547711553828773, "grad_norm": 97.14000701904297, "learning_rate": 4.820149403847039e-05, "loss": 0.738, "step": 257500 }, { "epoch": 3.5490893058885122, "grad_norm": 2.647348642349243, "learning_rate": 4.819621683148924e-05, "loss": 0.7436, "step": 257600 }, { "epoch": 3.550467057948252, "grad_norm": 4.298648357391357, "learning_rate": 4.819093786625858e-05, "loss": 0.6504, "step": 257700 }, { "epoch": 3.551844810007991, "grad_norm": 2.510650396347046, "learning_rate": 4.818565714322689e-05, "loss": 0.7227, "step": 257800 }, { "epoch": 3.55322256206773, "grad_norm": 7.00123405456543, "learning_rate": 4.81803746628428e-05, "loss": 0.7115, "step": 257900 }, { "epoch": 3.5546003141274696, "grad_norm": 10.8532075881958, "learning_rate": 4.817509042555509e-05, "loss": 0.7083, "step": 258000 }, { "epoch": 3.555978066187209, "grad_norm": 21.37672233581543, "learning_rate": 4.8169804431812665e-05, "loss": 0.7287, "step": 258100 }, { "epoch": 3.5573558182469482, "grad_norm": 9.328338623046875, "learning_rate": 4.816451668206462e-05, "loss": 0.7438, "step": 258200 }, { "epoch": 3.5587335703066874, "grad_norm": 5.649765491485596, "learning_rate": 4.8159227176760155e-05, "loss": 0.7996, "step": 258300 }, { "epoch": 3.560111322366427, "grad_norm": 5.415292739868164, "learning_rate": 4.815393591634866e-05, "loss": 0.7156, "step": 258400 }, { "epoch": 3.5614890744261665, "grad_norm": 17.550783157348633, "learning_rate": 4.814864290127963e-05, "loss": 0.8034, "step": 258500 }, { "epoch": 3.5628668264859056, "grad_norm": 13.280574798583984, "learning_rate": 4.814334813200275e-05, "loss": 0.7627, "step": 258600 }, { "epoch": 3.5642445785456447, "grad_norm": 5.680484771728516, "learning_rate": 4.813805160896784e-05, "loss": 0.6671, "step": 258700 }, { "epoch": 3.5656223306053842, "grad_norm": 3.3929390907287598, "learning_rate": 4.8132753332624864e-05, "loss": 0.7065, "step": 258800 }, { "epoch": 3.567000082665124, "grad_norm": 4.375004291534424, "learning_rate": 4.812745330342393e-05, "loss": 0.7072, "step": 258900 }, { "epoch": 3.568377834724863, "grad_norm": 4.153993606567383, "learning_rate": 4.8122151521815315e-05, "loss": 0.8178, "step": 259000 }, { "epoch": 3.569755586784602, "grad_norm": 9.471790313720703, "learning_rate": 4.811684798824942e-05, "loss": 0.6827, "step": 259100 }, { "epoch": 3.5711333388443416, "grad_norm": 7.491828441619873, "learning_rate": 4.811154270317682e-05, "loss": 0.7637, "step": 259200 }, { "epoch": 3.572511090904081, "grad_norm": 53.97184371948242, "learning_rate": 4.810623566704822e-05, "loss": 0.7672, "step": 259300 }, { "epoch": 3.5738888429638203, "grad_norm": 10.1900634765625, "learning_rate": 4.810092688031447e-05, "loss": 0.7222, "step": 259400 }, { "epoch": 3.5752665950235594, "grad_norm": 25.034801483154297, "learning_rate": 4.809561634342659e-05, "loss": 0.7186, "step": 259500 }, { "epoch": 3.576644347083299, "grad_norm": 42.99748611450195, "learning_rate": 4.809030405683574e-05, "loss": 0.7698, "step": 259600 }, { "epoch": 3.5780220991430385, "grad_norm": 7.716062545776367, "learning_rate": 4.808499002099322e-05, "loss": 0.7963, "step": 259700 }, { "epoch": 3.5793998512027776, "grad_norm": 4.706084728240967, "learning_rate": 4.8079674236350485e-05, "loss": 0.8121, "step": 259800 }, { "epoch": 3.5807776032625167, "grad_norm": 7.900924205780029, "learning_rate": 4.807435670335913e-05, "loss": 0.7499, "step": 259900 }, { "epoch": 3.5821553553222563, "grad_norm": 43.2916145324707, "learning_rate": 4.806903742247093e-05, "loss": 0.6282, "step": 260000 }, { "epoch": 3.5835331073819954, "grad_norm": 6.506124019622803, "learning_rate": 4.806371639413777e-05, "loss": 0.7603, "step": 260100 }, { "epoch": 3.584910859441735, "grad_norm": 4.683910369873047, "learning_rate": 4.8058393618811685e-05, "loss": 0.7214, "step": 260200 }, { "epoch": 3.586288611501474, "grad_norm": 8.051076889038086, "learning_rate": 4.80530690969449e-05, "loss": 0.7195, "step": 260300 }, { "epoch": 3.5876663635612136, "grad_norm": 25.076473236083984, "learning_rate": 4.804774282898974e-05, "loss": 0.6599, "step": 260400 }, { "epoch": 3.5890441156209527, "grad_norm": 11.387847900390625, "learning_rate": 4.8042414815398704e-05, "loss": 0.7632, "step": 260500 }, { "epoch": 3.5904218676806923, "grad_norm": 41.54713821411133, "learning_rate": 4.803708505662444e-05, "loss": 0.7081, "step": 260600 }, { "epoch": 3.5917996197404314, "grad_norm": 37.84764099121094, "learning_rate": 4.803175355311973e-05, "loss": 0.7368, "step": 260700 }, { "epoch": 3.593177371800171, "grad_norm": 14.859453201293945, "learning_rate": 4.8026420305337515e-05, "loss": 0.7185, "step": 260800 }, { "epoch": 3.59455512385991, "grad_norm": 51.62633514404297, "learning_rate": 4.802108531373088e-05, "loss": 0.7543, "step": 260900 }, { "epoch": 3.5959328759196496, "grad_norm": 19.463062286376953, "learning_rate": 4.801574857875307e-05, "loss": 0.692, "step": 261000 }, { "epoch": 3.5973106279793887, "grad_norm": 9.161649703979492, "learning_rate": 4.801041010085746e-05, "loss": 0.6862, "step": 261100 }, { "epoch": 3.5986883800391283, "grad_norm": 13.147780418395996, "learning_rate": 4.800506988049757e-05, "loss": 0.8044, "step": 261200 }, { "epoch": 3.6000661320988674, "grad_norm": 15.129803657531738, "learning_rate": 4.79997279181271e-05, "loss": 0.7687, "step": 261300 }, { "epoch": 3.601443884158607, "grad_norm": 31.365673065185547, "learning_rate": 4.799438421419987e-05, "loss": 0.7516, "step": 261400 }, { "epoch": 3.602821636218346, "grad_norm": 9.364607810974121, "learning_rate": 4.7989038769169845e-05, "loss": 0.741, "step": 261500 }, { "epoch": 3.6041993882780856, "grad_norm": 2.6934702396392822, "learning_rate": 4.7983745063962666e-05, "loss": 0.7155, "step": 261600 }, { "epoch": 3.6055771403378247, "grad_norm": 14.262542724609375, "learning_rate": 4.797839615548928e-05, "loss": 0.735, "step": 261700 }, { "epoch": 3.6069548923975643, "grad_norm": 29.470951080322266, "learning_rate": 4.797304550727137e-05, "loss": 0.7384, "step": 261800 }, { "epoch": 3.6083326444573034, "grad_norm": 3.852599859237671, "learning_rate": 4.796769311976351e-05, "loss": 0.7171, "step": 261900 }, { "epoch": 3.609710396517043, "grad_norm": 2.499370574951172, "learning_rate": 4.796233899342041e-05, "loss": 0.8475, "step": 262000 }, { "epoch": 3.611088148576782, "grad_norm": 12.833796501159668, "learning_rate": 4.795698312869693e-05, "loss": 0.7111, "step": 262100 }, { "epoch": 3.612465900636521, "grad_norm": 16.0988712310791, "learning_rate": 4.795162552604806e-05, "loss": 0.689, "step": 262200 }, { "epoch": 3.6138436526962607, "grad_norm": 10.9572114944458, "learning_rate": 4.794626618592899e-05, "loss": 0.7906, "step": 262300 }, { "epoch": 3.6152214047560003, "grad_norm": 5.4418416023254395, "learning_rate": 4.794090510879499e-05, "loss": 0.7051, "step": 262400 }, { "epoch": 3.6165991568157394, "grad_norm": 3.5932974815368652, "learning_rate": 4.793554229510154e-05, "loss": 0.7169, "step": 262500 }, { "epoch": 3.6179769088754785, "grad_norm": 4.044643878936768, "learning_rate": 4.793017774530421e-05, "loss": 0.7507, "step": 262600 }, { "epoch": 3.619354660935218, "grad_norm": 3.4287917613983154, "learning_rate": 4.792481145985877e-05, "loss": 0.7921, "step": 262700 }, { "epoch": 3.6207324129949576, "grad_norm": 8.930657386779785, "learning_rate": 4.7919443439221106e-05, "loss": 0.8014, "step": 262800 }, { "epoch": 3.6221101650546967, "grad_norm": 163.52113342285156, "learning_rate": 4.7914073683847254e-05, "loss": 0.766, "step": 262900 }, { "epoch": 3.623487917114436, "grad_norm": 2.648787021636963, "learning_rate": 4.7908702194193406e-05, "loss": 0.7713, "step": 263000 }, { "epoch": 3.6248656691741754, "grad_norm": 4.38862943649292, "learning_rate": 4.790338271153159e-05, "loss": 0.662, "step": 263100 }, { "epoch": 3.626243421233915, "grad_norm": 8.300777435302734, "learning_rate": 4.7898007772018314e-05, "loss": 0.7063, "step": 263200 }, { "epoch": 3.627621173293654, "grad_norm": 3.531403064727783, "learning_rate": 4.789263109958992e-05, "loss": 0.7463, "step": 263300 }, { "epoch": 3.628998925353393, "grad_norm": 3.786421775817871, "learning_rate": 4.788725269470319e-05, "loss": 0.6612, "step": 263400 }, { "epoch": 3.6303766774131327, "grad_norm": 5.484480381011963, "learning_rate": 4.788187255781504e-05, "loss": 0.8124, "step": 263500 }, { "epoch": 3.6317544294728723, "grad_norm": 7.373251438140869, "learning_rate": 4.787649068938254e-05, "loss": 0.7311, "step": 263600 }, { "epoch": 3.6331321815326114, "grad_norm": 25.674911499023438, "learning_rate": 4.787110708986291e-05, "loss": 0.6926, "step": 263700 }, { "epoch": 3.6345099335923505, "grad_norm": 18.366029739379883, "learning_rate": 4.7865721759713516e-05, "loss": 0.7466, "step": 263800 }, { "epoch": 3.63588768565209, "grad_norm": 7.999305248260498, "learning_rate": 4.786033469939187e-05, "loss": 0.7021, "step": 263900 }, { "epoch": 3.6372654377118296, "grad_norm": 8.078387260437012, "learning_rate": 4.7854945909355624e-05, "loss": 0.7951, "step": 264000 }, { "epoch": 3.6386431897715688, "grad_norm": 10.383421897888184, "learning_rate": 4.78495553900626e-05, "loss": 0.751, "step": 264100 }, { "epoch": 3.640020941831308, "grad_norm": 8.066372871398926, "learning_rate": 4.784416314197073e-05, "loss": 0.8231, "step": 264200 }, { "epoch": 3.6413986938910474, "grad_norm": 5.808138847351074, "learning_rate": 4.783882311385623e-05, "loss": 0.7748, "step": 264300 }, { "epoch": 3.6427764459507865, "grad_norm": 11.194897651672363, "learning_rate": 4.783342742681768e-05, "loss": 0.6328, "step": 264400 }, { "epoch": 3.644154198010526, "grad_norm": 8.608333587646484, "learning_rate": 4.782803001235045e-05, "loss": 0.7876, "step": 264500 }, { "epoch": 3.645531950070265, "grad_norm": 4.367278575897217, "learning_rate": 4.782263087091307e-05, "loss": 0.695, "step": 264600 }, { "epoch": 3.6469097021300048, "grad_norm": 1.2145054340362549, "learning_rate": 4.7817230002964225e-05, "loss": 0.7885, "step": 264700 }, { "epoch": 3.648287454189744, "grad_norm": 8.021724700927734, "learning_rate": 4.7811827408962756e-05, "loss": 0.7201, "step": 264800 }, { "epoch": 3.6496652062494834, "grad_norm": 14.330703735351562, "learning_rate": 4.7806423089367634e-05, "loss": 0.7589, "step": 264900 }, { "epoch": 3.6510429583092225, "grad_norm": 11.726005554199219, "learning_rate": 4.780101704463798e-05, "loss": 0.7641, "step": 265000 }, { "epoch": 3.652420710368962, "grad_norm": 6.068086624145508, "learning_rate": 4.779560927523307e-05, "loss": 0.837, "step": 265100 }, { "epoch": 3.653798462428701, "grad_norm": 25.767248153686523, "learning_rate": 4.779019978161232e-05, "loss": 0.802, "step": 265200 }, { "epoch": 3.6551762144884408, "grad_norm": 4.487380027770996, "learning_rate": 4.7784788564235295e-05, "loss": 0.8065, "step": 265300 }, { "epoch": 3.65655396654818, "grad_norm": 4.216097831726074, "learning_rate": 4.77793756235617e-05, "loss": 0.8177, "step": 265400 }, { "epoch": 3.6579317186079194, "grad_norm": 9.052308082580566, "learning_rate": 4.7773960960051406e-05, "loss": 0.7106, "step": 265500 }, { "epoch": 3.6593094706676585, "grad_norm": 8.382248878479004, "learning_rate": 4.77685445741644e-05, "loss": 0.8162, "step": 265600 }, { "epoch": 3.660687222727398, "grad_norm": 6.573047161102295, "learning_rate": 4.776312646636085e-05, "loss": 0.763, "step": 265700 }, { "epoch": 3.662064974787137, "grad_norm": 16.751585006713867, "learning_rate": 4.775770663710103e-05, "loss": 0.7174, "step": 265800 }, { "epoch": 3.6634427268468768, "grad_norm": 4.9187822341918945, "learning_rate": 4.775228508684539e-05, "loss": 0.7844, "step": 265900 }, { "epoch": 3.664820478906616, "grad_norm": 11.159941673278809, "learning_rate": 4.7746861816054535e-05, "loss": 0.7923, "step": 266000 }, { "epoch": 3.6661982309663554, "grad_norm": 8.07771110534668, "learning_rate": 4.774143682518918e-05, "loss": 0.697, "step": 266100 }, { "epoch": 3.6675759830260946, "grad_norm": 3.0037763118743896, "learning_rate": 4.7736010114710215e-05, "loss": 0.7301, "step": 266200 }, { "epoch": 3.668953735085834, "grad_norm": 15.279373168945312, "learning_rate": 4.7730581685078664e-05, "loss": 0.7591, "step": 266300 }, { "epoch": 3.6703314871455732, "grad_norm": 12.624349594116211, "learning_rate": 4.77251515367557e-05, "loss": 0.8636, "step": 266400 }, { "epoch": 3.6717092392053123, "grad_norm": 57.029048919677734, "learning_rate": 4.771971967020264e-05, "loss": 0.8478, "step": 266500 }, { "epoch": 3.673086991265052, "grad_norm": 4.073955059051514, "learning_rate": 4.771428608588095e-05, "loss": 0.8166, "step": 266600 }, { "epoch": 3.6744647433247914, "grad_norm": 38.54369354248047, "learning_rate": 4.7708850784252244e-05, "loss": 0.7079, "step": 266700 }, { "epoch": 3.6758424953845306, "grad_norm": 16.506465911865234, "learning_rate": 4.770341376577827e-05, "loss": 0.8103, "step": 266800 }, { "epoch": 3.6772202474442697, "grad_norm": 22.867197036743164, "learning_rate": 4.769797503092094e-05, "loss": 0.816, "step": 266900 }, { "epoch": 3.6785979995040092, "grad_norm": 9.203018188476562, "learning_rate": 4.76925345801423e-05, "loss": 0.7172, "step": 267000 }, { "epoch": 3.679975751563749, "grad_norm": 7.021424293518066, "learning_rate": 4.768709241390455e-05, "loss": 0.73, "step": 267100 }, { "epoch": 3.681353503623488, "grad_norm": 7.541938781738281, "learning_rate": 4.768164853267001e-05, "loss": 0.7382, "step": 267200 }, { "epoch": 3.682731255683227, "grad_norm": 36.13368225097656, "learning_rate": 4.767620293690118e-05, "loss": 0.7394, "step": 267300 }, { "epoch": 3.6841090077429666, "grad_norm": 3.0906050205230713, "learning_rate": 4.76707556270607e-05, "loss": 0.7198, "step": 267400 }, { "epoch": 3.685486759802706, "grad_norm": 40.52449417114258, "learning_rate": 4.7665306603611334e-05, "loss": 0.8418, "step": 267500 }, { "epoch": 3.6868645118624452, "grad_norm": 6.134153366088867, "learning_rate": 4.7659855867016004e-05, "loss": 0.7729, "step": 267600 }, { "epoch": 3.6882422639221843, "grad_norm": 11.829121589660645, "learning_rate": 4.7654403417737775e-05, "loss": 0.8, "step": 267700 }, { "epoch": 3.689620015981924, "grad_norm": 87.53887939453125, "learning_rate": 4.764894925623988e-05, "loss": 0.7595, "step": 267800 }, { "epoch": 3.6909977680416635, "grad_norm": 22.38325309753418, "learning_rate": 4.764349338298565e-05, "loss": 0.752, "step": 267900 }, { "epoch": 3.6923755201014026, "grad_norm": 13.194493293762207, "learning_rate": 4.763803579843861e-05, "loss": 0.7732, "step": 268000 }, { "epoch": 3.6937532721611417, "grad_norm": 4.538683891296387, "learning_rate": 4.7632576503062405e-05, "loss": 0.8146, "step": 268100 }, { "epoch": 3.6951310242208812, "grad_norm": 12.0274019241333, "learning_rate": 4.762711549732083e-05, "loss": 0.7495, "step": 268200 }, { "epoch": 3.696508776280621, "grad_norm": 21.319379806518555, "learning_rate": 4.762165278167782e-05, "loss": 0.6809, "step": 268300 }, { "epoch": 3.69788652834036, "grad_norm": 6.008383274078369, "learning_rate": 4.761618835659746e-05, "loss": 0.725, "step": 268400 }, { "epoch": 3.699264280400099, "grad_norm": 34.65384292602539, "learning_rate": 4.761072222254399e-05, "loss": 0.7692, "step": 268500 }, { "epoch": 3.7006420324598386, "grad_norm": 73.18565368652344, "learning_rate": 4.760525437998178e-05, "loss": 0.7571, "step": 268600 }, { "epoch": 3.7020197845195777, "grad_norm": 1.4377433061599731, "learning_rate": 4.7599784829375354e-05, "loss": 0.6696, "step": 268700 }, { "epoch": 3.7033975365793173, "grad_norm": 8.201079368591309, "learning_rate": 4.7594313571189385e-05, "loss": 0.762, "step": 268800 }, { "epoch": 3.7047752886390564, "grad_norm": 4.367722034454346, "learning_rate": 4.758884060588867e-05, "loss": 0.7242, "step": 268900 }, { "epoch": 3.706153040698796, "grad_norm": 4.735632419586182, "learning_rate": 4.758336593393817e-05, "loss": 0.7481, "step": 269000 }, { "epoch": 3.707530792758535, "grad_norm": 9.206520080566406, "learning_rate": 4.757788955580298e-05, "loss": 0.7383, "step": 269100 }, { "epoch": 3.7089085448182746, "grad_norm": 13.853857040405273, "learning_rate": 4.757241147194837e-05, "loss": 0.7151, "step": 269200 }, { "epoch": 3.7102862968780137, "grad_norm": 5.568164348602295, "learning_rate": 4.756693168283971e-05, "loss": 0.8428, "step": 269300 }, { "epoch": 3.7116640489377533, "grad_norm": 3.7256813049316406, "learning_rate": 4.756145018894254e-05, "loss": 0.7207, "step": 269400 }, { "epoch": 3.7130418009974924, "grad_norm": 4.671421527862549, "learning_rate": 4.755596699072254e-05, "loss": 0.6761, "step": 269500 }, { "epoch": 3.714419553057232, "grad_norm": 14.45839786529541, "learning_rate": 4.755048208864555e-05, "loss": 0.7421, "step": 269600 }, { "epoch": 3.715797305116971, "grad_norm": 12.860676765441895, "learning_rate": 4.7544995483177516e-05, "loss": 0.6924, "step": 269700 }, { "epoch": 3.7171750571767106, "grad_norm": 5.4139251708984375, "learning_rate": 4.753950717478457e-05, "loss": 0.6845, "step": 269800 }, { "epoch": 3.7185528092364497, "grad_norm": 14.10604190826416, "learning_rate": 4.753401716393297e-05, "loss": 0.7336, "step": 269900 }, { "epoch": 3.7199305612961893, "grad_norm": 5.590238571166992, "learning_rate": 4.752858037664089e-05, "loss": 0.7572, "step": 270000 }, { "epoch": 3.7213083133559284, "grad_norm": 68.34678649902344, "learning_rate": 4.752308697928428e-05, "loss": 0.7648, "step": 270100 }, { "epoch": 3.722686065415668, "grad_norm": 3.155557155609131, "learning_rate": 4.7517591880864e-05, "loss": 0.7174, "step": 270200 }, { "epoch": 3.724063817475407, "grad_norm": 11.392009735107422, "learning_rate": 4.751209508184687e-05, "loss": 0.7653, "step": 270300 }, { "epoch": 3.7254415695351466, "grad_norm": 17.823469161987305, "learning_rate": 4.750659658269989e-05, "loss": 0.8423, "step": 270400 }, { "epoch": 3.7268193215948857, "grad_norm": 6.774659633636475, "learning_rate": 4.750109638389017e-05, "loss": 0.8087, "step": 270500 }, { "epoch": 3.7281970736546253, "grad_norm": 15.389320373535156, "learning_rate": 4.7495594485885e-05, "loss": 0.6586, "step": 270600 }, { "epoch": 3.7295748257143644, "grad_norm": 14.331101417541504, "learning_rate": 4.749009088915177e-05, "loss": 0.7198, "step": 270700 }, { "epoch": 3.7309525777741035, "grad_norm": 4.198037624359131, "learning_rate": 4.748458559415806e-05, "loss": 0.7551, "step": 270800 }, { "epoch": 3.732330329833843, "grad_norm": 3.555859088897705, "learning_rate": 4.747907860137156e-05, "loss": 0.6778, "step": 270900 }, { "epoch": 3.7337080818935826, "grad_norm": 13.764531135559082, "learning_rate": 4.7473569911260116e-05, "loss": 0.7844, "step": 271000 }, { "epoch": 3.7350858339533217, "grad_norm": 13.592623710632324, "learning_rate": 4.7468059524291725e-05, "loss": 0.739, "step": 271100 }, { "epoch": 3.736463586013061, "grad_norm": 3.8996264934539795, "learning_rate": 4.7462547440934524e-05, "loss": 0.8977, "step": 271200 }, { "epoch": 3.7378413380728004, "grad_norm": 3.623610734939575, "learning_rate": 4.745703366165679e-05, "loss": 0.6747, "step": 271300 }, { "epoch": 3.73921909013254, "grad_norm": 3.714876174926758, "learning_rate": 4.745151818692695e-05, "loss": 0.7487, "step": 271400 }, { "epoch": 3.740596842192279, "grad_norm": 8.382322311401367, "learning_rate": 4.744600101721356e-05, "loss": 0.7621, "step": 271500 }, { "epoch": 3.741974594252018, "grad_norm": 6.062857151031494, "learning_rate": 4.744048215298535e-05, "loss": 0.6358, "step": 271600 }, { "epoch": 3.7433523463117577, "grad_norm": 5.738988399505615, "learning_rate": 4.7434961594711166e-05, "loss": 0.7446, "step": 271700 }, { "epoch": 3.7447300983714973, "grad_norm": 29.230758666992188, "learning_rate": 4.742943934286e-05, "loss": 0.7758, "step": 271800 }, { "epoch": 3.7461078504312364, "grad_norm": 19.700021743774414, "learning_rate": 4.7423915397901004e-05, "loss": 0.7268, "step": 271900 }, { "epoch": 3.7474856024909755, "grad_norm": 1.5176565647125244, "learning_rate": 4.741838976030347e-05, "loss": 0.7574, "step": 272000 }, { "epoch": 3.748863354550715, "grad_norm": 4.334427356719971, "learning_rate": 4.741286243053683e-05, "loss": 0.7013, "step": 272100 }, { "epoch": 3.7502411066104546, "grad_norm": 6.970549583435059, "learning_rate": 4.740733340907064e-05, "loss": 0.715, "step": 272200 }, { "epoch": 3.7516188586701937, "grad_norm": 6.901124954223633, "learning_rate": 4.7401802696374635e-05, "loss": 0.6704, "step": 272300 }, { "epoch": 3.752996610729933, "grad_norm": 11.396589279174805, "learning_rate": 4.7396270292918674e-05, "loss": 0.8068, "step": 272400 }, { "epoch": 3.7543743627896724, "grad_norm": 13.870627403259277, "learning_rate": 4.7390791548475615e-05, "loss": 0.7341, "step": 272500 }, { "epoch": 3.755752114849412, "grad_norm": 9.901688575744629, "learning_rate": 4.738525578180578e-05, "loss": 0.6902, "step": 272600 }, { "epoch": 3.757129866909151, "grad_norm": 7.905740737915039, "learning_rate": 4.7379718325781725e-05, "loss": 0.7451, "step": 272700 }, { "epoch": 3.75850761896889, "grad_norm": 3.3949713706970215, "learning_rate": 4.7374179180873905e-05, "loss": 0.7406, "step": 272800 }, { "epoch": 3.7598853710286297, "grad_norm": 10.167157173156738, "learning_rate": 4.736863834755288e-05, "loss": 0.861, "step": 272900 }, { "epoch": 3.761263123088369, "grad_norm": 566.754150390625, "learning_rate": 4.736309582628938e-05, "loss": 0.7593, "step": 273000 }, { "epoch": 3.7626408751481084, "grad_norm": 18.167667388916016, "learning_rate": 4.7357551617554274e-05, "loss": 0.7495, "step": 273100 }, { "epoch": 3.7640186272078475, "grad_norm": 9.464631080627441, "learning_rate": 4.735200572181857e-05, "loss": 0.7617, "step": 273200 }, { "epoch": 3.765396379267587, "grad_norm": 5.3268232345581055, "learning_rate": 4.734645813955341e-05, "loss": 0.8335, "step": 273300 }, { "epoch": 3.766774131327326, "grad_norm": 3.9660980701446533, "learning_rate": 4.7340908871230105e-05, "loss": 0.6887, "step": 273400 }, { "epoch": 3.7681518833870657, "grad_norm": 7.923286437988281, "learning_rate": 4.733535791732008e-05, "loss": 0.8255, "step": 273500 }, { "epoch": 3.769529635446805, "grad_norm": 3.753889322280884, "learning_rate": 4.732980527829493e-05, "loss": 0.8136, "step": 273600 }, { "epoch": 3.7709073875065444, "grad_norm": 2.2283871173858643, "learning_rate": 4.732430650620049e-05, "loss": 0.7593, "step": 273700 }, { "epoch": 3.7722851395662835, "grad_norm": 6.431221008300781, "learning_rate": 4.731875051519976e-05, "loss": 0.8895, "step": 273800 }, { "epoch": 3.773662891626023, "grad_norm": 1.4079269170761108, "learning_rate": 4.731319284049479e-05, "loss": 0.7761, "step": 273900 }, { "epoch": 3.775040643685762, "grad_norm": 15.714146614074707, "learning_rate": 4.730763348255773e-05, "loss": 0.7908, "step": 274000 }, { "epoch": 3.7764183957455018, "grad_norm": 3.988079071044922, "learning_rate": 4.730207244186087e-05, "loss": 0.7625, "step": 274100 }, { "epoch": 3.777796147805241, "grad_norm": 62.9211311340332, "learning_rate": 4.7296509718876633e-05, "loss": 0.7935, "step": 274200 }, { "epoch": 3.7791738998649804, "grad_norm": 6.144111156463623, "learning_rate": 4.7290945314077636e-05, "loss": 0.7979, "step": 274300 }, { "epoch": 3.7805516519247195, "grad_norm": 29.91961097717285, "learning_rate": 4.7285379227936575e-05, "loss": 0.8063, "step": 274400 }, { "epoch": 3.781929403984459, "grad_norm": 2.6208667755126953, "learning_rate": 4.727981146092633e-05, "loss": 0.8775, "step": 274500 }, { "epoch": 3.783307156044198, "grad_norm": 16.388765335083008, "learning_rate": 4.727424201351991e-05, "loss": 0.7705, "step": 274600 }, { "epoch": 3.7846849081039378, "grad_norm": 22.490192413330078, "learning_rate": 4.726867088619047e-05, "loss": 0.8121, "step": 274700 }, { "epoch": 3.786062660163677, "grad_norm": 3.434366464614868, "learning_rate": 4.7263098079411297e-05, "loss": 0.8411, "step": 274800 }, { "epoch": 3.7874404122234164, "grad_norm": 2.3183443546295166, "learning_rate": 4.725752359365584e-05, "loss": 0.8552, "step": 274900 }, { "epoch": 3.7888181642831555, "grad_norm": 10.366097450256348, "learning_rate": 4.725194742939766e-05, "loss": 0.8053, "step": 275000 }, { "epoch": 3.7901959163428947, "grad_norm": 12.962300300598145, "learning_rate": 4.724636958711051e-05, "loss": 0.7189, "step": 275100 }, { "epoch": 3.791573668402634, "grad_norm": 5.601036071777344, "learning_rate": 4.7240790067268236e-05, "loss": 0.8333, "step": 275200 }, { "epoch": 3.7929514204623738, "grad_norm": 16.954980850219727, "learning_rate": 4.723520887034485e-05, "loss": 0.7332, "step": 275300 }, { "epoch": 3.794329172522113, "grad_norm": 6.905393123626709, "learning_rate": 4.7229625996814516e-05, "loss": 0.7401, "step": 275400 }, { "epoch": 3.795706924581852, "grad_norm": 36.58599853515625, "learning_rate": 4.722404144715151e-05, "loss": 0.7407, "step": 275500 }, { "epoch": 3.7970846766415915, "grad_norm": 20.370412826538086, "learning_rate": 4.721845522183028e-05, "loss": 0.8583, "step": 275600 }, { "epoch": 3.798462428701331, "grad_norm": 80.74333190917969, "learning_rate": 4.72128673213254e-05, "loss": 0.8026, "step": 275700 }, { "epoch": 3.79984018076107, "grad_norm": 9.61959457397461, "learning_rate": 4.7207277746111575e-05, "loss": 0.7635, "step": 275800 }, { "epoch": 3.8012179328208093, "grad_norm": 21.183671951293945, "learning_rate": 4.7201686496663705e-05, "loss": 0.7851, "step": 275900 }, { "epoch": 3.802595684880549, "grad_norm": 195.9647216796875, "learning_rate": 4.719609357345677e-05, "loss": 0.787, "step": 276000 }, { "epoch": 3.8039734369402884, "grad_norm": 3.6417524814605713, "learning_rate": 4.7190498976965914e-05, "loss": 0.7718, "step": 276100 }, { "epoch": 3.8053511890000276, "grad_norm": 11.64912223815918, "learning_rate": 4.718490270766643e-05, "loss": 0.7545, "step": 276200 }, { "epoch": 3.8067289410597667, "grad_norm": 10.296647071838379, "learning_rate": 4.717930476603377e-05, "loss": 0.8476, "step": 276300 }, { "epoch": 3.8081066931195062, "grad_norm": 11.872393608093262, "learning_rate": 4.717370515254348e-05, "loss": 0.8365, "step": 276400 }, { "epoch": 3.809484445179246, "grad_norm": 23.96843719482422, "learning_rate": 4.7168103867671286e-05, "loss": 0.8164, "step": 276500 }, { "epoch": 3.810862197238985, "grad_norm": 11.160082817077637, "learning_rate": 4.716255694972026e-05, "loss": 0.7699, "step": 276600 }, { "epoch": 3.812239949298724, "grad_norm": 12.165009498596191, "learning_rate": 4.715695234021391e-05, "loss": 0.7202, "step": 276700 }, { "epoch": 3.8136177013584636, "grad_norm": 9.358080863952637, "learning_rate": 4.7151346060748914e-05, "loss": 0.8503, "step": 276800 }, { "epoch": 3.814995453418203, "grad_norm": 35.57636642456055, "learning_rate": 4.7145738111801534e-05, "loss": 0.8482, "step": 276900 }, { "epoch": 3.8163732054779422, "grad_norm": 12.582289695739746, "learning_rate": 4.7140128493848183e-05, "loss": 0.8424, "step": 277000 }, { "epoch": 3.8177509575376813, "grad_norm": 19.38596534729004, "learning_rate": 4.713451720736544e-05, "loss": 0.7397, "step": 277100 }, { "epoch": 3.819128709597421, "grad_norm": 3.3069958686828613, "learning_rate": 4.712896039063067e-05, "loss": 0.8366, "step": 277200 }, { "epoch": 3.82050646165716, "grad_norm": 10.138157844543457, "learning_rate": 4.7123345785192806e-05, "loss": 0.823, "step": 277300 }, { "epoch": 3.8218842137168996, "grad_norm": 2.509814739227295, "learning_rate": 4.711772951265132e-05, "loss": 0.703, "step": 277400 }, { "epoch": 3.8232619657766387, "grad_norm": 6.17867374420166, "learning_rate": 4.7112111573483355e-05, "loss": 0.8611, "step": 277500 }, { "epoch": 3.8246397178363782, "grad_norm": 5.795092582702637, "learning_rate": 4.710649196816617e-05, "loss": 0.8168, "step": 277600 }, { "epoch": 3.8260174698961174, "grad_norm": 46.700477600097656, "learning_rate": 4.71008706971772e-05, "loss": 0.874, "step": 277700 }, { "epoch": 3.827395221955857, "grad_norm": 31.78653907775879, "learning_rate": 4.7095247760993974e-05, "loss": 0.7207, "step": 277800 }, { "epoch": 3.828772974015596, "grad_norm": 28.1950740814209, "learning_rate": 4.70896231600942e-05, "loss": 0.7582, "step": 277900 }, { "epoch": 3.8301507260753356, "grad_norm": 8.862968444824219, "learning_rate": 4.7083996894955734e-05, "loss": 0.7385, "step": 278000 }, { "epoch": 3.8315284781350747, "grad_norm": 9.290912628173828, "learning_rate": 4.707836896605653e-05, "loss": 0.7466, "step": 278100 }, { "epoch": 3.8329062301948142, "grad_norm": 6.6076340675354, "learning_rate": 4.707273937387472e-05, "loss": 0.8101, "step": 278200 }, { "epoch": 3.8342839822545534, "grad_norm": 6.146384239196777, "learning_rate": 4.7067108118888566e-05, "loss": 0.8619, "step": 278300 }, { "epoch": 3.835661734314293, "grad_norm": 4.0692572593688965, "learning_rate": 4.7061475201576475e-05, "loss": 0.7975, "step": 278400 }, { "epoch": 3.837039486374032, "grad_norm": 5.783301830291748, "learning_rate": 4.705584062241699e-05, "loss": 0.8069, "step": 278500 }, { "epoch": 3.8384172384337716, "grad_norm": 24.528738021850586, "learning_rate": 4.705020438188879e-05, "loss": 0.8094, "step": 278600 }, { "epoch": 3.8397949904935107, "grad_norm": 16.147613525390625, "learning_rate": 4.704456648047072e-05, "loss": 0.8363, "step": 278700 }, { "epoch": 3.8411727425532503, "grad_norm": 10.864917755126953, "learning_rate": 4.7038926918641735e-05, "loss": 0.7708, "step": 278800 }, { "epoch": 3.8425504946129894, "grad_norm": 2.2151923179626465, "learning_rate": 4.703328569688094e-05, "loss": 0.7813, "step": 278900 }, { "epoch": 3.843928246672729, "grad_norm": 10.98679256439209, "learning_rate": 4.702764281566761e-05, "loss": 0.8556, "step": 279000 }, { "epoch": 3.845305998732468, "grad_norm": 6.018759727478027, "learning_rate": 4.702199827548111e-05, "loss": 0.7443, "step": 279100 }, { "epoch": 3.8466837507922076, "grad_norm": 3.0387134552001953, "learning_rate": 4.701635207680098e-05, "loss": 0.6786, "step": 279200 }, { "epoch": 3.8480615028519467, "grad_norm": 9.06083869934082, "learning_rate": 4.7010704220106896e-05, "loss": 0.808, "step": 279300 }, { "epoch": 3.8494392549116863, "grad_norm": 3.589306592941284, "learning_rate": 4.700505470587868e-05, "loss": 0.7647, "step": 279400 }, { "epoch": 3.8508170069714254, "grad_norm": 7.072883605957031, "learning_rate": 4.699940353459628e-05, "loss": 0.7335, "step": 279500 }, { "epoch": 3.852194759031165, "grad_norm": 11.459481239318848, "learning_rate": 4.699375070673978e-05, "loss": 0.7742, "step": 279600 }, { "epoch": 3.853572511090904, "grad_norm": 7.030880451202393, "learning_rate": 4.698809622278943e-05, "loss": 0.8076, "step": 279700 }, { "epoch": 3.854950263150643, "grad_norm": 109.31099700927734, "learning_rate": 4.698249665281496e-05, "loss": 0.6946, "step": 279800 }, { "epoch": 3.8563280152103827, "grad_norm": 3.794438600540161, "learning_rate": 4.697683887466713e-05, "loss": 0.7673, "step": 279900 }, { "epoch": 3.8577057672701223, "grad_norm": 4.457272529602051, "learning_rate": 4.69711794418622e-05, "loss": 0.747, "step": 280000 }, { "epoch": 3.8590835193298614, "grad_norm": 3.3575732707977295, "learning_rate": 4.6965518354880966e-05, "loss": 0.8331, "step": 280100 }, { "epoch": 3.8604612713896005, "grad_norm": 7.636532783508301, "learning_rate": 4.6959855614204354e-05, "loss": 0.7142, "step": 280200 }, { "epoch": 3.86183902344934, "grad_norm": 13.00849723815918, "learning_rate": 4.695419122031346e-05, "loss": 0.8094, "step": 280300 }, { "epoch": 3.8632167755090796, "grad_norm": 5.083268165588379, "learning_rate": 4.694852517368949e-05, "loss": 0.7171, "step": 280400 }, { "epoch": 3.8645945275688187, "grad_norm": 13.605998039245605, "learning_rate": 4.69428574748138e-05, "loss": 0.8005, "step": 280500 }, { "epoch": 3.865972279628558, "grad_norm": 10.626866340637207, "learning_rate": 4.693718812416792e-05, "loss": 0.7407, "step": 280600 }, { "epoch": 3.8673500316882974, "grad_norm": 5.20609712600708, "learning_rate": 4.693151712223346e-05, "loss": 0.7583, "step": 280700 }, { "epoch": 3.868727783748037, "grad_norm": 12.129583358764648, "learning_rate": 4.69258444694922e-05, "loss": 0.6739, "step": 280800 }, { "epoch": 3.870105535807776, "grad_norm": 15.339916229248047, "learning_rate": 4.692017016642607e-05, "loss": 0.6958, "step": 280900 }, { "epoch": 3.871483287867515, "grad_norm": 27.290355682373047, "learning_rate": 4.691449421351715e-05, "loss": 0.756, "step": 281000 }, { "epoch": 3.8728610399272547, "grad_norm": 15.555962562561035, "learning_rate": 4.690881661124761e-05, "loss": 0.6909, "step": 281100 }, { "epoch": 3.8742387919869943, "grad_norm": 3.7568814754486084, "learning_rate": 4.690313736009979e-05, "loss": 0.7812, "step": 281200 }, { "epoch": 3.8756165440467334, "grad_norm": 4.715430736541748, "learning_rate": 4.6897456460556204e-05, "loss": 0.7335, "step": 281300 }, { "epoch": 3.8769942961064725, "grad_norm": 4.514675617218018, "learning_rate": 4.6891773913099454e-05, "loss": 0.688, "step": 281400 }, { "epoch": 3.878372048166212, "grad_norm": 3.296437978744507, "learning_rate": 4.6886089718212295e-05, "loss": 0.7317, "step": 281500 }, { "epoch": 3.879749800225951, "grad_norm": 11.520378112792969, "learning_rate": 4.6880403876377646e-05, "loss": 0.8156, "step": 281600 }, { "epoch": 3.8811275522856907, "grad_norm": 12.717551231384277, "learning_rate": 4.687471638807853e-05, "loss": 0.8177, "step": 281700 }, { "epoch": 3.88250530434543, "grad_norm": 8.70654010772705, "learning_rate": 4.686902725379814e-05, "loss": 0.8383, "step": 281800 }, { "epoch": 3.8838830564051694, "grad_norm": 26.836902618408203, "learning_rate": 4.686333647401979e-05, "loss": 0.8805, "step": 281900 }, { "epoch": 3.8852608084649085, "grad_norm": 18.273624420166016, "learning_rate": 4.685764404922695e-05, "loss": 0.8539, "step": 282000 }, { "epoch": 3.886638560524648, "grad_norm": 16.71436309814453, "learning_rate": 4.685194997990321e-05, "loss": 0.6822, "step": 282100 }, { "epoch": 3.888016312584387, "grad_norm": 13.167102813720703, "learning_rate": 4.684625426653232e-05, "loss": 0.8249, "step": 282200 }, { "epoch": 3.8893940646441267, "grad_norm": 9.252867698669434, "learning_rate": 4.684055690959815e-05, "loss": 0.7913, "step": 282300 }, { "epoch": 3.890771816703866, "grad_norm": 18.08372688293457, "learning_rate": 4.683485790958472e-05, "loss": 0.735, "step": 282400 }, { "epoch": 3.8921495687636054, "grad_norm": 22.017805099487305, "learning_rate": 4.682915726697621e-05, "loss": 0.6777, "step": 282500 }, { "epoch": 3.8935273208233445, "grad_norm": 7.59044075012207, "learning_rate": 4.6823454982256896e-05, "loss": 0.7403, "step": 282600 }, { "epoch": 3.894905072883084, "grad_norm": 19.335826873779297, "learning_rate": 4.681775105591122e-05, "loss": 0.8477, "step": 282700 }, { "epoch": 3.896282824942823, "grad_norm": 3.237865924835205, "learning_rate": 4.681204548842376e-05, "loss": 0.7686, "step": 282800 }, { "epoch": 3.8976605770025627, "grad_norm": 3.7802419662475586, "learning_rate": 4.680633828027924e-05, "loss": 0.7711, "step": 282900 }, { "epoch": 3.899038329062302, "grad_norm": 9.453742027282715, "learning_rate": 4.680062943196252e-05, "loss": 0.7047, "step": 283000 }, { "epoch": 3.9004160811220414, "grad_norm": 7.233980655670166, "learning_rate": 4.679491894395857e-05, "loss": 0.6415, "step": 283100 }, { "epoch": 3.9017938331817805, "grad_norm": 3.4469568729400635, "learning_rate": 4.678920681675256e-05, "loss": 0.6709, "step": 283200 }, { "epoch": 3.90317158524152, "grad_norm": 4.129087924957275, "learning_rate": 4.678349305082975e-05, "loss": 0.7867, "step": 283300 }, { "epoch": 3.904549337301259, "grad_norm": 8.282999992370605, "learning_rate": 4.677777764667554e-05, "loss": 0.8287, "step": 283400 }, { "epoch": 3.9059270893609987, "grad_norm": 12.684904098510742, "learning_rate": 4.67720606047755e-05, "loss": 0.751, "step": 283500 }, { "epoch": 3.907304841420738, "grad_norm": 1.003591775894165, "learning_rate": 4.6766341925615316e-05, "loss": 0.8116, "step": 283600 }, { "epoch": 3.9086825934804774, "grad_norm": 6.440825939178467, "learning_rate": 4.676062160968082e-05, "loss": 0.7158, "step": 283700 }, { "epoch": 3.9100603455402165, "grad_norm": 2.9218430519104004, "learning_rate": 4.6754899657457974e-05, "loss": 0.7279, "step": 283800 }, { "epoch": 3.911438097599956, "grad_norm": 8.239198684692383, "learning_rate": 4.674917606943291e-05, "loss": 0.7192, "step": 283900 }, { "epoch": 3.912815849659695, "grad_norm": 5.853105545043945, "learning_rate": 4.674345084609184e-05, "loss": 0.8082, "step": 284000 }, { "epoch": 3.9141936017194343, "grad_norm": 11.474648475646973, "learning_rate": 4.6737723987921185e-05, "loss": 0.7637, "step": 284100 }, { "epoch": 3.915571353779174, "grad_norm": 16.13724136352539, "learning_rate": 4.673199549540746e-05, "loss": 0.7011, "step": 284200 }, { "epoch": 3.9169491058389134, "grad_norm": 8.205422401428223, "learning_rate": 4.6726265369037325e-05, "loss": 0.7268, "step": 284300 }, { "epoch": 3.9183268578986525, "grad_norm": 11.68458080291748, "learning_rate": 4.6720533609297583e-05, "loss": 0.7169, "step": 284400 }, { "epoch": 3.9197046099583917, "grad_norm": 12.15461254119873, "learning_rate": 4.671480021667518e-05, "loss": 0.7506, "step": 284500 }, { "epoch": 3.921082362018131, "grad_norm": 14.598043441772461, "learning_rate": 4.67090651916572e-05, "loss": 0.717, "step": 284600 }, { "epoch": 3.9224601140778708, "grad_norm": 35.984710693359375, "learning_rate": 4.6703328534730857e-05, "loss": 0.7073, "step": 284700 }, { "epoch": 3.92383786613761, "grad_norm": 15.192087173461914, "learning_rate": 4.6697590246383505e-05, "loss": 0.727, "step": 284800 }, { "epoch": 3.925215618197349, "grad_norm": 12.10468578338623, "learning_rate": 4.669185032710266e-05, "loss": 0.7586, "step": 284900 }, { "epoch": 3.9265933702570885, "grad_norm": 17.737178802490234, "learning_rate": 4.668610877737594e-05, "loss": 0.9379, "step": 285000 }, { "epoch": 3.927971122316828, "grad_norm": 29.761802673339844, "learning_rate": 4.668036559769112e-05, "loss": 0.7204, "step": 285100 }, { "epoch": 3.929348874376567, "grad_norm": 18.861635208129883, "learning_rate": 4.667462078853611e-05, "loss": 0.688, "step": 285200 }, { "epoch": 3.9307266264363063, "grad_norm": 114.94448852539062, "learning_rate": 4.666887435039898e-05, "loss": 0.7467, "step": 285300 }, { "epoch": 3.932104378496046, "grad_norm": 6.358266830444336, "learning_rate": 4.66631262837679e-05, "loss": 0.7498, "step": 285400 }, { "epoch": 3.9334821305557854, "grad_norm": 6.687554836273193, "learning_rate": 4.665737658913121e-05, "loss": 0.723, "step": 285500 }, { "epoch": 3.9348598826155246, "grad_norm": 5.466715335845947, "learning_rate": 4.6651625266977366e-05, "loss": 0.7884, "step": 285600 }, { "epoch": 3.9362376346752637, "grad_norm": 30.31229591369629, "learning_rate": 4.664587231779498e-05, "loss": 0.7059, "step": 285700 }, { "epoch": 3.937615386735003, "grad_norm": 12.738252639770508, "learning_rate": 4.6640117742072786e-05, "loss": 0.7394, "step": 285800 }, { "epoch": 3.9389931387947423, "grad_norm": 14.36909294128418, "learning_rate": 4.663436154029967e-05, "loss": 0.7453, "step": 285900 }, { "epoch": 3.940370890854482, "grad_norm": 6.872807025909424, "learning_rate": 4.662860371296466e-05, "loss": 0.6916, "step": 286000 }, { "epoch": 3.941748642914221, "grad_norm": 9.961835861206055, "learning_rate": 4.6622844260556886e-05, "loss": 0.7001, "step": 286100 }, { "epoch": 3.9431263949739606, "grad_norm": 19.924083709716797, "learning_rate": 4.661708318356567e-05, "loss": 0.8202, "step": 286200 }, { "epoch": 3.9445041470336997, "grad_norm": 18.48103141784668, "learning_rate": 4.6611320482480436e-05, "loss": 0.755, "step": 286300 }, { "epoch": 3.9458818990934392, "grad_norm": 7.876520156860352, "learning_rate": 4.660555615779075e-05, "loss": 0.8122, "step": 286400 }, { "epoch": 3.9472596511531783, "grad_norm": 23.327024459838867, "learning_rate": 4.659979020998634e-05, "loss": 0.6568, "step": 286500 }, { "epoch": 3.948637403212918, "grad_norm": 2.7605140209198, "learning_rate": 4.659402263955702e-05, "loss": 0.8279, "step": 286600 }, { "epoch": 3.950015155272657, "grad_norm": 11.242687225341797, "learning_rate": 4.65882534469928e-05, "loss": 0.7345, "step": 286700 }, { "epoch": 3.9513929073323966, "grad_norm": 4.0618438720703125, "learning_rate": 4.65824826327838e-05, "loss": 0.6334, "step": 286800 }, { "epoch": 3.9527706593921357, "grad_norm": 29.51956558227539, "learning_rate": 4.6576710197420264e-05, "loss": 0.6697, "step": 286900 }, { "epoch": 3.9541484114518752, "grad_norm": 3.382915735244751, "learning_rate": 4.6570936141392604e-05, "loss": 0.8269, "step": 287000 }, { "epoch": 3.9555261635116143, "grad_norm": 1.3904200792312622, "learning_rate": 4.656516046519135e-05, "loss": 0.7169, "step": 287100 }, { "epoch": 3.956903915571354, "grad_norm": 4.980300426483154, "learning_rate": 4.655938316930719e-05, "loss": 0.7243, "step": 287200 }, { "epoch": 3.958281667631093, "grad_norm": 11.517420768737793, "learning_rate": 4.6553604254230914e-05, "loss": 0.7269, "step": 287300 }, { "epoch": 3.9596594196908326, "grad_norm": 6.000971794128418, "learning_rate": 4.654782372045348e-05, "loss": 0.7721, "step": 287400 }, { "epoch": 3.9610371717505717, "grad_norm": 2.6846120357513428, "learning_rate": 4.654204156846597e-05, "loss": 0.6701, "step": 287500 }, { "epoch": 3.9624149238103112, "grad_norm": 7.725025653839111, "learning_rate": 4.653625779875961e-05, "loss": 0.614, "step": 287600 }, { "epoch": 3.9637926758700504, "grad_norm": 6.2518205642700195, "learning_rate": 4.653047241182576e-05, "loss": 0.7316, "step": 287700 }, { "epoch": 3.96517042792979, "grad_norm": 31.462007522583008, "learning_rate": 4.652468540815593e-05, "loss": 0.7132, "step": 287800 }, { "epoch": 3.966548179989529, "grad_norm": 5.510139465332031, "learning_rate": 4.651889678824173e-05, "loss": 0.7492, "step": 287900 }, { "epoch": 3.9679259320492686, "grad_norm": 2.874608278274536, "learning_rate": 4.6513106552574964e-05, "loss": 0.6882, "step": 288000 }, { "epoch": 3.9693036841090077, "grad_norm": 85.63662719726562, "learning_rate": 4.650731470164752e-05, "loss": 0.7373, "step": 288100 }, { "epoch": 3.9706814361687472, "grad_norm": 9.946539878845215, "learning_rate": 4.650152123595144e-05, "loss": 0.7325, "step": 288200 }, { "epoch": 3.9720591882284864, "grad_norm": 18.722869873046875, "learning_rate": 4.6495726155978936e-05, "loss": 0.7257, "step": 288300 }, { "epoch": 3.9734369402882255, "grad_norm": 7.619026184082031, "learning_rate": 4.64899294622223e-05, "loss": 0.7141, "step": 288400 }, { "epoch": 3.974814692347965, "grad_norm": 10.969167709350586, "learning_rate": 4.648413115517401e-05, "loss": 0.7081, "step": 288500 }, { "epoch": 3.9761924444077046, "grad_norm": 11.60225772857666, "learning_rate": 4.647833123532667e-05, "loss": 0.6833, "step": 288600 }, { "epoch": 3.9775701964674437, "grad_norm": 6.163235664367676, "learning_rate": 4.647252970317297e-05, "loss": 0.6646, "step": 288700 }, { "epoch": 3.978947948527183, "grad_norm": 9.643515586853027, "learning_rate": 4.646672655920583e-05, "loss": 0.6888, "step": 288800 }, { "epoch": 3.9803257005869224, "grad_norm": 3.917431354522705, "learning_rate": 4.646092180391824e-05, "loss": 0.685, "step": 288900 }, { "epoch": 3.981703452646662, "grad_norm": 9.85342788696289, "learning_rate": 4.645511543780333e-05, "loss": 0.6969, "step": 289000 }, { "epoch": 3.983081204706401, "grad_norm": 9.19890308380127, "learning_rate": 4.644930746135438e-05, "loss": 0.7208, "step": 289100 }, { "epoch": 3.98445895676614, "grad_norm": 17.32145118713379, "learning_rate": 4.644349787506483e-05, "loss": 0.7199, "step": 289200 }, { "epoch": 3.9858367088258797, "grad_norm": 6.630151271820068, "learning_rate": 4.643768667942821e-05, "loss": 0.7026, "step": 289300 }, { "epoch": 3.9872144608856193, "grad_norm": 3.8577563762664795, "learning_rate": 4.6431873874938235e-05, "loss": 0.7031, "step": 289400 }, { "epoch": 3.9885922129453584, "grad_norm": 5.545560836791992, "learning_rate": 4.642611761417697e-05, "loss": 0.7269, "step": 289500 }, { "epoch": 3.9899699650050975, "grad_norm": 6.154033660888672, "learning_rate": 4.6420301609538085e-05, "loss": 0.7344, "step": 289600 }, { "epoch": 3.991347717064837, "grad_norm": 17.962711334228516, "learning_rate": 4.6414483997522785e-05, "loss": 0.6977, "step": 289700 }, { "epoch": 3.9927254691245766, "grad_norm": 12.827983856201172, "learning_rate": 4.6408664778625296e-05, "loss": 0.7511, "step": 289800 }, { "epoch": 3.9941032211843157, "grad_norm": 9.863675117492676, "learning_rate": 4.640284395334e-05, "loss": 0.6639, "step": 289900 }, { "epoch": 3.995480973244055, "grad_norm": 1.7848522663116455, "learning_rate": 4.639702152216141e-05, "loss": 0.7221, "step": 290000 }, { "epoch": 3.9968587253037944, "grad_norm": 9.894287109375, "learning_rate": 4.6391197485584164e-05, "loss": 0.6245, "step": 290100 }, { "epoch": 3.9982364773635335, "grad_norm": 3.5770010948181152, "learning_rate": 4.638537184410304e-05, "loss": 0.7844, "step": 290200 }, { "epoch": 3.999614229423273, "grad_norm": 5.768791198730469, "learning_rate": 4.637954459821296e-05, "loss": 0.6162, "step": 290300 }, { "epoch": 4.000991981483012, "grad_norm": 2.224738836288452, "learning_rate": 4.637371574840898e-05, "loss": 0.6058, "step": 290400 }, { "epoch": 4.002369733542752, "grad_norm": 3.0572924613952637, "learning_rate": 4.636788529518629e-05, "loss": 0.6076, "step": 290500 }, { "epoch": 4.003747485602491, "grad_norm": 2.082035541534424, "learning_rate": 4.6362053239040225e-05, "loss": 0.6615, "step": 290600 }, { "epoch": 4.00512523766223, "grad_norm": 18.01354217529297, "learning_rate": 4.635621958046623e-05, "loss": 0.6938, "step": 290700 }, { "epoch": 4.0065029897219695, "grad_norm": 5.204951286315918, "learning_rate": 4.635038431995992e-05, "loss": 0.7008, "step": 290800 }, { "epoch": 4.007880741781709, "grad_norm": 12.204028129577637, "learning_rate": 4.634454745801702e-05, "loss": 0.6751, "step": 290900 }, { "epoch": 4.009258493841449, "grad_norm": 17.813941955566406, "learning_rate": 4.6338708995133405e-05, "loss": 0.6748, "step": 291000 }, { "epoch": 4.010636245901187, "grad_norm": 8.7006196975708, "learning_rate": 4.6332868931805086e-05, "loss": 0.6319, "step": 291100 }, { "epoch": 4.012013997960927, "grad_norm": 9.431085586547852, "learning_rate": 4.632702726852821e-05, "loss": 0.6698, "step": 291200 }, { "epoch": 4.013391750020666, "grad_norm": 3.8761565685272217, "learning_rate": 4.632118400579903e-05, "loss": 0.6349, "step": 291300 }, { "epoch": 4.014769502080406, "grad_norm": 6.5067338943481445, "learning_rate": 4.6315339144113996e-05, "loss": 0.6412, "step": 291400 }, { "epoch": 4.016147254140145, "grad_norm": 3.528282403945923, "learning_rate": 4.630949268396964e-05, "loss": 0.6492, "step": 291500 }, { "epoch": 4.017525006199884, "grad_norm": 10.56949520111084, "learning_rate": 4.630364462586265e-05, "loss": 0.7175, "step": 291600 }, { "epoch": 4.018902758259624, "grad_norm": 8.316420555114746, "learning_rate": 4.629779497028985e-05, "loss": 0.6765, "step": 291700 }, { "epoch": 4.020280510319363, "grad_norm": 4.828928470611572, "learning_rate": 4.629194371774819e-05, "loss": 0.6435, "step": 291800 }, { "epoch": 4.021658262379102, "grad_norm": 14.561111450195312, "learning_rate": 4.628609086873478e-05, "loss": 0.6342, "step": 291900 }, { "epoch": 4.0230360144388415, "grad_norm": 2.1733908653259277, "learning_rate": 4.628023642374684e-05, "loss": 0.6746, "step": 292000 }, { "epoch": 4.024413766498581, "grad_norm": 38.17127227783203, "learning_rate": 4.627438038328174e-05, "loss": 0.6031, "step": 292100 }, { "epoch": 4.025791518558321, "grad_norm": 151.87948608398438, "learning_rate": 4.6268522747836986e-05, "loss": 0.7576, "step": 292200 }, { "epoch": 4.027169270618059, "grad_norm": 6.559102535247803, "learning_rate": 4.626266351791019e-05, "loss": 0.6567, "step": 292300 }, { "epoch": 4.028547022677799, "grad_norm": 3.6919069290161133, "learning_rate": 4.6256802693999145e-05, "loss": 0.6294, "step": 292400 }, { "epoch": 4.029924774737538, "grad_norm": 6.815934658050537, "learning_rate": 4.625094027660175e-05, "loss": 0.7439, "step": 292500 }, { "epoch": 4.031302526797278, "grad_norm": 2.451958417892456, "learning_rate": 4.6245076266216055e-05, "loss": 0.7037, "step": 292600 }, { "epoch": 4.032680278857017, "grad_norm": 12.651769638061523, "learning_rate": 4.623921066334022e-05, "loss": 0.7634, "step": 292700 }, { "epoch": 4.034058030916756, "grad_norm": 3.5858993530273438, "learning_rate": 4.6233343468472586e-05, "loss": 0.7031, "step": 292800 }, { "epoch": 4.035435782976496, "grad_norm": 37.72840118408203, "learning_rate": 4.622747468211157e-05, "loss": 0.5645, "step": 292900 }, { "epoch": 4.036813535036235, "grad_norm": 8.41267204284668, "learning_rate": 4.622160430475579e-05, "loss": 0.5949, "step": 293000 }, { "epoch": 4.038191287095974, "grad_norm": 22.63103675842285, "learning_rate": 4.621579106445377e-05, "loss": 0.6729, "step": 293100 }, { "epoch": 4.0395690391557135, "grad_norm": 17.30426788330078, "learning_rate": 4.620991752250219e-05, "loss": 0.6312, "step": 293200 }, { "epoch": 4.040946791215453, "grad_norm": 10.727584838867188, "learning_rate": 4.620404239104742e-05, "loss": 0.5985, "step": 293300 }, { "epoch": 4.042324543275193, "grad_norm": 10.214902877807617, "learning_rate": 4.619816567058855e-05, "loss": 0.6537, "step": 293400 }, { "epoch": 4.043702295334931, "grad_norm": 7.953614234924316, "learning_rate": 4.6192287361624846e-05, "loss": 0.7053, "step": 293500 }, { "epoch": 4.045080047394671, "grad_norm": 9.201581001281738, "learning_rate": 4.6186407464655706e-05, "loss": 0.6549, "step": 293600 }, { "epoch": 4.04645779945441, "grad_norm": 5.408073902130127, "learning_rate": 4.6180525980180656e-05, "loss": 0.6672, "step": 293700 }, { "epoch": 4.047835551514149, "grad_norm": 14.326773643493652, "learning_rate": 4.617464290869934e-05, "loss": 0.5852, "step": 293800 }, { "epoch": 4.049213303573889, "grad_norm": 4.585080146789551, "learning_rate": 4.6168758250711584e-05, "loss": 0.6757, "step": 293900 }, { "epoch": 4.050591055633628, "grad_norm": 7.5951080322265625, "learning_rate": 4.6162872006717305e-05, "loss": 0.6505, "step": 294000 }, { "epoch": 4.051968807693368, "grad_norm": 6.346579074859619, "learning_rate": 4.615698417721657e-05, "loss": 0.6555, "step": 294100 }, { "epoch": 4.053346559753106, "grad_norm": 14.753069877624512, "learning_rate": 4.6151094762709573e-05, "loss": 0.6137, "step": 294200 }, { "epoch": 4.054724311812846, "grad_norm": 5.630283355712891, "learning_rate": 4.6145203763696664e-05, "loss": 0.7107, "step": 294300 }, { "epoch": 4.0561020638725855, "grad_norm": 6.181244850158691, "learning_rate": 4.6139311180678305e-05, "loss": 0.7511, "step": 294400 }, { "epoch": 4.057479815932325, "grad_norm": 9.801922798156738, "learning_rate": 4.61334170141551e-05, "loss": 0.6808, "step": 294500 }, { "epoch": 4.058857567992064, "grad_norm": 23.463417053222656, "learning_rate": 4.6127521264627796e-05, "loss": 0.7575, "step": 294600 }, { "epoch": 4.060235320051803, "grad_norm": 13.619138717651367, "learning_rate": 4.6121623932597266e-05, "loss": 0.6982, "step": 294700 }, { "epoch": 4.061613072111543, "grad_norm": 1.987518548965454, "learning_rate": 4.611572501856451e-05, "loss": 0.7263, "step": 294800 }, { "epoch": 4.062990824171282, "grad_norm": 8.392542839050293, "learning_rate": 4.610982452303068e-05, "loss": 0.6535, "step": 294900 }, { "epoch": 4.064368576231021, "grad_norm": 14.59041976928711, "learning_rate": 4.6103922446497044e-05, "loss": 0.6871, "step": 295000 }, { "epoch": 4.065746328290761, "grad_norm": 2.0860438346862793, "learning_rate": 4.6098018789465025e-05, "loss": 0.6721, "step": 295100 }, { "epoch": 4.0671240803505, "grad_norm": 8.673189163208008, "learning_rate": 4.6092113552436156e-05, "loss": 0.7174, "step": 295200 }, { "epoch": 4.06850183241024, "grad_norm": 2.694934368133545, "learning_rate": 4.6086206735912134e-05, "loss": 0.7058, "step": 295300 }, { "epoch": 4.069879584469978, "grad_norm": 22.90232276916504, "learning_rate": 4.608029834039475e-05, "loss": 0.6488, "step": 295400 }, { "epoch": 4.071257336529718, "grad_norm": 4.197026252746582, "learning_rate": 4.607438836638598e-05, "loss": 0.7081, "step": 295500 }, { "epoch": 4.0726350885894576, "grad_norm": 5.113185882568359, "learning_rate": 4.6068476814387886e-05, "loss": 0.5575, "step": 295600 }, { "epoch": 4.074012840649197, "grad_norm": 29.31284523010254, "learning_rate": 4.606256368490269e-05, "loss": 0.6179, "step": 295700 }, { "epoch": 4.075390592708936, "grad_norm": 3.3541550636291504, "learning_rate": 4.605664897843274e-05, "loss": 0.6748, "step": 295800 }, { "epoch": 4.076768344768675, "grad_norm": 21.39793586730957, "learning_rate": 4.6050732695480535e-05, "loss": 0.6716, "step": 295900 }, { "epoch": 4.078146096828415, "grad_norm": 3.5587472915649414, "learning_rate": 4.604481483654867e-05, "loss": 0.6151, "step": 296000 }, { "epoch": 4.0795238488881544, "grad_norm": 19.50777816772461, "learning_rate": 4.603889540213993e-05, "loss": 0.6334, "step": 296100 }, { "epoch": 4.080901600947893, "grad_norm": 3.8695263862609863, "learning_rate": 4.603297439275716e-05, "loss": 0.6793, "step": 296200 }, { "epoch": 4.082279353007633, "grad_norm": 10.028346061706543, "learning_rate": 4.602705180890341e-05, "loss": 0.723, "step": 296300 }, { "epoch": 4.083657105067372, "grad_norm": 9.337726593017578, "learning_rate": 4.602112765108182e-05, "loss": 0.6028, "step": 296400 }, { "epoch": 4.085034857127112, "grad_norm": 1.486716628074646, "learning_rate": 4.601526118489555e-05, "loss": 0.6733, "step": 296500 }, { "epoch": 4.0864126091868505, "grad_norm": 228.84585571289062, "learning_rate": 4.6009333896375405e-05, "loss": 0.6992, "step": 296600 }, { "epoch": 4.08779036124659, "grad_norm": 10.276219367980957, "learning_rate": 4.6003405035392656e-05, "loss": 0.619, "step": 296700 }, { "epoch": 4.08916811330633, "grad_norm": 7.710974216461182, "learning_rate": 4.599747460245098e-05, "loss": 0.6592, "step": 296800 }, { "epoch": 4.090545865366069, "grad_norm": 8.020652770996094, "learning_rate": 4.599154259805422e-05, "loss": 0.7993, "step": 296900 }, { "epoch": 4.091923617425808, "grad_norm": 3.6239333152770996, "learning_rate": 4.59856090227063e-05, "loss": 0.599, "step": 297000 }, { "epoch": 4.093301369485547, "grad_norm": 8.320853233337402, "learning_rate": 4.597967387691133e-05, "loss": 0.6745, "step": 297100 }, { "epoch": 4.094679121545287, "grad_norm": 4.688587665557861, "learning_rate": 4.5973737161173515e-05, "loss": 0.7208, "step": 297200 }, { "epoch": 4.0960568736050265, "grad_norm": 9.391998291015625, "learning_rate": 4.5967798875997224e-05, "loss": 0.7009, "step": 297300 }, { "epoch": 4.097434625664765, "grad_norm": 8.450928688049316, "learning_rate": 4.596185902188694e-05, "loss": 0.7006, "step": 297400 }, { "epoch": 4.098812377724505, "grad_norm": 17.44727325439453, "learning_rate": 4.595591759934728e-05, "loss": 0.6514, "step": 297500 }, { "epoch": 4.100190129784244, "grad_norm": 18.792360305786133, "learning_rate": 4.5949974608882994e-05, "loss": 0.6217, "step": 297600 }, { "epoch": 4.101567881843984, "grad_norm": 35.528717041015625, "learning_rate": 4.594403005099898e-05, "loss": 0.7896, "step": 297700 }, { "epoch": 4.1029456339037225, "grad_norm": 45.263118743896484, "learning_rate": 4.593808392620025e-05, "loss": 0.721, "step": 297800 }, { "epoch": 4.104323385963462, "grad_norm": 8.396429061889648, "learning_rate": 4.593213623499196e-05, "loss": 0.6781, "step": 297900 }, { "epoch": 4.105701138023202, "grad_norm": 6.4494147300720215, "learning_rate": 4.5926186977879415e-05, "loss": 0.6052, "step": 298000 }, { "epoch": 4.10707889008294, "grad_norm": 4.370274543762207, "learning_rate": 4.5920236155368e-05, "loss": 0.6555, "step": 298100 }, { "epoch": 4.10845664214268, "grad_norm": 5.55239200592041, "learning_rate": 4.59142837679633e-05, "loss": 0.6822, "step": 298200 }, { "epoch": 4.109834394202419, "grad_norm": 3.1298775672912598, "learning_rate": 4.590832981617098e-05, "loss": 0.6552, "step": 298300 }, { "epoch": 4.111212146262159, "grad_norm": 10.128996849060059, "learning_rate": 4.590237430049687e-05, "loss": 0.666, "step": 298400 }, { "epoch": 4.112589898321898, "grad_norm": 5.206035614013672, "learning_rate": 4.589641722144691e-05, "loss": 0.6689, "step": 298500 }, { "epoch": 4.113967650381637, "grad_norm": 5.965327739715576, "learning_rate": 4.5890518173680934e-05, "loss": 0.7026, "step": 298600 }, { "epoch": 4.115345402441377, "grad_norm": 3.657636880874634, "learning_rate": 4.58845579850188e-05, "loss": 0.5738, "step": 298700 }, { "epoch": 4.116723154501116, "grad_norm": 7.645824432373047, "learning_rate": 4.5878596234494424e-05, "loss": 0.7064, "step": 298800 }, { "epoch": 4.118100906560855, "grad_norm": 3.154310703277588, "learning_rate": 4.587263292261427e-05, "loss": 0.6868, "step": 298900 }, { "epoch": 4.1194786586205945, "grad_norm": 14.839678764343262, "learning_rate": 4.586666804988495e-05, "loss": 0.6765, "step": 299000 }, { "epoch": 4.120856410680334, "grad_norm": 3.0030577182769775, "learning_rate": 4.586070161681322e-05, "loss": 0.6241, "step": 299100 }, { "epoch": 4.122234162740074, "grad_norm": 6.3814697265625, "learning_rate": 4.585473362390595e-05, "loss": 0.7379, "step": 299200 }, { "epoch": 4.123611914799812, "grad_norm": 2.8625292778015137, "learning_rate": 4.5848764071670163e-05, "loss": 0.6466, "step": 299300 }, { "epoch": 4.124989666859552, "grad_norm": 4.646816253662109, "learning_rate": 4.5842792960613e-05, "loss": 0.702, "step": 299400 }, { "epoch": 4.126367418919291, "grad_norm": 2.1967198848724365, "learning_rate": 4.5836820291241724e-05, "loss": 0.6001, "step": 299500 }, { "epoch": 4.127745170979031, "grad_norm": 3.3459267616271973, "learning_rate": 4.583084606406376e-05, "loss": 0.6948, "step": 299600 }, { "epoch": 4.12912292303877, "grad_norm": 6.72745943069458, "learning_rate": 4.582487027958664e-05, "loss": 0.6781, "step": 299700 }, { "epoch": 4.130500675098509, "grad_norm": 4.786750793457031, "learning_rate": 4.5818892938318034e-05, "loss": 0.6623, "step": 299800 }, { "epoch": 4.131878427158249, "grad_norm": 4.595633029937744, "learning_rate": 4.5812914040765766e-05, "loss": 0.6556, "step": 299900 }, { "epoch": 4.133256179217988, "grad_norm": 11.3757963180542, "learning_rate": 4.580693358743776e-05, "loss": 0.6595, "step": 300000 }, { "epoch": 4.133256179217988, "eval_accuracy": 0.8888380360022988, "eval_cer": 0.12308031299907674, "eval_loss": 0.6750513911247253, "eval_runtime": 10417.9033, "eval_samples_per_second": 5.178, "eval_steps_per_second": 0.324, "eval_wer": 0.22392130254803114, "step": 300000 }, { "epoch": 4.134633931277727, "grad_norm": 3.223041296005249, "learning_rate": 4.580095157884208e-05, "loss": 0.6499, "step": 300100 }, { "epoch": 4.1360116833374665, "grad_norm": 8.322097778320312, "learning_rate": 4.579496801548694e-05, "loss": 0.6594, "step": 300200 }, { "epoch": 4.137389435397206, "grad_norm": 4.080111503601074, "learning_rate": 4.5788982897880676e-05, "loss": 0.6304, "step": 300300 }, { "epoch": 4.138767187456946, "grad_norm": 4.8094353675842285, "learning_rate": 4.578299622653174e-05, "loss": 0.6695, "step": 300400 }, { "epoch": 4.140144939516684, "grad_norm": 61.309207916259766, "learning_rate": 4.5777008001948746e-05, "loss": 0.6342, "step": 300500 }, { "epoch": 4.141522691576424, "grad_norm": 3.9324393272399902, "learning_rate": 4.5771078130097816e-05, "loss": 0.677, "step": 300600 }, { "epoch": 4.142900443636163, "grad_norm": 6.68555212020874, "learning_rate": 4.576508681609266e-05, "loss": 0.7262, "step": 300700 }, { "epoch": 4.144278195695903, "grad_norm": 14.441776275634766, "learning_rate": 4.575909395037494e-05, "loss": 0.7022, "step": 300800 }, { "epoch": 4.145655947755642, "grad_norm": 3.4261410236358643, "learning_rate": 4.5753099533453766e-05, "loss": 0.6292, "step": 300900 }, { "epoch": 4.147033699815381, "grad_norm": 1.409772276878357, "learning_rate": 4.574716353318883e-05, "loss": 0.713, "step": 301000 }, { "epoch": 4.148411451875121, "grad_norm": 12.364428520202637, "learning_rate": 4.5741166030887994e-05, "loss": 0.5997, "step": 301100 }, { "epoch": 4.14978920393486, "grad_norm": 4.517096519470215, "learning_rate": 4.5735166978906784e-05, "loss": 0.7972, "step": 301200 }, { "epoch": 4.151166955994599, "grad_norm": 14.583377838134766, "learning_rate": 4.572916637775486e-05, "loss": 0.7412, "step": 301300 }, { "epoch": 4.1525447080543385, "grad_norm": 24.385231018066406, "learning_rate": 4.5723164227941985e-05, "loss": 0.6874, "step": 301400 }, { "epoch": 4.153922460114078, "grad_norm": 9.408326148986816, "learning_rate": 4.571716052997809e-05, "loss": 0.7456, "step": 301500 }, { "epoch": 4.155300212173818, "grad_norm": 37.284271240234375, "learning_rate": 4.5711155284373186e-05, "loss": 0.705, "step": 301600 }, { "epoch": 4.156677964233556, "grad_norm": 4.45246696472168, "learning_rate": 4.570514849163749e-05, "loss": 0.7105, "step": 301700 }, { "epoch": 4.158055716293296, "grad_norm": 5.8539958000183105, "learning_rate": 4.569914015228129e-05, "loss": 0.6115, "step": 301800 }, { "epoch": 4.159433468353035, "grad_norm": 2.451735734939575, "learning_rate": 4.569313026681503e-05, "loss": 0.6619, "step": 301900 }, { "epoch": 4.160811220412775, "grad_norm": 4.320910930633545, "learning_rate": 4.568711883574927e-05, "loss": 0.7227, "step": 302000 }, { "epoch": 4.162188972472514, "grad_norm": 516.494384765625, "learning_rate": 4.568110585959473e-05, "loss": 0.6722, "step": 302100 }, { "epoch": 4.163566724532253, "grad_norm": 6.573520183563232, "learning_rate": 4.567509133886223e-05, "loss": 0.7271, "step": 302200 }, { "epoch": 4.164944476591993, "grad_norm": 8.189583778381348, "learning_rate": 4.5669075274062726e-05, "loss": 0.6514, "step": 302300 }, { "epoch": 4.166322228651731, "grad_norm": 10.18826675415039, "learning_rate": 4.5663057665707346e-05, "loss": 0.6119, "step": 302400 }, { "epoch": 4.167699980711471, "grad_norm": 7.887221336364746, "learning_rate": 4.565703851430728e-05, "loss": 0.685, "step": 302500 }, { "epoch": 4.1690777327712105, "grad_norm": 6.623042583465576, "learning_rate": 4.565101782037391e-05, "loss": 0.594, "step": 302600 }, { "epoch": 4.17045548483095, "grad_norm": 5.593002796173096, "learning_rate": 4.564499558441871e-05, "loss": 0.5991, "step": 302700 }, { "epoch": 4.171833236890689, "grad_norm": 5.588982582092285, "learning_rate": 4.563897180695331e-05, "loss": 0.5978, "step": 302800 }, { "epoch": 4.173210988950428, "grad_norm": 12.168755531311035, "learning_rate": 4.563294648848946e-05, "loss": 0.5849, "step": 302900 }, { "epoch": 4.174588741010168, "grad_norm": 6.639806747436523, "learning_rate": 4.562691962953903e-05, "loss": 0.5988, "step": 303000 }, { "epoch": 4.175966493069907, "grad_norm": 11.007120132446289, "learning_rate": 4.562089123061404e-05, "loss": 0.6065, "step": 303100 }, { "epoch": 4.177344245129646, "grad_norm": 5.840813159942627, "learning_rate": 4.5614921599229176e-05, "loss": 0.6902, "step": 303200 }, { "epoch": 4.178721997189386, "grad_norm": 9.484322547912598, "learning_rate": 4.56088901372786e-05, "loss": 0.6262, "step": 303300 }, { "epoch": 4.180099749249125, "grad_norm": 13.334750175476074, "learning_rate": 4.560285713688516e-05, "loss": 0.6358, "step": 303400 }, { "epoch": 4.181477501308865, "grad_norm": 7.545956134796143, "learning_rate": 4.559682259856139e-05, "loss": 0.6416, "step": 303500 }, { "epoch": 4.182855253368603, "grad_norm": 11.94626522064209, "learning_rate": 4.559078652281996e-05, "loss": 0.6969, "step": 303600 }, { "epoch": 4.184233005428343, "grad_norm": 4.082379341125488, "learning_rate": 4.558474891017367e-05, "loss": 0.6951, "step": 303700 }, { "epoch": 4.1856107574880825, "grad_norm": 11.176995277404785, "learning_rate": 4.557870976113543e-05, "loss": 0.7071, "step": 303800 }, { "epoch": 4.186988509547822, "grad_norm": 10.222467422485352, "learning_rate": 4.557266907621831e-05, "loss": 0.7676, "step": 303900 }, { "epoch": 4.188366261607561, "grad_norm": 8.333457946777344, "learning_rate": 4.556662685593549e-05, "loss": 0.6495, "step": 304000 }, { "epoch": 4.1897440136673, "grad_norm": 15.739195823669434, "learning_rate": 4.5560583100800294e-05, "loss": 0.6683, "step": 304100 }, { "epoch": 4.19112176572704, "grad_norm": 6.781731128692627, "learning_rate": 4.555453781132616e-05, "loss": 0.621, "step": 304200 }, { "epoch": 4.192499517786779, "grad_norm": 47.04030227661133, "learning_rate": 4.554849098802668e-05, "loss": 0.6846, "step": 304300 }, { "epoch": 4.193877269846518, "grad_norm": 4.148467063903809, "learning_rate": 4.5542442631415555e-05, "loss": 0.6307, "step": 304400 }, { "epoch": 4.195255021906258, "grad_norm": 5.733232498168945, "learning_rate": 4.553639274200662e-05, "loss": 0.7009, "step": 304500 }, { "epoch": 4.196632773965997, "grad_norm": 5.764578819274902, "learning_rate": 4.553034132031384e-05, "loss": 0.7528, "step": 304600 }, { "epoch": 4.198010526025737, "grad_norm": 24.287540435791016, "learning_rate": 4.552434890396653e-05, "loss": 0.7154, "step": 304700 }, { "epoch": 4.199388278085475, "grad_norm": 7.267184257507324, "learning_rate": 4.551829443455853e-05, "loss": 0.6338, "step": 304800 }, { "epoch": 4.200766030145215, "grad_norm": 11.063650131225586, "learning_rate": 4.551223843440422e-05, "loss": 0.7643, "step": 304900 }, { "epoch": 4.2021437822049545, "grad_norm": 6.391530513763428, "learning_rate": 4.550618090401811e-05, "loss": 0.6596, "step": 305000 }, { "epoch": 4.203521534264694, "grad_norm": 19.334653854370117, "learning_rate": 4.550012184391482e-05, "loss": 0.594, "step": 305100 }, { "epoch": 4.204899286324433, "grad_norm": 16.1595516204834, "learning_rate": 4.5494061254609094e-05, "loss": 0.6308, "step": 305200 }, { "epoch": 4.206277038384172, "grad_norm": 2.512019157409668, "learning_rate": 4.5487999136615795e-05, "loss": 0.6856, "step": 305300 }, { "epoch": 4.207654790443912, "grad_norm": 11.41607666015625, "learning_rate": 4.548193549044996e-05, "loss": 0.6184, "step": 305400 }, { "epoch": 4.209032542503651, "grad_norm": 1.7704436779022217, "learning_rate": 4.5475870316626697e-05, "loss": 0.6761, "step": 305500 }, { "epoch": 4.21041029456339, "grad_norm": 4.644819736480713, "learning_rate": 4.546980361566129e-05, "loss": 0.7083, "step": 305600 }, { "epoch": 4.21178804662313, "grad_norm": 6.914735794067383, "learning_rate": 4.5463735388069126e-05, "loss": 0.6939, "step": 305700 }, { "epoch": 4.213165798682869, "grad_norm": 6.2788543701171875, "learning_rate": 4.545766563436575e-05, "loss": 0.6125, "step": 305800 }, { "epoch": 4.214543550742609, "grad_norm": 6.040369510650635, "learning_rate": 4.545159435506681e-05, "loss": 0.7533, "step": 305900 }, { "epoch": 4.2159213028023474, "grad_norm": 10.040986061096191, "learning_rate": 4.544552155068808e-05, "loss": 0.5517, "step": 306000 }, { "epoch": 4.217299054862087, "grad_norm": 13.150712966918945, "learning_rate": 4.543944722174548e-05, "loss": 0.7302, "step": 306100 }, { "epoch": 4.218676806921827, "grad_norm": 10.265639305114746, "learning_rate": 4.5433371368755074e-05, "loss": 0.7238, "step": 306200 }, { "epoch": 4.220054558981566, "grad_norm": 2.7123641967773438, "learning_rate": 4.5427293992233014e-05, "loss": 0.7632, "step": 306300 }, { "epoch": 4.221432311041305, "grad_norm": 10.464102745056152, "learning_rate": 4.542121509269562e-05, "loss": 0.6421, "step": 306400 }, { "epoch": 4.222810063101044, "grad_norm": 12.108945846557617, "learning_rate": 4.541513467065933e-05, "loss": 0.6621, "step": 306500 }, { "epoch": 4.224187815160784, "grad_norm": 7.261825084686279, "learning_rate": 4.5409113553612976e-05, "loss": 0.6362, "step": 306600 }, { "epoch": 4.225565567220523, "grad_norm": 12.817927360534668, "learning_rate": 4.5403030103340785e-05, "loss": 0.6375, "step": 306700 }, { "epoch": 4.226943319280262, "grad_norm": 5.216375350952148, "learning_rate": 4.539694513211459e-05, "loss": 0.6797, "step": 306800 }, { "epoch": 4.228321071340002, "grad_norm": 30.395282745361328, "learning_rate": 4.539085864045135e-05, "loss": 0.6771, "step": 306900 }, { "epoch": 4.229698823399741, "grad_norm": 6.5052666664123535, "learning_rate": 4.538477062886813e-05, "loss": 0.6184, "step": 307000 }, { "epoch": 4.23107657545948, "grad_norm": 4.310901165008545, "learning_rate": 4.5378681097882146e-05, "loss": 0.6836, "step": 307100 }, { "epoch": 4.2324543275192195, "grad_norm": 6.7607221603393555, "learning_rate": 4.5372590048010735e-05, "loss": 0.6863, "step": 307200 }, { "epoch": 4.233832079578959, "grad_norm": 5.41831636428833, "learning_rate": 4.536649747977138e-05, "loss": 0.6654, "step": 307300 }, { "epoch": 4.235209831638699, "grad_norm": 33.93351745605469, "learning_rate": 4.5360403393681644e-05, "loss": 0.6476, "step": 307400 }, { "epoch": 4.236587583698437, "grad_norm": 13.801878929138184, "learning_rate": 4.5354307790259276e-05, "loss": 0.7141, "step": 307500 }, { "epoch": 4.237965335758177, "grad_norm": 3.7954368591308594, "learning_rate": 4.534821067002212e-05, "loss": 0.7271, "step": 307600 }, { "epoch": 4.239343087817916, "grad_norm": 3.5893633365631104, "learning_rate": 4.534211203348815e-05, "loss": 0.7161, "step": 307700 }, { "epoch": 4.240720839877656, "grad_norm": 3.2294416427612305, "learning_rate": 4.533601188117549e-05, "loss": 0.5693, "step": 307800 }, { "epoch": 4.242098591937395, "grad_norm": 2.4786124229431152, "learning_rate": 4.5329910213602365e-05, "loss": 0.6949, "step": 307900 }, { "epoch": 4.243476343997134, "grad_norm": 3.1852173805236816, "learning_rate": 4.532380703128715e-05, "loss": 0.5627, "step": 308000 }, { "epoch": 4.244854096056874, "grad_norm": 6.080512046813965, "learning_rate": 4.531770233474835e-05, "loss": 0.6595, "step": 308100 }, { "epoch": 4.246231848116613, "grad_norm": 3.594705820083618, "learning_rate": 4.531159612450458e-05, "loss": 0.6604, "step": 308200 }, { "epoch": 4.247609600176352, "grad_norm": 9.806014060974121, "learning_rate": 4.530548840107458e-05, "loss": 0.7253, "step": 308300 }, { "epoch": 4.2489873522360915, "grad_norm": 19.69601821899414, "learning_rate": 4.529937916497727e-05, "loss": 0.688, "step": 308400 }, { "epoch": 4.250365104295831, "grad_norm": 9.94734001159668, "learning_rate": 4.529326841673162e-05, "loss": 0.6847, "step": 308500 }, { "epoch": 4.251742856355571, "grad_norm": 9.905821800231934, "learning_rate": 4.5287156156856795e-05, "loss": 0.6948, "step": 308600 }, { "epoch": 4.253120608415309, "grad_norm": 12.264094352722168, "learning_rate": 4.528104238587206e-05, "loss": 0.6775, "step": 308700 }, { "epoch": 4.254498360475049, "grad_norm": 23.35835075378418, "learning_rate": 4.527492710429681e-05, "loss": 0.637, "step": 308800 }, { "epoch": 4.255876112534788, "grad_norm": 4.416563034057617, "learning_rate": 4.526881031265056e-05, "loss": 0.6705, "step": 308900 }, { "epoch": 4.257253864594528, "grad_norm": 9.018585205078125, "learning_rate": 4.526269201145298e-05, "loss": 0.5894, "step": 309000 }, { "epoch": 4.258631616654267, "grad_norm": 7.559141635894775, "learning_rate": 4.525657220122382e-05, "loss": 0.7091, "step": 309100 }, { "epoch": 4.260009368714006, "grad_norm": 4.722597122192383, "learning_rate": 4.525045088248303e-05, "loss": 0.6905, "step": 309200 }, { "epoch": 4.261387120773746, "grad_norm": 20.97707176208496, "learning_rate": 4.524432805575062e-05, "loss": 0.6744, "step": 309300 }, { "epoch": 4.262764872833485, "grad_norm": 6.883830547332764, "learning_rate": 4.5238203721546756e-05, "loss": 0.612, "step": 309400 }, { "epoch": 4.264142624893224, "grad_norm": 4.53001594543457, "learning_rate": 4.523207788039175e-05, "loss": 0.7528, "step": 309500 }, { "epoch": 4.2655203769529635, "grad_norm": 5.905134677886963, "learning_rate": 4.5225950532806e-05, "loss": 0.6995, "step": 309600 }, { "epoch": 4.266898129012703, "grad_norm": 21.644359588623047, "learning_rate": 4.521982167931007e-05, "loss": 0.5514, "step": 309700 }, { "epoch": 4.268275881072443, "grad_norm": 4.295462131500244, "learning_rate": 4.5213691320424635e-05, "loss": 0.6912, "step": 309800 }, { "epoch": 4.269653633132181, "grad_norm": 7.276356220245361, "learning_rate": 4.52075594566705e-05, "loss": 0.6998, "step": 309900 }, { "epoch": 4.271031385191921, "grad_norm": 16.975078582763672, "learning_rate": 4.52014260885686e-05, "loss": 0.6877, "step": 310000 }, { "epoch": 4.27240913725166, "grad_norm": 7.268409252166748, "learning_rate": 4.5195291216639985e-05, "loss": 0.6977, "step": 310100 }, { "epoch": 4.273786889311399, "grad_norm": 6.270153999328613, "learning_rate": 4.518915484140586e-05, "loss": 0.652, "step": 310200 }, { "epoch": 4.275164641371139, "grad_norm": 1.7775713205337524, "learning_rate": 4.5183016963387534e-05, "loss": 0.5626, "step": 310300 }, { "epoch": 4.276542393430878, "grad_norm": 11.436005592346191, "learning_rate": 4.517687758310645e-05, "loss": 0.5906, "step": 310400 }, { "epoch": 4.277920145490618, "grad_norm": 1.905484914779663, "learning_rate": 4.517073670108418e-05, "loss": 0.608, "step": 310500 }, { "epoch": 4.279297897550357, "grad_norm": 8.761491775512695, "learning_rate": 4.5164594317842434e-05, "loss": 0.6706, "step": 310600 }, { "epoch": 4.280675649610096, "grad_norm": 3.2594447135925293, "learning_rate": 4.5158450433903017e-05, "loss": 0.6344, "step": 310700 }, { "epoch": 4.2820534016698355, "grad_norm": 32.87424850463867, "learning_rate": 4.5152305049787904e-05, "loss": 0.6822, "step": 310800 }, { "epoch": 4.283431153729575, "grad_norm": 17.112659454345703, "learning_rate": 4.514615816601916e-05, "loss": 0.6699, "step": 310900 }, { "epoch": 4.284808905789314, "grad_norm": 581.0711669921875, "learning_rate": 4.514000978311901e-05, "loss": 0.681, "step": 311000 }, { "epoch": 4.286186657849053, "grad_norm": 3.072092056274414, "learning_rate": 4.513385990160977e-05, "loss": 0.5739, "step": 311100 }, { "epoch": 4.287564409908793, "grad_norm": 3.4997644424438477, "learning_rate": 4.512770852201394e-05, "loss": 0.684, "step": 311200 }, { "epoch": 4.288942161968532, "grad_norm": 2.147372007369995, "learning_rate": 4.5121555644854066e-05, "loss": 0.5565, "step": 311300 }, { "epoch": 4.290319914028271, "grad_norm": 9.841828346252441, "learning_rate": 4.5115401270652906e-05, "loss": 0.737, "step": 311400 }, { "epoch": 4.291697666088011, "grad_norm": 1.6798075437545776, "learning_rate": 4.5109245399933275e-05, "loss": 0.5695, "step": 311500 }, { "epoch": 4.29307541814775, "grad_norm": 8.864051818847656, "learning_rate": 4.5103088033218165e-05, "loss": 0.644, "step": 311600 }, { "epoch": 4.29445317020749, "grad_norm": 5.417300701141357, "learning_rate": 4.509692917103067e-05, "loss": 0.6097, "step": 311700 }, { "epoch": 4.295830922267228, "grad_norm": 24.710216522216797, "learning_rate": 4.5090768813894014e-05, "loss": 0.6593, "step": 311800 }, { "epoch": 4.297208674326968, "grad_norm": 27.967021942138672, "learning_rate": 4.5084606962331555e-05, "loss": 0.645, "step": 311900 }, { "epoch": 4.2985864263867075, "grad_norm": 23.30682373046875, "learning_rate": 4.5078443616866777e-05, "loss": 0.7004, "step": 312000 }, { "epoch": 4.299964178446447, "grad_norm": 1.977541446685791, "learning_rate": 4.5072278778023274e-05, "loss": 0.6215, "step": 312100 }, { "epoch": 4.301341930506186, "grad_norm": 5.870731353759766, "learning_rate": 4.50661124463248e-05, "loss": 0.6536, "step": 312200 }, { "epoch": 4.302719682565925, "grad_norm": 12.300503730773926, "learning_rate": 4.50599446222952e-05, "loss": 0.7482, "step": 312300 }, { "epoch": 4.304097434625665, "grad_norm": 108.59202575683594, "learning_rate": 4.505377530645846e-05, "loss": 0.685, "step": 312400 }, { "epoch": 4.305475186685404, "grad_norm": 145.99563598632812, "learning_rate": 4.504760449933871e-05, "loss": 0.6427, "step": 312500 }, { "epoch": 4.306852938745143, "grad_norm": 4.786005020141602, "learning_rate": 4.5041493931816513e-05, "loss": 0.6589, "step": 312600 }, { "epoch": 4.308230690804883, "grad_norm": 15.538369178771973, "learning_rate": 4.503532015860333e-05, "loss": 0.6376, "step": 312700 }, { "epoch": 4.309608442864622, "grad_norm": 6.881442546844482, "learning_rate": 4.5029144895674986e-05, "loss": 0.6426, "step": 312800 }, { "epoch": 4.310986194924362, "grad_norm": 7.188647270202637, "learning_rate": 4.502296814355612e-05, "loss": 0.7208, "step": 312900 }, { "epoch": 4.3123639469841, "grad_norm": 7.574893951416016, "learning_rate": 4.5016789902771444e-05, "loss": 0.649, "step": 313000 }, { "epoch": 4.31374169904384, "grad_norm": 7.721707820892334, "learning_rate": 4.501061017384586e-05, "loss": 0.6867, "step": 313100 }, { "epoch": 4.3151194511035795, "grad_norm": 40.91524124145508, "learning_rate": 4.500442895730436e-05, "loss": 0.5987, "step": 313200 }, { "epoch": 4.316497203163319, "grad_norm": 39.55015182495117, "learning_rate": 4.499824625367206e-05, "loss": 0.6682, "step": 313300 }, { "epoch": 4.317874955223058, "grad_norm": 53.41413116455078, "learning_rate": 4.499206206347423e-05, "loss": 0.6926, "step": 313400 }, { "epoch": 4.319252707282797, "grad_norm": 7.61053991317749, "learning_rate": 4.498587638723623e-05, "loss": 0.664, "step": 313500 }, { "epoch": 4.320630459342537, "grad_norm": 7.264420032501221, "learning_rate": 4.497968922548358e-05, "loss": 0.7242, "step": 313600 }, { "epoch": 4.322008211402276, "grad_norm": 2.119528293609619, "learning_rate": 4.49735005787419e-05, "loss": 0.6099, "step": 313700 }, { "epoch": 4.323385963462015, "grad_norm": 17.502540588378906, "learning_rate": 4.496731044753696e-05, "loss": 0.739, "step": 313800 }, { "epoch": 4.324763715521755, "grad_norm": 4.526692867279053, "learning_rate": 4.496111883239463e-05, "loss": 0.6673, "step": 313900 }, { "epoch": 4.326141467581494, "grad_norm": 1.956165075302124, "learning_rate": 4.495492573384092e-05, "loss": 0.7291, "step": 314000 }, { "epoch": 4.327519219641234, "grad_norm": 10.1044340133667, "learning_rate": 4.494873115240197e-05, "loss": 0.7305, "step": 314100 }, { "epoch": 4.328896971700972, "grad_norm": 9.427190780639648, "learning_rate": 4.494253508860405e-05, "loss": 0.6822, "step": 314200 }, { "epoch": 4.330274723760712, "grad_norm": 3.114659547805786, "learning_rate": 4.493633754297354e-05, "loss": 0.7219, "step": 314300 }, { "epoch": 4.3316524758204515, "grad_norm": 7.070925712585449, "learning_rate": 4.493013851603694e-05, "loss": 0.6383, "step": 314400 }, { "epoch": 4.33303022788019, "grad_norm": 4.399557113647461, "learning_rate": 4.4923938008320915e-05, "loss": 0.692, "step": 314500 }, { "epoch": 4.33440797993993, "grad_norm": 5.275290489196777, "learning_rate": 4.4917736020352207e-05, "loss": 0.6595, "step": 314600 }, { "epoch": 4.335785731999669, "grad_norm": 32.72828674316406, "learning_rate": 4.491153255265772e-05, "loss": 0.6366, "step": 314700 }, { "epoch": 4.337163484059409, "grad_norm": 11.588094711303711, "learning_rate": 4.490532760576447e-05, "loss": 0.6871, "step": 314800 }, { "epoch": 4.338541236119148, "grad_norm": 12.23223876953125, "learning_rate": 4.489912118019958e-05, "loss": 0.6105, "step": 314900 }, { "epoch": 4.339918988178887, "grad_norm": 7.590038776397705, "learning_rate": 4.489291327649034e-05, "loss": 0.6946, "step": 315000 }, { "epoch": 4.341296740238627, "grad_norm": 4.498335838317871, "learning_rate": 4.488670389516414e-05, "loss": 0.7135, "step": 315100 }, { "epoch": 4.342674492298366, "grad_norm": 5.7012038230896, "learning_rate": 4.488049303674848e-05, "loss": 0.7255, "step": 315200 }, { "epoch": 4.344052244358105, "grad_norm": 21.774002075195312, "learning_rate": 4.487428070177104e-05, "loss": 0.6117, "step": 315300 }, { "epoch": 4.345429996417844, "grad_norm": 12.106348991394043, "learning_rate": 4.486806689075955e-05, "loss": 0.6473, "step": 315400 }, { "epoch": 4.346807748477584, "grad_norm": 4.872758865356445, "learning_rate": 4.486185160424194e-05, "loss": 0.6807, "step": 315500 }, { "epoch": 4.3481855005373236, "grad_norm": 14.71099853515625, "learning_rate": 4.4855634842746206e-05, "loss": 0.6505, "step": 315600 }, { "epoch": 4.349563252597062, "grad_norm": 4.2744622230529785, "learning_rate": 4.4849478796456743e-05, "loss": 0.6094, "step": 315700 }, { "epoch": 4.350941004656802, "grad_norm": 21.7128849029541, "learning_rate": 4.4843259101325936e-05, "loss": 0.7034, "step": 315800 }, { "epoch": 4.352318756716541, "grad_norm": 5.750769138336182, "learning_rate": 4.483703793279654e-05, "loss": 0.6062, "step": 315900 }, { "epoch": 4.353696508776281, "grad_norm": 3.0670692920684814, "learning_rate": 4.4830815291397086e-05, "loss": 0.6707, "step": 316000 }, { "epoch": 4.35507426083602, "grad_norm": 6.325472354888916, "learning_rate": 4.48245911776562e-05, "loss": 0.6914, "step": 316100 }, { "epoch": 4.356452012895759, "grad_norm": 4.057070732116699, "learning_rate": 4.481836559210266e-05, "loss": 0.6692, "step": 316200 }, { "epoch": 4.357829764955499, "grad_norm": 7.541365623474121, "learning_rate": 4.481213853526536e-05, "loss": 0.6871, "step": 316300 }, { "epoch": 4.359207517015238, "grad_norm": 3.7444095611572266, "learning_rate": 4.480591000767334e-05, "loss": 0.6726, "step": 316400 }, { "epoch": 4.360585269074977, "grad_norm": 10.029513359069824, "learning_rate": 4.479968000985572e-05, "loss": 0.6633, "step": 316500 }, { "epoch": 4.3619630211347165, "grad_norm": 15.464344024658203, "learning_rate": 4.4793448542341774e-05, "loss": 0.6757, "step": 316600 }, { "epoch": 4.363340773194456, "grad_norm": 16.177799224853516, "learning_rate": 4.4787215605660905e-05, "loss": 0.5873, "step": 316700 }, { "epoch": 4.364718525254196, "grad_norm": 104.21092224121094, "learning_rate": 4.478098120034263e-05, "loss": 0.7217, "step": 316800 }, { "epoch": 4.366096277313934, "grad_norm": 11.626893997192383, "learning_rate": 4.477474532691659e-05, "loss": 0.6055, "step": 316900 }, { "epoch": 4.367474029373674, "grad_norm": 5.727044105529785, "learning_rate": 4.476850798591256e-05, "loss": 0.6408, "step": 317000 }, { "epoch": 4.368851781433413, "grad_norm": 19.25215721130371, "learning_rate": 4.476226917786043e-05, "loss": 0.6554, "step": 317100 }, { "epoch": 4.370229533493153, "grad_norm": 33.02938461303711, "learning_rate": 4.475602890329022e-05, "loss": 0.6764, "step": 317200 }, { "epoch": 4.371607285552892, "grad_norm": 7.360673904418945, "learning_rate": 4.474978716273207e-05, "loss": 0.6612, "step": 317300 }, { "epoch": 4.372985037612631, "grad_norm": 30.094141006469727, "learning_rate": 4.474354395671626e-05, "loss": 0.6373, "step": 317400 }, { "epoch": 4.374362789672371, "grad_norm": 6.94580602645874, "learning_rate": 4.4737299285773175e-05, "loss": 0.7158, "step": 317500 }, { "epoch": 4.37574054173211, "grad_norm": 3.3117239475250244, "learning_rate": 4.473105315043332e-05, "loss": 0.6187, "step": 317600 }, { "epoch": 4.377118293791849, "grad_norm": 24.79054069519043, "learning_rate": 4.472480555122735e-05, "loss": 0.6927, "step": 317700 }, { "epoch": 4.3784960458515885, "grad_norm": 3.6354496479034424, "learning_rate": 4.471855648868603e-05, "loss": 0.7071, "step": 317800 }, { "epoch": 4.379873797911328, "grad_norm": 10.32691478729248, "learning_rate": 4.471230596334024e-05, "loss": 0.7083, "step": 317900 }, { "epoch": 4.381251549971068, "grad_norm": 55.8413200378418, "learning_rate": 4.470605397572101e-05, "loss": 0.7576, "step": 318000 }, { "epoch": 4.382629302030806, "grad_norm": 5.073696613311768, "learning_rate": 4.469980052635946e-05, "loss": 0.6505, "step": 318100 }, { "epoch": 4.384007054090546, "grad_norm": 2.443202257156372, "learning_rate": 4.4693545615786866e-05, "loss": 0.5787, "step": 318200 }, { "epoch": 4.385384806150285, "grad_norm": 16.05838966369629, "learning_rate": 4.468728924453461e-05, "loss": 0.6481, "step": 318300 }, { "epoch": 4.386762558210025, "grad_norm": 4.32959508895874, "learning_rate": 4.468103141313421e-05, "loss": 0.7195, "step": 318400 }, { "epoch": 4.388140310269764, "grad_norm": 1.2014583349227905, "learning_rate": 4.467477212211728e-05, "loss": 0.6952, "step": 318500 }, { "epoch": 4.389518062329503, "grad_norm": 26.61128044128418, "learning_rate": 4.46685113720156e-05, "loss": 0.758, "step": 318600 }, { "epoch": 4.390895814389243, "grad_norm": 7.952206134796143, "learning_rate": 4.466231179266568e-05, "loss": 0.6511, "step": 318700 }, { "epoch": 4.392273566448981, "grad_norm": 23.078968048095703, "learning_rate": 4.4656048140567834e-05, "loss": 0.6312, "step": 318800 }, { "epoch": 4.393651318508721, "grad_norm": 8.388651847839355, "learning_rate": 4.464978303097593e-05, "loss": 0.6318, "step": 318900 }, { "epoch": 4.3950290705684605, "grad_norm": 1.8579869270324707, "learning_rate": 4.464351646442222e-05, "loss": 0.687, "step": 319000 }, { "epoch": 4.3964068226282, "grad_norm": 7.543272495269775, "learning_rate": 4.463724844143907e-05, "loss": 0.689, "step": 319100 }, { "epoch": 4.39778457468794, "grad_norm": 8.869715690612793, "learning_rate": 4.4630978962559005e-05, "loss": 0.7199, "step": 319200 }, { "epoch": 4.399162326747678, "grad_norm": 70.35514831542969, "learning_rate": 4.462470802831463e-05, "loss": 0.7331, "step": 319300 }, { "epoch": 4.400540078807418, "grad_norm": 13.741540908813477, "learning_rate": 4.46184356392387e-05, "loss": 0.6418, "step": 319400 }, { "epoch": 4.401917830867157, "grad_norm": 3.9362874031066895, "learning_rate": 4.461216179586408e-05, "loss": 0.5425, "step": 319500 }, { "epoch": 4.403295582926896, "grad_norm": 5.519320964813232, "learning_rate": 4.460588649872377e-05, "loss": 0.6122, "step": 319600 }, { "epoch": 4.404673334986636, "grad_norm": 12.589254379272461, "learning_rate": 4.4599609748350895e-05, "loss": 0.6953, "step": 319700 }, { "epoch": 4.406051087046375, "grad_norm": 3.9567782878875732, "learning_rate": 4.45933315452787e-05, "loss": 0.6358, "step": 319800 }, { "epoch": 4.407428839106115, "grad_norm": 12.955341339111328, "learning_rate": 4.4587051890040515e-05, "loss": 0.6723, "step": 319900 }, { "epoch": 4.408806591165853, "grad_norm": 9.158559799194336, "learning_rate": 4.458077078316988e-05, "loss": 0.6046, "step": 320000 }, { "epoch": 4.410184343225593, "grad_norm": 9.448963165283203, "learning_rate": 4.4574488225200364e-05, "loss": 0.6838, "step": 320100 }, { "epoch": 4.4115620952853325, "grad_norm": 18.403839111328125, "learning_rate": 4.456820421666573e-05, "loss": 0.6272, "step": 320200 }, { "epoch": 4.412939847345072, "grad_norm": 3.124269723892212, "learning_rate": 4.4561918758099835e-05, "loss": 0.5848, "step": 320300 }, { "epoch": 4.414317599404811, "grad_norm": 4.586199760437012, "learning_rate": 4.455563185003664e-05, "loss": 0.5837, "step": 320400 }, { "epoch": 4.41569535146455, "grad_norm": 7.89439058303833, "learning_rate": 4.454934349301026e-05, "loss": 0.6138, "step": 320500 }, { "epoch": 4.41707310352429, "grad_norm": 9.500040054321289, "learning_rate": 4.454305368755494e-05, "loss": 0.5725, "step": 320600 }, { "epoch": 4.418450855584029, "grad_norm": 11.009655952453613, "learning_rate": 4.453676243420501e-05, "loss": 0.6691, "step": 320700 }, { "epoch": 4.419828607643768, "grad_norm": 40.77465057373047, "learning_rate": 4.4530469733494955e-05, "loss": 0.7356, "step": 320800 }, { "epoch": 4.421206359703508, "grad_norm": 2.06315016746521, "learning_rate": 4.4524238534594755e-05, "loss": 0.5915, "step": 320900 }, { "epoch": 4.422584111763247, "grad_norm": 4.759917259216309, "learning_rate": 4.451794295522862e-05, "loss": 0.6672, "step": 321000 }, { "epoch": 4.423961863822987, "grad_norm": 11.225339889526367, "learning_rate": 4.451164593010117e-05, "loss": 0.6679, "step": 321100 }, { "epoch": 4.425339615882725, "grad_norm": 7.234725475311279, "learning_rate": 4.450534745974736e-05, "loss": 0.6958, "step": 321200 }, { "epoch": 4.426717367942465, "grad_norm": 25.36356544494629, "learning_rate": 4.449904754470228e-05, "loss": 0.5903, "step": 321300 }, { "epoch": 4.4280951200022045, "grad_norm": 12.474896430969238, "learning_rate": 4.449274618550115e-05, "loss": 0.7154, "step": 321400 }, { "epoch": 4.429472872061944, "grad_norm": 28.230289459228516, "learning_rate": 4.44864433826793e-05, "loss": 0.7167, "step": 321500 }, { "epoch": 4.430850624121683, "grad_norm": 7.706596851348877, "learning_rate": 4.448013913677218e-05, "loss": 0.6774, "step": 321600 }, { "epoch": 4.432228376181422, "grad_norm": 47.81163787841797, "learning_rate": 4.447383344831538e-05, "loss": 0.6442, "step": 321700 }, { "epoch": 4.433606128241162, "grad_norm": 4.364405155181885, "learning_rate": 4.446752631784458e-05, "loss": 0.6635, "step": 321800 }, { "epoch": 4.434983880300901, "grad_norm": 14.025710105895996, "learning_rate": 4.4461217745895614e-05, "loss": 0.5836, "step": 321900 }, { "epoch": 4.43636163236064, "grad_norm": 3.8178043365478516, "learning_rate": 4.445490773300443e-05, "loss": 0.6871, "step": 322000 }, { "epoch": 4.43773938442038, "grad_norm": 8.0946044921875, "learning_rate": 4.4448596279707093e-05, "loss": 0.6875, "step": 322100 }, { "epoch": 4.439117136480119, "grad_norm": 3.0491600036621094, "learning_rate": 4.44422833865398e-05, "loss": 0.7648, "step": 322200 }, { "epoch": 4.440494888539859, "grad_norm": 27.746309280395508, "learning_rate": 4.443596905403885e-05, "loss": 0.6826, "step": 322300 }, { "epoch": 4.441872640599597, "grad_norm": 7.239984512329102, "learning_rate": 4.442965328274068e-05, "loss": 0.646, "step": 322400 }, { "epoch": 4.443250392659337, "grad_norm": 7.9559831619262695, "learning_rate": 4.442333607318186e-05, "loss": 0.7213, "step": 322500 }, { "epoch": 4.4446281447190765, "grad_norm": 19.0515193939209, "learning_rate": 4.441701742589906e-05, "loss": 0.6669, "step": 322600 }, { "epoch": 4.446005896778816, "grad_norm": 12.214559555053711, "learning_rate": 4.4410697341429084e-05, "loss": 0.6711, "step": 322700 }, { "epoch": 4.447383648838555, "grad_norm": 3.0706989765167236, "learning_rate": 4.440437582030886e-05, "loss": 0.59, "step": 322800 }, { "epoch": 4.448761400898294, "grad_norm": 5.658199310302734, "learning_rate": 4.439805286307541e-05, "loss": 0.6901, "step": 322900 }, { "epoch": 4.450139152958034, "grad_norm": 8.028273582458496, "learning_rate": 4.439172847026593e-05, "loss": 0.6358, "step": 323000 }, { "epoch": 4.451516905017773, "grad_norm": 8.47390079498291, "learning_rate": 4.4385465907797866e-05, "loss": 0.6942, "step": 323100 }, { "epoch": 4.452894657077512, "grad_norm": 6.019741535186768, "learning_rate": 4.437913865979065e-05, "loss": 0.634, "step": 323200 }, { "epoch": 4.454272409137252, "grad_norm": 39.022335052490234, "learning_rate": 4.4372809977814246e-05, "loss": 0.7013, "step": 323300 }, { "epoch": 4.455650161196991, "grad_norm": 5.139946937561035, "learning_rate": 4.4366479862406316e-05, "loss": 0.6942, "step": 323400 }, { "epoch": 4.457027913256731, "grad_norm": 2.410353422164917, "learning_rate": 4.436014831410464e-05, "loss": 0.6475, "step": 323500 }, { "epoch": 4.458405665316469, "grad_norm": 39.79777145385742, "learning_rate": 4.43538153334471e-05, "loss": 0.6899, "step": 323600 }, { "epoch": 4.459783417376209, "grad_norm": 5.1380414962768555, "learning_rate": 4.434748092097172e-05, "loss": 0.6477, "step": 323700 }, { "epoch": 4.4611611694359485, "grad_norm": 5.901916980743408, "learning_rate": 4.434114507721666e-05, "loss": 0.6342, "step": 323800 }, { "epoch": 4.462538921495687, "grad_norm": 4.1130781173706055, "learning_rate": 4.433480780272016e-05, "loss": 0.6638, "step": 323900 }, { "epoch": 4.463916673555427, "grad_norm": 8.368687629699707, "learning_rate": 4.4328469098020614e-05, "loss": 0.6708, "step": 324000 }, { "epoch": 4.465294425615166, "grad_norm": 3.7628891468048096, "learning_rate": 4.432212896365652e-05, "loss": 0.5853, "step": 324100 }, { "epoch": 4.466672177674906, "grad_norm": 12.795971870422363, "learning_rate": 4.4315787400166504e-05, "loss": 0.6824, "step": 324200 }, { "epoch": 4.4680499297346445, "grad_norm": 8.752735137939453, "learning_rate": 4.4309444408089334e-05, "loss": 0.7174, "step": 324300 }, { "epoch": 4.469427681794384, "grad_norm": 8.314553260803223, "learning_rate": 4.430309998796385e-05, "loss": 0.6286, "step": 324400 }, { "epoch": 4.470805433854124, "grad_norm": 6.607656478881836, "learning_rate": 4.429675414032906e-05, "loss": 0.6874, "step": 324500 }, { "epoch": 4.472183185913863, "grad_norm": 5.661682605743408, "learning_rate": 4.429040686572408e-05, "loss": 0.5944, "step": 324600 }, { "epoch": 4.473560937973602, "grad_norm": 4.260201930999756, "learning_rate": 4.428405816468814e-05, "loss": 0.6079, "step": 324700 }, { "epoch": 4.474938690033341, "grad_norm": 9.348628997802734, "learning_rate": 4.4277708037760586e-05, "loss": 0.6204, "step": 324800 }, { "epoch": 4.476316442093081, "grad_norm": 6.534801006317139, "learning_rate": 4.4271356485480895e-05, "loss": 0.5981, "step": 324900 }, { "epoch": 4.4776941941528206, "grad_norm": 12.314505577087402, "learning_rate": 4.4265003508388686e-05, "loss": 0.6903, "step": 325000 }, { "epoch": 4.479071946212559, "grad_norm": 2.194709300994873, "learning_rate": 4.425864910702364e-05, "loss": 0.6886, "step": 325100 }, { "epoch": 4.480449698272299, "grad_norm": 8.704954147338867, "learning_rate": 4.425229328192563e-05, "loss": 0.6035, "step": 325200 }, { "epoch": 4.481827450332038, "grad_norm": 3.1498658657073975, "learning_rate": 4.42459360336346e-05, "loss": 0.6406, "step": 325300 }, { "epoch": 4.483205202391778, "grad_norm": 18.84713363647461, "learning_rate": 4.423957736269063e-05, "loss": 0.6397, "step": 325400 }, { "epoch": 4.484582954451517, "grad_norm": 16.160200119018555, "learning_rate": 4.4233217269633926e-05, "loss": 0.645, "step": 325500 }, { "epoch": 4.485960706511256, "grad_norm": 10.243626594543457, "learning_rate": 4.42268557550048e-05, "loss": 0.6115, "step": 325600 }, { "epoch": 4.487338458570996, "grad_norm": 12.973387718200684, "learning_rate": 4.422049281934371e-05, "loss": 0.6017, "step": 325700 }, { "epoch": 4.488716210630735, "grad_norm": 3.910337448120117, "learning_rate": 4.4214192113782396e-05, "loss": 0.6187, "step": 325800 }, { "epoch": 4.490093962690474, "grad_norm": 4.787367820739746, "learning_rate": 4.4207826351876004e-05, "loss": 0.5603, "step": 325900 }, { "epoch": 4.4914717147502135, "grad_norm": 4.082210063934326, "learning_rate": 4.420145917055429e-05, "loss": 0.7037, "step": 326000 }, { "epoch": 4.492849466809953, "grad_norm": 15.9624662399292, "learning_rate": 4.419509057035818e-05, "loss": 0.7349, "step": 326100 }, { "epoch": 4.494227218869693, "grad_norm": 6.187216281890869, "learning_rate": 4.4188720551828705e-05, "loss": 0.6383, "step": 326200 }, { "epoch": 4.495604970929431, "grad_norm": 3.234055757522583, "learning_rate": 4.418234911550705e-05, "loss": 0.6447, "step": 326300 }, { "epoch": 4.496982722989171, "grad_norm": 36.49329376220703, "learning_rate": 4.4175976261934476e-05, "loss": 0.704, "step": 326400 }, { "epoch": 4.49836047504891, "grad_norm": 6.525084018707275, "learning_rate": 4.416960199165242e-05, "loss": 0.6344, "step": 326500 }, { "epoch": 4.49973822710865, "grad_norm": 29.574024200439453, "learning_rate": 4.41632263052024e-05, "loss": 0.6527, "step": 326600 }, { "epoch": 4.501115979168389, "grad_norm": 19.798866271972656, "learning_rate": 4.4156849203126034e-05, "loss": 0.65, "step": 326700 }, { "epoch": 4.502493731228128, "grad_norm": 7.43825101852417, "learning_rate": 4.415047068596513e-05, "loss": 0.679, "step": 326800 }, { "epoch": 4.503871483287868, "grad_norm": 5.0385212898254395, "learning_rate": 4.414409075426155e-05, "loss": 0.6786, "step": 326900 }, { "epoch": 4.505249235347607, "grad_norm": 10.723860740661621, "learning_rate": 4.41377094085573e-05, "loss": 0.6308, "step": 327000 }, { "epoch": 4.506626987407346, "grad_norm": 30.588050842285156, "learning_rate": 4.413132664939454e-05, "loss": 0.6482, "step": 327100 }, { "epoch": 4.5080047394670855, "grad_norm": 3.961076259613037, "learning_rate": 4.412494247731549e-05, "loss": 0.5487, "step": 327200 }, { "epoch": 4.509382491526825, "grad_norm": 3.0004286766052246, "learning_rate": 4.411855689286252e-05, "loss": 0.6541, "step": 327300 }, { "epoch": 4.510760243586564, "grad_norm": 10.720518112182617, "learning_rate": 4.4112169896578116e-05, "loss": 0.7179, "step": 327400 }, { "epoch": 4.512137995646303, "grad_norm": 15.561443328857422, "learning_rate": 4.4105781489004896e-05, "loss": 0.6546, "step": 327500 }, { "epoch": 4.513515747706043, "grad_norm": 3.469061851501465, "learning_rate": 4.409939167068559e-05, "loss": 0.6741, "step": 327600 }, { "epoch": 4.514893499765782, "grad_norm": 5.806593894958496, "learning_rate": 4.4093000442163036e-05, "loss": 0.6833, "step": 327700 }, { "epoch": 4.516271251825522, "grad_norm": 5.602105617523193, "learning_rate": 4.4086607803980205e-05, "loss": 0.651, "step": 327800 }, { "epoch": 4.517649003885261, "grad_norm": 4.355697154998779, "learning_rate": 4.408021375668018e-05, "loss": 0.6217, "step": 327900 }, { "epoch": 4.519026755945, "grad_norm": 6.771401882171631, "learning_rate": 4.4073818300806174e-05, "loss": 0.6178, "step": 328000 }, { "epoch": 4.52040450800474, "grad_norm": 11.900725364685059, "learning_rate": 4.406742143690152e-05, "loss": 0.6847, "step": 328100 }, { "epoch": 4.521782260064478, "grad_norm": 5.0748372077941895, "learning_rate": 4.406102316550965e-05, "loss": 0.691, "step": 328200 }, { "epoch": 4.523160012124218, "grad_norm": 4.792394161224365, "learning_rate": 4.4054623487174137e-05, "loss": 0.7566, "step": 328300 }, { "epoch": 4.5245377641839575, "grad_norm": 4.872078895568848, "learning_rate": 4.404828642024591e-05, "loss": 0.6961, "step": 328400 }, { "epoch": 4.525915516243697, "grad_norm": 7.041717529296875, "learning_rate": 4.404188394371016e-05, "loss": 0.6334, "step": 328500 }, { "epoch": 4.527293268303437, "grad_norm": 9.934708595275879, "learning_rate": 4.403548006185674e-05, "loss": 0.6869, "step": 328600 }, { "epoch": 4.528671020363175, "grad_norm": 7.335161209106445, "learning_rate": 4.402907477522969e-05, "loss": 0.6828, "step": 328700 }, { "epoch": 4.530048772422915, "grad_norm": 10.300129890441895, "learning_rate": 4.4022668084373176e-05, "loss": 0.591, "step": 328800 }, { "epoch": 4.531426524482654, "grad_norm": 4.763584136962891, "learning_rate": 4.4016259989831476e-05, "loss": 0.5627, "step": 328900 }, { "epoch": 4.532804276542393, "grad_norm": 6.448105335235596, "learning_rate": 4.4009850492148995e-05, "loss": 0.6706, "step": 329000 }, { "epoch": 4.534182028602133, "grad_norm": 7.568809986114502, "learning_rate": 4.400343959187026e-05, "loss": 0.7217, "step": 329100 }, { "epoch": 4.535559780661872, "grad_norm": 13.323205947875977, "learning_rate": 4.399702728953989e-05, "loss": 0.6055, "step": 329200 }, { "epoch": 4.536937532721612, "grad_norm": 2.8426015377044678, "learning_rate": 4.399061358570265e-05, "loss": 0.6613, "step": 329300 }, { "epoch": 4.53831528478135, "grad_norm": 2.6047165393829346, "learning_rate": 4.3984198480903436e-05, "loss": 0.6935, "step": 329400 }, { "epoch": 4.53969303684109, "grad_norm": 32.55445098876953, "learning_rate": 4.3977781975687215e-05, "loss": 0.6591, "step": 329500 }, { "epoch": 4.5410707889008295, "grad_norm": 11.835723876953125, "learning_rate": 4.397136407059912e-05, "loss": 0.6166, "step": 329600 }, { "epoch": 4.542448540960569, "grad_norm": 5.599956512451172, "learning_rate": 4.3964944766184374e-05, "loss": 0.607, "step": 329700 }, { "epoch": 4.543826293020308, "grad_norm": 3.1657302379608154, "learning_rate": 4.3958524062988346e-05, "loss": 0.6528, "step": 329800 }, { "epoch": 4.545204045080047, "grad_norm": 4.893840312957764, "learning_rate": 4.3952101961556496e-05, "loss": 0.6333, "step": 329900 }, { "epoch": 4.546581797139787, "grad_norm": 16.13520622253418, "learning_rate": 4.3945678462434414e-05, "loss": 0.6527, "step": 330000 }, { "epoch": 4.547959549199526, "grad_norm": 3.2625110149383545, "learning_rate": 4.393925356616781e-05, "loss": 0.7119, "step": 330100 }, { "epoch": 4.549337301259265, "grad_norm": 9.866243362426758, "learning_rate": 4.393282727330252e-05, "loss": 0.6564, "step": 330200 }, { "epoch": 4.550715053319005, "grad_norm": 11.611804008483887, "learning_rate": 4.3926399584384474e-05, "loss": 0.63, "step": 330300 }, { "epoch": 4.552092805378744, "grad_norm": 38.45549392700195, "learning_rate": 4.3919970499959745e-05, "loss": 0.5968, "step": 330400 }, { "epoch": 4.553470557438484, "grad_norm": 4.421891689300537, "learning_rate": 4.391354002057453e-05, "loss": 0.6544, "step": 330500 }, { "epoch": 4.554848309498222, "grad_norm": 29.308429718017578, "learning_rate": 4.39071081467751e-05, "loss": 0.6482, "step": 330600 }, { "epoch": 4.556226061557962, "grad_norm": 8.412556648254395, "learning_rate": 4.390067487910791e-05, "loss": 0.6941, "step": 330700 }, { "epoch": 4.5576038136177015, "grad_norm": 5.871030807495117, "learning_rate": 4.389424021811948e-05, "loss": 0.6502, "step": 330800 }, { "epoch": 4.558981565677441, "grad_norm": 12.721481323242188, "learning_rate": 4.388780416435648e-05, "loss": 0.5806, "step": 330900 }, { "epoch": 4.56035931773718, "grad_norm": 5.437663555145264, "learning_rate": 4.3881366718365664e-05, "loss": 0.6978, "step": 331000 }, { "epoch": 4.561737069796919, "grad_norm": 3.4897611141204834, "learning_rate": 4.3874927880693945e-05, "loss": 0.6613, "step": 331100 }, { "epoch": 4.563114821856659, "grad_norm": 9.729260444641113, "learning_rate": 4.386848765188832e-05, "loss": 0.7368, "step": 331200 }, { "epoch": 4.564492573916398, "grad_norm": 4.953622341156006, "learning_rate": 4.3862046032495945e-05, "loss": 0.6203, "step": 331300 }, { "epoch": 4.565870325976137, "grad_norm": 11.730154991149902, "learning_rate": 4.385560302306403e-05, "loss": 0.6922, "step": 331400 }, { "epoch": 4.567248078035877, "grad_norm": 13.521852493286133, "learning_rate": 4.384915862413998e-05, "loss": 0.6582, "step": 331500 }, { "epoch": 4.568625830095616, "grad_norm": 5.167218208312988, "learning_rate": 4.384271283627126e-05, "loss": 0.6732, "step": 331600 }, { "epoch": 4.570003582155355, "grad_norm": 3.1702427864074707, "learning_rate": 4.3836265660005474e-05, "loss": 0.7069, "step": 331700 }, { "epoch": 4.571381334215094, "grad_norm": 9.022198677062988, "learning_rate": 4.382981709589034e-05, "loss": 0.6747, "step": 331800 }, { "epoch": 4.572759086274834, "grad_norm": 44.19960403442383, "learning_rate": 4.3823367144473715e-05, "loss": 0.6148, "step": 331900 }, { "epoch": 4.5741368383345735, "grad_norm": 1.9782874584197998, "learning_rate": 4.381691580630353e-05, "loss": 0.652, "step": 332000 }, { "epoch": 4.575514590394313, "grad_norm": 2.4546005725860596, "learning_rate": 4.381046308192787e-05, "loss": 0.596, "step": 332100 }, { "epoch": 4.576892342454052, "grad_norm": 7.0841522216796875, "learning_rate": 4.3804008971894926e-05, "loss": 0.564, "step": 332200 }, { "epoch": 4.578270094513791, "grad_norm": 3.3855671882629395, "learning_rate": 4.3797553476753016e-05, "loss": 0.6121, "step": 332300 }, { "epoch": 4.579647846573531, "grad_norm": 7.465083122253418, "learning_rate": 4.379109659705056e-05, "loss": 0.6452, "step": 332400 }, { "epoch": 4.5810255986332695, "grad_norm": 5.384669303894043, "learning_rate": 4.37846383333361e-05, "loss": 0.6134, "step": 332500 }, { "epoch": 4.582403350693009, "grad_norm": 3.685746192932129, "learning_rate": 4.3778178686158304e-05, "loss": 0.6748, "step": 332600 }, { "epoch": 4.583781102752749, "grad_norm": 17.18223762512207, "learning_rate": 4.3771717656065954e-05, "loss": 0.674, "step": 332700 }, { "epoch": 4.585158854812488, "grad_norm": 14.064021110534668, "learning_rate": 4.376525524360793e-05, "loss": 0.68, "step": 332800 }, { "epoch": 4.586536606872228, "grad_norm": 5.873450756072998, "learning_rate": 4.3758791449333266e-05, "loss": 0.6449, "step": 332900 }, { "epoch": 4.587914358931966, "grad_norm": 4.995344638824463, "learning_rate": 4.37523262737911e-05, "loss": 0.6322, "step": 333000 }, { "epoch": 4.589292110991706, "grad_norm": 13.373953819274902, "learning_rate": 4.374585971753066e-05, "loss": 0.6165, "step": 333100 }, { "epoch": 4.5906698630514455, "grad_norm": 2.717437505722046, "learning_rate": 4.3739456467295646e-05, "loss": 0.6183, "step": 333200 }, { "epoch": 4.592047615111184, "grad_norm": 13.884819030761719, "learning_rate": 4.373298716504038e-05, "loss": 0.5447, "step": 333300 }, { "epoch": 4.593425367170924, "grad_norm": 14.432552337646484, "learning_rate": 4.37265164837098e-05, "loss": 0.7218, "step": 333400 }, { "epoch": 4.594803119230663, "grad_norm": 7.3644819259643555, "learning_rate": 4.372004442385363e-05, "loss": 0.5659, "step": 333500 }, { "epoch": 4.596180871290403, "grad_norm": 2.4393606185913086, "learning_rate": 4.37136357272192e-05, "loss": 0.6337, "step": 333600 }, { "epoch": 4.5975586233501415, "grad_norm": 4.671186923980713, "learning_rate": 4.3707160925733006e-05, "loss": 0.6657, "step": 333700 }, { "epoch": 4.598936375409881, "grad_norm": 5.4141340255737305, "learning_rate": 4.3700684747365585e-05, "loss": 0.6458, "step": 333800 }, { "epoch": 4.600314127469621, "grad_norm": 20.662593841552734, "learning_rate": 4.36942071926671e-05, "loss": 0.6444, "step": 333900 }, { "epoch": 4.60169187952936, "grad_norm": 7.323440074920654, "learning_rate": 4.368772826218787e-05, "loss": 0.6511, "step": 334000 }, { "epoch": 4.603069631589099, "grad_norm": 3.818000316619873, "learning_rate": 4.368124795647831e-05, "loss": 0.7214, "step": 334100 }, { "epoch": 4.604447383648838, "grad_norm": 13.386191368103027, "learning_rate": 4.3674766276088964e-05, "loss": 0.692, "step": 334200 }, { "epoch": 4.605825135708578, "grad_norm": 5.456580638885498, "learning_rate": 4.366828322157046e-05, "loss": 0.7098, "step": 334300 }, { "epoch": 4.6072028877683175, "grad_norm": 2.9225990772247314, "learning_rate": 4.366179879347358e-05, "loss": 0.5972, "step": 334400 }, { "epoch": 4.608580639828056, "grad_norm": 3.7065563201904297, "learning_rate": 4.365531299234921e-05, "loss": 0.631, "step": 334500 }, { "epoch": 4.609958391887796, "grad_norm": 6.768597602844238, "learning_rate": 4.364882581874835e-05, "loss": 0.6423, "step": 334600 }, { "epoch": 4.611336143947535, "grad_norm": 4.486959934234619, "learning_rate": 4.364233727322213e-05, "loss": 0.6353, "step": 334700 }, { "epoch": 4.612713896007275, "grad_norm": 10.104267120361328, "learning_rate": 4.3635847356321765e-05, "loss": 0.6059, "step": 334800 }, { "epoch": 4.6140916480670136, "grad_norm": 25.838520050048828, "learning_rate": 4.3629356068598616e-05, "loss": 0.573, "step": 334900 }, { "epoch": 4.615469400126753, "grad_norm": 2.941793441772461, "learning_rate": 4.362286341060415e-05, "loss": 0.7028, "step": 335000 }, { "epoch": 4.616847152186493, "grad_norm": 7.484210968017578, "learning_rate": 4.361636938288997e-05, "loss": 0.6252, "step": 335100 }, { "epoch": 4.618224904246232, "grad_norm": 7.027438640594482, "learning_rate": 4.360987398600774e-05, "loss": 0.6118, "step": 335200 }, { "epoch": 4.619602656305971, "grad_norm": 4.250925064086914, "learning_rate": 4.360337722050931e-05, "loss": 0.6161, "step": 335300 }, { "epoch": 4.6209804083657104, "grad_norm": 32.049842834472656, "learning_rate": 4.359687908694659e-05, "loss": 0.6447, "step": 335400 }, { "epoch": 4.62235816042545, "grad_norm": 5.912929058074951, "learning_rate": 4.3590379585871654e-05, "loss": 0.6468, "step": 335500 }, { "epoch": 4.62373591248519, "grad_norm": 4.3188886642456055, "learning_rate": 4.3583878717836646e-05, "loss": 0.6896, "step": 335600 }, { "epoch": 4.625113664544928, "grad_norm": 4.989152908325195, "learning_rate": 4.357737648339386e-05, "loss": 0.6276, "step": 335700 }, { "epoch": 4.626491416604668, "grad_norm": 4.833415508270264, "learning_rate": 4.3570872883095676e-05, "loss": 0.611, "step": 335800 }, { "epoch": 4.627869168664407, "grad_norm": 46.401973724365234, "learning_rate": 4.356436791749464e-05, "loss": 0.5807, "step": 335900 }, { "epoch": 4.629246920724146, "grad_norm": 6.8872175216674805, "learning_rate": 4.355786158714336e-05, "loss": 0.5654, "step": 336000 }, { "epoch": 4.630624672783886, "grad_norm": 36.346012115478516, "learning_rate": 4.355135389259459e-05, "loss": 0.6942, "step": 336100 }, { "epoch": 4.632002424843625, "grad_norm": 52.620880126953125, "learning_rate": 4.354484483440118e-05, "loss": 0.6396, "step": 336200 }, { "epoch": 4.633380176903365, "grad_norm": 3.102942705154419, "learning_rate": 4.3538334413116125e-05, "loss": 0.6247, "step": 336300 }, { "epoch": 4.634757928963104, "grad_norm": 5.044652462005615, "learning_rate": 4.3531822629292505e-05, "loss": 0.6124, "step": 336400 }, { "epoch": 4.636135681022843, "grad_norm": 11.227763175964355, "learning_rate": 4.352530948348354e-05, "loss": 0.6181, "step": 336500 }, { "epoch": 4.6375134330825825, "grad_norm": 7.284191608428955, "learning_rate": 4.3518794976242536e-05, "loss": 0.6899, "step": 336600 }, { "epoch": 4.638891185142322, "grad_norm": 13.224371910095215, "learning_rate": 4.351227910812296e-05, "loss": 0.665, "step": 336700 }, { "epoch": 4.640268937202061, "grad_norm": 6.4130048751831055, "learning_rate": 4.3505761879678355e-05, "loss": 0.5789, "step": 336800 }, { "epoch": 4.6416466892618, "grad_norm": 45.207942962646484, "learning_rate": 4.3499243291462387e-05, "loss": 0.574, "step": 336900 }, { "epoch": 4.64302444132154, "grad_norm": 5.822239875793457, "learning_rate": 4.349272334402885e-05, "loss": 0.6108, "step": 337000 }, { "epoch": 4.644402193381279, "grad_norm": 10.178132057189941, "learning_rate": 4.3486202037931656e-05, "loss": 0.6054, "step": 337100 }, { "epoch": 4.645779945441019, "grad_norm": 8.700674057006836, "learning_rate": 4.3479679373724806e-05, "loss": 0.6075, "step": 337200 }, { "epoch": 4.647157697500758, "grad_norm": 3.3527495861053467, "learning_rate": 4.347315535196244e-05, "loss": 0.6668, "step": 337300 }, { "epoch": 4.648535449560497, "grad_norm": 5.201707363128662, "learning_rate": 4.346662997319882e-05, "loss": 0.5955, "step": 337400 }, { "epoch": 4.649913201620237, "grad_norm": 4.6474690437316895, "learning_rate": 4.346010323798828e-05, "loss": 0.6044, "step": 337500 }, { "epoch": 4.651290953679975, "grad_norm": 1.3089040517807007, "learning_rate": 4.345357514688533e-05, "loss": 0.5305, "step": 337600 }, { "epoch": 4.652668705739715, "grad_norm": 8.0230712890625, "learning_rate": 4.3447045700444554e-05, "loss": 0.7267, "step": 337700 }, { "epoch": 4.6540464577994545, "grad_norm": 7.888185024261475, "learning_rate": 4.3440514899220656e-05, "loss": 0.6061, "step": 337800 }, { "epoch": 4.655424209859194, "grad_norm": 3.1337671279907227, "learning_rate": 4.343398274376847e-05, "loss": 0.6158, "step": 337900 }, { "epoch": 4.656801961918933, "grad_norm": 4.105374813079834, "learning_rate": 4.3427514576433055e-05, "loss": 0.607, "step": 338000 }, { "epoch": 4.658179713978672, "grad_norm": 138.5779266357422, "learning_rate": 4.3420979727717656e-05, "loss": 0.6822, "step": 338100 }, { "epoch": 4.659557466038412, "grad_norm": 16.21944236755371, "learning_rate": 4.341444352643358e-05, "loss": 0.6598, "step": 338200 }, { "epoch": 4.660935218098151, "grad_norm": 9.633851051330566, "learning_rate": 4.3407905973136104e-05, "loss": 0.678, "step": 338300 }, { "epoch": 4.66231297015789, "grad_norm": 12.092757225036621, "learning_rate": 4.3401367068380635e-05, "loss": 0.7248, "step": 338400 }, { "epoch": 4.66369072221763, "grad_norm": 7.795053482055664, "learning_rate": 4.339482681272268e-05, "loss": 0.6059, "step": 338500 }, { "epoch": 4.665068474277369, "grad_norm": 5.384261608123779, "learning_rate": 4.338828520671787e-05, "loss": 0.6675, "step": 338600 }, { "epoch": 4.666446226337109, "grad_norm": 7.8802947998046875, "learning_rate": 4.338174225092195e-05, "loss": 0.6546, "step": 338700 }, { "epoch": 4.667823978396847, "grad_norm": 3.0653738975524902, "learning_rate": 4.3375197945890775e-05, "loss": 0.61, "step": 338800 }, { "epoch": 4.669201730456587, "grad_norm": 12.49382495880127, "learning_rate": 4.336865229218032e-05, "loss": 0.5998, "step": 338900 }, { "epoch": 4.6705794825163265, "grad_norm": 31.43951416015625, "learning_rate": 4.336210529034667e-05, "loss": 0.5845, "step": 339000 }, { "epoch": 4.671957234576066, "grad_norm": 9.029288291931152, "learning_rate": 4.335555694094601e-05, "loss": 0.7507, "step": 339100 }, { "epoch": 4.673334986635805, "grad_norm": 4.097837448120117, "learning_rate": 4.334900724453469e-05, "loss": 0.6854, "step": 339200 }, { "epoch": 4.674712738695544, "grad_norm": 4.363492012023926, "learning_rate": 4.334245620166911e-05, "loss": 0.6521, "step": 339300 }, { "epoch": 4.676090490755284, "grad_norm": 5.626641273498535, "learning_rate": 4.3335903812905835e-05, "loss": 0.6246, "step": 339400 }, { "epoch": 4.677468242815023, "grad_norm": 7.819990634918213, "learning_rate": 4.33293500788015e-05, "loss": 0.639, "step": 339500 }, { "epoch": 4.678845994874762, "grad_norm": 9.912444114685059, "learning_rate": 4.3322794999912916e-05, "loss": 0.6164, "step": 339600 }, { "epoch": 4.680223746934502, "grad_norm": 5.36249303817749, "learning_rate": 4.331623857679693e-05, "loss": 0.666, "step": 339700 }, { "epoch": 4.681601498994241, "grad_norm": 7.043378829956055, "learning_rate": 4.330968081001057e-05, "loss": 0.658, "step": 339800 }, { "epoch": 4.682979251053981, "grad_norm": 81.09571838378906, "learning_rate": 4.330312170011095e-05, "loss": 0.6157, "step": 339900 }, { "epoch": 4.684357003113719, "grad_norm": 4.815480709075928, "learning_rate": 4.3296561247655285e-05, "loss": 0.5948, "step": 340000 }, { "epoch": 4.685734755173459, "grad_norm": 2.4301512241363525, "learning_rate": 4.3289999453200924e-05, "loss": 0.698, "step": 340100 }, { "epoch": 4.6871125072331985, "grad_norm": 4.584778785705566, "learning_rate": 4.328343631730533e-05, "loss": 0.6204, "step": 340200 }, { "epoch": 4.688490259292937, "grad_norm": 5.054748058319092, "learning_rate": 4.3276871840526074e-05, "loss": 0.5633, "step": 340300 }, { "epoch": 4.689868011352677, "grad_norm": 5.237736225128174, "learning_rate": 4.327030602342085e-05, "loss": 0.6407, "step": 340400 }, { "epoch": 4.691245763412416, "grad_norm": 7.613414764404297, "learning_rate": 4.326380454474619e-05, "loss": 0.5855, "step": 340500 }, { "epoch": 4.692623515472156, "grad_norm": 6.827023029327393, "learning_rate": 4.325723606205186e-05, "loss": 0.6241, "step": 340600 }, { "epoch": 4.694001267531895, "grad_norm": 14.288708686828613, "learning_rate": 4.3250666240699716e-05, "loss": 0.7284, "step": 340700 }, { "epoch": 4.695379019591634, "grad_norm": 2.823258638381958, "learning_rate": 4.324409508124788e-05, "loss": 0.6481, "step": 340800 }, { "epoch": 4.696756771651374, "grad_norm": 4.392832279205322, "learning_rate": 4.323752258425464e-05, "loss": 0.6244, "step": 340900 }, { "epoch": 4.698134523711113, "grad_norm": 4.342037200927734, "learning_rate": 4.323094875027833e-05, "loss": 0.655, "step": 341000 }, { "epoch": 4.699512275770852, "grad_norm": 11.47658634185791, "learning_rate": 4.3224373579877446e-05, "loss": 0.6078, "step": 341100 }, { "epoch": 4.700890027830591, "grad_norm": 2.0359914302825928, "learning_rate": 4.321779707361059e-05, "loss": 0.6016, "step": 341200 }, { "epoch": 4.702267779890331, "grad_norm": 12.72111701965332, "learning_rate": 4.321121923203645e-05, "loss": 0.6964, "step": 341300 }, { "epoch": 4.7036455319500705, "grad_norm": 10.523122787475586, "learning_rate": 4.320464005571386e-05, "loss": 0.6005, "step": 341400 }, { "epoch": 4.70502328400981, "grad_norm": 9.06456184387207, "learning_rate": 4.3198059545201766e-05, "loss": 0.5746, "step": 341500 }, { "epoch": 4.706401036069549, "grad_norm": 8.78666877746582, "learning_rate": 4.319154352610025e-05, "loss": 0.5869, "step": 341600 }, { "epoch": 4.707778788129288, "grad_norm": 10.096076011657715, "learning_rate": 4.318496036221431e-05, "loss": 0.6518, "step": 341700 }, { "epoch": 4.709156540189028, "grad_norm": 4.437751293182373, "learning_rate": 4.317837586581075e-05, "loss": 0.6404, "step": 341800 }, { "epoch": 4.7105342922487665, "grad_norm": 1.933472990989685, "learning_rate": 4.317179003744895e-05, "loss": 0.6197, "step": 341900 }, { "epoch": 4.711912044308506, "grad_norm": 7.073274612426758, "learning_rate": 4.316520287768841e-05, "loss": 0.6471, "step": 342000 }, { "epoch": 4.713289796368246, "grad_norm": 10.69509506225586, "learning_rate": 4.315861438708874e-05, "loss": 0.6068, "step": 342100 }, { "epoch": 4.714667548427985, "grad_norm": 8.324593544006348, "learning_rate": 4.3152024566209665e-05, "loss": 0.6434, "step": 342200 }, { "epoch": 4.716045300487724, "grad_norm": 10.524890899658203, "learning_rate": 4.3145499333697296e-05, "loss": 0.6683, "step": 342300 }, { "epoch": 4.717423052547463, "grad_norm": 6.639632225036621, "learning_rate": 4.313890686722788e-05, "loss": 0.6266, "step": 342400 }, { "epoch": 4.718800804607203, "grad_norm": 5.75840425491333, "learning_rate": 4.313231307215331e-05, "loss": 0.6337, "step": 342500 }, { "epoch": 4.7201785566669425, "grad_norm": 6.58840274810791, "learning_rate": 4.312571794903378e-05, "loss": 0.6424, "step": 342600 }, { "epoch": 4.721556308726681, "grad_norm": 45.629913330078125, "learning_rate": 4.311912149842956e-05, "loss": 0.7007, "step": 342700 }, { "epoch": 4.722934060786421, "grad_norm": 5.736390113830566, "learning_rate": 4.311252372090107e-05, "loss": 0.6642, "step": 342800 }, { "epoch": 4.72431181284616, "grad_norm": 54.273075103759766, "learning_rate": 4.3105924617008807e-05, "loss": 0.6461, "step": 342900 }, { "epoch": 4.7256895649059, "grad_norm": 2.6785812377929688, "learning_rate": 4.3099324187313416e-05, "loss": 0.7001, "step": 343000 }, { "epoch": 4.7270673169656385, "grad_norm": 11.368042945861816, "learning_rate": 4.309272243237563e-05, "loss": 0.6648, "step": 343100 }, { "epoch": 4.728445069025378, "grad_norm": 7.296229362487793, "learning_rate": 4.3086119352756296e-05, "loss": 0.6033, "step": 343200 }, { "epoch": 4.729822821085118, "grad_norm": 4.869019985198975, "learning_rate": 4.3079514949016397e-05, "loss": 0.6019, "step": 343300 }, { "epoch": 4.731200573144857, "grad_norm": 21.18593978881836, "learning_rate": 4.307290922171699e-05, "loss": 0.7382, "step": 343400 }, { "epoch": 4.732578325204596, "grad_norm": 13.06953239440918, "learning_rate": 4.306630217141928e-05, "loss": 0.6229, "step": 343500 }, { "epoch": 4.733956077264335, "grad_norm": 24.202054977416992, "learning_rate": 4.305969379868455e-05, "loss": 0.7006, "step": 343600 }, { "epoch": 4.735333829324075, "grad_norm": 42.04108428955078, "learning_rate": 4.305308410407424e-05, "loss": 0.6311, "step": 343700 }, { "epoch": 4.7367115813838145, "grad_norm": 7.695801734924316, "learning_rate": 4.3046473088149866e-05, "loss": 0.6048, "step": 343800 }, { "epoch": 4.738089333443553, "grad_norm": 14.970787048339844, "learning_rate": 4.303986075147307e-05, "loss": 0.7253, "step": 343900 }, { "epoch": 4.739467085503293, "grad_norm": 7.628849029541016, "learning_rate": 4.303324709460559e-05, "loss": 0.6739, "step": 344000 }, { "epoch": 4.740844837563032, "grad_norm": 5.008774280548096, "learning_rate": 4.302663211810931e-05, "loss": 0.688, "step": 344100 }, { "epoch": 4.742222589622772, "grad_norm": 21.251340866088867, "learning_rate": 4.302001582254619e-05, "loss": 0.5997, "step": 344200 }, { "epoch": 4.7436003416825105, "grad_norm": 4.002758026123047, "learning_rate": 4.301339820847834e-05, "loss": 0.6583, "step": 344300 }, { "epoch": 4.74497809374225, "grad_norm": 7.359635829925537, "learning_rate": 4.300677927646794e-05, "loss": 0.5987, "step": 344400 }, { "epoch": 4.74635584580199, "grad_norm": 25.74188232421875, "learning_rate": 4.300015902707731e-05, "loss": 0.5773, "step": 344500 }, { "epoch": 4.747733597861728, "grad_norm": 9.259632110595703, "learning_rate": 4.299353746086887e-05, "loss": 0.6032, "step": 344600 }, { "epoch": 4.749111349921468, "grad_norm": 248.90753173828125, "learning_rate": 4.2986914578405154e-05, "loss": 0.6006, "step": 344700 }, { "epoch": 4.750489101981207, "grad_norm": 29.208189010620117, "learning_rate": 4.298029038024883e-05, "loss": 0.7067, "step": 344800 }, { "epoch": 4.751866854040947, "grad_norm": 3.7965328693389893, "learning_rate": 4.2973664866962616e-05, "loss": 0.6119, "step": 344900 }, { "epoch": 4.7532446061006866, "grad_norm": 6.786052227020264, "learning_rate": 4.296703803910942e-05, "loss": 0.6829, "step": 345000 }, { "epoch": 4.754622358160425, "grad_norm": 9.718019485473633, "learning_rate": 4.2960409897252224e-05, "loss": 0.6432, "step": 345100 }, { "epoch": 4.756000110220165, "grad_norm": 15.900662422180176, "learning_rate": 4.2953780441954105e-05, "loss": 0.651, "step": 345200 }, { "epoch": 4.757377862279904, "grad_norm": 5.348118305206299, "learning_rate": 4.2947149673778275e-05, "loss": 0.6101, "step": 345300 }, { "epoch": 4.758755614339643, "grad_norm": 5.125334739685059, "learning_rate": 4.294051759328806e-05, "loss": 0.6619, "step": 345400 }, { "epoch": 4.760133366399383, "grad_norm": 11.220720291137695, "learning_rate": 4.293388420104687e-05, "loss": 0.6458, "step": 345500 }, { "epoch": 4.761511118459122, "grad_norm": 18.100278854370117, "learning_rate": 4.292724949761827e-05, "loss": 0.5987, "step": 345600 }, { "epoch": 4.762888870518862, "grad_norm": 30.092592239379883, "learning_rate": 4.292061348356589e-05, "loss": 0.7135, "step": 345700 }, { "epoch": 4.764266622578601, "grad_norm": 2.859104871749878, "learning_rate": 4.291397615945351e-05, "loss": 0.679, "step": 345800 }, { "epoch": 4.76564437463834, "grad_norm": 2.6178338527679443, "learning_rate": 4.2907337525844995e-05, "loss": 0.6723, "step": 345900 }, { "epoch": 4.7670221266980795, "grad_norm": 3.2841413021087646, "learning_rate": 4.290069758330433e-05, "loss": 0.6428, "step": 346000 }, { "epoch": 4.768399878757819, "grad_norm": 14.654231071472168, "learning_rate": 4.289405633239563e-05, "loss": 0.6737, "step": 346100 }, { "epoch": 4.769777630817558, "grad_norm": 3.0337975025177, "learning_rate": 4.288741377368307e-05, "loss": 0.5654, "step": 346200 }, { "epoch": 4.771155382877297, "grad_norm": 16.27220916748047, "learning_rate": 4.2880769907731e-05, "loss": 0.5557, "step": 346300 }, { "epoch": 4.772533134937037, "grad_norm": 2.7087109088897705, "learning_rate": 4.287419119329629e-05, "loss": 0.5715, "step": 346400 }, { "epoch": 4.773910886996776, "grad_norm": 4.579300880432129, "learning_rate": 4.28675447276169e-05, "loss": 0.7386, "step": 346500 }, { "epoch": 4.775288639056515, "grad_norm": 7.389023303985596, "learning_rate": 4.2860896956385955e-05, "loss": 0.6614, "step": 346600 }, { "epoch": 4.776666391116255, "grad_norm": 8.012945175170898, "learning_rate": 4.285424788016822e-05, "loss": 0.6886, "step": 346700 }, { "epoch": 4.778044143175994, "grad_norm": 23.281654357910156, "learning_rate": 4.284759749952858e-05, "loss": 0.6703, "step": 346800 }, { "epoch": 4.779421895235734, "grad_norm": 4.183088302612305, "learning_rate": 4.284094581503202e-05, "loss": 0.7064, "step": 346900 }, { "epoch": 4.780799647295472, "grad_norm": 4.177852630615234, "learning_rate": 4.283429282724363e-05, "loss": 0.7442, "step": 347000 }, { "epoch": 4.782177399355212, "grad_norm": 11.401872634887695, "learning_rate": 4.282763853672861e-05, "loss": 0.5458, "step": 347100 }, { "epoch": 4.7835551514149515, "grad_norm": 9.291254043579102, "learning_rate": 4.282098294405227e-05, "loss": 0.7176, "step": 347200 }, { "epoch": 4.784932903474691, "grad_norm": 9.33425521850586, "learning_rate": 4.2814326049780064e-05, "loss": 0.6523, "step": 347300 }, { "epoch": 4.78631065553443, "grad_norm": 6.8329572677612305, "learning_rate": 4.280766785447751e-05, "loss": 0.6582, "step": 347400 }, { "epoch": 4.787688407594169, "grad_norm": 46.750179290771484, "learning_rate": 4.2801008358710255e-05, "loss": 0.6308, "step": 347500 }, { "epoch": 4.789066159653909, "grad_norm": 4.833880424499512, "learning_rate": 4.2794347563044064e-05, "loss": 0.5835, "step": 347600 }, { "epoch": 4.790443911713648, "grad_norm": 5.339344024658203, "learning_rate": 4.2787685468044795e-05, "loss": 0.7058, "step": 347700 }, { "epoch": 4.791821663773387, "grad_norm": 18.253280639648438, "learning_rate": 4.278102207427844e-05, "loss": 0.6121, "step": 347800 }, { "epoch": 4.793199415833127, "grad_norm": 30.998825073242188, "learning_rate": 4.2774357382311076e-05, "loss": 0.5985, "step": 347900 }, { "epoch": 4.794577167892866, "grad_norm": 6.988937854766846, "learning_rate": 4.276769139270891e-05, "loss": 0.6668, "step": 348000 }, { "epoch": 4.795954919952606, "grad_norm": 7.60188102722168, "learning_rate": 4.2761024106038264e-05, "loss": 0.6605, "step": 348100 }, { "epoch": 4.797332672012344, "grad_norm": 2.9623730182647705, "learning_rate": 4.2754355522865526e-05, "loss": 0.6786, "step": 348200 }, { "epoch": 4.798710424072084, "grad_norm": 3.613161563873291, "learning_rate": 4.274768564375726e-05, "loss": 0.5727, "step": 348300 }, { "epoch": 4.8000881761318235, "grad_norm": 2.2245612144470215, "learning_rate": 4.274101446928009e-05, "loss": 0.6603, "step": 348400 }, { "epoch": 4.801465928191563, "grad_norm": 5.619102478027344, "learning_rate": 4.273434200000077e-05, "loss": 0.6145, "step": 348500 }, { "epoch": 4.802843680251302, "grad_norm": 59.182674407958984, "learning_rate": 4.2727668236486144e-05, "loss": 0.6255, "step": 348600 }, { "epoch": 4.804221432311041, "grad_norm": 4.695216655731201, "learning_rate": 4.2720993179303215e-05, "loss": 0.6451, "step": 348700 }, { "epoch": 4.805599184370781, "grad_norm": 5.404073715209961, "learning_rate": 4.271431682901903e-05, "loss": 0.5819, "step": 348800 }, { "epoch": 4.8069769364305195, "grad_norm": 8.825237274169922, "learning_rate": 4.270763918620081e-05, "loss": 0.6225, "step": 348900 }, { "epoch": 4.808354688490259, "grad_norm": 1.9902760982513428, "learning_rate": 4.270096025141583e-05, "loss": 0.6198, "step": 349000 }, { "epoch": 4.809732440549999, "grad_norm": 7.778998374938965, "learning_rate": 4.269428002523151e-05, "loss": 0.668, "step": 349100 }, { "epoch": 4.811110192609738, "grad_norm": 6.042855262756348, "learning_rate": 4.268759850821537e-05, "loss": 0.7092, "step": 349200 }, { "epoch": 4.812487944669478, "grad_norm": 8.571781158447266, "learning_rate": 4.2680915700935045e-05, "loss": 0.6834, "step": 349300 }, { "epoch": 4.813865696729216, "grad_norm": 6.674981117248535, "learning_rate": 4.267423160395825e-05, "loss": 0.7157, "step": 349400 }, { "epoch": 4.815243448788956, "grad_norm": 4.643914699554443, "learning_rate": 4.266754621785286e-05, "loss": 0.6347, "step": 349500 }, { "epoch": 4.8166212008486955, "grad_norm": 3.260669708251953, "learning_rate": 4.2660859543186825e-05, "loss": 0.636, "step": 349600 }, { "epoch": 4.817998952908434, "grad_norm": 59.931602478027344, "learning_rate": 4.2654171580528196e-05, "loss": 0.6207, "step": 349700 }, { "epoch": 4.819376704968174, "grad_norm": 7.203336238861084, "learning_rate": 4.264748233044518e-05, "loss": 0.6024, "step": 349800 }, { "epoch": 4.820754457027913, "grad_norm": 10.81579303741455, "learning_rate": 4.264079179350603e-05, "loss": 0.6292, "step": 349900 }, { "epoch": 4.822132209087653, "grad_norm": 9.10781192779541, "learning_rate": 4.2634099970279165e-05, "loss": 0.6698, "step": 350000 }, { "epoch": 4.823509961147392, "grad_norm": 7.323215961456299, "learning_rate": 4.262740686133308e-05, "loss": 0.6258, "step": 350100 }, { "epoch": 4.824887713207131, "grad_norm": 7.165981292724609, "learning_rate": 4.2620712467236394e-05, "loss": 0.6133, "step": 350200 }, { "epoch": 4.826265465266871, "grad_norm": 3.829420804977417, "learning_rate": 4.261401678855783e-05, "loss": 0.5866, "step": 350300 }, { "epoch": 4.82764321732661, "grad_norm": 9.75688362121582, "learning_rate": 4.260731982586621e-05, "loss": 0.6533, "step": 350400 }, { "epoch": 4.829020969386349, "grad_norm": 4.121615409851074, "learning_rate": 4.260068856854302e-05, "loss": 0.5646, "step": 350500 }, { "epoch": 4.830398721446088, "grad_norm": 5.265509605407715, "learning_rate": 4.259398905235817e-05, "loss": 0.6644, "step": 350600 }, { "epoch": 4.831776473505828, "grad_norm": 5.581333637237549, "learning_rate": 4.2587288253861736e-05, "loss": 0.6661, "step": 350700 }, { "epoch": 4.8331542255655675, "grad_norm": 6.327726364135742, "learning_rate": 4.258058617362297e-05, "loss": 0.6045, "step": 350800 }, { "epoch": 4.834531977625306, "grad_norm": 3.824824810028076, "learning_rate": 4.257388281221126e-05, "loss": 0.6226, "step": 350900 }, { "epoch": 4.835909729685046, "grad_norm": 2.384413480758667, "learning_rate": 4.2567178170196086e-05, "loss": 0.6302, "step": 351000 }, { "epoch": 4.837287481744785, "grad_norm": 6.625782012939453, "learning_rate": 4.256047224814705e-05, "loss": 0.6129, "step": 351100 }, { "epoch": 4.838665233804525, "grad_norm": 33.81018829345703, "learning_rate": 4.255376504663384e-05, "loss": 0.6418, "step": 351200 }, { "epoch": 4.8400429858642635, "grad_norm": 12.79318618774414, "learning_rate": 4.254712365735901e-05, "loss": 0.6311, "step": 351300 }, { "epoch": 4.841420737924003, "grad_norm": 5.097681045532227, "learning_rate": 4.254041391140744e-05, "loss": 0.6556, "step": 351400 }, { "epoch": 4.842798489983743, "grad_norm": 16.894330978393555, "learning_rate": 4.253370288769575e-05, "loss": 0.652, "step": 351500 }, { "epoch": 4.844176242043482, "grad_norm": 3.3176510334014893, "learning_rate": 4.252699058679409e-05, "loss": 0.554, "step": 351600 }, { "epoch": 4.845553994103221, "grad_norm": 13.652451515197754, "learning_rate": 4.2520277009272704e-05, "loss": 0.6247, "step": 351700 }, { "epoch": 4.84693174616296, "grad_norm": 4.763336658477783, "learning_rate": 4.251356215570195e-05, "loss": 0.6832, "step": 351800 }, { "epoch": 4.8483094982227, "grad_norm": 5.352202415466309, "learning_rate": 4.2506846026652275e-05, "loss": 0.6036, "step": 351900 }, { "epoch": 4.8496872502824395, "grad_norm": 3.9824962615966797, "learning_rate": 4.250012862269425e-05, "loss": 0.6436, "step": 352000 }, { "epoch": 4.851065002342178, "grad_norm": 9.790726661682129, "learning_rate": 4.249340994439858e-05, "loss": 0.6255, "step": 352100 }, { "epoch": 4.852442754401918, "grad_norm": 29.39607810974121, "learning_rate": 4.248668999233601e-05, "loss": 0.6255, "step": 352200 }, { "epoch": 4.853820506461657, "grad_norm": 4.3125104904174805, "learning_rate": 4.247996876707747e-05, "loss": 0.6313, "step": 352300 }, { "epoch": 4.855198258521397, "grad_norm": 5.499168872833252, "learning_rate": 4.247324626919392e-05, "loss": 0.5914, "step": 352400 }, { "epoch": 4.8565760105811355, "grad_norm": 4.054204940795898, "learning_rate": 4.246652249925652e-05, "loss": 0.6141, "step": 352500 }, { "epoch": 4.857953762640875, "grad_norm": 2.557570219039917, "learning_rate": 4.2459797457836454e-05, "loss": 0.6582, "step": 352600 }, { "epoch": 4.859331514700615, "grad_norm": 8.356891632080078, "learning_rate": 4.2453071145505064e-05, "loss": 0.6141, "step": 352700 }, { "epoch": 4.860709266760354, "grad_norm": 29.28074836730957, "learning_rate": 4.244634356283378e-05, "loss": 0.569, "step": 352800 }, { "epoch": 4.862087018820093, "grad_norm": 4.220701217651367, "learning_rate": 4.243961471039415e-05, "loss": 0.624, "step": 352900 }, { "epoch": 4.863464770879832, "grad_norm": 2.6034014225006104, "learning_rate": 4.243288458875781e-05, "loss": 0.6679, "step": 353000 }, { "epoch": 4.864842522939572, "grad_norm": 3.912343978881836, "learning_rate": 4.2426153198496535e-05, "loss": 0.6819, "step": 353100 }, { "epoch": 4.866220274999311, "grad_norm": 7.990548610687256, "learning_rate": 4.241942054018218e-05, "loss": 0.6395, "step": 353200 }, { "epoch": 4.86759802705905, "grad_norm": 15.660150527954102, "learning_rate": 4.2412686614386725e-05, "loss": 0.6778, "step": 353300 }, { "epoch": 4.86897577911879, "grad_norm": 29.208141326904297, "learning_rate": 4.240595142168226e-05, "loss": 0.6645, "step": 353400 }, { "epoch": 4.870353531178529, "grad_norm": 6.159826755523682, "learning_rate": 4.2399214962640954e-05, "loss": 0.7131, "step": 353500 }, { "epoch": 4.871731283238269, "grad_norm": 5.614286422729492, "learning_rate": 4.239247723783511e-05, "loss": 0.6363, "step": 353600 }, { "epoch": 4.8731090352980075, "grad_norm": 25.2165470123291, "learning_rate": 4.238573824783714e-05, "loss": 0.6855, "step": 353700 }, { "epoch": 4.874486787357747, "grad_norm": 4.608912944793701, "learning_rate": 4.237906540202371e-05, "loss": 0.7223, "step": 353800 }, { "epoch": 4.875864539417487, "grad_norm": 2.840367555618286, "learning_rate": 4.2372323895996754e-05, "loss": 0.5955, "step": 353900 }, { "epoch": 4.877242291477225, "grad_norm": 9.986611366271973, "learning_rate": 4.23655811264898e-05, "loss": 0.6123, "step": 354000 }, { "epoch": 4.878620043536965, "grad_norm": 9.702980995178223, "learning_rate": 4.2358837094075666e-05, "loss": 0.653, "step": 354100 }, { "epoch": 4.879997795596704, "grad_norm": 5.322096824645996, "learning_rate": 4.235209179932732e-05, "loss": 0.6583, "step": 354200 }, { "epoch": 4.881375547656444, "grad_norm": 3.6570732593536377, "learning_rate": 4.234534524281778e-05, "loss": 0.5954, "step": 354300 }, { "epoch": 4.8827532997161835, "grad_norm": 19.012914657592773, "learning_rate": 4.233859742512022e-05, "loss": 0.6734, "step": 354400 }, { "epoch": 4.884131051775922, "grad_norm": 7.837798595428467, "learning_rate": 4.2331848346807894e-05, "loss": 0.6474, "step": 354500 }, { "epoch": 4.885508803835662, "grad_norm": 2.2519407272338867, "learning_rate": 4.232509800845417e-05, "loss": 0.6591, "step": 354600 }, { "epoch": 4.886886555895401, "grad_norm": 7.41689395904541, "learning_rate": 4.231834641063253e-05, "loss": 0.6365, "step": 354700 }, { "epoch": 4.88826430795514, "grad_norm": 11.385661125183105, "learning_rate": 4.231159355391655e-05, "loss": 0.6042, "step": 354800 }, { "epoch": 4.8896420600148796, "grad_norm": 11.584607124328613, "learning_rate": 4.230483943887991e-05, "loss": 0.6433, "step": 354900 }, { "epoch": 4.891019812074619, "grad_norm": 5.1976189613342285, "learning_rate": 4.229808406609644e-05, "loss": 0.7044, "step": 355000 }, { "epoch": 4.892397564134359, "grad_norm": 7.088685035705566, "learning_rate": 4.229132743614e-05, "loss": 0.5413, "step": 355100 }, { "epoch": 4.893775316194097, "grad_norm": 288.20904541015625, "learning_rate": 4.2284569549584636e-05, "loss": 0.6139, "step": 355200 }, { "epoch": 4.895153068253837, "grad_norm": 4.635827541351318, "learning_rate": 4.227781040700445e-05, "loss": 0.7333, "step": 355300 }, { "epoch": 4.8965308203135764, "grad_norm": 15.920924186706543, "learning_rate": 4.227105000897367e-05, "loss": 0.5883, "step": 355400 }, { "epoch": 4.897908572373316, "grad_norm": 6.5138421058654785, "learning_rate": 4.226428835606662e-05, "loss": 0.6135, "step": 355500 }, { "epoch": 4.899286324433055, "grad_norm": 4.578680038452148, "learning_rate": 4.2257525448857736e-05, "loss": 0.6835, "step": 355600 }, { "epoch": 4.900664076492794, "grad_norm": 10.249539375305176, "learning_rate": 4.225076128792157e-05, "loss": 0.7169, "step": 355700 }, { "epoch": 4.902041828552534, "grad_norm": 1.6489287614822388, "learning_rate": 4.224399587383277e-05, "loss": 0.6193, "step": 355800 }, { "epoch": 4.903419580612273, "grad_norm": 6.4056291580200195, "learning_rate": 4.223722920716609e-05, "loss": 0.6761, "step": 355900 }, { "epoch": 4.904797332672012, "grad_norm": 5.431517124176025, "learning_rate": 4.223046128849639e-05, "loss": 0.6166, "step": 356000 }, { "epoch": 4.906175084731752, "grad_norm": 5.034150123596191, "learning_rate": 4.222369211839865e-05, "loss": 0.6374, "step": 356100 }, { "epoch": 4.907552836791491, "grad_norm": 4.222480297088623, "learning_rate": 4.2216921697447935e-05, "loss": 0.6185, "step": 356200 }, { "epoch": 4.908930588851231, "grad_norm": 8.255706787109375, "learning_rate": 4.22102177491187e-05, "loss": 0.62, "step": 356300 }, { "epoch": 4.910308340910969, "grad_norm": 13.287421226501465, "learning_rate": 4.220344484068188e-05, "loss": 0.617, "step": 356400 }, { "epoch": 4.911686092970709, "grad_norm": 8.075258255004883, "learning_rate": 4.219667068311218e-05, "loss": 0.6516, "step": 356500 }, { "epoch": 4.9130638450304485, "grad_norm": 4.107531547546387, "learning_rate": 4.218989527698513e-05, "loss": 0.6652, "step": 356600 }, { "epoch": 4.914441597090188, "grad_norm": 8.637262344360352, "learning_rate": 4.2183118622876325e-05, "loss": 0.5962, "step": 356700 }, { "epoch": 4.915819349149927, "grad_norm": 18.10711669921875, "learning_rate": 4.217634072136146e-05, "loss": 0.5902, "step": 356800 }, { "epoch": 4.917197101209666, "grad_norm": 3.852318048477173, "learning_rate": 4.2169629370669745e-05, "loss": 0.6784, "step": 356900 }, { "epoch": 4.918574853269406, "grad_norm": 2.2979257106781006, "learning_rate": 4.216284898853005e-05, "loss": 0.5952, "step": 357000 }, { "epoch": 4.919952605329145, "grad_norm": 11.620049476623535, "learning_rate": 4.215606736070633e-05, "loss": 0.616, "step": 357100 }, { "epoch": 4.921330357388884, "grad_norm": 5.2932515144348145, "learning_rate": 4.214928448777469e-05, "loss": 0.6233, "step": 357200 }, { "epoch": 4.922708109448624, "grad_norm": 5.9200849533081055, "learning_rate": 4.21425003703114e-05, "loss": 0.6537, "step": 357300 }, { "epoch": 4.924085861508363, "grad_norm": 13.241439819335938, "learning_rate": 4.2135715008892773e-05, "loss": 0.7043, "step": 357400 }, { "epoch": 4.925463613568102, "grad_norm": 24.02862548828125, "learning_rate": 4.21289284040953e-05, "loss": 0.541, "step": 357500 }, { "epoch": 4.926841365627841, "grad_norm": 2.1290619373321533, "learning_rate": 4.212214055649551e-05, "loss": 0.5816, "step": 357600 }, { "epoch": 4.928219117687581, "grad_norm": 5.67216682434082, "learning_rate": 4.211535146667008e-05, "loss": 0.6459, "step": 357700 }, { "epoch": 4.9295968697473205, "grad_norm": 33.091854095458984, "learning_rate": 4.210856113519577e-05, "loss": 0.6935, "step": 357800 }, { "epoch": 4.93097462180706, "grad_norm": 6.914583206176758, "learning_rate": 4.210176956264945e-05, "loss": 0.6591, "step": 357900 }, { "epoch": 4.932352373866799, "grad_norm": 17.032636642456055, "learning_rate": 4.2094976749608096e-05, "loss": 0.6472, "step": 358000 }, { "epoch": 4.933730125926538, "grad_norm": 2.5963149070739746, "learning_rate": 4.208818269664881e-05, "loss": 0.6091, "step": 358100 }, { "epoch": 4.935107877986278, "grad_norm": 9.638187408447266, "learning_rate": 4.2081387404348766e-05, "loss": 0.656, "step": 358200 }, { "epoch": 4.9364856300460165, "grad_norm": 12.491456031799316, "learning_rate": 4.2074590873285274e-05, "loss": 0.6281, "step": 358300 }, { "epoch": 4.937863382105756, "grad_norm": 3.607346534729004, "learning_rate": 4.206779310403572e-05, "loss": 0.686, "step": 358400 }, { "epoch": 4.939241134165496, "grad_norm": 23.04509735107422, "learning_rate": 4.20609940971776e-05, "loss": 0.6462, "step": 358500 }, { "epoch": 4.940618886225235, "grad_norm": 10.33632755279541, "learning_rate": 4.2054193853288566e-05, "loss": 0.6242, "step": 358600 }, { "epoch": 4.941996638284975, "grad_norm": 2.7325823307037354, "learning_rate": 4.2047392372946274e-05, "loss": 0.6599, "step": 358700 }, { "epoch": 4.943374390344713, "grad_norm": 4.85376501083374, "learning_rate": 4.204058965672859e-05, "loss": 0.6081, "step": 358800 }, { "epoch": 4.944752142404453, "grad_norm": 58.063602447509766, "learning_rate": 4.203378570521344e-05, "loss": 0.5858, "step": 358900 }, { "epoch": 4.9461298944641925, "grad_norm": 23.585988998413086, "learning_rate": 4.2026980518978816e-05, "loss": 0.5981, "step": 359000 }, { "epoch": 4.947507646523931, "grad_norm": 7.902803897857666, "learning_rate": 4.202017409860289e-05, "loss": 0.7191, "step": 359100 }, { "epoch": 4.948885398583671, "grad_norm": 5.040465831756592, "learning_rate": 4.2013366444663885e-05, "loss": 0.6359, "step": 359200 }, { "epoch": 4.95026315064341, "grad_norm": 21.595529556274414, "learning_rate": 4.2006557557740155e-05, "loss": 0.6671, "step": 359300 }, { "epoch": 4.95164090270315, "grad_norm": 4.803142547607422, "learning_rate": 4.199974743841015e-05, "loss": 0.6007, "step": 359400 }, { "epoch": 4.9530186547628885, "grad_norm": 6.893428325653076, "learning_rate": 4.199293608725241e-05, "loss": 0.5709, "step": 359500 }, { "epoch": 4.954396406822628, "grad_norm": 7.387838363647461, "learning_rate": 4.1986123504845606e-05, "loss": 0.537, "step": 359600 }, { "epoch": 4.955774158882368, "grad_norm": 6.249050617218018, "learning_rate": 4.197930969176849e-05, "loss": 0.5397, "step": 359700 }, { "epoch": 4.957151910942107, "grad_norm": 18.658788681030273, "learning_rate": 4.1972494648599964e-05, "loss": 0.6408, "step": 359800 }, { "epoch": 4.958529663001846, "grad_norm": 2.8422133922576904, "learning_rate": 4.196567837591896e-05, "loss": 0.7516, "step": 359900 }, { "epoch": 4.959907415061585, "grad_norm": 4.506580829620361, "learning_rate": 4.1958860874304575e-05, "loss": 0.6432, "step": 360000 }, { "epoch": 4.961285167121325, "grad_norm": 5.355544567108154, "learning_rate": 4.195204214433599e-05, "loss": 0.6113, "step": 360100 }, { "epoch": 4.9626629191810645, "grad_norm": 4.7043776512146, "learning_rate": 4.194522218659249e-05, "loss": 0.5758, "step": 360200 }, { "epoch": 4.964040671240803, "grad_norm": 6.156986713409424, "learning_rate": 4.193840100165345e-05, "loss": 0.5164, "step": 360300 }, { "epoch": 4.965418423300543, "grad_norm": 52.45740509033203, "learning_rate": 4.1931578590098395e-05, "loss": 0.6351, "step": 360400 }, { "epoch": 4.966796175360282, "grad_norm": 3.1738970279693604, "learning_rate": 4.19247549525069e-05, "loss": 0.6273, "step": 360500 }, { "epoch": 4.968173927420022, "grad_norm": 16.057491302490234, "learning_rate": 4.191793008945868e-05, "loss": 0.6193, "step": 360600 }, { "epoch": 4.9695516794797605, "grad_norm": 4.419029235839844, "learning_rate": 4.191110400153353e-05, "loss": 0.6523, "step": 360700 }, { "epoch": 4.9709294315395, "grad_norm": 37.11264419555664, "learning_rate": 4.190427668931138e-05, "loss": 0.6745, "step": 360800 }, { "epoch": 4.97230718359924, "grad_norm": 5.937665939331055, "learning_rate": 4.189744815337223e-05, "loss": 0.6945, "step": 360900 }, { "epoch": 4.973684935658979, "grad_norm": 29.668067932128906, "learning_rate": 4.1890618394296186e-05, "loss": 0.5732, "step": 361000 }, { "epoch": 4.975062687718718, "grad_norm": 3.8287365436553955, "learning_rate": 4.1883787412663506e-05, "loss": 0.7104, "step": 361100 }, { "epoch": 4.976440439778457, "grad_norm": 6.143915176391602, "learning_rate": 4.18769552090545e-05, "loss": 0.6377, "step": 361200 }, { "epoch": 4.977818191838197, "grad_norm": 16.557329177856445, "learning_rate": 4.187012178404958e-05, "loss": 0.6204, "step": 361300 }, { "epoch": 4.9791959438979365, "grad_norm": 2.3030545711517334, "learning_rate": 4.186328713822931e-05, "loss": 0.6939, "step": 361400 }, { "epoch": 4.980573695957675, "grad_norm": 2.297210216522217, "learning_rate": 4.185645127217431e-05, "loss": 0.6302, "step": 361500 }, { "epoch": 4.981951448017415, "grad_norm": 7.1969194412231445, "learning_rate": 4.1849614186465337e-05, "loss": 0.6338, "step": 361600 }, { "epoch": 4.983329200077154, "grad_norm": 2.900331735610962, "learning_rate": 4.184277588168321e-05, "loss": 0.5577, "step": 361700 }, { "epoch": 4.984706952136893, "grad_norm": 3.4706006050109863, "learning_rate": 4.18359363584089e-05, "loss": 0.611, "step": 361800 }, { "epoch": 4.9860847041966325, "grad_norm": 8.389727592468262, "learning_rate": 4.182909561722347e-05, "loss": 0.611, "step": 361900 }, { "epoch": 4.987462456256372, "grad_norm": 17.859580993652344, "learning_rate": 4.1822253658708045e-05, "loss": 0.5764, "step": 362000 }, { "epoch": 4.988840208316112, "grad_norm": 5.879579544067383, "learning_rate": 4.1815410483443906e-05, "loss": 0.6728, "step": 362100 }, { "epoch": 4.990217960375851, "grad_norm": 17.218673706054688, "learning_rate": 4.180856609201241e-05, "loss": 0.586, "step": 362200 }, { "epoch": 4.99159571243559, "grad_norm": 3.2508254051208496, "learning_rate": 4.180172048499501e-05, "loss": 0.6029, "step": 362300 }, { "epoch": 4.992973464495329, "grad_norm": 10.402902603149414, "learning_rate": 4.179487366297329e-05, "loss": 0.672, "step": 362400 }, { "epoch": 4.994351216555069, "grad_norm": 1.7490906715393066, "learning_rate": 4.1788025626528935e-05, "loss": 0.5079, "step": 362500 }, { "epoch": 4.995728968614808, "grad_norm": 4.2720794677734375, "learning_rate": 4.17811763762437e-05, "loss": 0.7242, "step": 362600 }, { "epoch": 4.997106720674547, "grad_norm": 12.797200202941895, "learning_rate": 4.177432591269947e-05, "loss": 0.6602, "step": 362700 }, { "epoch": 4.998484472734287, "grad_norm": 4.103048324584961, "learning_rate": 4.176747423647822e-05, "loss": 0.6779, "step": 362800 }, { "epoch": 4.999862224794026, "grad_norm": 5.677615642547607, "learning_rate": 4.176062134816204e-05, "loss": 0.5889, "step": 362900 }, { "epoch": 5.001239976853765, "grad_norm": 11.1349458694458, "learning_rate": 4.1753767248333136e-05, "loss": 0.5987, "step": 363000 }, { "epoch": 5.0026177289135045, "grad_norm": 28.439786911010742, "learning_rate": 4.1746911937573764e-05, "loss": 0.6101, "step": 363100 }, { "epoch": 5.003995480973244, "grad_norm": 22.641624450683594, "learning_rate": 4.174005541646635e-05, "loss": 0.5408, "step": 363200 }, { "epoch": 5.005373233032984, "grad_norm": 11.364294052124023, "learning_rate": 4.173319768559337e-05, "loss": 0.575, "step": 363300 }, { "epoch": 5.006750985092722, "grad_norm": 2.4644153118133545, "learning_rate": 4.172633874553744e-05, "loss": 0.5785, "step": 363400 }, { "epoch": 5.008128737152462, "grad_norm": 7.052544116973877, "learning_rate": 4.1719478596881234e-05, "loss": 0.6256, "step": 363500 }, { "epoch": 5.009506489212201, "grad_norm": 2.3804948329925537, "learning_rate": 4.1712617240207584e-05, "loss": 0.5669, "step": 363600 }, { "epoch": 5.010884241271941, "grad_norm": 6.756119251251221, "learning_rate": 4.170575467609939e-05, "loss": 0.6099, "step": 363700 }, { "epoch": 5.01226199333168, "grad_norm": 1.881215214729309, "learning_rate": 4.1698890905139666e-05, "loss": 0.5922, "step": 363800 }, { "epoch": 5.013639745391419, "grad_norm": 38.156002044677734, "learning_rate": 4.16920945836529e-05, "loss": 0.6207, "step": 363900 }, { "epoch": 5.015017497451159, "grad_norm": 3.037376880645752, "learning_rate": 4.16852284127935e-05, "loss": 0.6101, "step": 364000 }, { "epoch": 5.016395249510898, "grad_norm": 5.417888164520264, "learning_rate": 4.1678361036826374e-05, "loss": 0.6517, "step": 364100 }, { "epoch": 5.017773001570637, "grad_norm": 82.02448272705078, "learning_rate": 4.167149245633494e-05, "loss": 0.5222, "step": 364200 }, { "epoch": 5.0191507536303765, "grad_norm": 6.891456127166748, "learning_rate": 4.1664622671902734e-05, "loss": 0.6018, "step": 364300 }, { "epoch": 5.020528505690116, "grad_norm": 17.33257293701172, "learning_rate": 4.165775168411336e-05, "loss": 0.57, "step": 364400 }, { "epoch": 5.021906257749856, "grad_norm": 12.862934112548828, "learning_rate": 4.165087949355055e-05, "loss": 0.6209, "step": 364500 }, { "epoch": 5.023284009809594, "grad_norm": 8.050983428955078, "learning_rate": 4.164400610079812e-05, "loss": 0.5868, "step": 364600 }, { "epoch": 5.024661761869334, "grad_norm": 10.230002403259277, "learning_rate": 4.163713150644002e-05, "loss": 0.5999, "step": 364700 }, { "epoch": 5.026039513929073, "grad_norm": 5.717103004455566, "learning_rate": 4.163025571106028e-05, "loss": 0.5354, "step": 364800 }, { "epoch": 5.027417265988813, "grad_norm": 4.582232475280762, "learning_rate": 4.162337871524302e-05, "loss": 0.6353, "step": 364900 }, { "epoch": 5.028795018048552, "grad_norm": 10.818204879760742, "learning_rate": 4.161650051957249e-05, "loss": 0.604, "step": 365000 }, { "epoch": 5.030172770108291, "grad_norm": 3.3800549507141113, "learning_rate": 4.160962112463302e-05, "loss": 0.5648, "step": 365100 }, { "epoch": 5.031550522168031, "grad_norm": 8.884862899780273, "learning_rate": 4.160274053100904e-05, "loss": 0.5043, "step": 365200 }, { "epoch": 5.03292827422777, "grad_norm": 7.920313835144043, "learning_rate": 4.159585873928511e-05, "loss": 0.5536, "step": 365300 }, { "epoch": 5.034306026287509, "grad_norm": 4.136593341827393, "learning_rate": 4.158897575004587e-05, "loss": 0.5865, "step": 365400 }, { "epoch": 5.035683778347249, "grad_norm": 5.959545612335205, "learning_rate": 4.1582091563876045e-05, "loss": 0.5471, "step": 365500 }, { "epoch": 5.037061530406988, "grad_norm": 6.247141361236572, "learning_rate": 4.157520618136052e-05, "loss": 0.6011, "step": 365600 }, { "epoch": 5.038439282466728, "grad_norm": 2.8242380619049072, "learning_rate": 4.156831960308421e-05, "loss": 0.5757, "step": 365700 }, { "epoch": 5.039817034526466, "grad_norm": 2.302687168121338, "learning_rate": 4.1561431829632176e-05, "loss": 0.6191, "step": 365800 }, { "epoch": 5.041194786586206, "grad_norm": 10.013916015625, "learning_rate": 4.155454286158957e-05, "loss": 0.5053, "step": 365900 }, { "epoch": 5.0425725386459455, "grad_norm": 2.516279697418213, "learning_rate": 4.154765269954165e-05, "loss": 0.7116, "step": 366000 }, { "epoch": 5.043950290705685, "grad_norm": 5.0328288078308105, "learning_rate": 4.1540761344073776e-05, "loss": 0.5504, "step": 366100 }, { "epoch": 5.045328042765424, "grad_norm": 1.6958752870559692, "learning_rate": 4.153386879577138e-05, "loss": 0.5612, "step": 366200 }, { "epoch": 5.046705794825163, "grad_norm": 9.392889022827148, "learning_rate": 4.152697505522003e-05, "loss": 0.5702, "step": 366300 }, { "epoch": 5.048083546884903, "grad_norm": 5.647604465484619, "learning_rate": 4.1520080123005394e-05, "loss": 0.6137, "step": 366400 }, { "epoch": 5.049461298944642, "grad_norm": 18.64179039001465, "learning_rate": 4.151318399971322e-05, "loss": 0.6157, "step": 366500 }, { "epoch": 5.050839051004381, "grad_norm": 57.31591796875, "learning_rate": 4.150635566495823e-05, "loss": 0.552, "step": 366600 }, { "epoch": 5.052216803064121, "grad_norm": 4.111088275909424, "learning_rate": 4.1499457173164834e-05, "loss": 0.5856, "step": 366700 }, { "epoch": 5.05359455512386, "grad_norm": 2.2314295768737793, "learning_rate": 4.1492626494742364e-05, "loss": 0.5392, "step": 366800 }, { "epoch": 5.054972307183599, "grad_norm": 4.245868682861328, "learning_rate": 4.148572563676861e-05, "loss": 0.5763, "step": 366900 }, { "epoch": 5.056350059243338, "grad_norm": 14.007862091064453, "learning_rate": 4.14788235906359e-05, "loss": 0.5775, "step": 367000 }, { "epoch": 5.057727811303078, "grad_norm": 8.79376220703125, "learning_rate": 4.147192035693063e-05, "loss": 0.5646, "step": 367100 }, { "epoch": 5.0591055633628175, "grad_norm": 4.9509477615356445, "learning_rate": 4.1465015936239236e-05, "loss": 0.6033, "step": 367200 }, { "epoch": 5.060483315422556, "grad_norm": 22.317596435546875, "learning_rate": 4.145811032914829e-05, "loss": 0.6139, "step": 367300 }, { "epoch": 5.061861067482296, "grad_norm": 3.3228743076324463, "learning_rate": 4.145120353624448e-05, "loss": 0.6206, "step": 367400 }, { "epoch": 5.063238819542035, "grad_norm": 1.2921619415283203, "learning_rate": 4.1444295558114534e-05, "loss": 0.4629, "step": 367500 }, { "epoch": 5.064616571601775, "grad_norm": 11.990192413330078, "learning_rate": 4.143738639534535e-05, "loss": 0.5826, "step": 367600 }, { "epoch": 5.0659943236615135, "grad_norm": 1.4604682922363281, "learning_rate": 4.14304760485239e-05, "loss": 0.6262, "step": 367700 }, { "epoch": 5.067372075721253, "grad_norm": 13.105561256408691, "learning_rate": 4.142356451823723e-05, "loss": 0.5814, "step": 367800 }, { "epoch": 5.068749827780993, "grad_norm": 4.268446445465088, "learning_rate": 4.141665180507252e-05, "loss": 0.5768, "step": 367900 }, { "epoch": 5.070127579840732, "grad_norm": 1.995708703994751, "learning_rate": 4.1409737909617046e-05, "loss": 0.5649, "step": 368000 }, { "epoch": 5.071505331900471, "grad_norm": 9.153339385986328, "learning_rate": 4.1402822832458176e-05, "loss": 0.5351, "step": 368100 }, { "epoch": 5.07288308396021, "grad_norm": 2.0645530223846436, "learning_rate": 4.1395906574183385e-05, "loss": 0.645, "step": 368200 }, { "epoch": 5.07426083601995, "grad_norm": 3.429266929626465, "learning_rate": 4.138898913538023e-05, "loss": 0.4891, "step": 368300 }, { "epoch": 5.0756385880796895, "grad_norm": 11.953442573547363, "learning_rate": 4.138207051663639e-05, "loss": 0.6215, "step": 368400 }, { "epoch": 5.077016340139428, "grad_norm": 3.2423934936523438, "learning_rate": 4.137515071853965e-05, "loss": 0.5465, "step": 368500 }, { "epoch": 5.078394092199168, "grad_norm": 1.6827054023742676, "learning_rate": 4.1368229741677865e-05, "loss": 0.5334, "step": 368600 }, { "epoch": 5.079771844258907, "grad_norm": 10.879039764404297, "learning_rate": 4.136130758663901e-05, "loss": 0.6364, "step": 368700 }, { "epoch": 5.081149596318647, "grad_norm": 2.1504204273223877, "learning_rate": 4.135438425401117e-05, "loss": 0.6347, "step": 368800 }, { "epoch": 5.0825273483783855, "grad_norm": 7.6515913009643555, "learning_rate": 4.1347459744382506e-05, "loss": 0.5976, "step": 368900 }, { "epoch": 5.083905100438125, "grad_norm": 21.53762435913086, "learning_rate": 4.134053405834129e-05, "loss": 0.5814, "step": 369000 }, { "epoch": 5.085282852497865, "grad_norm": 3.4068212509155273, "learning_rate": 4.1333676470912956e-05, "loss": 0.6395, "step": 369100 }, { "epoch": 5.086660604557604, "grad_norm": 2.371778726577759, "learning_rate": 4.13267484455613e-05, "loss": 0.4973, "step": 369200 }, { "epoch": 5.088038356617343, "grad_norm": 6.421906471252441, "learning_rate": 4.131981924555665e-05, "loss": 0.5746, "step": 369300 }, { "epoch": 5.089416108677082, "grad_norm": 2.853595733642578, "learning_rate": 4.1312888871487625e-05, "loss": 0.6155, "step": 369400 }, { "epoch": 5.090793860736822, "grad_norm": 3.2667086124420166, "learning_rate": 4.1305957323943044e-05, "loss": 0.62, "step": 369500 }, { "epoch": 5.0921716127965615, "grad_norm": 2.0608744621276855, "learning_rate": 4.129902460351175e-05, "loss": 0.5823, "step": 369600 }, { "epoch": 5.0935493648563, "grad_norm": 2.729482412338257, "learning_rate": 4.129209071078272e-05, "loss": 0.5176, "step": 369700 }, { "epoch": 5.09492711691604, "grad_norm": 8.697037696838379, "learning_rate": 4.1285155646345024e-05, "loss": 0.6126, "step": 369800 }, { "epoch": 5.096304868975779, "grad_norm": 21.503162384033203, "learning_rate": 4.127821941078783e-05, "loss": 0.641, "step": 369900 }, { "epoch": 5.097682621035519, "grad_norm": 5.574917793273926, "learning_rate": 4.127128200470041e-05, "loss": 0.6696, "step": 370000 }, { "epoch": 5.0990603730952575, "grad_norm": 18.982444763183594, "learning_rate": 4.1264343428672134e-05, "loss": 0.7233, "step": 370100 }, { "epoch": 5.100438125154997, "grad_norm": 2.718024253845215, "learning_rate": 4.125740368329246e-05, "loss": 0.6405, "step": 370200 }, { "epoch": 5.101815877214737, "grad_norm": 4.670779705047607, "learning_rate": 4.125046276915097e-05, "loss": 0.6026, "step": 370300 }, { "epoch": 5.103193629274476, "grad_norm": 8.902013778686523, "learning_rate": 4.124352068683731e-05, "loss": 0.5597, "step": 370400 }, { "epoch": 5.104571381334215, "grad_norm": 22.331748962402344, "learning_rate": 4.123657743694126e-05, "loss": 0.6467, "step": 370500 }, { "epoch": 5.105949133393954, "grad_norm": 10.051376342773438, "learning_rate": 4.1229633020052684e-05, "loss": 0.6235, "step": 370600 }, { "epoch": 5.107326885453694, "grad_norm": 6.93824577331543, "learning_rate": 4.1222756898366214e-05, "loss": 0.6308, "step": 370700 }, { "epoch": 5.1087046375134335, "grad_norm": 11.9744291305542, "learning_rate": 4.121581016091777e-05, "loss": 0.6979, "step": 370800 }, { "epoch": 5.110082389573172, "grad_norm": 33.28582763671875, "learning_rate": 4.120886225824108e-05, "loss": 0.5997, "step": 370900 }, { "epoch": 5.111460141632912, "grad_norm": 4.222032070159912, "learning_rate": 4.1201913190926415e-05, "loss": 0.6096, "step": 371000 }, { "epoch": 5.112837893692651, "grad_norm": 14.958670616149902, "learning_rate": 4.119496295956412e-05, "loss": 0.5833, "step": 371100 }, { "epoch": 5.11421564575239, "grad_norm": 2.6425986289978027, "learning_rate": 4.118801156474466e-05, "loss": 0.5559, "step": 371200 }, { "epoch": 5.1155933978121295, "grad_norm": 126.07704162597656, "learning_rate": 4.1181059007058596e-05, "loss": 0.5604, "step": 371300 }, { "epoch": 5.116971149871869, "grad_norm": 6.945819854736328, "learning_rate": 4.1174105287096576e-05, "loss": 0.615, "step": 371400 }, { "epoch": 5.118348901931609, "grad_norm": 3.546926736831665, "learning_rate": 4.1167150405449353e-05, "loss": 0.6426, "step": 371500 }, { "epoch": 5.119726653991347, "grad_norm": 11.858908653259277, "learning_rate": 4.116019436270778e-05, "loss": 0.6276, "step": 371600 }, { "epoch": 5.121104406051087, "grad_norm": 5.070420265197754, "learning_rate": 4.115323715946281e-05, "loss": 0.6204, "step": 371700 }, { "epoch": 5.122482158110826, "grad_norm": 4.287106990814209, "learning_rate": 4.1146278796305495e-05, "loss": 0.5996, "step": 371800 }, { "epoch": 5.123859910170566, "grad_norm": 4.991305828094482, "learning_rate": 4.113931927382699e-05, "loss": 0.6209, "step": 371900 }, { "epoch": 5.125237662230305, "grad_norm": 4.611647605895996, "learning_rate": 4.113242820516437e-05, "loss": 0.6224, "step": 372000 }, { "epoch": 5.126615414290044, "grad_norm": 7.752304553985596, "learning_rate": 4.112546637739578e-05, "loss": 0.6273, "step": 372100 }, { "epoch": 5.127993166349784, "grad_norm": 15.775985717773438, "learning_rate": 4.1118503392074086e-05, "loss": 0.6353, "step": 372200 }, { "epoch": 5.129370918409523, "grad_norm": 6.6105241775512695, "learning_rate": 4.111160889693872e-05, "loss": 0.6562, "step": 372300 }, { "epoch": 5.130748670469262, "grad_norm": 2.3871562480926514, "learning_rate": 4.110464360984637e-05, "loss": 0.5681, "step": 372400 }, { "epoch": 5.1321264225290015, "grad_norm": 1.8231264352798462, "learning_rate": 4.1097677166969955e-05, "loss": 0.5977, "step": 372500 }, { "epoch": 5.133504174588741, "grad_norm": 5.6286115646362305, "learning_rate": 4.109070956890129e-05, "loss": 0.5465, "step": 372600 }, { "epoch": 5.134881926648481, "grad_norm": 9.957012176513672, "learning_rate": 4.1083740816232326e-05, "loss": 0.5163, "step": 372700 }, { "epoch": 5.136259678708219, "grad_norm": 9.75981616973877, "learning_rate": 4.107677090955508e-05, "loss": 0.5852, "step": 372800 }, { "epoch": 5.137637430767959, "grad_norm": 7.914633274078369, "learning_rate": 4.106979984946169e-05, "loss": 0.6376, "step": 372900 }, { "epoch": 5.139015182827698, "grad_norm": 3.5851895809173584, "learning_rate": 4.106282763654438e-05, "loss": 0.5709, "step": 373000 }, { "epoch": 5.140392934887438, "grad_norm": 7.711193084716797, "learning_rate": 4.105585427139547e-05, "loss": 0.6295, "step": 373100 }, { "epoch": 5.141770686947177, "grad_norm": 32.90951919555664, "learning_rate": 4.10488797546074e-05, "loss": 0.5489, "step": 373200 }, { "epoch": 5.143148439006916, "grad_norm": 29.91495704650879, "learning_rate": 4.1041904086772666e-05, "loss": 0.6764, "step": 373300 }, { "epoch": 5.144526191066656, "grad_norm": 43.439964294433594, "learning_rate": 4.103492726848391e-05, "loss": 0.6146, "step": 373400 }, { "epoch": 5.145903943126395, "grad_norm": 1.0201935768127441, "learning_rate": 4.102794930033383e-05, "loss": 0.6339, "step": 373500 }, { "epoch": 5.147281695186134, "grad_norm": 3.2723748683929443, "learning_rate": 4.102097018291525e-05, "loss": 0.6299, "step": 373600 }, { "epoch": 5.1486594472458735, "grad_norm": 23.417081832885742, "learning_rate": 4.101398991682109e-05, "loss": 0.6017, "step": 373700 }, { "epoch": 5.150037199305613, "grad_norm": 3.727722406387329, "learning_rate": 4.100700850264434e-05, "loss": 0.5638, "step": 373800 }, { "epoch": 5.151414951365353, "grad_norm": 12.340597152709961, "learning_rate": 4.100002594097811e-05, "loss": 0.6044, "step": 373900 }, { "epoch": 5.152792703425091, "grad_norm": 77.85572052001953, "learning_rate": 4.099304223241562e-05, "loss": 0.5691, "step": 374000 }, { "epoch": 5.154170455484831, "grad_norm": 5.215542316436768, "learning_rate": 4.098605737755016e-05, "loss": 0.5507, "step": 374100 }, { "epoch": 5.15554820754457, "grad_norm": 3.6421759128570557, "learning_rate": 4.097907137697514e-05, "loss": 0.5906, "step": 374200 }, { "epoch": 5.15692595960431, "grad_norm": 2.5727028846740723, "learning_rate": 4.0972084231284044e-05, "loss": 0.5203, "step": 374300 }, { "epoch": 5.158303711664049, "grad_norm": 3.6209142208099365, "learning_rate": 4.0965095941070455e-05, "loss": 0.5257, "step": 374400 }, { "epoch": 5.159681463723788, "grad_norm": 206.15066528320312, "learning_rate": 4.09581065069281e-05, "loss": 0.6077, "step": 374500 }, { "epoch": 5.161059215783528, "grad_norm": 36.08523178100586, "learning_rate": 4.0951115929450726e-05, "loss": 0.6378, "step": 374600 }, { "epoch": 5.162436967843267, "grad_norm": 12.618553161621094, "learning_rate": 4.0944124209232255e-05, "loss": 0.6172, "step": 374700 }, { "epoch": 5.163814719903006, "grad_norm": 4.642848491668701, "learning_rate": 4.0937131346866656e-05, "loss": 0.6837, "step": 374800 }, { "epoch": 5.165192471962746, "grad_norm": 40.28071975708008, "learning_rate": 4.0930137342947985e-05, "loss": 0.6628, "step": 374900 }, { "epoch": 5.166570224022485, "grad_norm": 6.119065761566162, "learning_rate": 4.092314219807045e-05, "loss": 0.6317, "step": 375000 }, { "epoch": 5.167947976082225, "grad_norm": 2.995183229446411, "learning_rate": 4.0916145912828314e-05, "loss": 0.6198, "step": 375100 }, { "epoch": 5.169325728141963, "grad_norm": 14.597306251525879, "learning_rate": 4.0909148487815946e-05, "loss": 0.5681, "step": 375200 }, { "epoch": 5.170703480201703, "grad_norm": 98.04617309570312, "learning_rate": 4.090214992362781e-05, "loss": 0.6087, "step": 375300 }, { "epoch": 5.1720812322614425, "grad_norm": 5.577531814575195, "learning_rate": 4.089515022085848e-05, "loss": 0.6834, "step": 375400 }, { "epoch": 5.173458984321181, "grad_norm": 5.794976234436035, "learning_rate": 4.088814938010259e-05, "loss": 0.587, "step": 375500 }, { "epoch": 5.174836736380921, "grad_norm": 10.025135040283203, "learning_rate": 4.088114740195494e-05, "loss": 0.6494, "step": 375600 }, { "epoch": 5.17621448844066, "grad_norm": 2.8370156288146973, "learning_rate": 4.0874144287010354e-05, "loss": 0.625, "step": 375700 }, { "epoch": 5.1775922405004, "grad_norm": 48.05025100708008, "learning_rate": 4.0867210083997494e-05, "loss": 0.6058, "step": 375800 }, { "epoch": 5.1789699925601385, "grad_norm": 8.473396301269531, "learning_rate": 4.0860204708597125e-05, "loss": 0.6052, "step": 375900 }, { "epoch": 5.180347744619878, "grad_norm": 42.94242858886719, "learning_rate": 4.0853198198179e-05, "loss": 0.5828, "step": 376000 }, { "epoch": 5.181725496679618, "grad_norm": 5.639178276062012, "learning_rate": 4.084619055333838e-05, "loss": 0.6109, "step": 376100 }, { "epoch": 5.183103248739357, "grad_norm": 24.022245407104492, "learning_rate": 4.083918177467061e-05, "loss": 0.5179, "step": 376200 }, { "epoch": 5.184481000799096, "grad_norm": 19.259923934936523, "learning_rate": 4.083217186277109e-05, "loss": 0.6196, "step": 376300 }, { "epoch": 5.185858752858835, "grad_norm": 7.789968013763428, "learning_rate": 4.0825160818235366e-05, "loss": 0.6328, "step": 376400 }, { "epoch": 5.187236504918575, "grad_norm": 5.4027814865112305, "learning_rate": 4.0818148641659055e-05, "loss": 0.602, "step": 376500 }, { "epoch": 5.1886142569783145, "grad_norm": 10.9011869430542, "learning_rate": 4.0811135333637884e-05, "loss": 0.6472, "step": 376600 }, { "epoch": 5.189992009038053, "grad_norm": 7.006741523742676, "learning_rate": 4.080412089476767e-05, "loss": 0.5627, "step": 376700 }, { "epoch": 5.191369761097793, "grad_norm": 9.492364883422852, "learning_rate": 4.079710532564432e-05, "loss": 0.5759, "step": 376800 }, { "epoch": 5.192747513157532, "grad_norm": 16.567487716674805, "learning_rate": 4.079008862686385e-05, "loss": 0.5429, "step": 376900 }, { "epoch": 5.194125265217272, "grad_norm": 8.14239501953125, "learning_rate": 4.078307079902236e-05, "loss": 0.5523, "step": 377000 }, { "epoch": 5.1955030172770105, "grad_norm": 5.102200031280518, "learning_rate": 4.0776051842716044e-05, "loss": 0.538, "step": 377100 }, { "epoch": 5.19688076933675, "grad_norm": 4.339067459106445, "learning_rate": 4.0769031758541206e-05, "loss": 0.6056, "step": 377200 }, { "epoch": 5.19825852139649, "grad_norm": 2.2519538402557373, "learning_rate": 4.076201054709424e-05, "loss": 0.6395, "step": 377300 }, { "epoch": 5.199636273456229, "grad_norm": 3.2794904708862305, "learning_rate": 4.075498820897162e-05, "loss": 0.5668, "step": 377400 }, { "epoch": 5.201014025515968, "grad_norm": 4.9867777824401855, "learning_rate": 4.0747964744769946e-05, "loss": 0.514, "step": 377500 }, { "epoch": 5.202391777575707, "grad_norm": 1.9453870058059692, "learning_rate": 4.074094015508589e-05, "loss": 0.5455, "step": 377600 }, { "epoch": 5.203769529635447, "grad_norm": 4.1607346534729, "learning_rate": 4.073391444051623e-05, "loss": 0.603, "step": 377700 }, { "epoch": 5.2051472816951865, "grad_norm": 5.72292947769165, "learning_rate": 4.072688760165783e-05, "loss": 0.6327, "step": 377800 }, { "epoch": 5.206525033754925, "grad_norm": 1.0085132122039795, "learning_rate": 4.071985963910767e-05, "loss": 0.6038, "step": 377900 }, { "epoch": 5.207902785814665, "grad_norm": 7.772294044494629, "learning_rate": 4.071283055346279e-05, "loss": 0.5819, "step": 378000 }, { "epoch": 5.209280537874404, "grad_norm": 5.603379726409912, "learning_rate": 4.070580034532036e-05, "loss": 0.5842, "step": 378100 }, { "epoch": 5.210658289934144, "grad_norm": 8.517966270446777, "learning_rate": 4.0698769015277634e-05, "loss": 0.574, "step": 378200 }, { "epoch": 5.2120360419938825, "grad_norm": 3.800159454345703, "learning_rate": 4.069173656393195e-05, "loss": 0.6174, "step": 378300 }, { "epoch": 5.213413794053622, "grad_norm": 13.190343856811523, "learning_rate": 4.068470299188076e-05, "loss": 0.5524, "step": 378400 }, { "epoch": 5.214791546113362, "grad_norm": 4.494522571563721, "learning_rate": 4.06776682997216e-05, "loss": 0.6131, "step": 378500 }, { "epoch": 5.216169298173101, "grad_norm": 3.176579236984253, "learning_rate": 4.0670632488052094e-05, "loss": 0.6198, "step": 378600 }, { "epoch": 5.21754705023284, "grad_norm": 2.469952344894409, "learning_rate": 4.066359555746999e-05, "loss": 0.6303, "step": 378700 }, { "epoch": 5.218924802292579, "grad_norm": 3.082827568054199, "learning_rate": 4.065655750857309e-05, "loss": 0.6535, "step": 378800 }, { "epoch": 5.220302554352319, "grad_norm": 18.763904571533203, "learning_rate": 4.0649518341959324e-05, "loss": 0.5767, "step": 378900 }, { "epoch": 5.2216803064120585, "grad_norm": 4.597332000732422, "learning_rate": 4.064247805822671e-05, "loss": 0.5857, "step": 379000 }, { "epoch": 5.223058058471797, "grad_norm": 7.7229743003845215, "learning_rate": 4.063543665797333e-05, "loss": 0.584, "step": 379100 }, { "epoch": 5.224435810531537, "grad_norm": 2.106656551361084, "learning_rate": 4.0628394141797424e-05, "loss": 0.508, "step": 379200 }, { "epoch": 5.225813562591276, "grad_norm": 13.566353797912598, "learning_rate": 4.062135051029726e-05, "loss": 0.5658, "step": 379300 }, { "epoch": 5.227191314651016, "grad_norm": 11.244034767150879, "learning_rate": 4.0614305764071236e-05, "loss": 0.6092, "step": 379400 }, { "epoch": 5.2285690667107545, "grad_norm": 1.7361465692520142, "learning_rate": 4.0607259903717864e-05, "loss": 0.5772, "step": 379500 }, { "epoch": 5.229946818770494, "grad_norm": 2.4289731979370117, "learning_rate": 4.060021292983569e-05, "loss": 0.6042, "step": 379600 }, { "epoch": 5.231324570830234, "grad_norm": 3.7731006145477295, "learning_rate": 4.0593164843023416e-05, "loss": 0.6586, "step": 379700 }, { "epoch": 5.232702322889972, "grad_norm": 8.631765365600586, "learning_rate": 4.058611564387981e-05, "loss": 0.5831, "step": 379800 }, { "epoch": 5.234080074949712, "grad_norm": 3.49064302444458, "learning_rate": 4.0579065333003715e-05, "loss": 0.5367, "step": 379900 }, { "epoch": 5.235457827009451, "grad_norm": 3.032696485519409, "learning_rate": 4.057201391099412e-05, "loss": 0.5582, "step": 380000 }, { "epoch": 5.236835579069191, "grad_norm": 6.222581386566162, "learning_rate": 4.056496137845007e-05, "loss": 0.5183, "step": 380100 }, { "epoch": 5.23821333112893, "grad_norm": 5.745354652404785, "learning_rate": 4.05579077359707e-05, "loss": 0.6431, "step": 380200 }, { "epoch": 5.239591083188669, "grad_norm": 37.070980072021484, "learning_rate": 4.055085298415527e-05, "loss": 0.5982, "step": 380300 }, { "epoch": 5.240968835248409, "grad_norm": 2.498462677001953, "learning_rate": 4.054379712360311e-05, "loss": 0.6013, "step": 380400 }, { "epoch": 5.242346587308148, "grad_norm": 7.718915939331055, "learning_rate": 4.0536740154913656e-05, "loss": 0.5188, "step": 380500 }, { "epoch": 5.243724339367887, "grad_norm": 6.567794322967529, "learning_rate": 4.052968207868643e-05, "loss": 0.557, "step": 380600 }, { "epoch": 5.2451020914276265, "grad_norm": 11.195625305175781, "learning_rate": 4.052262289552105e-05, "loss": 0.5007, "step": 380700 }, { "epoch": 5.246479843487366, "grad_norm": 5.9724507331848145, "learning_rate": 4.051556260601723e-05, "loss": 0.6212, "step": 380800 }, { "epoch": 5.247857595547106, "grad_norm": 4.797862529754639, "learning_rate": 4.050850121077478e-05, "loss": 0.5214, "step": 380900 }, { "epoch": 5.249235347606844, "grad_norm": 2.588568687438965, "learning_rate": 4.0501438710393606e-05, "loss": 0.5557, "step": 381000 }, { "epoch": 5.250613099666584, "grad_norm": 9.342585563659668, "learning_rate": 4.049437510547369e-05, "loss": 0.5424, "step": 381100 }, { "epoch": 5.251990851726323, "grad_norm": 836.8515625, "learning_rate": 4.0487310396615136e-05, "loss": 0.6345, "step": 381200 }, { "epoch": 5.253368603786063, "grad_norm": 11.792338371276855, "learning_rate": 4.048024458441812e-05, "loss": 0.5612, "step": 381300 }, { "epoch": 5.254746355845802, "grad_norm": 5.871984481811523, "learning_rate": 4.0473177669482916e-05, "loss": 0.5592, "step": 381400 }, { "epoch": 5.256124107905541, "grad_norm": 7.3336286544799805, "learning_rate": 4.046610965240991e-05, "loss": 0.5213, "step": 381500 }, { "epoch": 5.257501859965281, "grad_norm": 11.177349090576172, "learning_rate": 4.045904053379954e-05, "loss": 0.6225, "step": 381600 }, { "epoch": 5.25887961202502, "grad_norm": 14.836637496948242, "learning_rate": 4.045197031425239e-05, "loss": 0.5878, "step": 381700 }, { "epoch": 5.260257364084759, "grad_norm": 2.5071139335632324, "learning_rate": 4.04448989943691e-05, "loss": 0.5891, "step": 381800 }, { "epoch": 5.2616351161444985, "grad_norm": 22.321273803710938, "learning_rate": 4.0437826574750404e-05, "loss": 0.6396, "step": 381900 }, { "epoch": 5.263012868204238, "grad_norm": 7.139045715332031, "learning_rate": 4.043075305599716e-05, "loss": 0.5871, "step": 382000 }, { "epoch": 5.264390620263978, "grad_norm": 6.893263339996338, "learning_rate": 4.042367843871029e-05, "loss": 0.5377, "step": 382100 }, { "epoch": 5.265768372323716, "grad_norm": 4.769865036010742, "learning_rate": 4.041660272349082e-05, "loss": 0.5813, "step": 382200 }, { "epoch": 5.267146124383456, "grad_norm": 6.16351842880249, "learning_rate": 4.040952591093987e-05, "loss": 0.6101, "step": 382300 }, { "epoch": 5.268523876443195, "grad_norm": 7.79477596282959, "learning_rate": 4.040244800165864e-05, "loss": 0.6184, "step": 382400 }, { "epoch": 5.269901628502935, "grad_norm": 2.567798137664795, "learning_rate": 4.039536899624844e-05, "loss": 0.6354, "step": 382500 }, { "epoch": 5.271279380562674, "grad_norm": 5.0070672035217285, "learning_rate": 4.038828889531069e-05, "loss": 0.6699, "step": 382600 }, { "epoch": 5.272657132622413, "grad_norm": 5.721522808074951, "learning_rate": 4.038120769944685e-05, "loss": 0.6626, "step": 382700 }, { "epoch": 5.274034884682153, "grad_norm": 19.798641204833984, "learning_rate": 4.037412540925852e-05, "loss": 0.602, "step": 382800 }, { "epoch": 5.275412636741892, "grad_norm": 7.47581672668457, "learning_rate": 4.0367042025347376e-05, "loss": 0.5653, "step": 382900 }, { "epoch": 5.276790388801631, "grad_norm": 14.146540641784668, "learning_rate": 4.035995754831518e-05, "loss": 0.5903, "step": 383000 }, { "epoch": 5.2781681408613705, "grad_norm": 4.865891456604004, "learning_rate": 4.03528719787638e-05, "loss": 0.5535, "step": 383100 }, { "epoch": 5.27954589292111, "grad_norm": 13.20900821685791, "learning_rate": 4.034578531729518e-05, "loss": 0.5371, "step": 383200 }, { "epoch": 5.28092364498085, "grad_norm": 10.534146308898926, "learning_rate": 4.0338697564511395e-05, "loss": 0.5295, "step": 383300 }, { "epoch": 5.282301397040588, "grad_norm": 6.142459392547607, "learning_rate": 4.033167961484659e-05, "loss": 0.6659, "step": 383400 }, { "epoch": 5.283679149100328, "grad_norm": 8.958629608154297, "learning_rate": 4.0324589692137076e-05, "loss": 0.6162, "step": 383500 }, { "epoch": 5.285056901160067, "grad_norm": 90.62395477294922, "learning_rate": 4.031749867991306e-05, "loss": 0.6245, "step": 383600 }, { "epoch": 5.286434653219807, "grad_norm": 7.336931228637695, "learning_rate": 4.031040657877696e-05, "loss": 0.6219, "step": 383700 }, { "epoch": 5.287812405279546, "grad_norm": 2.8121542930603027, "learning_rate": 4.0303313389331274e-05, "loss": 0.6539, "step": 383800 }, { "epoch": 5.289190157339285, "grad_norm": 8.606476783752441, "learning_rate": 4.029629006033232e-05, "loss": 0.5587, "step": 383900 }, { "epoch": 5.290567909399025, "grad_norm": 2.72857928276062, "learning_rate": 4.0289194706943446e-05, "loss": 0.6036, "step": 384000 }, { "epoch": 5.291945661458763, "grad_norm": 6.961001396179199, "learning_rate": 4.028209826704706e-05, "loss": 0.5443, "step": 384100 }, { "epoch": 5.293323413518503, "grad_norm": 1.7218700647354126, "learning_rate": 4.027500074124602e-05, "loss": 0.582, "step": 384200 }, { "epoch": 5.2947011655782426, "grad_norm": 2.7872872352600098, "learning_rate": 4.026790213014332e-05, "loss": 0.5201, "step": 384300 }, { "epoch": 5.296078917637982, "grad_norm": 7.193872928619385, "learning_rate": 4.026080243434201e-05, "loss": 0.5944, "step": 384400 }, { "epoch": 5.297456669697722, "grad_norm": 3.1091573238372803, "learning_rate": 4.025370165444525e-05, "loss": 0.6305, "step": 384500 }, { "epoch": 5.29883442175746, "grad_norm": 10.786712646484375, "learning_rate": 4.024659979105629e-05, "loss": 0.5025, "step": 384600 }, { "epoch": 5.3002121738172, "grad_norm": 3.10170578956604, "learning_rate": 4.0239496844778466e-05, "loss": 0.562, "step": 384700 }, { "epoch": 5.3015899258769394, "grad_norm": 8.986777305603027, "learning_rate": 4.023239281621521e-05, "loss": 0.554, "step": 384800 }, { "epoch": 5.302967677936678, "grad_norm": 13.616964340209961, "learning_rate": 4.0225287705970046e-05, "loss": 0.623, "step": 384900 }, { "epoch": 5.304345429996418, "grad_norm": 3.461974859237671, "learning_rate": 4.021818151464658e-05, "loss": 0.6051, "step": 385000 }, { "epoch": 5.305723182056157, "grad_norm": 3.282644033432007, "learning_rate": 4.021107424284854e-05, "loss": 0.5755, "step": 385100 }, { "epoch": 5.307100934115897, "grad_norm": 4.333539962768555, "learning_rate": 4.0203965891179716e-05, "loss": 0.5618, "step": 385200 }, { "epoch": 5.3084786861756355, "grad_norm": 4.0965189933776855, "learning_rate": 4.0196856460243986e-05, "loss": 0.6056, "step": 385300 }, { "epoch": 5.309856438235375, "grad_norm": 36.64126205444336, "learning_rate": 4.018974595064535e-05, "loss": 0.6409, "step": 385400 }, { "epoch": 5.311234190295115, "grad_norm": 31.54360580444336, "learning_rate": 4.018263436298787e-05, "loss": 0.4638, "step": 385500 }, { "epoch": 5.312611942354854, "grad_norm": 2.8543038368225098, "learning_rate": 4.017552169787572e-05, "loss": 0.6129, "step": 385600 }, { "epoch": 5.313989694414593, "grad_norm": 68.8862075805664, "learning_rate": 4.016840795591315e-05, "loss": 0.5871, "step": 385700 }, { "epoch": 5.315367446474332, "grad_norm": 46.42893600463867, "learning_rate": 4.0161293137704517e-05, "loss": 0.6657, "step": 385800 }, { "epoch": 5.316745198534072, "grad_norm": 3.993032932281494, "learning_rate": 4.0154177243854266e-05, "loss": 0.6036, "step": 385900 }, { "epoch": 5.3181229505938115, "grad_norm": 8.683631896972656, "learning_rate": 4.01470602749669e-05, "loss": 0.6358, "step": 386000 }, { "epoch": 5.31950070265355, "grad_norm": 38.39250564575195, "learning_rate": 4.013994223164708e-05, "loss": 0.6056, "step": 386100 }, { "epoch": 5.32087845471329, "grad_norm": 6.656564235687256, "learning_rate": 4.0132823114499494e-05, "loss": 0.5059, "step": 386200 }, { "epoch": 5.322256206773029, "grad_norm": 4.556617736816406, "learning_rate": 4.012570292412895e-05, "loss": 0.5801, "step": 386300 }, { "epoch": 5.323633958832769, "grad_norm": 4.628555774688721, "learning_rate": 4.0118581661140366e-05, "loss": 0.5386, "step": 386400 }, { "epoch": 5.3250117108925075, "grad_norm": 47.28902053833008, "learning_rate": 4.011145932613869e-05, "loss": 0.5367, "step": 386500 }, { "epoch": 5.326389462952247, "grad_norm": 12.198378562927246, "learning_rate": 4.0104335919729046e-05, "loss": 0.6269, "step": 386600 }, { "epoch": 5.327767215011987, "grad_norm": 20.024791717529297, "learning_rate": 4.009721144251658e-05, "loss": 0.5378, "step": 386700 }, { "epoch": 5.329144967071726, "grad_norm": 6.95658016204834, "learning_rate": 4.0090085895106536e-05, "loss": 0.5614, "step": 386800 }, { "epoch": 5.330522719131465, "grad_norm": 2.594550609588623, "learning_rate": 4.0082959278104305e-05, "loss": 0.5719, "step": 386900 }, { "epoch": 5.331900471191204, "grad_norm": 9.886303901672363, "learning_rate": 4.0075831592115304e-05, "loss": 0.5286, "step": 387000 }, { "epoch": 5.333278223250944, "grad_norm": 6.014042854309082, "learning_rate": 4.0068702837745065e-05, "loss": 0.6058, "step": 387100 }, { "epoch": 5.3346559753106835, "grad_norm": 17.563846588134766, "learning_rate": 4.0061573015599225e-05, "loss": 0.4989, "step": 387200 }, { "epoch": 5.336033727370422, "grad_norm": 2.6834816932678223, "learning_rate": 4.00544421262835e-05, "loss": 0.5503, "step": 387300 }, { "epoch": 5.337411479430162, "grad_norm": 42.77082824707031, "learning_rate": 4.0047310170403675e-05, "loss": 0.6368, "step": 387400 }, { "epoch": 5.338789231489901, "grad_norm": 7.212562084197998, "learning_rate": 4.004017714856566e-05, "loss": 0.6061, "step": 387500 }, { "epoch": 5.340166983549641, "grad_norm": 11.456828117370605, "learning_rate": 4.003304306137545e-05, "loss": 0.5994, "step": 387600 }, { "epoch": 5.3415447356093795, "grad_norm": 13.261024475097656, "learning_rate": 4.002590790943911e-05, "loss": 0.5423, "step": 387700 }, { "epoch": 5.342922487669119, "grad_norm": 5.116427898406982, "learning_rate": 4.001877169336281e-05, "loss": 0.4804, "step": 387800 }, { "epoch": 5.344300239728859, "grad_norm": 9.281394958496094, "learning_rate": 4.0011634413752814e-05, "loss": 0.5409, "step": 387900 }, { "epoch": 5.345677991788598, "grad_norm": 3.0764312744140625, "learning_rate": 4.0004496071215456e-05, "loss": 0.5561, "step": 388000 }, { "epoch": 5.347055743848337, "grad_norm": 3.0495798587799072, "learning_rate": 3.999735666635719e-05, "loss": 0.653, "step": 388100 }, { "epoch": 5.348433495908076, "grad_norm": 5.166534900665283, "learning_rate": 3.999021619978453e-05, "loss": 0.5862, "step": 388200 }, { "epoch": 5.349811247967816, "grad_norm": 2.615530014038086, "learning_rate": 3.998307467210411e-05, "loss": 0.6143, "step": 388300 }, { "epoch": 5.351189000027555, "grad_norm": 3.1502695083618164, "learning_rate": 3.997593208392264e-05, "loss": 0.5436, "step": 388400 }, { "epoch": 5.352566752087294, "grad_norm": 7.142763614654541, "learning_rate": 3.996878843584691e-05, "loss": 0.5761, "step": 388500 }, { "epoch": 5.353944504147034, "grad_norm": 2.311624765396118, "learning_rate": 3.9961643728483806e-05, "loss": 0.6069, "step": 388600 }, { "epoch": 5.355322256206773, "grad_norm": 4.66115665435791, "learning_rate": 3.995449796244033e-05, "loss": 0.5677, "step": 388700 }, { "epoch": 5.356700008266513, "grad_norm": 5.503378868103027, "learning_rate": 3.994735113832352e-05, "loss": 0.5195, "step": 388800 }, { "epoch": 5.3580777603262515, "grad_norm": 5.903469562530518, "learning_rate": 3.9940203256740554e-05, "loss": 0.5981, "step": 388900 }, { "epoch": 5.359455512385991, "grad_norm": 10.365694046020508, "learning_rate": 3.9933054318298684e-05, "loss": 0.6242, "step": 389000 }, { "epoch": 5.360833264445731, "grad_norm": 3.0333306789398193, "learning_rate": 3.992590432360523e-05, "loss": 0.601, "step": 389100 }, { "epoch": 5.362211016505469, "grad_norm": 4.537501811981201, "learning_rate": 3.991875327326764e-05, "loss": 0.636, "step": 389200 }, { "epoch": 5.363588768565209, "grad_norm": 6.623795509338379, "learning_rate": 3.991160116789343e-05, "loss": 0.5535, "step": 389300 }, { "epoch": 5.364966520624948, "grad_norm": 0.8408687710762024, "learning_rate": 3.9904448008090196e-05, "loss": 0.5941, "step": 389400 }, { "epoch": 5.366344272684688, "grad_norm": 9.286808013916016, "learning_rate": 3.989729379446565e-05, "loss": 0.6384, "step": 389500 }, { "epoch": 5.367722024744427, "grad_norm": 5.8450422286987305, "learning_rate": 3.989013852762757e-05, "loss": 0.5605, "step": 389600 }, { "epoch": 5.369099776804166, "grad_norm": 42.73175048828125, "learning_rate": 3.988305377658667e-05, "loss": 0.6346, "step": 389700 }, { "epoch": 5.370477528863906, "grad_norm": 3.9566800594329834, "learning_rate": 3.987589641566221e-05, "loss": 0.6917, "step": 389800 }, { "epoch": 5.371855280923645, "grad_norm": 16.253812789916992, "learning_rate": 3.986873800334204e-05, "loss": 0.5924, "step": 389900 }, { "epoch": 5.373233032983384, "grad_norm": 19.51742172241211, "learning_rate": 3.986157854023429e-05, "loss": 0.6879, "step": 390000 }, { "epoch": 5.3746107850431235, "grad_norm": 1.9851157665252686, "learning_rate": 3.9854418026947206e-05, "loss": 0.5342, "step": 390100 }, { "epoch": 5.375988537102863, "grad_norm": 17.37412452697754, "learning_rate": 3.984725646408911e-05, "loss": 0.6825, "step": 390200 }, { "epoch": 5.377366289162603, "grad_norm": 30.969364166259766, "learning_rate": 3.98400938522684e-05, "loss": 0.571, "step": 390300 }, { "epoch": 5.378744041222341, "grad_norm": 62.48212814331055, "learning_rate": 3.983293019209359e-05, "loss": 0.6106, "step": 390400 }, { "epoch": 5.380121793282081, "grad_norm": 11.327045440673828, "learning_rate": 3.982576548417326e-05, "loss": 0.6305, "step": 390500 }, { "epoch": 5.38149954534182, "grad_norm": 9.365761756896973, "learning_rate": 3.981867139184801e-05, "loss": 0.774, "step": 390600 }, { "epoch": 5.38287729740156, "grad_norm": 1.2081712484359741, "learning_rate": 3.9811504600725044e-05, "loss": 0.5846, "step": 390700 }, { "epoch": 5.384255049461299, "grad_norm": 7.587369918823242, "learning_rate": 3.980433676367678e-05, "loss": 0.6117, "step": 390800 }, { "epoch": 5.385632801521038, "grad_norm": 3.743663787841797, "learning_rate": 3.979716788131216e-05, "loss": 0.6546, "step": 390900 }, { "epoch": 5.387010553580778, "grad_norm": 2.2835028171539307, "learning_rate": 3.9789997954240215e-05, "loss": 0.5886, "step": 391000 }, { "epoch": 5.388388305640517, "grad_norm": 59.94197463989258, "learning_rate": 3.9782826983070077e-05, "loss": 0.6517, "step": 391100 }, { "epoch": 5.389766057700256, "grad_norm": 9.970721244812012, "learning_rate": 3.9775654968410946e-05, "loss": 0.6597, "step": 391200 }, { "epoch": 5.3911438097599955, "grad_norm": 17.020177841186523, "learning_rate": 3.976848191087211e-05, "loss": 0.5796, "step": 391300 }, { "epoch": 5.392521561819735, "grad_norm": 2.878309965133667, "learning_rate": 3.976130781106298e-05, "loss": 0.6131, "step": 391400 }, { "epoch": 5.393899313879475, "grad_norm": 10.692058563232422, "learning_rate": 3.9754204426161944e-05, "loss": 0.6202, "step": 391500 }, { "epoch": 5.395277065939213, "grad_norm": 4.510809421539307, "learning_rate": 3.974702825404823e-05, "loss": 0.571, "step": 391600 }, { "epoch": 5.396654817998953, "grad_norm": 2.2511394023895264, "learning_rate": 3.9739851041486794e-05, "loss": 0.619, "step": 391700 }, { "epoch": 5.398032570058692, "grad_norm": 89.06232452392578, "learning_rate": 3.97326727890874e-05, "loss": 0.6232, "step": 391800 }, { "epoch": 5.399410322118432, "grad_norm": 3.4528465270996094, "learning_rate": 3.9725493497459873e-05, "loss": 0.5915, "step": 391900 }, { "epoch": 5.400788074178171, "grad_norm": 30.032114028930664, "learning_rate": 3.971831316721412e-05, "loss": 0.5203, "step": 392000 }, { "epoch": 5.40216582623791, "grad_norm": 2.21948504447937, "learning_rate": 3.9711131798960166e-05, "loss": 0.7044, "step": 392100 }, { "epoch": 5.40354357829765, "grad_norm": 206.881591796875, "learning_rate": 3.9703949393308086e-05, "loss": 0.6272, "step": 392200 }, { "epoch": 5.404921330357389, "grad_norm": 3.525752305984497, "learning_rate": 3.9696765950868074e-05, "loss": 0.6257, "step": 392300 }, { "epoch": 5.406299082417128, "grad_norm": 6.595919609069824, "learning_rate": 3.9689581472250394e-05, "loss": 0.6626, "step": 392400 }, { "epoch": 5.4076768344768675, "grad_norm": 57.39454650878906, "learning_rate": 3.968239595806541e-05, "loss": 0.68, "step": 392500 }, { "epoch": 5.409054586536607, "grad_norm": 7.214725017547607, "learning_rate": 3.967520940892356e-05, "loss": 0.5895, "step": 392600 }, { "epoch": 5.410432338596346, "grad_norm": 10.026534080505371, "learning_rate": 3.966802182543539e-05, "loss": 0.5966, "step": 392700 }, { "epoch": 5.411810090656085, "grad_norm": 9.579185485839844, "learning_rate": 3.96608332082115e-05, "loss": 0.6058, "step": 392800 }, { "epoch": 5.413187842715825, "grad_norm": 9.621077537536621, "learning_rate": 3.9653643557862635e-05, "loss": 0.5532, "step": 392900 }, { "epoch": 5.414565594775564, "grad_norm": 157.06382751464844, "learning_rate": 3.964645287499955e-05, "loss": 0.5576, "step": 393000 }, { "epoch": 5.415943346835304, "grad_norm": 6.495384216308594, "learning_rate": 3.9639261160233174e-05, "loss": 0.6186, "step": 393100 }, { "epoch": 5.417321098895043, "grad_norm": 8.68899154663086, "learning_rate": 3.963206841417444e-05, "loss": 0.5747, "step": 393200 }, { "epoch": 5.418698850954782, "grad_norm": 10.314475059509277, "learning_rate": 3.962487463743444e-05, "loss": 0.5613, "step": 393300 }, { "epoch": 5.420076603014522, "grad_norm": 2.918860912322998, "learning_rate": 3.961767983062431e-05, "loss": 0.5409, "step": 393400 }, { "epoch": 5.42145435507426, "grad_norm": 13.488360404968262, "learning_rate": 3.961048399435527e-05, "loss": 0.6498, "step": 393500 }, { "epoch": 5.422832107134, "grad_norm": 3.7352588176727295, "learning_rate": 3.960328712923867e-05, "loss": 0.5815, "step": 393600 }, { "epoch": 5.4242098591937395, "grad_norm": 11.204102516174316, "learning_rate": 3.9596089235885904e-05, "loss": 0.5885, "step": 393700 }, { "epoch": 5.425587611253479, "grad_norm": 4.194942474365234, "learning_rate": 3.9588890314908475e-05, "loss": 0.5427, "step": 393800 }, { "epoch": 5.426965363313218, "grad_norm": 6.698248386383057, "learning_rate": 3.9581690366917976e-05, "loss": 0.5564, "step": 393900 }, { "epoch": 5.428343115372957, "grad_norm": 9.856194496154785, "learning_rate": 3.9574489392526054e-05, "loss": 0.6457, "step": 394000 }, { "epoch": 5.429720867432697, "grad_norm": 2.5510141849517822, "learning_rate": 3.9567287392344497e-05, "loss": 0.6215, "step": 394100 }, { "epoch": 5.431098619492436, "grad_norm": 35.342491149902344, "learning_rate": 3.956008436698514e-05, "loss": 0.6565, "step": 394200 }, { "epoch": 5.432476371552175, "grad_norm": 42.12504577636719, "learning_rate": 3.9552880317059906e-05, "loss": 0.6903, "step": 394300 }, { "epoch": 5.433854123611915, "grad_norm": 6.624769687652588, "learning_rate": 3.954567524318084e-05, "loss": 0.5621, "step": 394400 }, { "epoch": 5.435231875671654, "grad_norm": 6.960642337799072, "learning_rate": 3.9538469145960036e-05, "loss": 0.5903, "step": 394500 }, { "epoch": 5.436609627731394, "grad_norm": 4.90280294418335, "learning_rate": 3.953126202600968e-05, "loss": 0.6016, "step": 394600 }, { "epoch": 5.4379873797911324, "grad_norm": 18.313831329345703, "learning_rate": 3.952405388394208e-05, "loss": 0.5806, "step": 394700 }, { "epoch": 5.439365131850872, "grad_norm": 7.8750433921813965, "learning_rate": 3.9516844720369566e-05, "loss": 0.5819, "step": 394800 }, { "epoch": 5.440742883910612, "grad_norm": 4.338384628295898, "learning_rate": 3.9509634535904625e-05, "loss": 0.6167, "step": 394900 }, { "epoch": 5.442120635970351, "grad_norm": 9.941856384277344, "learning_rate": 3.95024233311598e-05, "loss": 0.6363, "step": 395000 }, { "epoch": 5.44349838803009, "grad_norm": 3.9847335815429688, "learning_rate": 3.949521110674769e-05, "loss": 0.6363, "step": 395100 }, { "epoch": 5.444876140089829, "grad_norm": 5.161157608032227, "learning_rate": 3.948799786328104e-05, "loss": 0.4969, "step": 395200 }, { "epoch": 5.446253892149569, "grad_norm": 4.168668746948242, "learning_rate": 3.948078360137264e-05, "loss": 0.5912, "step": 395300 }, { "epoch": 5.4476316442093085, "grad_norm": 7.99725866317749, "learning_rate": 3.9473568321635385e-05, "loss": 0.5631, "step": 395400 }, { "epoch": 5.449009396269047, "grad_norm": 5.162728786468506, "learning_rate": 3.946635202468224e-05, "loss": 0.5957, "step": 395500 }, { "epoch": 5.450387148328787, "grad_norm": 27.24070167541504, "learning_rate": 3.945913471112627e-05, "loss": 0.5731, "step": 395600 }, { "epoch": 5.451764900388526, "grad_norm": 23.206134796142578, "learning_rate": 3.945191638158062e-05, "loss": 0.6385, "step": 395700 }, { "epoch": 5.453142652448266, "grad_norm": 6.573702812194824, "learning_rate": 3.944469703665853e-05, "loss": 0.6125, "step": 395800 }, { "epoch": 5.4545204045080045, "grad_norm": 3.9449212551116943, "learning_rate": 3.943747667697333e-05, "loss": 0.6089, "step": 395900 }, { "epoch": 5.455898156567744, "grad_norm": 4.865775108337402, "learning_rate": 3.9430255303138405e-05, "loss": 0.5685, "step": 396000 }, { "epoch": 5.457275908627484, "grad_norm": 70.38639068603516, "learning_rate": 3.942303291576725e-05, "loss": 0.6121, "step": 396100 }, { "epoch": 5.458653660687223, "grad_norm": 7.472474575042725, "learning_rate": 3.941580951547346e-05, "loss": 0.5594, "step": 396200 }, { "epoch": 5.460031412746962, "grad_norm": 2.748600482940674, "learning_rate": 3.9408585102870694e-05, "loss": 0.5541, "step": 396300 }, { "epoch": 5.461409164806701, "grad_norm": 9.134875297546387, "learning_rate": 3.940135967857269e-05, "loss": 0.5642, "step": 396400 }, { "epoch": 5.462786916866441, "grad_norm": 18.136253356933594, "learning_rate": 3.93941332431933e-05, "loss": 0.5701, "step": 396500 }, { "epoch": 5.4641646689261805, "grad_norm": 5.452502250671387, "learning_rate": 3.938690579734644e-05, "loss": 0.6575, "step": 396600 }, { "epoch": 5.465542420985919, "grad_norm": 8.468731880187988, "learning_rate": 3.937967734164612e-05, "loss": 0.5374, "step": 396700 }, { "epoch": 5.466920173045659, "grad_norm": 6.156403064727783, "learning_rate": 3.937244787670644e-05, "loss": 0.5167, "step": 396800 }, { "epoch": 5.468297925105398, "grad_norm": 6.814610481262207, "learning_rate": 3.9365217403141564e-05, "loss": 0.5811, "step": 396900 }, { "epoch": 5.469675677165137, "grad_norm": 12.494441986083984, "learning_rate": 3.9357985921565765e-05, "loss": 0.6224, "step": 397000 }, { "epoch": 5.4710534292248765, "grad_norm": 5.694009780883789, "learning_rate": 3.93507534325934e-05, "loss": 0.553, "step": 397100 }, { "epoch": 5.472431181284616, "grad_norm": 4.90285587310791, "learning_rate": 3.934351993683891e-05, "loss": 0.5698, "step": 397200 }, { "epoch": 5.473808933344356, "grad_norm": 12.405680656433105, "learning_rate": 3.933628543491681e-05, "loss": 0.5481, "step": 397300 }, { "epoch": 5.475186685404095, "grad_norm": 3.627758264541626, "learning_rate": 3.93290499274417e-05, "loss": 0.5757, "step": 397400 }, { "epoch": 5.476564437463834, "grad_norm": 2.3824477195739746, "learning_rate": 3.932188578512485e-05, "loss": 0.6218, "step": 397500 }, { "epoch": 5.477942189523573, "grad_norm": 8.639028549194336, "learning_rate": 3.9314648278428094e-05, "loss": 0.5181, "step": 397600 }, { "epoch": 5.479319941583313, "grad_norm": 12.686386108398438, "learning_rate": 3.9307409768016524e-05, "loss": 0.5235, "step": 397700 }, { "epoch": 5.480697693643052, "grad_norm": 40.40836715698242, "learning_rate": 3.93001702545051e-05, "loss": 0.6171, "step": 397800 }, { "epoch": 5.482075445702791, "grad_norm": 11.910573959350586, "learning_rate": 3.9292929738508833e-05, "loss": 0.6197, "step": 397900 }, { "epoch": 5.483453197762531, "grad_norm": 71.02980041503906, "learning_rate": 3.9285688220642856e-05, "loss": 0.5833, "step": 398000 }, { "epoch": 5.48483094982227, "grad_norm": 16.957067489624023, "learning_rate": 3.9278445701522366e-05, "loss": 0.638, "step": 398100 }, { "epoch": 5.486208701882009, "grad_norm": 8.490316390991211, "learning_rate": 3.9271202181762655e-05, "loss": 0.5639, "step": 398200 }, { "epoch": 5.4875864539417485, "grad_norm": 3.2989583015441895, "learning_rate": 3.92639576619791e-05, "loss": 0.6038, "step": 398300 }, { "epoch": 5.488964206001488, "grad_norm": 2.340512752532959, "learning_rate": 3.925671214278716e-05, "loss": 0.6419, "step": 398400 }, { "epoch": 5.490341958061228, "grad_norm": 4.294698715209961, "learning_rate": 3.924946562480237e-05, "loss": 0.5905, "step": 398500 }, { "epoch": 5.491719710120966, "grad_norm": 6.179967403411865, "learning_rate": 3.9242218108640376e-05, "loss": 0.57, "step": 398600 }, { "epoch": 5.493097462180706, "grad_norm": 6.88069486618042, "learning_rate": 3.923496959491688e-05, "loss": 0.5298, "step": 398700 }, { "epoch": 5.494475214240445, "grad_norm": 10.764232635498047, "learning_rate": 3.922772008424767e-05, "loss": 0.5, "step": 398800 }, { "epoch": 5.495852966300185, "grad_norm": 8.831521034240723, "learning_rate": 3.922046957724865e-05, "loss": 0.6417, "step": 398900 }, { "epoch": 5.497230718359924, "grad_norm": 11.632200241088867, "learning_rate": 3.921321807453577e-05, "loss": 0.5413, "step": 399000 }, { "epoch": 5.498608470419663, "grad_norm": 15.019420623779297, "learning_rate": 3.92059655767251e-05, "loss": 0.6104, "step": 399100 }, { "epoch": 5.499986222479403, "grad_norm": 6.926292896270752, "learning_rate": 3.919871208443275e-05, "loss": 0.6515, "step": 399200 }, { "epoch": 5.501363974539142, "grad_norm": 21.12950325012207, "learning_rate": 3.9191457598274964e-05, "loss": 0.5821, "step": 399300 }, { "epoch": 5.502741726598881, "grad_norm": 3.0918517112731934, "learning_rate": 3.918420211886806e-05, "loss": 0.5749, "step": 399400 }, { "epoch": 5.5041194786586205, "grad_norm": 3.0059430599212646, "learning_rate": 3.91769456468284e-05, "loss": 0.5806, "step": 399500 }, { "epoch": 5.50549723071836, "grad_norm": 16.383102416992188, "learning_rate": 3.916968818277246e-05, "loss": 0.5962, "step": 399600 }, { "epoch": 5.5068749827781, "grad_norm": 4.6504340171813965, "learning_rate": 3.916242972731681e-05, "loss": 0.6096, "step": 399700 }, { "epoch": 5.508252734837838, "grad_norm": 1.9566203355789185, "learning_rate": 3.91551702810781e-05, "loss": 0.5579, "step": 399800 }, { "epoch": 5.509630486897578, "grad_norm": 13.979101181030273, "learning_rate": 3.9147909844673035e-05, "loss": 0.6306, "step": 399900 }, { "epoch": 5.511008238957317, "grad_norm": 1.4463425874710083, "learning_rate": 3.914072103787423e-05, "loss": 0.5993, "step": 400000 }, { "epoch": 5.512385991017057, "grad_norm": 4.543508052825928, "learning_rate": 3.913345863287328e-05, "loss": 0.6163, "step": 400100 }, { "epoch": 5.513763743076796, "grad_norm": 6.191413879394531, "learning_rate": 3.91261952395505e-05, "loss": 0.5561, "step": 400200 }, { "epoch": 5.515141495136535, "grad_norm": 18.279468536376953, "learning_rate": 3.911893085852295e-05, "loss": 0.5004, "step": 400300 }, { "epoch": 5.516519247196275, "grad_norm": 2.0381975173950195, "learning_rate": 3.911166549040778e-05, "loss": 0.5924, "step": 400400 }, { "epoch": 5.517896999256013, "grad_norm": 5.469998836517334, "learning_rate": 3.910439913582223e-05, "loss": 0.5727, "step": 400500 }, { "epoch": 5.519274751315753, "grad_norm": 5.091738700866699, "learning_rate": 3.9097131795383596e-05, "loss": 0.6142, "step": 400600 }, { "epoch": 5.5206525033754925, "grad_norm": 26.629655838012695, "learning_rate": 3.9089863469709285e-05, "loss": 0.658, "step": 400700 }, { "epoch": 5.522030255435232, "grad_norm": 11.263328552246094, "learning_rate": 3.908259415941679e-05, "loss": 0.534, "step": 400800 }, { "epoch": 5.523408007494972, "grad_norm": 5.415755271911621, "learning_rate": 3.9075323865123665e-05, "loss": 0.5332, "step": 400900 }, { "epoch": 5.52478575955471, "grad_norm": 193.8293914794922, "learning_rate": 3.906805258744755e-05, "loss": 0.5489, "step": 401000 }, { "epoch": 5.52616351161445, "grad_norm": 5.2032904624938965, "learning_rate": 3.906078032700619e-05, "loss": 0.5734, "step": 401100 }, { "epoch": 5.527541263674189, "grad_norm": 4.388431072235107, "learning_rate": 3.90535070844174e-05, "loss": 0.5996, "step": 401200 }, { "epoch": 5.528919015733928, "grad_norm": 3.1175475120544434, "learning_rate": 3.9046232860299075e-05, "loss": 0.5605, "step": 401300 }, { "epoch": 5.530296767793668, "grad_norm": 96.86162567138672, "learning_rate": 3.903895765526919e-05, "loss": 0.5437, "step": 401400 }, { "epoch": 5.531674519853407, "grad_norm": 15.472757339477539, "learning_rate": 3.903168146994582e-05, "loss": 0.5706, "step": 401500 }, { "epoch": 5.533052271913147, "grad_norm": 3.812843084335327, "learning_rate": 3.9024404304947124e-05, "loss": 0.6388, "step": 401600 }, { "epoch": 5.534430023972886, "grad_norm": 8.083906173706055, "learning_rate": 3.9017126160891314e-05, "loss": 0.6355, "step": 401700 }, { "epoch": 5.535807776032625, "grad_norm": 26.95009422302246, "learning_rate": 3.900984703839672e-05, "loss": 0.5922, "step": 401800 }, { "epoch": 5.5371855280923645, "grad_norm": 19.113903045654297, "learning_rate": 3.900256693808174e-05, "loss": 0.6205, "step": 401900 }, { "epoch": 5.538563280152104, "grad_norm": 9.053086280822754, "learning_rate": 3.899535867617513e-05, "loss": 0.5659, "step": 402000 }, { "epoch": 5.539941032211843, "grad_norm": 37.57135009765625, "learning_rate": 3.8988149457113135e-05, "loss": 0.5775, "step": 402100 }, { "epoch": 5.541318784271582, "grad_norm": 5.8613152503967285, "learning_rate": 3.898086644656142e-05, "loss": 0.5821, "step": 402200 }, { "epoch": 5.542696536331322, "grad_norm": 6.441689968109131, "learning_rate": 3.897358246065138e-05, "loss": 0.6273, "step": 402300 }, { "epoch": 5.544074288391061, "grad_norm": 4.96812629699707, "learning_rate": 3.896629750000181e-05, "loss": 0.6514, "step": 402400 }, { "epoch": 5.5454520404508, "grad_norm": 4.849573612213135, "learning_rate": 3.895901156523161e-05, "loss": 0.5573, "step": 402500 }, { "epoch": 5.54682979251054, "grad_norm": 19.24586296081543, "learning_rate": 3.895172465695975e-05, "loss": 0.6427, "step": 402600 }, { "epoch": 5.548207544570279, "grad_norm": 6.632706642150879, "learning_rate": 3.8944436775805294e-05, "loss": 0.4727, "step": 402700 }, { "epoch": 5.549585296630019, "grad_norm": 14.708337783813477, "learning_rate": 3.893714792238739e-05, "loss": 0.5611, "step": 402800 }, { "epoch": 5.550963048689757, "grad_norm": 9.718215942382812, "learning_rate": 3.892985809732525e-05, "loss": 0.6052, "step": 402900 }, { "epoch": 5.552340800749497, "grad_norm": 11.996245384216309, "learning_rate": 3.892256730123819e-05, "loss": 0.5922, "step": 403000 }, { "epoch": 5.5537185528092365, "grad_norm": 4.267444133758545, "learning_rate": 3.89152755347456e-05, "loss": 0.5713, "step": 403100 }, { "epoch": 5.555096304868976, "grad_norm": 9.997715950012207, "learning_rate": 3.8907982798466956e-05, "loss": 0.594, "step": 403200 }, { "epoch": 5.556474056928715, "grad_norm": 8.01296329498291, "learning_rate": 3.89007620348716e-05, "loss": 0.552, "step": 403300 }, { "epoch": 5.557851808988454, "grad_norm": 114.51327514648438, "learning_rate": 3.889346737056199e-05, "loss": 0.5488, "step": 403400 }, { "epoch": 5.559229561048194, "grad_norm": 1.3845990896224976, "learning_rate": 3.888617173831905e-05, "loss": 0.5585, "step": 403500 }, { "epoch": 5.560607313107933, "grad_norm": 13.90626335144043, "learning_rate": 3.887887513876255e-05, "loss": 0.5187, "step": 403600 }, { "epoch": 5.561985065167672, "grad_norm": 4.643988132476807, "learning_rate": 3.88715775725124e-05, "loss": 0.5841, "step": 403700 }, { "epoch": 5.563362817227412, "grad_norm": 7.148656845092773, "learning_rate": 3.8864279040188565e-05, "loss": 0.6647, "step": 403800 }, { "epoch": 5.564740569287151, "grad_norm": 8.253439903259277, "learning_rate": 3.885697954241108e-05, "loss": 0.5794, "step": 403900 }, { "epoch": 5.566118321346891, "grad_norm": 39.806983947753906, "learning_rate": 3.884967907980009e-05, "loss": 0.5727, "step": 404000 }, { "epoch": 5.567496073406629, "grad_norm": 12.74814224243164, "learning_rate": 3.8842377652975784e-05, "loss": 0.6619, "step": 404100 }, { "epoch": 5.568873825466369, "grad_norm": 7.739386081695557, "learning_rate": 3.883507526255848e-05, "loss": 0.5363, "step": 404200 }, { "epoch": 5.5702515775261086, "grad_norm": 3.6189370155334473, "learning_rate": 3.8827771909168535e-05, "loss": 0.5867, "step": 404300 }, { "epoch": 5.571629329585848, "grad_norm": 7.51109504699707, "learning_rate": 3.882046759342641e-05, "loss": 0.5445, "step": 404400 }, { "epoch": 5.573007081645587, "grad_norm": 8.570854187011719, "learning_rate": 3.881316231595266e-05, "loss": 0.6674, "step": 404500 }, { "epoch": 5.574384833705326, "grad_norm": 23.80831527709961, "learning_rate": 3.880585607736789e-05, "loss": 0.5858, "step": 404600 }, { "epoch": 5.575762585765066, "grad_norm": 4.698869705200195, "learning_rate": 3.87985488782928e-05, "loss": 0.5427, "step": 404700 }, { "epoch": 5.577140337824805, "grad_norm": 2.22591233253479, "learning_rate": 3.8791240719348184e-05, "loss": 0.5897, "step": 404800 }, { "epoch": 5.578518089884544, "grad_norm": 2.73796010017395, "learning_rate": 3.878393160115491e-05, "loss": 0.5859, "step": 404900 }, { "epoch": 5.579895841944284, "grad_norm": 5.244845390319824, "learning_rate": 3.877662152433391e-05, "loss": 0.5895, "step": 405000 }, { "epoch": 5.581273594004023, "grad_norm": 21.79768180847168, "learning_rate": 3.8769310489506224e-05, "loss": 0.5254, "step": 405100 }, { "epoch": 5.582651346063763, "grad_norm": 3.9913852214813232, "learning_rate": 3.876199849729295e-05, "loss": 0.5458, "step": 405200 }, { "epoch": 5.5840290981235015, "grad_norm": 6.309573650360107, "learning_rate": 3.87546855483153e-05, "loss": 0.5521, "step": 405300 }, { "epoch": 5.585406850183241, "grad_norm": 3.8957104682922363, "learning_rate": 3.874737164319453e-05, "loss": 0.5393, "step": 405400 }, { "epoch": 5.586784602242981, "grad_norm": 14.508999824523926, "learning_rate": 3.8740056782552e-05, "loss": 0.5773, "step": 405500 }, { "epoch": 5.588162354302719, "grad_norm": 20.58478546142578, "learning_rate": 3.873274096700913e-05, "loss": 0.6063, "step": 405600 }, { "epoch": 5.589540106362459, "grad_norm": 7.5302019119262695, "learning_rate": 3.872542419718746e-05, "loss": 0.6154, "step": 405700 }, { "epoch": 5.590917858422198, "grad_norm": 8.081854820251465, "learning_rate": 3.871810647370858e-05, "loss": 0.6599, "step": 405800 }, { "epoch": 5.592295610481938, "grad_norm": 18.692920684814453, "learning_rate": 3.8710787797194145e-05, "loss": 0.4728, "step": 405900 }, { "epoch": 5.5936733625416775, "grad_norm": 26.04582977294922, "learning_rate": 3.870346816826595e-05, "loss": 0.5816, "step": 406000 }, { "epoch": 5.595051114601416, "grad_norm": 7.022157192230225, "learning_rate": 3.869614758754581e-05, "loss": 0.6078, "step": 406100 }, { "epoch": 5.596428866661156, "grad_norm": 10.696815490722656, "learning_rate": 3.868882605565565e-05, "loss": 0.668, "step": 406200 }, { "epoch": 5.597806618720895, "grad_norm": 4.490745544433594, "learning_rate": 3.8681503573217486e-05, "loss": 0.589, "step": 406300 }, { "epoch": 5.599184370780634, "grad_norm": 4.078994274139404, "learning_rate": 3.867418014085338e-05, "loss": 0.5682, "step": 406400 }, { "epoch": 5.6005621228403735, "grad_norm": 16.163284301757812, "learning_rate": 3.866685575918551e-05, "loss": 0.5954, "step": 406500 }, { "epoch": 5.601939874900113, "grad_norm": 13.411498069763184, "learning_rate": 3.865953042883612e-05, "loss": 0.571, "step": 406600 }, { "epoch": 5.603317626959853, "grad_norm": 25.4854679107666, "learning_rate": 3.865220415042751e-05, "loss": 0.6416, "step": 406700 }, { "epoch": 5.604695379019591, "grad_norm": 21.939712524414062, "learning_rate": 3.8644876924582115e-05, "loss": 0.5763, "step": 406800 }, { "epoch": 5.606073131079331, "grad_norm": 11.836036682128906, "learning_rate": 3.8637548751922405e-05, "loss": 0.6635, "step": 406900 }, { "epoch": 5.60745088313907, "grad_norm": 10.029311180114746, "learning_rate": 3.863021963307095e-05, "loss": 0.5576, "step": 407000 }, { "epoch": 5.60882863519881, "grad_norm": 15.481919288635254, "learning_rate": 3.86228895686504e-05, "loss": 0.5838, "step": 407100 }, { "epoch": 5.610206387258549, "grad_norm": 4.08052396774292, "learning_rate": 3.861555855928347e-05, "loss": 0.6174, "step": 407200 }, { "epoch": 5.611584139318288, "grad_norm": 8.861710548400879, "learning_rate": 3.860822660559298e-05, "loss": 0.6717, "step": 407300 }, { "epoch": 5.612961891378028, "grad_norm": 3.937213659286499, "learning_rate": 3.86008937082018e-05, "loss": 0.5986, "step": 407400 }, { "epoch": 5.614339643437767, "grad_norm": 6.842425346374512, "learning_rate": 3.859355986773292e-05, "loss": 0.5749, "step": 407500 }, { "epoch": 5.615717395497506, "grad_norm": 11.892126083374023, "learning_rate": 3.8586225084809365e-05, "loss": 0.5783, "step": 407600 }, { "epoch": 5.6170951475572455, "grad_norm": 3.5909173488616943, "learning_rate": 3.857888936005428e-05, "loss": 0.5823, "step": 407700 }, { "epoch": 5.618472899616985, "grad_norm": 15.181912422180176, "learning_rate": 3.857155269409086e-05, "loss": 0.6049, "step": 407800 }, { "epoch": 5.619850651676725, "grad_norm": 13.354763984680176, "learning_rate": 3.85642150875424e-05, "loss": 0.6094, "step": 407900 }, { "epoch": 5.621228403736463, "grad_norm": 23.534334182739258, "learning_rate": 3.8556876541032264e-05, "loss": 0.6279, "step": 408000 }, { "epoch": 5.622606155796203, "grad_norm": 10.670170783996582, "learning_rate": 3.8549537055183894e-05, "loss": 0.6505, "step": 408100 }, { "epoch": 5.623983907855942, "grad_norm": 10.856382369995117, "learning_rate": 3.854219663062083e-05, "loss": 0.6927, "step": 408200 }, { "epoch": 5.625361659915682, "grad_norm": 19.299663543701172, "learning_rate": 3.8534855267966665e-05, "loss": 0.5689, "step": 408300 }, { "epoch": 5.626739411975421, "grad_norm": 153.18516540527344, "learning_rate": 3.852751296784509e-05, "loss": 0.6135, "step": 408400 }, { "epoch": 5.62811716403516, "grad_norm": 6.1429524421691895, "learning_rate": 3.852016973087988e-05, "loss": 0.5947, "step": 408500 }, { "epoch": 5.6294949160949, "grad_norm": 11.997945785522461, "learning_rate": 3.8512825557694865e-05, "loss": 0.5577, "step": 408600 }, { "epoch": 5.630872668154639, "grad_norm": 16.177392959594727, "learning_rate": 3.850548044891398e-05, "loss": 0.5729, "step": 408700 }, { "epoch": 5.632250420214378, "grad_norm": 4.524254322052002, "learning_rate": 3.849813440516123e-05, "loss": 0.6511, "step": 408800 }, { "epoch": 5.6336281722741175, "grad_norm": 27.117694854736328, "learning_rate": 3.849078742706069e-05, "loss": 0.6178, "step": 408900 }, { "epoch": 5.635005924333857, "grad_norm": 9.303934097290039, "learning_rate": 3.848343951523653e-05, "loss": 0.6131, "step": 409000 }, { "epoch": 5.636383676393596, "grad_norm": 3.180523633956909, "learning_rate": 3.8476090670312996e-05, "loss": 0.5776, "step": 409100 }, { "epoch": 5.637761428453335, "grad_norm": 71.8797836303711, "learning_rate": 3.8468740892914414e-05, "loss": 0.6215, "step": 409200 }, { "epoch": 5.639139180513075, "grad_norm": 4.7273478507995605, "learning_rate": 3.8461390183665166e-05, "loss": 0.6278, "step": 409300 }, { "epoch": 5.640516932572814, "grad_norm": 28.959108352661133, "learning_rate": 3.8454038543189746e-05, "loss": 0.6483, "step": 409400 }, { "epoch": 5.641894684632554, "grad_norm": 6.889366149902344, "learning_rate": 3.844668597211271e-05, "loss": 0.5617, "step": 409500 }, { "epoch": 5.643272436692293, "grad_norm": 8.85036849975586, "learning_rate": 3.84393324710587e-05, "loss": 0.6637, "step": 409600 }, { "epoch": 5.644650188752032, "grad_norm": 9.753440856933594, "learning_rate": 3.843197804065244e-05, "loss": 0.6394, "step": 409700 }, { "epoch": 5.646027940811772, "grad_norm": 97.58231353759766, "learning_rate": 3.84246226815187e-05, "loss": 0.6126, "step": 409800 }, { "epoch": 5.64740569287151, "grad_norm": 1.8015433549880981, "learning_rate": 3.841726639428239e-05, "loss": 0.5896, "step": 409900 }, { "epoch": 5.64878344493125, "grad_norm": 1.3764885663986206, "learning_rate": 3.840990917956843e-05, "loss": 0.5802, "step": 410000 }, { "epoch": 5.6501611969909895, "grad_norm": 21.17970848083496, "learning_rate": 3.840255103800188e-05, "loss": 0.5663, "step": 410100 }, { "epoch": 5.651538949050729, "grad_norm": 7.723023891448975, "learning_rate": 3.839519197020785e-05, "loss": 0.5891, "step": 410200 }, { "epoch": 5.652916701110469, "grad_norm": 6.013448238372803, "learning_rate": 3.8387831976811504e-05, "loss": 0.6004, "step": 410300 }, { "epoch": 5.654294453170207, "grad_norm": 11.646791458129883, "learning_rate": 3.8380471058438155e-05, "loss": 0.5385, "step": 410400 }, { "epoch": 5.655672205229947, "grad_norm": 421.31719970703125, "learning_rate": 3.83731092157131e-05, "loss": 0.5981, "step": 410500 }, { "epoch": 5.657049957289686, "grad_norm": 11.343670845031738, "learning_rate": 3.8365746449261806e-05, "loss": 0.5304, "step": 410600 }, { "epoch": 5.658427709349425, "grad_norm": 7.379881381988525, "learning_rate": 3.8358382759709765e-05, "loss": 0.6346, "step": 410700 }, { "epoch": 5.659805461409165, "grad_norm": 11.422622680664062, "learning_rate": 3.835101814768255e-05, "loss": 0.6584, "step": 410800 }, { "epoch": 5.661183213468904, "grad_norm": 2.0989139080047607, "learning_rate": 3.8343652613805826e-05, "loss": 0.634, "step": 410900 }, { "epoch": 5.662560965528644, "grad_norm": 2.7042441368103027, "learning_rate": 3.833628615870534e-05, "loss": 0.5042, "step": 411000 }, { "epoch": 5.663938717588382, "grad_norm": 6.44378662109375, "learning_rate": 3.8328918783006913e-05, "loss": 0.5971, "step": 411100 }, { "epoch": 5.665316469648122, "grad_norm": 2.921806573867798, "learning_rate": 3.832155048733643e-05, "loss": 0.5016, "step": 411200 }, { "epoch": 5.6666942217078615, "grad_norm": 5.057344913482666, "learning_rate": 3.831425496901874e-05, "loss": 0.6401, "step": 411300 }, { "epoch": 5.668071973767601, "grad_norm": 3.4876370429992676, "learning_rate": 3.8306884844466255e-05, "loss": 0.5588, "step": 411400 }, { "epoch": 5.66944972582734, "grad_norm": 6.4181413650512695, "learning_rate": 3.829951380181362e-05, "loss": 0.5348, "step": 411500 }, { "epoch": 5.670827477887079, "grad_norm": 16.41754722595215, "learning_rate": 3.829214184168704e-05, "loss": 0.5263, "step": 411600 }, { "epoch": 5.672205229946819, "grad_norm": 25.24420928955078, "learning_rate": 3.828476896471279e-05, "loss": 0.578, "step": 411700 }, { "epoch": 5.673582982006558, "grad_norm": 9.882627487182617, "learning_rate": 3.8277395171517253e-05, "loss": 0.4634, "step": 411800 }, { "epoch": 5.674960734066297, "grad_norm": 16.55098533630371, "learning_rate": 3.827002046272684e-05, "loss": 0.5444, "step": 411900 }, { "epoch": 5.676338486126037, "grad_norm": 8.782390594482422, "learning_rate": 3.8262718599732725e-05, "loss": 0.5511, "step": 412000 }, { "epoch": 5.677716238185776, "grad_norm": 5.046112060546875, "learning_rate": 3.8255342070772565e-05, "loss": 0.5717, "step": 412100 }, { "epoch": 5.679093990245516, "grad_norm": 4.593100070953369, "learning_rate": 3.824796462809107e-05, "loss": 0.614, "step": 412200 }, { "epoch": 5.680471742305254, "grad_norm": 4.590670108795166, "learning_rate": 3.8240586272315e-05, "loss": 0.6133, "step": 412300 }, { "epoch": 5.681849494364994, "grad_norm": 10.165679931640625, "learning_rate": 3.823320700407116e-05, "loss": 0.5372, "step": 412400 }, { "epoch": 5.6832272464247335, "grad_norm": 4.057440280914307, "learning_rate": 3.822582682398649e-05, "loss": 0.7187, "step": 412500 }, { "epoch": 5.684604998484473, "grad_norm": 13.169560432434082, "learning_rate": 3.821844573268795e-05, "loss": 0.5909, "step": 412600 }, { "epoch": 5.685982750544212, "grad_norm": 9.047806739807129, "learning_rate": 3.821106373080261e-05, "loss": 0.5086, "step": 412700 }, { "epoch": 5.687360502603951, "grad_norm": 9.026451110839844, "learning_rate": 3.820368081895761e-05, "loss": 0.64, "step": 412800 }, { "epoch": 5.688738254663691, "grad_norm": 20.203289031982422, "learning_rate": 3.819629699778017e-05, "loss": 0.5369, "step": 412900 }, { "epoch": 5.69011600672343, "grad_norm": 13.962346076965332, "learning_rate": 3.818891226789757e-05, "loss": 0.5912, "step": 413000 }, { "epoch": 5.691493758783169, "grad_norm": 6.664554119110107, "learning_rate": 3.818152662993719e-05, "loss": 0.6237, "step": 413100 }, { "epoch": 5.692871510842909, "grad_norm": 38.70198440551758, "learning_rate": 3.817414008452648e-05, "loss": 0.6179, "step": 413200 }, { "epoch": 5.694249262902648, "grad_norm": 3.2160747051239014, "learning_rate": 3.816675263229296e-05, "loss": 0.5593, "step": 413300 }, { "epoch": 5.695627014962387, "grad_norm": 4.834950923919678, "learning_rate": 3.815936427386424e-05, "loss": 0.6315, "step": 413400 }, { "epoch": 5.697004767022126, "grad_norm": 8.595553398132324, "learning_rate": 3.815197500986799e-05, "loss": 0.5227, "step": 413500 }, { "epoch": 5.698382519081866, "grad_norm": 20.681102752685547, "learning_rate": 3.8144584840931956e-05, "loss": 0.5941, "step": 413600 }, { "epoch": 5.6997602711416055, "grad_norm": 7.695576190948486, "learning_rate": 3.813719376768399e-05, "loss": 0.5486, "step": 413700 }, { "epoch": 5.701138023201345, "grad_norm": 1.5854418277740479, "learning_rate": 3.812980179075199e-05, "loss": 0.609, "step": 413800 }, { "epoch": 5.702515775261084, "grad_norm": 4.077051162719727, "learning_rate": 3.812240891076395e-05, "loss": 0.6083, "step": 413900 }, { "epoch": 5.703893527320823, "grad_norm": 3.825028657913208, "learning_rate": 3.811501512834793e-05, "loss": 0.6136, "step": 414000 }, { "epoch": 5.705271279380563, "grad_norm": 3.4526240825653076, "learning_rate": 3.810762044413207e-05, "loss": 0.5935, "step": 414100 }, { "epoch": 5.706649031440302, "grad_norm": 9.169675827026367, "learning_rate": 3.810022485874458e-05, "loss": 0.635, "step": 414200 }, { "epoch": 5.708026783500041, "grad_norm": 5.189697265625, "learning_rate": 3.809282837281376e-05, "loss": 0.6432, "step": 414300 }, { "epoch": 5.709404535559781, "grad_norm": 18.285369873046875, "learning_rate": 3.808543098696798e-05, "loss": 0.5431, "step": 414400 }, { "epoch": 5.71078228761952, "grad_norm": 6.748597145080566, "learning_rate": 3.807803270183568e-05, "loss": 0.5442, "step": 414500 }, { "epoch": 5.71216003967926, "grad_norm": 3.252908945083618, "learning_rate": 3.807063351804539e-05, "loss": 0.5767, "step": 414600 }, { "epoch": 5.7135377917389985, "grad_norm": 9.5114164352417, "learning_rate": 3.806323343622569e-05, "loss": 0.6152, "step": 414700 }, { "epoch": 5.714915543798738, "grad_norm": 1.6165428161621094, "learning_rate": 3.8055832457005287e-05, "loss": 0.5425, "step": 414800 }, { "epoch": 5.716293295858478, "grad_norm": 7.752899646759033, "learning_rate": 3.804843058101291e-05, "loss": 0.5609, "step": 414900 }, { "epoch": 5.717671047918216, "grad_norm": 3.8368735313415527, "learning_rate": 3.804102780887738e-05, "loss": 0.6638, "step": 415000 }, { "epoch": 5.719048799977956, "grad_norm": 16.15721321105957, "learning_rate": 3.803362414122761e-05, "loss": 0.5415, "step": 415100 }, { "epoch": 5.720426552037695, "grad_norm": 399.3175048828125, "learning_rate": 3.802621957869259e-05, "loss": 0.577, "step": 415200 }, { "epoch": 5.721804304097435, "grad_norm": 6.12244176864624, "learning_rate": 3.801881412190135e-05, "loss": 0.6148, "step": 415300 }, { "epoch": 5.723182056157174, "grad_norm": 3.3543612957000732, "learning_rate": 3.8011407771483056e-05, "loss": 0.6247, "step": 415400 }, { "epoch": 5.724559808216913, "grad_norm": 6.6794538497924805, "learning_rate": 3.800400052806688e-05, "loss": 0.6077, "step": 415500 }, { "epoch": 5.725937560276653, "grad_norm": 4.871297359466553, "learning_rate": 3.799659239228212e-05, "loss": 0.6212, "step": 415600 }, { "epoch": 5.727315312336392, "grad_norm": 15.35666275024414, "learning_rate": 3.798918336475815e-05, "loss": 0.5743, "step": 415700 }, { "epoch": 5.728693064396131, "grad_norm": 10.151765823364258, "learning_rate": 3.798177344612438e-05, "loss": 0.5293, "step": 415800 }, { "epoch": 5.7300708164558705, "grad_norm": 1.2828086614608765, "learning_rate": 3.797436263701034e-05, "loss": 0.6429, "step": 415900 }, { "epoch": 5.73144856851561, "grad_norm": 7.951080322265625, "learning_rate": 3.79669509380456e-05, "loss": 0.5442, "step": 416000 }, { "epoch": 5.73282632057535, "grad_norm": 10.235562324523926, "learning_rate": 3.795953834985983e-05, "loss": 0.599, "step": 416100 }, { "epoch": 5.734204072635088, "grad_norm": 5.476729869842529, "learning_rate": 3.7952124873082766e-05, "loss": 0.5723, "step": 416200 }, { "epoch": 5.735581824694828, "grad_norm": 6.906072616577148, "learning_rate": 3.7944710508344225e-05, "loss": 0.6486, "step": 416300 }, { "epoch": 5.736959576754567, "grad_norm": 7.615192413330078, "learning_rate": 3.793729525627409e-05, "loss": 0.5485, "step": 416400 }, { "epoch": 5.738337328814307, "grad_norm": 6.2774810791015625, "learning_rate": 3.792987911750233e-05, "loss": 0.498, "step": 416500 }, { "epoch": 5.739715080874046, "grad_norm": 11.927968978881836, "learning_rate": 3.792246209265897e-05, "loss": 0.6452, "step": 416600 }, { "epoch": 5.741092832933785, "grad_norm": 3.4691221714019775, "learning_rate": 3.791504418237414e-05, "loss": 0.5523, "step": 416700 }, { "epoch": 5.742470584993525, "grad_norm": 33.873504638671875, "learning_rate": 3.7907625387278023e-05, "loss": 0.607, "step": 416800 }, { "epoch": 5.743848337053264, "grad_norm": 14.57044792175293, "learning_rate": 3.790020570800088e-05, "loss": 0.6101, "step": 416900 }, { "epoch": 5.745226089113003, "grad_norm": 0.9322608709335327, "learning_rate": 3.789278514517307e-05, "loss": 0.6007, "step": 417000 }, { "epoch": 5.7466038411727425, "grad_norm": 6.976534843444824, "learning_rate": 3.788536369942498e-05, "loss": 0.5511, "step": 417100 }, { "epoch": 5.747981593232482, "grad_norm": 5.832825183868408, "learning_rate": 3.787794137138711e-05, "loss": 0.5612, "step": 417200 }, { "epoch": 5.749359345292222, "grad_norm": 5.226287841796875, "learning_rate": 3.787051816169004e-05, "loss": 0.6086, "step": 417300 }, { "epoch": 5.75073709735196, "grad_norm": 4.8902974128723145, "learning_rate": 3.786309407096439e-05, "loss": 0.5356, "step": 417400 }, { "epoch": 5.7521148494117, "grad_norm": 5.891170024871826, "learning_rate": 3.785566909984088e-05, "loss": 0.5896, "step": 417500 }, { "epoch": 5.753492601471439, "grad_norm": 4.325453281402588, "learning_rate": 3.78482432489503e-05, "loss": 0.5538, "step": 417600 }, { "epoch": 5.754870353531178, "grad_norm": 10.180216789245605, "learning_rate": 3.7840816518923516e-05, "loss": 0.5688, "step": 417700 }, { "epoch": 5.756248105590918, "grad_norm": 6.114187240600586, "learning_rate": 3.783338891039146e-05, "loss": 0.6346, "step": 417800 }, { "epoch": 5.757625857650657, "grad_norm": 4.8008036613464355, "learning_rate": 3.7825960423985165e-05, "loss": 0.5841, "step": 417900 }, { "epoch": 5.759003609710397, "grad_norm": 22.57602882385254, "learning_rate": 3.781853106033569e-05, "loss": 0.5017, "step": 418000 }, { "epoch": 5.760381361770136, "grad_norm": 5.566181659698486, "learning_rate": 3.781110082007423e-05, "loss": 0.6026, "step": 418100 }, { "epoch": 5.761759113829875, "grad_norm": 16.178930282592773, "learning_rate": 3.7803669703831986e-05, "loss": 0.641, "step": 418200 }, { "epoch": 5.7631368658896145, "grad_norm": 12.21933650970459, "learning_rate": 3.7796237712240295e-05, "loss": 0.5489, "step": 418300 }, { "epoch": 5.764514617949354, "grad_norm": 5.758997440338135, "learning_rate": 3.7788804845930535e-05, "loss": 0.5397, "step": 418400 }, { "epoch": 5.765892370009093, "grad_norm": 6.082283973693848, "learning_rate": 3.778137110553417e-05, "loss": 0.6224, "step": 418500 }, { "epoch": 5.767270122068832, "grad_norm": 11.053380966186523, "learning_rate": 3.7773936491682723e-05, "loss": 0.5798, "step": 418600 }, { "epoch": 5.768647874128572, "grad_norm": 4.975136756896973, "learning_rate": 3.7766501005007816e-05, "loss": 0.6402, "step": 418700 }, { "epoch": 5.770025626188311, "grad_norm": 29.005327224731445, "learning_rate": 3.775906464614112e-05, "loss": 0.5531, "step": 418800 }, { "epoch": 5.771403378248051, "grad_norm": 28.622915267944336, "learning_rate": 3.7751627415714406e-05, "loss": 0.617, "step": 418900 }, { "epoch": 5.77278113030779, "grad_norm": 31.343488693237305, "learning_rate": 3.774418931435949e-05, "loss": 0.5954, "step": 419000 }, { "epoch": 5.774158882367529, "grad_norm": 6.084015369415283, "learning_rate": 3.773682473673068e-05, "loss": 0.6142, "step": 419100 }, { "epoch": 5.775536634427269, "grad_norm": 17.0856990814209, "learning_rate": 3.772938490410868e-05, "loss": 0.5966, "step": 419200 }, { "epoch": 5.776914386487007, "grad_norm": 19.9857234954834, "learning_rate": 3.772194420244809e-05, "loss": 0.6046, "step": 419300 }, { "epoch": 5.778292138546747, "grad_norm": 2.3278746604919434, "learning_rate": 3.771450263238105e-05, "loss": 0.5507, "step": 419400 }, { "epoch": 5.7796698906064865, "grad_norm": 2.851388931274414, "learning_rate": 3.770706019453976e-05, "loss": 0.6338, "step": 419500 }, { "epoch": 5.781047642666226, "grad_norm": 6.879894256591797, "learning_rate": 3.769961688955647e-05, "loss": 0.6018, "step": 419600 }, { "epoch": 5.782425394725965, "grad_norm": 12.186042785644531, "learning_rate": 3.769224716406563e-05, "loss": 0.6212, "step": 419700 }, { "epoch": 5.783803146785704, "grad_norm": 17.784088134765625, "learning_rate": 3.7684876589922334e-05, "loss": 0.6063, "step": 419800 }, { "epoch": 5.785180898845444, "grad_norm": 392.12445068359375, "learning_rate": 3.767743070460618e-05, "loss": 0.5581, "step": 419900 }, { "epoch": 5.786558650905183, "grad_norm": 4.219203948974609, "learning_rate": 3.766998395466522e-05, "loss": 0.567, "step": 420000 }, { "epoch": 5.787936402964922, "grad_norm": 8.193329811096191, "learning_rate": 3.7662536340732094e-05, "loss": 0.5591, "step": 420100 }, { "epoch": 5.789314155024662, "grad_norm": 9.114564895629883, "learning_rate": 3.765508786343951e-05, "loss": 0.608, "step": 420200 }, { "epoch": 5.790691907084401, "grad_norm": 15.776185035705566, "learning_rate": 3.764763852342026e-05, "loss": 0.5622, "step": 420300 }, { "epoch": 5.792069659144141, "grad_norm": 14.720494270324707, "learning_rate": 3.764018832130721e-05, "loss": 0.6298, "step": 420400 }, { "epoch": 5.793447411203879, "grad_norm": 59.077117919921875, "learning_rate": 3.763273725773328e-05, "loss": 0.5939, "step": 420500 }, { "epoch": 5.794825163263619, "grad_norm": 18.532920837402344, "learning_rate": 3.762528533333148e-05, "loss": 0.6347, "step": 420600 }, { "epoch": 5.7962029153233585, "grad_norm": 116.89422607421875, "learning_rate": 3.76178325487349e-05, "loss": 0.6525, "step": 420700 }, { "epoch": 5.797580667383098, "grad_norm": 178.13636779785156, "learning_rate": 3.761037890457666e-05, "loss": 0.693, "step": 420800 }, { "epoch": 5.798958419442837, "grad_norm": 5.303542137145996, "learning_rate": 3.760292440149002e-05, "loss": 0.6456, "step": 420900 }, { "epoch": 5.800336171502576, "grad_norm": 9.722251892089844, "learning_rate": 3.7595469040108275e-05, "loss": 0.6326, "step": 421000 }, { "epoch": 5.801713923562316, "grad_norm": 12.68631649017334, "learning_rate": 3.758801282106477e-05, "loss": 0.5819, "step": 421100 }, { "epoch": 5.803091675622055, "grad_norm": 36.83993148803711, "learning_rate": 3.758055574499298e-05, "loss": 0.597, "step": 421200 }, { "epoch": 5.804469427681794, "grad_norm": 13.51420783996582, "learning_rate": 3.7573097812526403e-05, "loss": 0.5683, "step": 421300 }, { "epoch": 5.805847179741534, "grad_norm": 23.411298751831055, "learning_rate": 3.7565639024298636e-05, "loss": 0.5431, "step": 421400 }, { "epoch": 5.807224931801273, "grad_norm": 6.391308784484863, "learning_rate": 3.7558179380943335e-05, "loss": 0.6411, "step": 421500 }, { "epoch": 5.808602683861013, "grad_norm": 36.90744400024414, "learning_rate": 3.755071888309423e-05, "loss": 0.6675, "step": 421600 }, { "epoch": 5.809980435920751, "grad_norm": 27.961341857910156, "learning_rate": 3.7543257531385156e-05, "loss": 0.7391, "step": 421700 }, { "epoch": 5.811358187980491, "grad_norm": 8.203859329223633, "learning_rate": 3.753579532644997e-05, "loss": 0.6663, "step": 421800 }, { "epoch": 5.8127359400402305, "grad_norm": 29.1221923828125, "learning_rate": 3.752833226892261e-05, "loss": 0.5969, "step": 421900 }, { "epoch": 5.814113692099969, "grad_norm": 17.87327003479004, "learning_rate": 3.752086835943714e-05, "loss": 0.5829, "step": 422000 }, { "epoch": 5.815491444159709, "grad_norm": 9.801362991333008, "learning_rate": 3.7513403598627614e-05, "loss": 0.6016, "step": 422100 }, { "epoch": 5.816869196219448, "grad_norm": 4.09297513961792, "learning_rate": 3.750593798712824e-05, "loss": 0.6083, "step": 422200 }, { "epoch": 5.818246948279188, "grad_norm": 26.39430809020996, "learning_rate": 3.749847152557325e-05, "loss": 0.555, "step": 422300 }, { "epoch": 5.819624700338927, "grad_norm": 20.324661254882812, "learning_rate": 3.749100421459694e-05, "loss": 0.6792, "step": 422400 }, { "epoch": 5.821002452398666, "grad_norm": 5.561123371124268, "learning_rate": 3.7483536054833714e-05, "loss": 0.6992, "step": 422500 }, { "epoch": 5.822380204458406, "grad_norm": 14.129959106445312, "learning_rate": 3.747606704691801e-05, "loss": 0.6028, "step": 422600 }, { "epoch": 5.823757956518145, "grad_norm": 9.802308082580566, "learning_rate": 3.746859719148439e-05, "loss": 0.6703, "step": 422700 }, { "epoch": 5.825135708577884, "grad_norm": 11.730709075927734, "learning_rate": 3.746112648916745e-05, "loss": 0.601, "step": 422800 }, { "epoch": 5.826513460637623, "grad_norm": 3.9638137817382812, "learning_rate": 3.745365494060183e-05, "loss": 0.5589, "step": 422900 }, { "epoch": 5.827891212697363, "grad_norm": 5.916142463684082, "learning_rate": 3.744618254642231e-05, "loss": 0.6173, "step": 423000 }, { "epoch": 5.8292689647571025, "grad_norm": 4.996461868286133, "learning_rate": 3.743870930726369e-05, "loss": 0.6419, "step": 423100 }, { "epoch": 5.830646716816842, "grad_norm": 94.03470611572266, "learning_rate": 3.743123522376088e-05, "loss": 0.6925, "step": 423200 }, { "epoch": 5.832024468876581, "grad_norm": 10.281254768371582, "learning_rate": 3.7423760296548826e-05, "loss": 0.6221, "step": 423300 }, { "epoch": 5.83340222093632, "grad_norm": 33.95487976074219, "learning_rate": 3.7416284526262565e-05, "loss": 0.6585, "step": 423400 }, { "epoch": 5.83477997299606, "grad_norm": 3.6431691646575928, "learning_rate": 3.74088079135372e-05, "loss": 0.6664, "step": 423500 }, { "epoch": 5.8361577250557986, "grad_norm": 3.5131843090057373, "learning_rate": 3.740133045900791e-05, "loss": 0.6495, "step": 423600 }, { "epoch": 5.837535477115538, "grad_norm": 17.174955368041992, "learning_rate": 3.739385216330995e-05, "loss": 0.6284, "step": 423700 }, { "epoch": 5.838913229175278, "grad_norm": 174.90777587890625, "learning_rate": 3.7386373027078625e-05, "loss": 0.6971, "step": 423800 }, { "epoch": 5.840290981235017, "grad_norm": 51.398136138916016, "learning_rate": 3.7378967854866036e-05, "loss": 0.6496, "step": 423900 }, { "epoch": 5.841668733294756, "grad_norm": 9.858229637145996, "learning_rate": 3.7371487047863715e-05, "loss": 0.6288, "step": 424000 }, { "epoch": 5.8430464853544954, "grad_norm": 3.9244627952575684, "learning_rate": 3.736400540222808e-05, "loss": 0.6168, "step": 424100 }, { "epoch": 5.844424237414235, "grad_norm": 28.409439086914062, "learning_rate": 3.7356522918594716e-05, "loss": 0.6231, "step": 424200 }, { "epoch": 5.845801989473975, "grad_norm": 30.201217651367188, "learning_rate": 3.734903959759929e-05, "loss": 0.6951, "step": 424300 }, { "epoch": 5.847179741533713, "grad_norm": 7.8538312911987305, "learning_rate": 3.734155543987758e-05, "loss": 0.6718, "step": 424400 }, { "epoch": 5.848557493593453, "grad_norm": 46.438236236572266, "learning_rate": 3.733407044606538e-05, "loss": 0.6381, "step": 424500 }, { "epoch": 5.849935245653192, "grad_norm": 19.912607192993164, "learning_rate": 3.732658461679859e-05, "loss": 0.6957, "step": 424600 }, { "epoch": 5.851312997712932, "grad_norm": 9.96639633178711, "learning_rate": 3.731917282348428e-05, "loss": 0.6889, "step": 424700 }, { "epoch": 5.852690749772671, "grad_norm": 1.5888310670852661, "learning_rate": 3.731168533355492e-05, "loss": 0.6641, "step": 424800 }, { "epoch": 5.85406850183241, "grad_norm": 76.31163787841797, "learning_rate": 3.73041970100727e-05, "loss": 0.6884, "step": 424900 }, { "epoch": 5.85544625389215, "grad_norm": 7.269099712371826, "learning_rate": 3.729670785367379e-05, "loss": 0.7136, "step": 425000 }, { "epoch": 5.856824005951889, "grad_norm": 8.136226654052734, "learning_rate": 3.728921786499442e-05, "loss": 0.566, "step": 425100 }, { "epoch": 5.858201758011628, "grad_norm": 14.660557746887207, "learning_rate": 3.7281727044670905e-05, "loss": 0.6157, "step": 425200 }, { "epoch": 5.8595795100713675, "grad_norm": 40.86747360229492, "learning_rate": 3.727423539333965e-05, "loss": 0.6475, "step": 425300 }, { "epoch": 5.860957262131107, "grad_norm": 374.0465087890625, "learning_rate": 3.726674291163709e-05, "loss": 0.662, "step": 425400 }, { "epoch": 5.862335014190847, "grad_norm": 7.229824066162109, "learning_rate": 3.7259249600199745e-05, "loss": 0.7103, "step": 425500 }, { "epoch": 5.863712766250585, "grad_norm": 25.638059616088867, "learning_rate": 3.7251755459664226e-05, "loss": 0.6467, "step": 425600 }, { "epoch": 5.865090518310325, "grad_norm": 353.8828430175781, "learning_rate": 3.724426049066718e-05, "loss": 0.7214, "step": 425700 }, { "epoch": 5.866468270370064, "grad_norm": 35.603641510009766, "learning_rate": 3.7236764693845376e-05, "loss": 0.7342, "step": 425800 }, { "epoch": 5.867846022429804, "grad_norm": 4.34743595123291, "learning_rate": 3.7229268069835575e-05, "loss": 0.724, "step": 425900 }, { "epoch": 5.869223774489543, "grad_norm": 11.061627388000488, "learning_rate": 3.7221770619274686e-05, "loss": 0.7567, "step": 426000 }, { "epoch": 5.870601526549282, "grad_norm": 37.35090637207031, "learning_rate": 3.721427234279965e-05, "loss": 0.6375, "step": 426100 }, { "epoch": 5.871979278609022, "grad_norm": 14.603950500488281, "learning_rate": 3.720677324104748e-05, "loss": 0.6786, "step": 426200 }, { "epoch": 5.87335703066876, "grad_norm": 1196.724853515625, "learning_rate": 3.719927331465526e-05, "loss": 0.595, "step": 426300 }, { "epoch": 5.8747347827285, "grad_norm": 5.4786577224731445, "learning_rate": 3.7191772564260155e-05, "loss": 0.6756, "step": 426400 }, { "epoch": 5.8761125347882395, "grad_norm": 24.734024047851562, "learning_rate": 3.7184270990499383e-05, "loss": 0.6679, "step": 426500 }, { "epoch": 5.877490286847979, "grad_norm": 18.665855407714844, "learning_rate": 3.7176768594010245e-05, "loss": 0.6471, "step": 426600 }, { "epoch": 5.878868038907719, "grad_norm": 8.035333633422852, "learning_rate": 3.7169265375430116e-05, "loss": 0.6225, "step": 426700 }, { "epoch": 5.880245790967457, "grad_norm": 12.994061470031738, "learning_rate": 3.7161761335396425e-05, "loss": 0.7333, "step": 426800 }, { "epoch": 5.881623543027197, "grad_norm": 32.16779327392578, "learning_rate": 3.715425647454667e-05, "loss": 0.7809, "step": 426900 }, { "epoch": 5.883001295086936, "grad_norm": 6.404416084289551, "learning_rate": 3.714675079351844e-05, "loss": 0.6168, "step": 427000 }, { "epoch": 5.884379047146675, "grad_norm": 4.859826564788818, "learning_rate": 3.7139244292949386e-05, "loss": 0.7034, "step": 427100 }, { "epoch": 5.885756799206415, "grad_norm": 7.830059051513672, "learning_rate": 3.7131736973477206e-05, "loss": 0.6149, "step": 427200 }, { "epoch": 5.887134551266154, "grad_norm": 5.535546779632568, "learning_rate": 3.7124228835739704e-05, "loss": 0.5354, "step": 427300 }, { "epoch": 5.888512303325894, "grad_norm": 2.0457115173339844, "learning_rate": 3.7116719880374714e-05, "loss": 0.6405, "step": 427400 }, { "epoch": 5.889890055385633, "grad_norm": 47.52168655395508, "learning_rate": 3.710921010802018e-05, "loss": 0.5588, "step": 427500 }, { "epoch": 5.891267807445372, "grad_norm": 2.555983304977417, "learning_rate": 3.7101699519314085e-05, "loss": 0.6799, "step": 427600 }, { "epoch": 5.8926455595051115, "grad_norm": 48.739097595214844, "learning_rate": 3.709418811489449e-05, "loss": 0.6886, "step": 427700 }, { "epoch": 5.894023311564851, "grad_norm": 14.525940895080566, "learning_rate": 3.7086675895399535e-05, "loss": 0.6192, "step": 427800 }, { "epoch": 5.89540106362459, "grad_norm": 5.6463141441345215, "learning_rate": 3.707916286146741e-05, "loss": 0.7687, "step": 427900 }, { "epoch": 5.896778815684329, "grad_norm": 7.920635223388672, "learning_rate": 3.70716490137364e-05, "loss": 0.6485, "step": 428000 }, { "epoch": 5.898156567744069, "grad_norm": 5.616747856140137, "learning_rate": 3.706413435284484e-05, "loss": 0.7273, "step": 428100 }, { "epoch": 5.899534319803808, "grad_norm": 3.455004930496216, "learning_rate": 3.7056618879431134e-05, "loss": 0.6811, "step": 428200 }, { "epoch": 5.900912071863547, "grad_norm": 7.980852127075195, "learning_rate": 3.704910259413377e-05, "loss": 0.639, "step": 428300 }, { "epoch": 5.902289823923287, "grad_norm": 6.100063323974609, "learning_rate": 3.7041585497591274e-05, "loss": 0.5664, "step": 428400 }, { "epoch": 5.903667575983026, "grad_norm": 6.610986232757568, "learning_rate": 3.703406759044228e-05, "loss": 0.6016, "step": 428500 }, { "epoch": 5.905045328042766, "grad_norm": 4.744160175323486, "learning_rate": 3.702654887332547e-05, "loss": 0.6221, "step": 428600 }, { "epoch": 5.906423080102504, "grad_norm": 13.851295471191406, "learning_rate": 3.70190293468796e-05, "loss": 0.7111, "step": 428700 }, { "epoch": 5.907800832162244, "grad_norm": 33.78352737426758, "learning_rate": 3.701150901174348e-05, "loss": 0.6226, "step": 428800 }, { "epoch": 5.9091785842219835, "grad_norm": 6.320300579071045, "learning_rate": 3.700398786855602e-05, "loss": 0.5655, "step": 428900 }, { "epoch": 5.910556336281723, "grad_norm": 3.313681125640869, "learning_rate": 3.699646591795616e-05, "loss": 0.5985, "step": 429000 }, { "epoch": 5.911934088341462, "grad_norm": 10.409984588623047, "learning_rate": 3.698894316058294e-05, "loss": 0.6006, "step": 429100 }, { "epoch": 5.913311840401201, "grad_norm": 12.074596405029297, "learning_rate": 3.698141959707546e-05, "loss": 0.5957, "step": 429200 }, { "epoch": 5.914689592460941, "grad_norm": 8.207657814025879, "learning_rate": 3.6973895228072865e-05, "loss": 0.6958, "step": 429300 }, { "epoch": 5.91606734452068, "grad_norm": 5.099185943603516, "learning_rate": 3.69663700542144e-05, "loss": 0.5632, "step": 429400 }, { "epoch": 5.917445096580419, "grad_norm": 3.278627395629883, "learning_rate": 3.695884407613938e-05, "loss": 0.6279, "step": 429500 }, { "epoch": 5.918822848640159, "grad_norm": 2.1374051570892334, "learning_rate": 3.6951317294487165e-05, "loss": 0.5715, "step": 429600 }, { "epoch": 5.920200600699898, "grad_norm": 2.794398784637451, "learning_rate": 3.6943789709897195e-05, "loss": 0.5368, "step": 429700 }, { "epoch": 5.921578352759638, "grad_norm": 448.9095458984375, "learning_rate": 3.693626132300896e-05, "loss": 0.5971, "step": 429800 }, { "epoch": 5.922956104819376, "grad_norm": 8.30272102355957, "learning_rate": 3.692873213446206e-05, "loss": 0.5373, "step": 429900 }, { "epoch": 5.924333856879116, "grad_norm": 3.7461960315704346, "learning_rate": 3.692120214489613e-05, "loss": 0.5221, "step": 430000 }, { "epoch": 5.9257116089388555, "grad_norm": 4.689380168914795, "learning_rate": 3.691367135495088e-05, "loss": 0.6562, "step": 430100 }, { "epoch": 5.927089360998595, "grad_norm": 8.454404830932617, "learning_rate": 3.690613976526608e-05, "loss": 0.5559, "step": 430200 }, { "epoch": 5.928467113058334, "grad_norm": 5.8494768142700195, "learning_rate": 3.689860737648159e-05, "loss": 0.6162, "step": 430300 }, { "epoch": 5.929844865118073, "grad_norm": 3.5328571796417236, "learning_rate": 3.6891074189237324e-05, "loss": 0.576, "step": 430400 }, { "epoch": 5.931222617177813, "grad_norm": 3.603300094604492, "learning_rate": 3.688354020417326e-05, "loss": 0.5384, "step": 430500 }, { "epoch": 5.9326003692375515, "grad_norm": 10.173060417175293, "learning_rate": 3.687600542192945e-05, "loss": 0.5893, "step": 430600 }, { "epoch": 5.933978121297291, "grad_norm": 8.62248706817627, "learning_rate": 3.686846984314601e-05, "loss": 0.6259, "step": 430700 }, { "epoch": 5.935355873357031, "grad_norm": 8.846124649047852, "learning_rate": 3.686093346846313e-05, "loss": 0.521, "step": 430800 }, { "epoch": 5.93673362541677, "grad_norm": 1.8469078540802002, "learning_rate": 3.685339629852106e-05, "loss": 0.5536, "step": 430900 }, { "epoch": 5.93811137747651, "grad_norm": 6.402103424072266, "learning_rate": 3.6845858333960125e-05, "loss": 0.5646, "step": 431000 }, { "epoch": 5.939489129536248, "grad_norm": 7.487903594970703, "learning_rate": 3.683831957542071e-05, "loss": 0.5571, "step": 431100 }, { "epoch": 5.940866881595988, "grad_norm": 3.5843000411987305, "learning_rate": 3.6830780023543266e-05, "loss": 0.5761, "step": 431200 }, { "epoch": 5.9422446336557275, "grad_norm": 4.272304058074951, "learning_rate": 3.6823239678968326e-05, "loss": 0.5372, "step": 431300 }, { "epoch": 5.943622385715466, "grad_norm": 12.894792556762695, "learning_rate": 3.6815773957621386e-05, "loss": 0.5875, "step": 431400 }, { "epoch": 5.945000137775206, "grad_norm": 132.09768676757812, "learning_rate": 3.6808232037484276e-05, "loss": 0.6061, "step": 431500 }, { "epoch": 5.946377889834945, "grad_norm": 24.59657859802246, "learning_rate": 3.680068932656524e-05, "loss": 0.5844, "step": 431600 }, { "epoch": 5.947755641894685, "grad_norm": 5.0189080238342285, "learning_rate": 3.6793145825505064e-05, "loss": 0.5308, "step": 431700 }, { "epoch": 5.949133393954424, "grad_norm": 1.8234995603561401, "learning_rate": 3.6785601534944605e-05, "loss": 0.6098, "step": 431800 }, { "epoch": 5.950511146014163, "grad_norm": 9.298529624938965, "learning_rate": 3.67780564555248e-05, "loss": 0.5942, "step": 431900 }, { "epoch": 5.951888898073903, "grad_norm": 10.730475425720215, "learning_rate": 3.677051058788662e-05, "loss": 0.596, "step": 432000 }, { "epoch": 5.953266650133642, "grad_norm": 11.711047172546387, "learning_rate": 3.676296393267115e-05, "loss": 0.523, "step": 432100 }, { "epoch": 5.954644402193381, "grad_norm": 8.184954643249512, "learning_rate": 3.67554164905195e-05, "loss": 0.5469, "step": 432200 }, { "epoch": 5.95602215425312, "grad_norm": 3.974088430404663, "learning_rate": 3.674786826207286e-05, "loss": 0.5885, "step": 432300 }, { "epoch": 5.95739990631286, "grad_norm": 5.20788049697876, "learning_rate": 3.674031924797251e-05, "loss": 0.5494, "step": 432400 }, { "epoch": 5.9587776583725995, "grad_norm": 5.296034336090088, "learning_rate": 3.673276944885976e-05, "loss": 0.5949, "step": 432500 }, { "epoch": 5.960155410432339, "grad_norm": 5.673565864562988, "learning_rate": 3.672521886537601e-05, "loss": 0.5615, "step": 432600 }, { "epoch": 5.961533162492078, "grad_norm": 10.703372955322266, "learning_rate": 3.671766749816273e-05, "loss": 0.5773, "step": 432700 }, { "epoch": 5.962910914551817, "grad_norm": 7.864493370056152, "learning_rate": 3.6710115347861436e-05, "loss": 0.5727, "step": 432800 }, { "epoch": 5.964288666611557, "grad_norm": 3.806501626968384, "learning_rate": 3.670256241511372e-05, "loss": 0.5684, "step": 432900 }, { "epoch": 5.9656664186712955, "grad_norm": 12.322609901428223, "learning_rate": 3.6695008700561254e-05, "loss": 0.5526, "step": 433000 }, { "epoch": 5.967044170731035, "grad_norm": 25.0034122467041, "learning_rate": 3.668745420484574e-05, "loss": 0.6101, "step": 433100 }, { "epoch": 5.968421922790775, "grad_norm": 3.787853717803955, "learning_rate": 3.6679898928609005e-05, "loss": 0.5057, "step": 433200 }, { "epoch": 5.969799674850514, "grad_norm": 11.54350757598877, "learning_rate": 3.6672342872492894e-05, "loss": 0.5054, "step": 433300 }, { "epoch": 5.971177426910253, "grad_norm": 7.111335754394531, "learning_rate": 3.666478603713932e-05, "loss": 0.5422, "step": 433400 }, { "epoch": 5.972555178969992, "grad_norm": 5.190135955810547, "learning_rate": 3.6657228423190287e-05, "loss": 0.5514, "step": 433500 }, { "epoch": 5.973932931029732, "grad_norm": 1.7636561393737793, "learning_rate": 3.6649670031287866e-05, "loss": 0.5779, "step": 433600 }, { "epoch": 5.9753106830894716, "grad_norm": 11.792951583862305, "learning_rate": 3.664211086207415e-05, "loss": 0.5175, "step": 433700 }, { "epoch": 5.97668843514921, "grad_norm": 18.334692001342773, "learning_rate": 3.6634550916191344e-05, "loss": 0.5519, "step": 433800 }, { "epoch": 5.97806618720895, "grad_norm": 9.421588897705078, "learning_rate": 3.662699019428172e-05, "loss": 0.6008, "step": 433900 }, { "epoch": 5.979443939268689, "grad_norm": 0.761475145816803, "learning_rate": 3.6619428696987566e-05, "loss": 0.5441, "step": 434000 }, { "epoch": 5.980821691328429, "grad_norm": 2.420869827270508, "learning_rate": 3.661186642495131e-05, "loss": 0.5136, "step": 434100 }, { "epoch": 5.982199443388168, "grad_norm": 56.12458038330078, "learning_rate": 3.660430337881538e-05, "loss": 0.565, "step": 434200 }, { "epoch": 5.983577195447907, "grad_norm": 3.856255292892456, "learning_rate": 3.659673955922229e-05, "loss": 0.6038, "step": 434300 }, { "epoch": 5.984954947507647, "grad_norm": 4.925717353820801, "learning_rate": 3.658917496681465e-05, "loss": 0.5903, "step": 434400 }, { "epoch": 5.986332699567386, "grad_norm": 5.484774589538574, "learning_rate": 3.6581609602235086e-05, "loss": 0.5318, "step": 434500 }, { "epoch": 5.987710451627125, "grad_norm": 5.931771755218506, "learning_rate": 3.657404346612633e-05, "loss": 0.6118, "step": 434600 }, { "epoch": 5.9890882036868645, "grad_norm": 2.350668430328369, "learning_rate": 3.656647655913116e-05, "loss": 0.4815, "step": 434700 }, { "epoch": 5.990465955746604, "grad_norm": 3.3989789485931396, "learning_rate": 3.655890888189242e-05, "loss": 0.5539, "step": 434800 }, { "epoch": 5.991843707806343, "grad_norm": 3.3227851390838623, "learning_rate": 3.655134043505302e-05, "loss": 0.5306, "step": 434900 }, { "epoch": 5.993221459866082, "grad_norm": 23.648752212524414, "learning_rate": 3.654377121925595e-05, "loss": 0.5452, "step": 435000 }, { "epoch": 5.994599211925822, "grad_norm": 434.21368408203125, "learning_rate": 3.653620123514425e-05, "loss": 0.5292, "step": 435100 }, { "epoch": 5.995976963985561, "grad_norm": 7.6446404457092285, "learning_rate": 3.652863048336102e-05, "loss": 0.5151, "step": 435200 }, { "epoch": 5.997354716045301, "grad_norm": 8.195099830627441, "learning_rate": 3.652105896454945e-05, "loss": 0.5419, "step": 435300 }, { "epoch": 5.99873246810504, "grad_norm": 5.087191581726074, "learning_rate": 3.651348667935276e-05, "loss": 0.5407, "step": 435400 }, { "epoch": 6.000110220164779, "grad_norm": 3.932257890701294, "learning_rate": 3.650591362841426e-05, "loss": 0.5739, "step": 435500 }, { "epoch": 6.001487972224519, "grad_norm": 6.996191501617432, "learning_rate": 3.6498339812377335e-05, "loss": 0.4838, "step": 435600 }, { "epoch": 6.002865724284258, "grad_norm": 4.764026165008545, "learning_rate": 3.6490765231885404e-05, "loss": 0.5172, "step": 435700 }, { "epoch": 6.004243476343997, "grad_norm": 11.481037139892578, "learning_rate": 3.6483189887581966e-05, "loss": 0.5502, "step": 435800 }, { "epoch": 6.0056212284037365, "grad_norm": 15.15224838256836, "learning_rate": 3.647561378011059e-05, "loss": 0.4605, "step": 435900 }, { "epoch": 6.006998980463476, "grad_norm": 4.166591644287109, "learning_rate": 3.6468036910114915e-05, "loss": 0.5327, "step": 436000 }, { "epoch": 6.008376732523215, "grad_norm": 4.7048821449279785, "learning_rate": 3.6460459278238616e-05, "loss": 0.5222, "step": 436100 }, { "epoch": 6.009754484582954, "grad_norm": 10.500849723815918, "learning_rate": 3.645288088512546e-05, "loss": 0.5579, "step": 436200 }, { "epoch": 6.011132236642694, "grad_norm": 10.437847137451172, "learning_rate": 3.644530173141926e-05, "loss": 0.5227, "step": 436300 }, { "epoch": 6.012509988702433, "grad_norm": 243.36138916015625, "learning_rate": 3.643772181776393e-05, "loss": 0.566, "step": 436400 }, { "epoch": 6.013887740762172, "grad_norm": 5.659040927886963, "learning_rate": 3.643021695528945e-05, "loss": 0.5845, "step": 436500 }, { "epoch": 6.015265492821912, "grad_norm": 7.633718967437744, "learning_rate": 3.642263553125117e-05, "loss": 0.487, "step": 436600 }, { "epoch": 6.016643244881651, "grad_norm": 7.7264227867126465, "learning_rate": 3.6415053349189354e-05, "loss": 0.5635, "step": 436700 }, { "epoch": 6.018020996941391, "grad_norm": 3.5479843616485596, "learning_rate": 3.640747040974815e-05, "loss": 0.6268, "step": 436800 }, { "epoch": 6.019398749001129, "grad_norm": 2.445791482925415, "learning_rate": 3.6399962554277246e-05, "loss": 0.5395, "step": 436900 }, { "epoch": 6.020776501060869, "grad_norm": 3.3734915256500244, "learning_rate": 3.639237810956767e-05, "loss": 0.5019, "step": 437000 }, { "epoch": 6.0221542531206085, "grad_norm": 2.492452621459961, "learning_rate": 3.638479290940507e-05, "loss": 0.5262, "step": 437100 }, { "epoch": 6.023532005180348, "grad_norm": 3.395153284072876, "learning_rate": 3.637720695443387e-05, "loss": 0.5754, "step": 437200 }, { "epoch": 6.024909757240087, "grad_norm": 7.3772969245910645, "learning_rate": 3.636962024529851e-05, "loss": 0.5231, "step": 437300 }, { "epoch": 6.026287509299826, "grad_norm": 3.5467405319213867, "learning_rate": 3.636203278264355e-05, "loss": 0.5894, "step": 437400 }, { "epoch": 6.027665261359566, "grad_norm": 3.104058027267456, "learning_rate": 3.6354444567113555e-05, "loss": 0.4561, "step": 437500 }, { "epoch": 6.029043013419305, "grad_norm": 4.787964344024658, "learning_rate": 3.6346855599353195e-05, "loss": 0.5876, "step": 437600 }, { "epoch": 6.030420765479044, "grad_norm": 12.891325950622559, "learning_rate": 3.6339265880007205e-05, "loss": 0.4888, "step": 437700 }, { "epoch": 6.031798517538784, "grad_norm": 9.917616844177246, "learning_rate": 3.633167540972034e-05, "loss": 0.5959, "step": 437800 }, { "epoch": 6.033176269598523, "grad_norm": 2.963008403778076, "learning_rate": 3.632408418913747e-05, "loss": 0.4673, "step": 437900 }, { "epoch": 6.034554021658263, "grad_norm": 8.004764556884766, "learning_rate": 3.6316492218903524e-05, "loss": 0.5567, "step": 438000 }, { "epoch": 6.035931773718001, "grad_norm": 2.5110106468200684, "learning_rate": 3.630889949966344e-05, "loss": 0.4351, "step": 438100 }, { "epoch": 6.037309525777741, "grad_norm": 2.3889193534851074, "learning_rate": 3.6301306032062285e-05, "loss": 0.5374, "step": 438200 }, { "epoch": 6.0386872778374805, "grad_norm": 5.569151878356934, "learning_rate": 3.629371181674515e-05, "loss": 0.4994, "step": 438300 }, { "epoch": 6.04006502989722, "grad_norm": 3.1107232570648193, "learning_rate": 3.628611685435721e-05, "loss": 0.4785, "step": 438400 }, { "epoch": 6.041442781956959, "grad_norm": 11.284457206726074, "learning_rate": 3.627852114554369e-05, "loss": 0.5848, "step": 438500 }, { "epoch": 6.042820534016698, "grad_norm": 24.089033126831055, "learning_rate": 3.627092469094989e-05, "loss": 0.6009, "step": 438600 }, { "epoch": 6.044198286076438, "grad_norm": 9.23412799835205, "learning_rate": 3.626332749122117e-05, "loss": 0.5247, "step": 438700 }, { "epoch": 6.045576038136177, "grad_norm": 1.5072345733642578, "learning_rate": 3.6255729547002946e-05, "loss": 0.4833, "step": 438800 }, { "epoch": 6.046953790195916, "grad_norm": 4.464546203613281, "learning_rate": 3.62481308589407e-05, "loss": 0.5437, "step": 438900 }, { "epoch": 6.048331542255656, "grad_norm": 1.1243488788604736, "learning_rate": 3.6240531427679984e-05, "loss": 0.5059, "step": 439000 }, { "epoch": 6.049709294315395, "grad_norm": 4.448837757110596, "learning_rate": 3.623293125386641e-05, "loss": 0.608, "step": 439100 }, { "epoch": 6.051087046375135, "grad_norm": 25.50322723388672, "learning_rate": 3.622540635097317e-05, "loss": 0.5858, "step": 439200 }, { "epoch": 6.052464798434873, "grad_norm": 2.336292266845703, "learning_rate": 3.621780470140037e-05, "loss": 0.5671, "step": 439300 }, { "epoch": 6.053842550494613, "grad_norm": 9.667577743530273, "learning_rate": 3.6210202311205475e-05, "loss": 0.5473, "step": 439400 }, { "epoch": 6.0552203025543525, "grad_norm": 5.689519882202148, "learning_rate": 3.6202599181034314e-05, "loss": 0.5299, "step": 439500 }, { "epoch": 6.056598054614092, "grad_norm": 2.0845119953155518, "learning_rate": 3.6194995311532844e-05, "loss": 0.4396, "step": 439600 }, { "epoch": 6.057975806673831, "grad_norm": 3.8977465629577637, "learning_rate": 3.618739070334702e-05, "loss": 0.5506, "step": 439700 }, { "epoch": 6.05935355873357, "grad_norm": 5.027807235717773, "learning_rate": 3.617978535712292e-05, "loss": 0.4433, "step": 439800 }, { "epoch": 6.06073131079331, "grad_norm": 1.3472216129302979, "learning_rate": 3.617217927350666e-05, "loss": 0.4872, "step": 439900 }, { "epoch": 6.062109062853049, "grad_norm": 4.526773452758789, "learning_rate": 3.616457245314439e-05, "loss": 0.4335, "step": 440000 }, { "epoch": 6.063486814912788, "grad_norm": 6.204596996307373, "learning_rate": 3.6156964896682374e-05, "loss": 0.5058, "step": 440100 }, { "epoch": 6.064864566972528, "grad_norm": 2.962155818939209, "learning_rate": 3.61493566047669e-05, "loss": 0.5986, "step": 440200 }, { "epoch": 6.066242319032267, "grad_norm": 3.6378650665283203, "learning_rate": 3.614174757804432e-05, "loss": 0.4689, "step": 440300 }, { "epoch": 6.067620071092006, "grad_norm": 3.064312219619751, "learning_rate": 3.613413781716109e-05, "loss": 0.5226, "step": 440400 }, { "epoch": 6.068997823151745, "grad_norm": 1.3371057510375977, "learning_rate": 3.612652732276367e-05, "loss": 0.5057, "step": 440500 }, { "epoch": 6.070375575211485, "grad_norm": 7.146762371063232, "learning_rate": 3.6118916095498616e-05, "loss": 0.5122, "step": 440600 }, { "epoch": 6.0717533272712245, "grad_norm": 3.738504409790039, "learning_rate": 3.611130413601255e-05, "loss": 0.5273, "step": 440700 }, { "epoch": 6.073131079330963, "grad_norm": 7.647688865661621, "learning_rate": 3.6103691444952153e-05, "loss": 0.5517, "step": 440800 }, { "epoch": 6.074508831390703, "grad_norm": 6.716989517211914, "learning_rate": 3.6096078022964135e-05, "loss": 0.5269, "step": 440900 }, { "epoch": 6.075886583450442, "grad_norm": 2.3925464153289795, "learning_rate": 3.6088463870695325e-05, "loss": 0.4833, "step": 441000 }, { "epoch": 6.077264335510182, "grad_norm": 5.472541809082031, "learning_rate": 3.6080848988792564e-05, "loss": 0.5763, "step": 441100 }, { "epoch": 6.0786420875699205, "grad_norm": 10.458732604980469, "learning_rate": 3.607323337790278e-05, "loss": 0.4669, "step": 441200 }, { "epoch": 6.08001983962966, "grad_norm": 2.846069812774658, "learning_rate": 3.6065617038672965e-05, "loss": 0.5704, "step": 441300 }, { "epoch": 6.0813975916894, "grad_norm": 5.048154830932617, "learning_rate": 3.605799997175016e-05, "loss": 0.5151, "step": 441400 }, { "epoch": 6.082775343749139, "grad_norm": 7.898248195648193, "learning_rate": 3.6050382177781476e-05, "loss": 0.5817, "step": 441500 }, { "epoch": 6.084153095808878, "grad_norm": 2.1498281955718994, "learning_rate": 3.6042763657414084e-05, "loss": 0.5335, "step": 441600 }, { "epoch": 6.085530847868617, "grad_norm": 11.37043285369873, "learning_rate": 3.6035144411295215e-05, "loss": 0.4992, "step": 441700 }, { "epoch": 6.086908599928357, "grad_norm": 5.462133407592773, "learning_rate": 3.602752444007215e-05, "loss": 0.4969, "step": 441800 }, { "epoch": 6.0882863519880965, "grad_norm": 5.392831802368164, "learning_rate": 3.601990374439227e-05, "loss": 0.5282, "step": 441900 }, { "epoch": 6.089664104047835, "grad_norm": 6.102015018463135, "learning_rate": 3.6012282324902985e-05, "loss": 0.4772, "step": 442000 }, { "epoch": 6.091041856107575, "grad_norm": 5.128676891326904, "learning_rate": 3.600466018225176e-05, "loss": 0.4876, "step": 442100 }, { "epoch": 6.092419608167314, "grad_norm": 1.8960788249969482, "learning_rate": 3.599703731708614e-05, "loss": 0.5109, "step": 442200 }, { "epoch": 6.093797360227054, "grad_norm": 3.5986814498901367, "learning_rate": 3.5989413730053727e-05, "loss": 0.4788, "step": 442300 }, { "epoch": 6.0951751122867925, "grad_norm": 5.08997106552124, "learning_rate": 3.598178942180219e-05, "loss": 0.5027, "step": 442400 }, { "epoch": 6.096552864346532, "grad_norm": 2.4945812225341797, "learning_rate": 3.5974164392979255e-05, "loss": 0.4899, "step": 442500 }, { "epoch": 6.097930616406272, "grad_norm": 3.864180326461792, "learning_rate": 3.596653864423269e-05, "loss": 0.4705, "step": 442600 }, { "epoch": 6.099308368466011, "grad_norm": 3.879204511642456, "learning_rate": 3.595891217621037e-05, "loss": 0.5654, "step": 442700 }, { "epoch": 6.10068612052575, "grad_norm": 4.761159420013428, "learning_rate": 3.5951284989560165e-05, "loss": 0.4777, "step": 442800 }, { "epoch": 6.102063872585489, "grad_norm": 1.4389569759368896, "learning_rate": 3.594365708493008e-05, "loss": 0.4663, "step": 442900 }, { "epoch": 6.103441624645229, "grad_norm": 21.642112731933594, "learning_rate": 3.5936028462968124e-05, "loss": 0.5343, "step": 443000 }, { "epoch": 6.1048193767049685, "grad_norm": 10.866023063659668, "learning_rate": 3.592839912432238e-05, "loss": 0.5379, "step": 443100 }, { "epoch": 6.106197128764707, "grad_norm": 6.113267421722412, "learning_rate": 3.592076906964102e-05, "loss": 0.5235, "step": 443200 }, { "epoch": 6.107574880824447, "grad_norm": 7.626857280731201, "learning_rate": 3.591313829957225e-05, "loss": 0.4854, "step": 443300 }, { "epoch": 6.108952632884186, "grad_norm": 70.18658447265625, "learning_rate": 3.590550681476433e-05, "loss": 0.4979, "step": 443400 }, { "epoch": 6.110330384943926, "grad_norm": 17.8839054107666, "learning_rate": 3.589787461586562e-05, "loss": 0.5445, "step": 443500 }, { "epoch": 6.1117081370036646, "grad_norm": 37.61186218261719, "learning_rate": 3.589024170352448e-05, "loss": 0.4973, "step": 443600 }, { "epoch": 6.113085889063404, "grad_norm": 327.0575256347656, "learning_rate": 3.588260807838939e-05, "loss": 0.4814, "step": 443700 }, { "epoch": 6.114463641123144, "grad_norm": 5.977665901184082, "learning_rate": 3.5874973741108866e-05, "loss": 0.5655, "step": 443800 }, { "epoch": 6.115841393182883, "grad_norm": 1.4254932403564453, "learning_rate": 3.586733869233147e-05, "loss": 0.5399, "step": 443900 }, { "epoch": 6.117219145242622, "grad_norm": 8.30862045288086, "learning_rate": 3.585970293270585e-05, "loss": 0.4537, "step": 444000 }, { "epoch": 6.1185968973023614, "grad_norm": 2.7500791549682617, "learning_rate": 3.585206646288069e-05, "loss": 0.5098, "step": 444100 }, { "epoch": 6.119974649362101, "grad_norm": 5.256394386291504, "learning_rate": 3.5844429283504765e-05, "loss": 0.5182, "step": 444200 }, { "epoch": 6.121352401421841, "grad_norm": 4.8288350105285645, "learning_rate": 3.583679139522687e-05, "loss": 0.4725, "step": 444300 }, { "epoch": 6.122730153481579, "grad_norm": 26.06218910217285, "learning_rate": 3.582915279869591e-05, "loss": 0.4934, "step": 444400 }, { "epoch": 6.124107905541319, "grad_norm": 4.325410842895508, "learning_rate": 3.58215134945608e-05, "loss": 0.5636, "step": 444500 }, { "epoch": 6.125485657601058, "grad_norm": 15.727971076965332, "learning_rate": 3.581387348347054e-05, "loss": 0.5729, "step": 444600 }, { "epoch": 6.126863409660798, "grad_norm": 4.680234432220459, "learning_rate": 3.58062327660742e-05, "loss": 0.4681, "step": 444700 }, { "epoch": 6.128241161720537, "grad_norm": 3.360046863555908, "learning_rate": 3.57985913430209e-05, "loss": 0.5741, "step": 444800 }, { "epoch": 6.129618913780276, "grad_norm": 9.207067489624023, "learning_rate": 3.5790949214959795e-05, "loss": 0.5031, "step": 444900 }, { "epoch": 6.130996665840016, "grad_norm": 11.302955627441406, "learning_rate": 3.578330638254015e-05, "loss": 0.5874, "step": 445000 }, { "epoch": 6.132374417899754, "grad_norm": 7.2201738357543945, "learning_rate": 3.577566284641124e-05, "loss": 0.4837, "step": 445100 }, { "epoch": 6.133752169959494, "grad_norm": 1.8146162033081055, "learning_rate": 3.576801860722245e-05, "loss": 0.5133, "step": 445200 }, { "epoch": 6.1351299220192335, "grad_norm": 7.022237777709961, "learning_rate": 3.5760373665623166e-05, "loss": 0.4991, "step": 445300 }, { "epoch": 6.136507674078973, "grad_norm": 10.87583065032959, "learning_rate": 3.5752728022262876e-05, "loss": 0.5313, "step": 445400 }, { "epoch": 6.137885426138712, "grad_norm": 4.28066873550415, "learning_rate": 3.5745081677791135e-05, "loss": 0.5797, "step": 445500 }, { "epoch": 6.139263178198451, "grad_norm": 12.023751258850098, "learning_rate": 3.5737511106772004e-05, "loss": 0.5142, "step": 445600 }, { "epoch": 6.140640930258191, "grad_norm": 43.1213493347168, "learning_rate": 3.5729863369021074e-05, "loss": 0.5477, "step": 445700 }, { "epoch": 6.14201868231793, "grad_norm": 3.95560884475708, "learning_rate": 3.572221493210115e-05, "loss": 0.5364, "step": 445800 }, { "epoch": 6.143396434377669, "grad_norm": 7.4035444259643555, "learning_rate": 3.5714642291471925e-05, "loss": 0.5676, "step": 445900 }, { "epoch": 6.144774186437409, "grad_norm": 86.10315704345703, "learning_rate": 3.570699246513887e-05, "loss": 0.5262, "step": 446000 }, { "epoch": 6.146151938497148, "grad_norm": 3.5248169898986816, "learning_rate": 3.569941845026454e-05, "loss": 0.4538, "step": 446100 }, { "epoch": 6.147529690556888, "grad_norm": 28.532821655273438, "learning_rate": 3.569176723709198e-05, "loss": 0.4902, "step": 446200 }, { "epoch": 6.148907442616626, "grad_norm": 34.63907241821289, "learning_rate": 3.5684115327986885e-05, "loss": 0.5433, "step": 446300 }, { "epoch": 6.150285194676366, "grad_norm": 1.4430421590805054, "learning_rate": 3.567646272359932e-05, "loss": 0.57, "step": 446400 }, { "epoch": 6.1516629467361055, "grad_norm": 2.3084681034088135, "learning_rate": 3.566880942457942e-05, "loss": 0.4334, "step": 446500 }, { "epoch": 6.153040698795845, "grad_norm": 3.7998335361480713, "learning_rate": 3.5661155431577375e-05, "loss": 0.5677, "step": 446600 }, { "epoch": 6.154418450855584, "grad_norm": 6.744919776916504, "learning_rate": 3.56535772955366e-05, "loss": 0.4305, "step": 446700 }, { "epoch": 6.155796202915323, "grad_norm": 109.22599029541016, "learning_rate": 3.5645921923444644e-05, "loss": 0.5926, "step": 446800 }, { "epoch": 6.157173954975063, "grad_norm": 46.428497314453125, "learning_rate": 3.5638265859314944e-05, "loss": 0.4879, "step": 446900 }, { "epoch": 6.158551707034802, "grad_norm": 10.422097206115723, "learning_rate": 3.563060910379791e-05, "loss": 0.5638, "step": 447000 }, { "epoch": 6.159929459094541, "grad_norm": 46.52714538574219, "learning_rate": 3.562295165754405e-05, "loss": 0.551, "step": 447100 }, { "epoch": 6.161307211154281, "grad_norm": 32.69171142578125, "learning_rate": 3.561529352120389e-05, "loss": 0.5483, "step": 447200 }, { "epoch": 6.16268496321402, "grad_norm": 8.378368377685547, "learning_rate": 3.5607634695428016e-05, "loss": 0.5729, "step": 447300 }, { "epoch": 6.16406271527376, "grad_norm": 68.1645736694336, "learning_rate": 3.559997518086711e-05, "loss": 0.5117, "step": 447400 }, { "epoch": 6.165440467333498, "grad_norm": 40.23468017578125, "learning_rate": 3.559231497817187e-05, "loss": 0.5395, "step": 447500 }, { "epoch": 6.166818219393238, "grad_norm": 11.316801071166992, "learning_rate": 3.558465408799307e-05, "loss": 0.5486, "step": 447600 }, { "epoch": 6.1681959714529775, "grad_norm": 4.000333309173584, "learning_rate": 3.557699251098154e-05, "loss": 0.5308, "step": 447700 }, { "epoch": 6.169573723512717, "grad_norm": 11.885077476501465, "learning_rate": 3.5569330247788186e-05, "loss": 0.5615, "step": 447800 }, { "epoch": 6.170951475572456, "grad_norm": 8.210104942321777, "learning_rate": 3.5561667299063934e-05, "loss": 0.5312, "step": 447900 }, { "epoch": 6.172329227632195, "grad_norm": 2.7470293045043945, "learning_rate": 3.555400366545981e-05, "loss": 0.5195, "step": 448000 }, { "epoch": 6.173706979691935, "grad_norm": 5.2759108543396, "learning_rate": 3.5546339347626864e-05, "loss": 0.4933, "step": 448100 }, { "epoch": 6.175084731751674, "grad_norm": 17.13570213317871, "learning_rate": 3.553867434621622e-05, "loss": 0.5528, "step": 448200 }, { "epoch": 6.176462483811413, "grad_norm": 67.52412414550781, "learning_rate": 3.553100866187908e-05, "loss": 0.5231, "step": 448300 }, { "epoch": 6.177840235871153, "grad_norm": 3.0787155628204346, "learning_rate": 3.5523342295266653e-05, "loss": 0.537, "step": 448400 }, { "epoch": 6.179217987930892, "grad_norm": 2.602759838104248, "learning_rate": 3.551567524703026e-05, "loss": 0.5399, "step": 448500 }, { "epoch": 6.180595739990631, "grad_norm": 22.202714920043945, "learning_rate": 3.550800751782125e-05, "loss": 0.5407, "step": 448600 }, { "epoch": 6.18197349205037, "grad_norm": 1.3842966556549072, "learning_rate": 3.5500339108291024e-05, "loss": 0.571, "step": 448700 }, { "epoch": 6.18335124411011, "grad_norm": 11.04072380065918, "learning_rate": 3.549267001909107e-05, "loss": 0.5185, "step": 448800 }, { "epoch": 6.1847289961698495, "grad_norm": 5.193368911743164, "learning_rate": 3.5485000250872893e-05, "loss": 0.5623, "step": 448900 }, { "epoch": 6.186106748229589, "grad_norm": 1.3192013502120972, "learning_rate": 3.547732980428811e-05, "loss": 0.4875, "step": 449000 }, { "epoch": 6.187484500289328, "grad_norm": 9.564032554626465, "learning_rate": 3.546965867998834e-05, "loss": 0.5589, "step": 449100 }, { "epoch": 6.188862252349067, "grad_norm": 1.0402576923370361, "learning_rate": 3.546198687862529e-05, "loss": 0.5751, "step": 449200 }, { "epoch": 6.190240004408807, "grad_norm": 17.56105613708496, "learning_rate": 3.5454314400850735e-05, "loss": 0.5944, "step": 449300 }, { "epoch": 6.1916177564685455, "grad_norm": 4.507169246673584, "learning_rate": 3.544664124731646e-05, "loss": 0.5029, "step": 449400 }, { "epoch": 6.192995508528285, "grad_norm": 7.020826816558838, "learning_rate": 3.5438967418674366e-05, "loss": 0.5375, "step": 449500 }, { "epoch": 6.194373260588025, "grad_norm": 10.615067481994629, "learning_rate": 3.5431292915576384e-05, "loss": 0.537, "step": 449600 }, { "epoch": 6.195751012647764, "grad_norm": 18.18393325805664, "learning_rate": 3.542361773867447e-05, "loss": 0.5349, "step": 449700 }, { "epoch": 6.197128764707503, "grad_norm": 12.330747604370117, "learning_rate": 3.541594188862071e-05, "loss": 0.4997, "step": 449800 }, { "epoch": 6.198506516767242, "grad_norm": 8.2967529296875, "learning_rate": 3.540826536606718e-05, "loss": 0.5679, "step": 449900 }, { "epoch": 6.199884268826982, "grad_norm": 0.19264723360538483, "learning_rate": 3.540058817166605e-05, "loss": 0.5127, "step": 450000 }, { "epoch": 6.2012620208867215, "grad_norm": 29.366424560546875, "learning_rate": 3.539291030606954e-05, "loss": 0.6451, "step": 450100 }, { "epoch": 6.20263977294646, "grad_norm": 11.469525337219238, "learning_rate": 3.5385231769929916e-05, "loss": 0.5369, "step": 450200 }, { "epoch": 6.2040175250062, "grad_norm": 4.140508651733398, "learning_rate": 3.537755256389951e-05, "loss": 0.5039, "step": 450300 }, { "epoch": 6.205395277065939, "grad_norm": 3.899909496307373, "learning_rate": 3.536987268863072e-05, "loss": 0.5257, "step": 450400 }, { "epoch": 6.206773029125679, "grad_norm": 7.393928050994873, "learning_rate": 3.536219214477598e-05, "loss": 0.5299, "step": 450500 }, { "epoch": 6.2081507811854175, "grad_norm": 4.502346992492676, "learning_rate": 3.535451093298779e-05, "loss": 0.5149, "step": 450600 }, { "epoch": 6.209528533245157, "grad_norm": 2.4106757640838623, "learning_rate": 3.534682905391872e-05, "loss": 0.6581, "step": 450700 }, { "epoch": 6.210906285304897, "grad_norm": 9.251667976379395, "learning_rate": 3.5339146508221375e-05, "loss": 0.5916, "step": 450800 }, { "epoch": 6.212284037364636, "grad_norm": 5.105830192565918, "learning_rate": 3.533146329654843e-05, "loss": 0.6143, "step": 450900 }, { "epoch": 6.213661789424375, "grad_norm": 4.3433685302734375, "learning_rate": 3.532377941955261e-05, "loss": 0.5629, "step": 451000 }, { "epoch": 6.215039541484114, "grad_norm": 7.390102863311768, "learning_rate": 3.53160948778867e-05, "loss": 0.6408, "step": 451100 }, { "epoch": 6.216417293543854, "grad_norm": 6.294495105743408, "learning_rate": 3.5308409672203545e-05, "loss": 0.4777, "step": 451200 }, { "epoch": 6.2177950456035935, "grad_norm": 14.88788890838623, "learning_rate": 3.5300723803156046e-05, "loss": 0.4797, "step": 451300 }, { "epoch": 6.219172797663332, "grad_norm": 13.858607292175293, "learning_rate": 3.529311413999301e-05, "loss": 0.5567, "step": 451400 }, { "epoch": 6.220550549723072, "grad_norm": 4.591149806976318, "learning_rate": 3.5285426952793085e-05, "loss": 0.5709, "step": 451500 }, { "epoch": 6.221928301782811, "grad_norm": 2.202214002609253, "learning_rate": 3.5277739104181294e-05, "loss": 0.5549, "step": 451600 }, { "epoch": 6.223306053842551, "grad_norm": 6.87120246887207, "learning_rate": 3.527005059481079e-05, "loss": 0.5243, "step": 451700 }, { "epoch": 6.2246838059022895, "grad_norm": 8.729130744934082, "learning_rate": 3.526236142533473e-05, "loss": 0.5646, "step": 451800 }, { "epoch": 6.226061557962029, "grad_norm": 10.625425338745117, "learning_rate": 3.5254671596406354e-05, "loss": 0.5251, "step": 451900 }, { "epoch": 6.227439310021769, "grad_norm": 2.1450891494750977, "learning_rate": 3.524698110867896e-05, "loss": 0.5895, "step": 452000 }, { "epoch": 6.228817062081508, "grad_norm": 5.816488742828369, "learning_rate": 3.523928996280588e-05, "loss": 0.5044, "step": 452100 }, { "epoch": 6.230194814141247, "grad_norm": 4.585272789001465, "learning_rate": 3.5231598159440526e-05, "loss": 0.5171, "step": 452200 }, { "epoch": 6.231572566200986, "grad_norm": 6.558437347412109, "learning_rate": 3.522390569923636e-05, "loss": 0.5942, "step": 452300 }, { "epoch": 6.232950318260726, "grad_norm": 8.557476997375488, "learning_rate": 3.521621258284688e-05, "loss": 0.4391, "step": 452400 }, { "epoch": 6.2343280703204655, "grad_norm": 2.2243638038635254, "learning_rate": 3.520851881092567e-05, "loss": 0.6063, "step": 452500 }, { "epoch": 6.235705822380204, "grad_norm": 3.2788567543029785, "learning_rate": 3.520082438412635e-05, "loss": 0.5307, "step": 452600 }, { "epoch": 6.237083574439944, "grad_norm": 95.05602264404297, "learning_rate": 3.51931293031026e-05, "loss": 0.5932, "step": 452700 }, { "epoch": 6.238461326499683, "grad_norm": 6.088125705718994, "learning_rate": 3.518543356850816e-05, "loss": 0.5323, "step": 452800 }, { "epoch": 6.239839078559422, "grad_norm": 3.636418581008911, "learning_rate": 3.517773718099682e-05, "loss": 0.5586, "step": 452900 }, { "epoch": 6.2412168306191615, "grad_norm": 4.503596305847168, "learning_rate": 3.517004014122243e-05, "loss": 0.4682, "step": 453000 }, { "epoch": 6.242594582678901, "grad_norm": 4.1418538093566895, "learning_rate": 3.5162342449838885e-05, "loss": 0.5484, "step": 453100 }, { "epoch": 6.243972334738641, "grad_norm": 5.79539680480957, "learning_rate": 3.515464410750015e-05, "loss": 0.6073, "step": 453200 }, { "epoch": 6.24535008679838, "grad_norm": 41.32741165161133, "learning_rate": 3.514694511486024e-05, "loss": 0.5863, "step": 453300 }, { "epoch": 6.246727838858119, "grad_norm": 2.4046976566314697, "learning_rate": 3.513924547257322e-05, "loss": 0.484, "step": 453400 }, { "epoch": 6.248105590917858, "grad_norm": 3.5096065998077393, "learning_rate": 3.513154518129321e-05, "loss": 0.5985, "step": 453500 }, { "epoch": 6.249483342977598, "grad_norm": 2.9447455406188965, "learning_rate": 3.5123844241674395e-05, "loss": 0.5178, "step": 453600 }, { "epoch": 6.250861095037337, "grad_norm": 1.9499253034591675, "learning_rate": 3.511621967344794e-05, "loss": 0.5323, "step": 453700 }, { "epoch": 6.252238847097076, "grad_norm": 4.988081932067871, "learning_rate": 3.5108517445581334e-05, "loss": 0.5229, "step": 453800 }, { "epoch": 6.253616599156816, "grad_norm": 13.288809776306152, "learning_rate": 3.510081457133225e-05, "loss": 0.5018, "step": 453900 }, { "epoch": 6.254994351216555, "grad_norm": 2.431906223297119, "learning_rate": 3.509311105135509e-05, "loss": 0.5333, "step": 454000 }, { "epoch": 6.256372103276294, "grad_norm": 2.840204954147339, "learning_rate": 3.508540688630428e-05, "loss": 0.4758, "step": 454100 }, { "epoch": 6.257749855336034, "grad_norm": 5.37916374206543, "learning_rate": 3.507770207683436e-05, "loss": 0.5634, "step": 454200 }, { "epoch": 6.259127607395773, "grad_norm": 13.765035629272461, "learning_rate": 3.506999662359988e-05, "loss": 0.6393, "step": 454300 }, { "epoch": 6.260505359455513, "grad_norm": 3.532738208770752, "learning_rate": 3.506229052725547e-05, "loss": 0.5693, "step": 454400 }, { "epoch": 6.261883111515251, "grad_norm": 0.5706472992897034, "learning_rate": 3.505458378845578e-05, "loss": 0.5661, "step": 454500 }, { "epoch": 6.263260863574991, "grad_norm": 34.04207229614258, "learning_rate": 3.504687640785555e-05, "loss": 0.5841, "step": 454600 }, { "epoch": 6.2646386156347305, "grad_norm": 6.770870685577393, "learning_rate": 3.503916838610956e-05, "loss": 0.5061, "step": 454700 }, { "epoch": 6.26601636769447, "grad_norm": 4.09011173248291, "learning_rate": 3.503145972387265e-05, "loss": 0.5853, "step": 454800 }, { "epoch": 6.267394119754209, "grad_norm": 5.528512477874756, "learning_rate": 3.50237504217997e-05, "loss": 0.5813, "step": 454900 }, { "epoch": 6.268771871813948, "grad_norm": 5.804129600524902, "learning_rate": 3.5016040480545665e-05, "loss": 0.5565, "step": 455000 }, { "epoch": 6.270149623873688, "grad_norm": 2.95120906829834, "learning_rate": 3.5008329900765533e-05, "loss": 0.567, "step": 455100 }, { "epoch": 6.271527375933427, "grad_norm": 21.578901290893555, "learning_rate": 3.5000618683114366e-05, "loss": 0.4828, "step": 455200 }, { "epoch": 6.272905127993166, "grad_norm": 0.12653601169586182, "learning_rate": 3.4992906828247266e-05, "loss": 0.5381, "step": 455300 }, { "epoch": 6.274282880052906, "grad_norm": 5.963935375213623, "learning_rate": 3.498519433681941e-05, "loss": 0.5394, "step": 455400 }, { "epoch": 6.275660632112645, "grad_norm": 36.67667770385742, "learning_rate": 3.497748120948599e-05, "loss": 0.503, "step": 455500 }, { "epoch": 6.277038384172385, "grad_norm": 4.799067497253418, "learning_rate": 3.496976744690229e-05, "loss": 0.5629, "step": 455600 }, { "epoch": 6.278416136232123, "grad_norm": 2.2575008869171143, "learning_rate": 3.496205304972363e-05, "loss": 0.583, "step": 455700 }, { "epoch": 6.279793888291863, "grad_norm": 4.8926897048950195, "learning_rate": 3.495433801860538e-05, "loss": 0.5549, "step": 455800 }, { "epoch": 6.2811716403516025, "grad_norm": 2.7254655361175537, "learning_rate": 3.494662235420299e-05, "loss": 0.5376, "step": 455900 }, { "epoch": 6.282549392411342, "grad_norm": 0.46621039509773254, "learning_rate": 3.4938906057171934e-05, "loss": 0.5434, "step": 456000 }, { "epoch": 6.283927144471081, "grad_norm": 14.060787200927734, "learning_rate": 3.4931189128167745e-05, "loss": 0.5894, "step": 456100 }, { "epoch": 6.28530489653082, "grad_norm": 3.1941330432891846, "learning_rate": 3.492347156784603e-05, "loss": 0.689, "step": 456200 }, { "epoch": 6.28668264859056, "grad_norm": 38.39863586425781, "learning_rate": 3.4915753376862414e-05, "loss": 0.5464, "step": 456300 }, { "epoch": 6.288060400650299, "grad_norm": 120.04096984863281, "learning_rate": 3.490803455587262e-05, "loss": 0.5847, "step": 456400 }, { "epoch": 6.289438152710038, "grad_norm": 13.030728340148926, "learning_rate": 3.4900315105532394e-05, "loss": 0.5741, "step": 456500 }, { "epoch": 6.290815904769778, "grad_norm": 2.070629358291626, "learning_rate": 3.489259502649753e-05, "loss": 0.4826, "step": 456600 }, { "epoch": 6.292193656829517, "grad_norm": 3.9621589183807373, "learning_rate": 3.488487431942391e-05, "loss": 0.5095, "step": 456700 }, { "epoch": 6.293571408889257, "grad_norm": 92.44857788085938, "learning_rate": 3.487715298496742e-05, "loss": 0.4997, "step": 456800 }, { "epoch": 6.294949160948995, "grad_norm": 6.448740005493164, "learning_rate": 3.4869431023784055e-05, "loss": 0.507, "step": 456900 }, { "epoch": 6.296326913008735, "grad_norm": 3.7643239498138428, "learning_rate": 3.486170843652983e-05, "loss": 0.5531, "step": 457000 }, { "epoch": 6.2977046650684745, "grad_norm": 1.9391893148422241, "learning_rate": 3.485398522386079e-05, "loss": 0.555, "step": 457100 }, { "epoch": 6.299082417128213, "grad_norm": 8.732494354248047, "learning_rate": 3.484626138643309e-05, "loss": 0.5667, "step": 457200 }, { "epoch": 6.300460169187953, "grad_norm": 4.021539688110352, "learning_rate": 3.48385369249029e-05, "loss": 0.6163, "step": 457300 }, { "epoch": 6.301837921247692, "grad_norm": 1.5873167514801025, "learning_rate": 3.4830811839926465e-05, "loss": 0.5065, "step": 457400 }, { "epoch": 6.303215673307432, "grad_norm": 69.21484375, "learning_rate": 3.4823086132160054e-05, "loss": 0.5174, "step": 457500 }, { "epoch": 6.304593425367171, "grad_norm": 7.337741851806641, "learning_rate": 3.481535980226001e-05, "loss": 0.5122, "step": 457600 }, { "epoch": 6.30597117742691, "grad_norm": 5.86789608001709, "learning_rate": 3.4807632850882735e-05, "loss": 0.557, "step": 457700 }, { "epoch": 6.30734892948665, "grad_norm": 3.5734307765960693, "learning_rate": 3.479990527868466e-05, "loss": 0.519, "step": 457800 }, { "epoch": 6.308726681546389, "grad_norm": 0.16667194664478302, "learning_rate": 3.479217708632227e-05, "loss": 0.4791, "step": 457900 }, { "epoch": 6.310104433606128, "grad_norm": 7.866695404052734, "learning_rate": 3.478444827445215e-05, "loss": 0.5163, "step": 458000 }, { "epoch": 6.311482185665867, "grad_norm": 8.209754943847656, "learning_rate": 3.477671884373087e-05, "loss": 0.4649, "step": 458100 }, { "epoch": 6.312859937725607, "grad_norm": 2.742061138153076, "learning_rate": 3.476898879481511e-05, "loss": 0.5171, "step": 458200 }, { "epoch": 6.3142376897853465, "grad_norm": 3.355088472366333, "learning_rate": 3.476125812836155e-05, "loss": 0.5451, "step": 458300 }, { "epoch": 6.315615441845085, "grad_norm": 1.8031038045883179, "learning_rate": 3.475352684502697e-05, "loss": 0.6196, "step": 458400 }, { "epoch": 6.316993193904825, "grad_norm": 38.720279693603516, "learning_rate": 3.474587226751191e-05, "loss": 0.5181, "step": 458500 }, { "epoch": 6.318370945964564, "grad_norm": 13.696335792541504, "learning_rate": 3.473813975853819e-05, "loss": 0.4956, "step": 458600 }, { "epoch": 6.319748698024304, "grad_norm": 6.031965732574463, "learning_rate": 3.4730406634647464e-05, "loss": 0.4961, "step": 458700 }, { "epoch": 6.3211264500840425, "grad_norm": 4.09878396987915, "learning_rate": 3.47226728964967e-05, "loss": 0.5304, "step": 458800 }, { "epoch": 6.322504202143782, "grad_norm": 4.406709671020508, "learning_rate": 3.4714938544742934e-05, "loss": 0.5477, "step": 458900 }, { "epoch": 6.323881954203522, "grad_norm": 15.194746971130371, "learning_rate": 3.470728093272214e-05, "loss": 0.5639, "step": 459000 }, { "epoch": 6.325259706263261, "grad_norm": 6.894687652587891, "learning_rate": 3.469954536185324e-05, "loss": 0.5385, "step": 459100 }, { "epoch": 6.326637458323, "grad_norm": 4.836931228637695, "learning_rate": 3.469180917934614e-05, "loss": 0.5193, "step": 459200 }, { "epoch": 6.328015210382739, "grad_norm": 37.81837844848633, "learning_rate": 3.4684072385858045e-05, "loss": 0.5059, "step": 459300 }, { "epoch": 6.329392962442479, "grad_norm": 4.395566940307617, "learning_rate": 3.467633498204625e-05, "loss": 0.6069, "step": 459400 }, { "epoch": 6.3307707145022185, "grad_norm": 7.261143207550049, "learning_rate": 3.466859696856809e-05, "loss": 0.5075, "step": 459500 }, { "epoch": 6.332148466561957, "grad_norm": 2.6545705795288086, "learning_rate": 3.4660858346080936e-05, "loss": 0.5434, "step": 459600 }, { "epoch": 6.333526218621697, "grad_norm": 58.85383605957031, "learning_rate": 3.465311911524224e-05, "loss": 0.5759, "step": 459700 }, { "epoch": 6.334903970681436, "grad_norm": 14.57435131072998, "learning_rate": 3.464537927670948e-05, "loss": 0.5165, "step": 459800 }, { "epoch": 6.336281722741176, "grad_norm": 11.907734870910645, "learning_rate": 3.463763883114018e-05, "loss": 0.4545, "step": 459900 }, { "epoch": 6.3376594748009145, "grad_norm": 5.383899688720703, "learning_rate": 3.462989777919197e-05, "loss": 0.5175, "step": 460000 }, { "epoch": 6.339037226860654, "grad_norm": 3.570643663406372, "learning_rate": 3.462215612152244e-05, "loss": 0.471, "step": 460100 }, { "epoch": 6.340414978920394, "grad_norm": 4.963911056518555, "learning_rate": 3.4614413858789325e-05, "loss": 0.4757, "step": 460200 }, { "epoch": 6.341792730980133, "grad_norm": 14.90195369720459, "learning_rate": 3.460667099165036e-05, "loss": 0.5004, "step": 460300 }, { "epoch": 6.343170483039872, "grad_norm": 5.259658336639404, "learning_rate": 3.459892752076333e-05, "loss": 0.5133, "step": 460400 }, { "epoch": 6.344548235099611, "grad_norm": 8.458319664001465, "learning_rate": 3.459118344678609e-05, "loss": 0.5886, "step": 460500 }, { "epoch": 6.345925987159351, "grad_norm": 13.110440254211426, "learning_rate": 3.4583438770376536e-05, "loss": 0.6334, "step": 460600 }, { "epoch": 6.3473037392190905, "grad_norm": 20.464799880981445, "learning_rate": 3.457569349219262e-05, "loss": 0.6236, "step": 460700 }, { "epoch": 6.348681491278829, "grad_norm": 7.437765598297119, "learning_rate": 3.456794761289235e-05, "loss": 0.5151, "step": 460800 }, { "epoch": 6.350059243338569, "grad_norm": 4.299627304077148, "learning_rate": 3.456020113313376e-05, "loss": 0.5655, "step": 460900 }, { "epoch": 6.351436995398308, "grad_norm": 5.7341203689575195, "learning_rate": 3.4552454053574975e-05, "loss": 0.4883, "step": 461000 }, { "epoch": 6.352814747458048, "grad_norm": 9.073986053466797, "learning_rate": 3.4544706374874134e-05, "loss": 0.623, "step": 461100 }, { "epoch": 6.3541924995177865, "grad_norm": 6.882385730743408, "learning_rate": 3.453695809768944e-05, "loss": 0.5603, "step": 461200 }, { "epoch": 6.355570251577526, "grad_norm": 2.997835159301758, "learning_rate": 3.452920922267917e-05, "loss": 0.5211, "step": 461300 }, { "epoch": 6.356948003637266, "grad_norm": 23.735191345214844, "learning_rate": 3.45214597505016e-05, "loss": 0.5414, "step": 461400 }, { "epoch": 6.358325755697004, "grad_norm": 1.2504786252975464, "learning_rate": 3.451370968181511e-05, "loss": 0.5107, "step": 461500 }, { "epoch": 6.359703507756744, "grad_norm": 46.827781677246094, "learning_rate": 3.4505959017278096e-05, "loss": 0.5035, "step": 461600 }, { "epoch": 6.361081259816483, "grad_norm": 4.429317951202393, "learning_rate": 3.4498207757549026e-05, "loss": 0.5114, "step": 461700 }, { "epoch": 6.362459011876223, "grad_norm": 9.95505142211914, "learning_rate": 3.44904559032864e-05, "loss": 0.5657, "step": 461800 }, { "epoch": 6.3638367639359625, "grad_norm": 42.97420120239258, "learning_rate": 3.4482703455148786e-05, "loss": 0.5305, "step": 461900 }, { "epoch": 6.365214515995701, "grad_norm": 4.830787181854248, "learning_rate": 3.4474950413794786e-05, "loss": 0.5017, "step": 462000 }, { "epoch": 6.366592268055441, "grad_norm": 3.6229782104492188, "learning_rate": 3.4467196779883065e-05, "loss": 0.522, "step": 462100 }, { "epoch": 6.36797002011518, "grad_norm": 3.63191294670105, "learning_rate": 3.445944255407233e-05, "loss": 0.5688, "step": 462200 }, { "epoch": 6.369347772174919, "grad_norm": 20.11029624938965, "learning_rate": 3.4451687737021364e-05, "loss": 0.5197, "step": 462300 }, { "epoch": 6.3707255242346585, "grad_norm": 2.780252456665039, "learning_rate": 3.4443932329388946e-05, "loss": 0.4674, "step": 462400 }, { "epoch": 6.372103276294398, "grad_norm": 9.972923278808594, "learning_rate": 3.4436176331833954e-05, "loss": 0.5934, "step": 462500 }, { "epoch": 6.373481028354138, "grad_norm": 3.0733118057250977, "learning_rate": 3.44284197450153e-05, "loss": 0.5463, "step": 462600 }, { "epoch": 6.374858780413876, "grad_norm": 12.761159896850586, "learning_rate": 3.442066256959193e-05, "loss": 0.5579, "step": 462700 }, { "epoch": 6.376236532473616, "grad_norm": 4.008204936981201, "learning_rate": 3.441290480622289e-05, "loss": 0.5283, "step": 462800 }, { "epoch": 6.377614284533355, "grad_norm": 3.52803373336792, "learning_rate": 3.440522404197866e-05, "loss": 0.5378, "step": 462900 }, { "epoch": 6.378992036593095, "grad_norm": 18.950180053710938, "learning_rate": 3.4397465110558476e-05, "loss": 0.5544, "step": 463000 }, { "epoch": 6.380369788652834, "grad_norm": 4.705127716064453, "learning_rate": 3.438970559316334e-05, "loss": 0.5723, "step": 463100 }, { "epoch": 6.381747540712573, "grad_norm": 54.34796905517578, "learning_rate": 3.4381945490452466e-05, "loss": 0.495, "step": 463200 }, { "epoch": 6.383125292772313, "grad_norm": 13.651488304138184, "learning_rate": 3.437418480308512e-05, "loss": 0.6149, "step": 463300 }, { "epoch": 6.384503044832052, "grad_norm": 5.200097560882568, "learning_rate": 3.436642353172061e-05, "loss": 0.5014, "step": 463400 }, { "epoch": 6.385880796891791, "grad_norm": 5.135319709777832, "learning_rate": 3.4358661677018276e-05, "loss": 0.5211, "step": 463500 }, { "epoch": 6.387258548951531, "grad_norm": 6.770837783813477, "learning_rate": 3.4350899239637554e-05, "loss": 0.5376, "step": 463600 }, { "epoch": 6.38863630101127, "grad_norm": 3.2738966941833496, "learning_rate": 3.4343136220237896e-05, "loss": 0.4791, "step": 463700 }, { "epoch": 6.39001405307101, "grad_norm": 35.05431365966797, "learning_rate": 3.43353726194788e-05, "loss": 0.5216, "step": 463800 }, { "epoch": 6.391391805130748, "grad_norm": 5.473015785217285, "learning_rate": 3.4327608438019834e-05, "loss": 0.5852, "step": 463900 }, { "epoch": 6.392769557190488, "grad_norm": 4.0664448738098145, "learning_rate": 3.43198436765206e-05, "loss": 0.568, "step": 464000 }, { "epoch": 6.3941473092502275, "grad_norm": 20.636764526367188, "learning_rate": 3.4312078335640755e-05, "loss": 0.464, "step": 464100 }, { "epoch": 6.395525061309967, "grad_norm": 4.607789039611816, "learning_rate": 3.4304312416040014e-05, "loss": 0.5298, "step": 464200 }, { "epoch": 6.396902813369706, "grad_norm": 4.129605770111084, "learning_rate": 3.4296545918378115e-05, "loss": 0.5272, "step": 464300 }, { "epoch": 6.398280565429445, "grad_norm": 11.238272666931152, "learning_rate": 3.428877884331486e-05, "loss": 0.4859, "step": 464400 }, { "epoch": 6.399658317489185, "grad_norm": 2.492154836654663, "learning_rate": 3.4281011191510134e-05, "loss": 0.5288, "step": 464500 }, { "epoch": 6.401036069548924, "grad_norm": 4.2271599769592285, "learning_rate": 3.427324296362379e-05, "loss": 0.5255, "step": 464600 }, { "epoch": 6.402413821608663, "grad_norm": 17.49359130859375, "learning_rate": 3.426547416031583e-05, "loss": 0.5483, "step": 464700 }, { "epoch": 6.403791573668403, "grad_norm": 26.748050689697266, "learning_rate": 3.4257704782246216e-05, "loss": 0.4936, "step": 464800 }, { "epoch": 6.405169325728142, "grad_norm": 23.832195281982422, "learning_rate": 3.4249934830075e-05, "loss": 0.5243, "step": 464900 }, { "epoch": 6.406547077787882, "grad_norm": 3.791219711303711, "learning_rate": 3.4242164304462304e-05, "loss": 0.5899, "step": 465000 }, { "epoch": 6.40792482984762, "grad_norm": 3.6568827629089355, "learning_rate": 3.423439320606824e-05, "loss": 0.566, "step": 465100 }, { "epoch": 6.40930258190736, "grad_norm": 17.30653953552246, "learning_rate": 3.4226621535553026e-05, "loss": 0.5962, "step": 465200 }, { "epoch": 6.4106803339670995, "grad_norm": 10.36335563659668, "learning_rate": 3.4218849293576916e-05, "loss": 0.5667, "step": 465300 }, { "epoch": 6.412058086026839, "grad_norm": 2.072679042816162, "learning_rate": 3.421107648080016e-05, "loss": 0.5336, "step": 465400 }, { "epoch": 6.413435838086578, "grad_norm": 3.226314067840576, "learning_rate": 3.420330309788314e-05, "loss": 0.5372, "step": 465500 }, { "epoch": 6.414813590146317, "grad_norm": 50.06383514404297, "learning_rate": 3.4195529145486224e-05, "loss": 0.5435, "step": 465600 }, { "epoch": 6.416191342206057, "grad_norm": 4.390735149383545, "learning_rate": 3.418775462426985e-05, "loss": 0.4922, "step": 465700 }, { "epoch": 6.4175690942657955, "grad_norm": 5.059786796569824, "learning_rate": 3.4179979534894514e-05, "loss": 0.557, "step": 465800 }, { "epoch": 6.418946846325535, "grad_norm": 3.7022817134857178, "learning_rate": 3.417220387802073e-05, "loss": 0.5162, "step": 465900 }, { "epoch": 6.420324598385275, "grad_norm": 4.939906597137451, "learning_rate": 3.41644276543091e-05, "loss": 0.4894, "step": 466000 }, { "epoch": 6.421702350445014, "grad_norm": 4.897161483764648, "learning_rate": 3.4156650864420244e-05, "loss": 0.5312, "step": 466100 }, { "epoch": 6.423080102504754, "grad_norm": 8.984227180480957, "learning_rate": 3.414887350901485e-05, "loss": 0.4259, "step": 466200 }, { "epoch": 6.424457854564492, "grad_norm": 1.7079793214797974, "learning_rate": 3.41411733707501e-05, "loss": 0.4979, "step": 466300 }, { "epoch": 6.425835606624232, "grad_norm": 5.8413615226745605, "learning_rate": 3.4133394891932515e-05, "loss": 0.5137, "step": 466400 }, { "epoch": 6.4272133586839715, "grad_norm": 9.191996574401855, "learning_rate": 3.412569364278504e-05, "loss": 0.5454, "step": 466500 }, { "epoch": 6.42859111074371, "grad_norm": 3.10945463180542, "learning_rate": 3.411791404317218e-05, "loss": 0.5298, "step": 466600 }, { "epoch": 6.42996886280345, "grad_norm": 4.685968399047852, "learning_rate": 3.411013388133367e-05, "loss": 0.4721, "step": 466700 }, { "epoch": 6.431346614863189, "grad_norm": 103.82674407958984, "learning_rate": 3.4102353157930464e-05, "loss": 0.4824, "step": 466800 }, { "epoch": 6.432724366922929, "grad_norm": 14.167476654052734, "learning_rate": 3.4094571873623586e-05, "loss": 0.5477, "step": 466900 }, { "epoch": 6.4341021189826675, "grad_norm": 10.491928100585938, "learning_rate": 3.4086790029074094e-05, "loss": 0.4711, "step": 467000 }, { "epoch": 6.435479871042407, "grad_norm": 6.281071662902832, "learning_rate": 3.4079007624943086e-05, "loss": 0.5018, "step": 467100 }, { "epoch": 6.436857623102147, "grad_norm": 14.812719345092773, "learning_rate": 3.4071224661891724e-05, "loss": 0.5254, "step": 467200 }, { "epoch": 6.438235375161886, "grad_norm": 1.3977235555648804, "learning_rate": 3.4063441140581206e-05, "loss": 0.5088, "step": 467300 }, { "epoch": 6.439613127221625, "grad_norm": 5.854102611541748, "learning_rate": 3.4055657061672776e-05, "loss": 0.4982, "step": 467400 }, { "epoch": 6.440990879281364, "grad_norm": 13.613001823425293, "learning_rate": 3.404787242582776e-05, "loss": 0.5188, "step": 467500 }, { "epoch": 6.442368631341104, "grad_norm": 3.4949545860290527, "learning_rate": 3.4040087233707473e-05, "loss": 0.5313, "step": 467600 }, { "epoch": 6.4437463834008435, "grad_norm": 10.11595344543457, "learning_rate": 3.4032301485973314e-05, "loss": 0.4718, "step": 467700 }, { "epoch": 6.445124135460582, "grad_norm": 4.291743755340576, "learning_rate": 3.402451518328673e-05, "loss": 0.4898, "step": 467800 }, { "epoch": 6.446501887520322, "grad_norm": 4.582491397857666, "learning_rate": 3.4016728326309196e-05, "loss": 0.549, "step": 467900 }, { "epoch": 6.447879639580061, "grad_norm": 2.074584722518921, "learning_rate": 3.4008940915702256e-05, "loss": 0.4883, "step": 468000 }, { "epoch": 6.449257391639801, "grad_norm": 5.900487899780273, "learning_rate": 3.400115295212749e-05, "loss": 0.5176, "step": 468100 }, { "epoch": 6.4506351436995395, "grad_norm": 7.9630937576293945, "learning_rate": 3.3993364436246505e-05, "loss": 0.5398, "step": 468200 }, { "epoch": 6.452012895759279, "grad_norm": 3.6541032791137695, "learning_rate": 3.3985575368721005e-05, "loss": 0.4832, "step": 468300 }, { "epoch": 6.453390647819019, "grad_norm": 3.677736759185791, "learning_rate": 3.3977785750212695e-05, "loss": 0.463, "step": 468400 }, { "epoch": 6.454768399878758, "grad_norm": 4.9105544090271, "learning_rate": 3.396999558138334e-05, "loss": 0.5095, "step": 468500 }, { "epoch": 6.456146151938497, "grad_norm": 9.214556694030762, "learning_rate": 3.396220486289477e-05, "loss": 0.5653, "step": 468600 }, { "epoch": 6.457523903998236, "grad_norm": 6.541526794433594, "learning_rate": 3.395441359540883e-05, "loss": 0.567, "step": 468700 }, { "epoch": 6.458901656057976, "grad_norm": 6.857761383056641, "learning_rate": 3.394662177958743e-05, "loss": 0.488, "step": 468800 }, { "epoch": 6.4602794081177155, "grad_norm": 4.510544300079346, "learning_rate": 3.3938829416092535e-05, "loss": 0.5462, "step": 468900 }, { "epoch": 6.461657160177454, "grad_norm": 2.774104356765747, "learning_rate": 3.393103650558614e-05, "loss": 0.4651, "step": 469000 }, { "epoch": 6.463034912237194, "grad_norm": 7.511016845703125, "learning_rate": 3.39232430487303e-05, "loss": 0.5954, "step": 469100 }, { "epoch": 6.464412664296933, "grad_norm": 2.5176055431365967, "learning_rate": 3.3915449046187085e-05, "loss": 0.4857, "step": 469200 }, { "epoch": 6.465790416356673, "grad_norm": 3.5668587684631348, "learning_rate": 3.3907654498618665e-05, "loss": 0.4752, "step": 469300 }, { "epoch": 6.4671681684164115, "grad_norm": 2.3010289669036865, "learning_rate": 3.389985940668721e-05, "loss": 0.508, "step": 469400 }, { "epoch": 6.468545920476151, "grad_norm": 585.8804321289062, "learning_rate": 3.389206377105496e-05, "loss": 0.525, "step": 469500 }, { "epoch": 6.469923672535891, "grad_norm": 2.168923854827881, "learning_rate": 3.3884267592384194e-05, "loss": 0.5275, "step": 469600 }, { "epoch": 6.47130142459563, "grad_norm": 6.4968438148498535, "learning_rate": 3.387647087133723e-05, "loss": 0.4872, "step": 469700 }, { "epoch": 6.472679176655369, "grad_norm": 4.322395324707031, "learning_rate": 3.386867360857644e-05, "loss": 0.5246, "step": 469800 }, { "epoch": 6.474056928715108, "grad_norm": 1.8616739511489868, "learning_rate": 3.3860875804764246e-05, "loss": 0.5923, "step": 469900 }, { "epoch": 6.475434680774848, "grad_norm": 4.919229030609131, "learning_rate": 3.385307746056311e-05, "loss": 0.57, "step": 470000 }, { "epoch": 6.476812432834587, "grad_norm": 26.47780418395996, "learning_rate": 3.3845278576635554e-05, "loss": 0.5623, "step": 470100 }, { "epoch": 6.478190184894326, "grad_norm": 15.909097671508789, "learning_rate": 3.3837479153644104e-05, "loss": 0.6046, "step": 470200 }, { "epoch": 6.479567936954066, "grad_norm": 4.691531658172607, "learning_rate": 3.382967919225139e-05, "loss": 0.4628, "step": 470300 }, { "epoch": 6.480945689013805, "grad_norm": 17.47124481201172, "learning_rate": 3.3821878693120036e-05, "loss": 0.4563, "step": 470400 }, { "epoch": 6.482323441073545, "grad_norm": 3.935624599456787, "learning_rate": 3.381407765691275e-05, "loss": 0.5361, "step": 470500 }, { "epoch": 6.4837011931332835, "grad_norm": 2.7093863487243652, "learning_rate": 3.380627608429226e-05, "loss": 0.5025, "step": 470600 }, { "epoch": 6.485078945193023, "grad_norm": 4.365957736968994, "learning_rate": 3.379847397592136e-05, "loss": 0.4969, "step": 470700 }, { "epoch": 6.486456697252763, "grad_norm": 0.8611201643943787, "learning_rate": 3.3790671332462874e-05, "loss": 0.4982, "step": 470800 }, { "epoch": 6.487834449312501, "grad_norm": 5.220925331115723, "learning_rate": 3.378286815457967e-05, "loss": 0.535, "step": 470900 }, { "epoch": 6.489212201372241, "grad_norm": 12.679610252380371, "learning_rate": 3.3775064442934666e-05, "loss": 0.4894, "step": 471000 }, { "epoch": 6.49058995343198, "grad_norm": 0.8816003203392029, "learning_rate": 3.376726019819085e-05, "loss": 0.4809, "step": 471100 }, { "epoch": 6.49196770549172, "grad_norm": 13.836054801940918, "learning_rate": 3.375945542101121e-05, "loss": 0.5055, "step": 471200 }, { "epoch": 6.493345457551459, "grad_norm": 88.91038513183594, "learning_rate": 3.375165011205881e-05, "loss": 0.539, "step": 471300 }, { "epoch": 6.494723209611198, "grad_norm": 8.592035293579102, "learning_rate": 3.3743844271996754e-05, "loss": 0.5181, "step": 471400 }, { "epoch": 6.496100961670938, "grad_norm": 1.4551607370376587, "learning_rate": 3.373603790148817e-05, "loss": 0.5676, "step": 471500 }, { "epoch": 6.497478713730677, "grad_norm": 2.1242740154266357, "learning_rate": 3.372823100119628e-05, "loss": 0.5473, "step": 471600 }, { "epoch": 6.498856465790416, "grad_norm": 3.7006642818450928, "learning_rate": 3.372042357178429e-05, "loss": 0.5132, "step": 471700 }, { "epoch": 6.5002342178501555, "grad_norm": 3.8233206272125244, "learning_rate": 3.37126156139155e-05, "loss": 0.6044, "step": 471800 }, { "epoch": 6.501611969909895, "grad_norm": 7.363094329833984, "learning_rate": 3.370496330313456e-05, "loss": 0.4758, "step": 471900 }, { "epoch": 6.502989721969635, "grad_norm": 4.851779937744141, "learning_rate": 3.369715430087828e-05, "loss": 0.5753, "step": 472000 }, { "epoch": 6.504367474029373, "grad_norm": 7.83793830871582, "learning_rate": 3.3689344772142045e-05, "loss": 0.5238, "step": 472100 }, { "epoch": 6.505745226089113, "grad_norm": 2.358006477355957, "learning_rate": 3.36815347175893e-05, "loss": 0.5111, "step": 472200 }, { "epoch": 6.507122978148852, "grad_norm": 37.04079055786133, "learning_rate": 3.3673724137883555e-05, "loss": 0.4996, "step": 472300 }, { "epoch": 6.508500730208592, "grad_norm": 13.890275955200195, "learning_rate": 3.3665913033688365e-05, "loss": 0.5195, "step": 472400 }, { "epoch": 6.509878482268331, "grad_norm": 4.729398727416992, "learning_rate": 3.365810140566731e-05, "loss": 0.4996, "step": 472500 }, { "epoch": 6.51125623432807, "grad_norm": 1.6167852878570557, "learning_rate": 3.3650289254484044e-05, "loss": 0.5998, "step": 472600 }, { "epoch": 6.51263398638781, "grad_norm": 21.66167449951172, "learning_rate": 3.364247658080224e-05, "loss": 0.5722, "step": 472700 }, { "epoch": 6.514011738447549, "grad_norm": 5.861439228057861, "learning_rate": 3.363466338528562e-05, "loss": 0.5368, "step": 472800 }, { "epoch": 6.515389490507288, "grad_norm": 10.542923927307129, "learning_rate": 3.3626849668597964e-05, "loss": 0.5995, "step": 472900 }, { "epoch": 6.5167672425670276, "grad_norm": 8.882357597351074, "learning_rate": 3.361903543140309e-05, "loss": 0.5322, "step": 473000 }, { "epoch": 6.518144994626767, "grad_norm": 6.442994117736816, "learning_rate": 3.361122067436484e-05, "loss": 0.5019, "step": 473100 }, { "epoch": 6.519522746686507, "grad_norm": 1.5847370624542236, "learning_rate": 3.360340539814714e-05, "loss": 0.4784, "step": 473200 }, { "epoch": 6.520900498746245, "grad_norm": 3.729393482208252, "learning_rate": 3.359558960341392e-05, "loss": 0.5788, "step": 473300 }, { "epoch": 6.522278250805985, "grad_norm": 22.281179428100586, "learning_rate": 3.358777329082918e-05, "loss": 0.4523, "step": 473400 }, { "epoch": 6.5236560028657244, "grad_norm": 10.255950927734375, "learning_rate": 3.3579956461056954e-05, "loss": 0.5112, "step": 473500 }, { "epoch": 6.525033754925463, "grad_norm": 21.808143615722656, "learning_rate": 3.357213911476132e-05, "loss": 0.5197, "step": 473600 }, { "epoch": 6.526411506985203, "grad_norm": 18.429672241210938, "learning_rate": 3.3564321252606405e-05, "loss": 0.4994, "step": 473700 }, { "epoch": 6.527789259044942, "grad_norm": 4.4939470291137695, "learning_rate": 3.355650287525637e-05, "loss": 0.4903, "step": 473800 }, { "epoch": 6.529167011104682, "grad_norm": 6.932443618774414, "learning_rate": 3.354868398337544e-05, "loss": 0.4575, "step": 473900 }, { "epoch": 6.530544763164421, "grad_norm": 6.450246334075928, "learning_rate": 3.354086457762786e-05, "loss": 0.5593, "step": 474000 }, { "epoch": 6.53192251522416, "grad_norm": 5.7159318923950195, "learning_rate": 3.3533044658677925e-05, "loss": 0.4866, "step": 474100 }, { "epoch": 6.5333002672839, "grad_norm": 2.7029058933258057, "learning_rate": 3.3525224227189986e-05, "loss": 0.4919, "step": 474200 }, { "epoch": 6.534678019343639, "grad_norm": 3.650818109512329, "learning_rate": 3.3517403283828415e-05, "loss": 0.5296, "step": 474300 }, { "epoch": 6.536055771403378, "grad_norm": 9.655908584594727, "learning_rate": 3.350958182925766e-05, "loss": 0.5935, "step": 474400 }, { "epoch": 6.537433523463117, "grad_norm": 53.43095397949219, "learning_rate": 3.350175986414218e-05, "loss": 0.4687, "step": 474500 }, { "epoch": 6.538811275522857, "grad_norm": 4.060654640197754, "learning_rate": 3.349393738914649e-05, "loss": 0.5496, "step": 474600 }, { "epoch": 6.5401890275825965, "grad_norm": 3.6551802158355713, "learning_rate": 3.3486114404935155e-05, "loss": 0.5736, "step": 474700 }, { "epoch": 6.541566779642336, "grad_norm": 21.855300903320312, "learning_rate": 3.3478290912172776e-05, "loss": 0.5015, "step": 474800 }, { "epoch": 6.542944531702075, "grad_norm": 2.1812844276428223, "learning_rate": 3.3470466911524014e-05, "loss": 0.5123, "step": 474900 }, { "epoch": 6.544322283761814, "grad_norm": 1.5420588254928589, "learning_rate": 3.3462642403653526e-05, "loss": 0.4814, "step": 475000 }, { "epoch": 6.545700035821554, "grad_norm": 12.950900077819824, "learning_rate": 3.345481738922606e-05, "loss": 0.5513, "step": 475100 }, { "epoch": 6.5470777878812925, "grad_norm": 47.913818359375, "learning_rate": 3.344699186890641e-05, "loss": 0.4856, "step": 475200 }, { "epoch": 6.548455539941032, "grad_norm": 9.541970252990723, "learning_rate": 3.343916584335936e-05, "loss": 0.5612, "step": 475300 }, { "epoch": 6.549833292000772, "grad_norm": 1.2268084287643433, "learning_rate": 3.343133931324979e-05, "loss": 0.5352, "step": 475400 }, { "epoch": 6.551211044060511, "grad_norm": 20.780214309692383, "learning_rate": 3.3423512279242614e-05, "loss": 0.4917, "step": 475500 }, { "epoch": 6.552588796120251, "grad_norm": 1.8556984663009644, "learning_rate": 3.341568474200274e-05, "loss": 0.529, "step": 475600 }, { "epoch": 6.553966548179989, "grad_norm": 2.5443620681762695, "learning_rate": 3.340785670219521e-05, "loss": 0.5013, "step": 475700 }, { "epoch": 6.555344300239729, "grad_norm": 5.767784118652344, "learning_rate": 3.340002816048501e-05, "loss": 0.592, "step": 475800 }, { "epoch": 6.5567220522994685, "grad_norm": 5.811985969543457, "learning_rate": 3.3392199117537245e-05, "loss": 0.5086, "step": 475900 }, { "epoch": 6.558099804359207, "grad_norm": 39.33713150024414, "learning_rate": 3.338436957401702e-05, "loss": 0.4897, "step": 476000 }, { "epoch": 6.559477556418947, "grad_norm": 5.036971092224121, "learning_rate": 3.337653953058948e-05, "loss": 0.4779, "step": 476100 }, { "epoch": 6.560855308478686, "grad_norm": 7.764959812164307, "learning_rate": 3.336870898791985e-05, "loss": 0.5157, "step": 476200 }, { "epoch": 6.562233060538426, "grad_norm": 13.46796703338623, "learning_rate": 3.3360877946673375e-05, "loss": 0.5134, "step": 476300 }, { "epoch": 6.5636108125981645, "grad_norm": 2.08974289894104, "learning_rate": 3.335304640751533e-05, "loss": 0.4405, "step": 476400 }, { "epoch": 6.564988564657904, "grad_norm": 82.97801208496094, "learning_rate": 3.334521437111105e-05, "loss": 0.4729, "step": 476500 }, { "epoch": 6.566366316717644, "grad_norm": 21.792648315429688, "learning_rate": 3.33373818381259e-05, "loss": 0.5104, "step": 476600 }, { "epoch": 6.567744068777383, "grad_norm": 2.1306300163269043, "learning_rate": 3.332954880922531e-05, "loss": 0.616, "step": 476700 }, { "epoch": 6.569121820837122, "grad_norm": 3.446024179458618, "learning_rate": 3.3321715285074716e-05, "loss": 0.4872, "step": 476800 }, { "epoch": 6.570499572896861, "grad_norm": 4.544395446777344, "learning_rate": 3.3313881266339625e-05, "loss": 0.4667, "step": 476900 }, { "epoch": 6.571877324956601, "grad_norm": 6.018442153930664, "learning_rate": 3.330604675368558e-05, "loss": 0.4431, "step": 477000 }, { "epoch": 6.5732550770163405, "grad_norm": 8.804194450378418, "learning_rate": 3.329821174777816e-05, "loss": 0.4841, "step": 477100 }, { "epoch": 6.574632829076079, "grad_norm": 2.9686954021453857, "learning_rate": 3.3290376249282986e-05, "loss": 0.5417, "step": 477200 }, { "epoch": 6.576010581135819, "grad_norm": 15.990909576416016, "learning_rate": 3.3282540258865725e-05, "loss": 0.4967, "step": 477300 }, { "epoch": 6.577388333195558, "grad_norm": 4.4878034591674805, "learning_rate": 3.327470377719209e-05, "loss": 0.6021, "step": 477400 }, { "epoch": 6.578766085255298, "grad_norm": 3.6620984077453613, "learning_rate": 3.3266866804927826e-05, "loss": 0.5129, "step": 477500 }, { "epoch": 6.5801438373150365, "grad_norm": 14.283812522888184, "learning_rate": 3.3259029342738716e-05, "loss": 0.5296, "step": 477600 }, { "epoch": 6.581521589374776, "grad_norm": 4.429503917694092, "learning_rate": 3.325119139129061e-05, "loss": 0.5294, "step": 477700 }, { "epoch": 6.582899341434516, "grad_norm": 39.468849182128906, "learning_rate": 3.324335295124937e-05, "loss": 0.5457, "step": 477800 }, { "epoch": 6.584277093494255, "grad_norm": 4.502475738525391, "learning_rate": 3.3235514023280906e-05, "loss": 0.5436, "step": 477900 }, { "epoch": 6.585654845553994, "grad_norm": 33.94144058227539, "learning_rate": 3.3227674608051196e-05, "loss": 0.5207, "step": 478000 }, { "epoch": 6.587032597613733, "grad_norm": 7.2556471824646, "learning_rate": 3.321983470622622e-05, "loss": 0.5627, "step": 478100 }, { "epoch": 6.588410349673473, "grad_norm": 18.895158767700195, "learning_rate": 3.321199431847201e-05, "loss": 0.5371, "step": 478200 }, { "epoch": 6.5897881017332125, "grad_norm": 1.9143149852752686, "learning_rate": 3.320415344545468e-05, "loss": 0.4969, "step": 478300 }, { "epoch": 6.591165853792951, "grad_norm": 11.343670845031738, "learning_rate": 3.3196312087840324e-05, "loss": 0.5592, "step": 478400 }, { "epoch": 6.592543605852691, "grad_norm": 3.6983842849731445, "learning_rate": 3.3188470246295125e-05, "loss": 0.5377, "step": 478500 }, { "epoch": 6.59392135791243, "grad_norm": 11.56848430633545, "learning_rate": 3.318062792148526e-05, "loss": 0.459, "step": 478600 }, { "epoch": 6.595299109972169, "grad_norm": 4.3684000968933105, "learning_rate": 3.317278511407699e-05, "loss": 0.5941, "step": 478700 }, { "epoch": 6.5966768620319085, "grad_norm": 3.685560703277588, "learning_rate": 3.316494182473662e-05, "loss": 0.4777, "step": 478800 }, { "epoch": 6.598054614091648, "grad_norm": 4.0189738273620605, "learning_rate": 3.315709805413044e-05, "loss": 0.4787, "step": 478900 }, { "epoch": 6.599432366151388, "grad_norm": 2.0849928855895996, "learning_rate": 3.314933224781369e-05, "loss": 0.537, "step": 479000 }, { "epoch": 6.600810118211127, "grad_norm": 3.9438023567199707, "learning_rate": 3.314156597110806e-05, "loss": 0.4727, "step": 479100 }, { "epoch": 6.602187870270866, "grad_norm": 7.533387660980225, "learning_rate": 3.313372077028169e-05, "loss": 0.5242, "step": 479200 }, { "epoch": 6.603565622330605, "grad_norm": 23.13103485107422, "learning_rate": 3.312587509084193e-05, "loss": 0.518, "step": 479300 }, { "epoch": 6.604943374390345, "grad_norm": 4.273341178894043, "learning_rate": 3.311802893345529e-05, "loss": 0.4648, "step": 479400 }, { "epoch": 6.606321126450084, "grad_norm": 10.277251243591309, "learning_rate": 3.3110182298788356e-05, "loss": 0.5075, "step": 479500 }, { "epoch": 6.607698878509823, "grad_norm": 3.7137677669525146, "learning_rate": 3.310233518750773e-05, "loss": 0.5354, "step": 479600 }, { "epoch": 6.609076630569563, "grad_norm": 2.8192920684814453, "learning_rate": 3.309448760028008e-05, "loss": 0.5696, "step": 479700 }, { "epoch": 6.610454382629302, "grad_norm": 21.763931274414062, "learning_rate": 3.308663953777207e-05, "loss": 0.4731, "step": 479800 }, { "epoch": 6.611832134689042, "grad_norm": 14.947737693786621, "learning_rate": 3.3078791000650455e-05, "loss": 0.4877, "step": 479900 }, { "epoch": 6.6132098867487805, "grad_norm": 2.8194727897644043, "learning_rate": 3.3070941989582013e-05, "loss": 0.5482, "step": 480000 }, { "epoch": 6.61458763880852, "grad_norm": 11.757162094116211, "learning_rate": 3.306309250523354e-05, "loss": 0.5233, "step": 480100 }, { "epoch": 6.61596539086826, "grad_norm": 1.4464075565338135, "learning_rate": 3.30552425482719e-05, "loss": 0.4637, "step": 480200 }, { "epoch": 6.617343142927998, "grad_norm": 18.960248947143555, "learning_rate": 3.304739211936398e-05, "loss": 0.5695, "step": 480300 }, { "epoch": 6.618720894987738, "grad_norm": 1.988845705986023, "learning_rate": 3.303954121917673e-05, "loss": 0.4336, "step": 480400 }, { "epoch": 6.620098647047477, "grad_norm": 1.8592430353164673, "learning_rate": 3.303168984837711e-05, "loss": 0.5139, "step": 480500 }, { "epoch": 6.621476399107217, "grad_norm": 1.5606125593185425, "learning_rate": 3.302383800763213e-05, "loss": 0.5365, "step": 480600 }, { "epoch": 6.622854151166956, "grad_norm": 5.78675651550293, "learning_rate": 3.301598569760886e-05, "loss": 0.5418, "step": 480700 }, { "epoch": 6.624231903226695, "grad_norm": 3.0970494747161865, "learning_rate": 3.3008132918974384e-05, "loss": 0.4604, "step": 480800 }, { "epoch": 6.625609655286435, "grad_norm": 7.67454719543457, "learning_rate": 3.3000279672395833e-05, "loss": 0.5215, "step": 480900 }, { "epoch": 6.626987407346174, "grad_norm": 8.267184257507324, "learning_rate": 3.2992425958540384e-05, "loss": 0.4412, "step": 481000 }, { "epoch": 6.628365159405913, "grad_norm": 7.434189796447754, "learning_rate": 3.298457177807525e-05, "loss": 0.4746, "step": 481100 }, { "epoch": 6.6297429114656525, "grad_norm": 4.048120021820068, "learning_rate": 3.297671713166769e-05, "loss": 0.4731, "step": 481200 }, { "epoch": 6.631120663525392, "grad_norm": 7.0578508377075195, "learning_rate": 3.2968862019984995e-05, "loss": 0.46, "step": 481300 }, { "epoch": 6.632498415585132, "grad_norm": 6.383736610412598, "learning_rate": 3.296100644369448e-05, "loss": 0.5021, "step": 481400 }, { "epoch": 6.63387616764487, "grad_norm": 10.01197624206543, "learning_rate": 3.2953150403463536e-05, "loss": 0.5876, "step": 481500 }, { "epoch": 6.63525391970461, "grad_norm": 22.271011352539062, "learning_rate": 3.294529389995957e-05, "loss": 0.3953, "step": 481600 }, { "epoch": 6.636631671764349, "grad_norm": 3.4908883571624756, "learning_rate": 3.293743693385002e-05, "loss": 0.4916, "step": 481700 }, { "epoch": 6.638009423824089, "grad_norm": 6.715983867645264, "learning_rate": 3.29295795058024e-05, "loss": 0.5247, "step": 481800 }, { "epoch": 6.639387175883828, "grad_norm": 4.584227085113525, "learning_rate": 3.292172161648421e-05, "loss": 0.4767, "step": 481900 }, { "epoch": 6.640764927943567, "grad_norm": 6.77118444442749, "learning_rate": 3.291386326656303e-05, "loss": 0.5416, "step": 482000 }, { "epoch": 6.642142680003307, "grad_norm": 2.8333487510681152, "learning_rate": 3.290600445670649e-05, "loss": 0.4468, "step": 482100 }, { "epoch": 6.643520432063046, "grad_norm": 2.227372169494629, "learning_rate": 3.289814518758219e-05, "loss": 0.4468, "step": 482200 }, { "epoch": 6.644898184122785, "grad_norm": 1.9961074590682983, "learning_rate": 3.289028545985785e-05, "loss": 0.473, "step": 482300 }, { "epoch": 6.6462759361825245, "grad_norm": 2.170229196548462, "learning_rate": 3.2882425274201184e-05, "loss": 0.4622, "step": 482400 }, { "epoch": 6.647653688242264, "grad_norm": 23.161144256591797, "learning_rate": 3.287456463127995e-05, "loss": 0.563, "step": 482500 }, { "epoch": 6.649031440302004, "grad_norm": 54.195125579833984, "learning_rate": 3.2866703531761955e-05, "loss": 0.532, "step": 482600 }, { "epoch": 6.650409192361742, "grad_norm": 3.0172743797302246, "learning_rate": 3.285884197631504e-05, "loss": 0.5573, "step": 482700 }, { "epoch": 6.651786944421482, "grad_norm": 2.6992311477661133, "learning_rate": 3.285097996560709e-05, "loss": 0.4854, "step": 482800 }, { "epoch": 6.653164696481221, "grad_norm": 8.62267017364502, "learning_rate": 3.284311750030601e-05, "loss": 0.5506, "step": 482900 }, { "epoch": 6.65454244854096, "grad_norm": 5.223740577697754, "learning_rate": 3.283525458107976e-05, "loss": 0.472, "step": 483000 }, { "epoch": 6.6559202006007, "grad_norm": 4.635311603546143, "learning_rate": 3.282739120859634e-05, "loss": 0.4923, "step": 483100 }, { "epoch": 6.657297952660439, "grad_norm": 5.964470386505127, "learning_rate": 3.2819527383523786e-05, "loss": 0.531, "step": 483200 }, { "epoch": 6.658675704720179, "grad_norm": 97.39588928222656, "learning_rate": 3.281166310653017e-05, "loss": 0.466, "step": 483300 }, { "epoch": 6.660053456779918, "grad_norm": 11.894049644470215, "learning_rate": 3.280379837828359e-05, "loss": 0.5036, "step": 483400 }, { "epoch": 6.661431208839657, "grad_norm": 5.156511306762695, "learning_rate": 3.279593319945221e-05, "loss": 0.502, "step": 483500 }, { "epoch": 6.662808960899397, "grad_norm": 19.589689254760742, "learning_rate": 3.278806757070421e-05, "loss": 0.5605, "step": 483600 }, { "epoch": 6.664186712959136, "grad_norm": 4.9715352058410645, "learning_rate": 3.2780201492707815e-05, "loss": 0.5309, "step": 483700 }, { "epoch": 6.665564465018875, "grad_norm": 5.03038215637207, "learning_rate": 3.2772334966131286e-05, "loss": 0.5306, "step": 483800 }, { "epoch": 6.666942217078614, "grad_norm": 1.5268027782440186, "learning_rate": 3.2764467991642936e-05, "loss": 0.4856, "step": 483900 }, { "epoch": 6.668319969138354, "grad_norm": 14.442225456237793, "learning_rate": 3.275660056991109e-05, "loss": 0.531, "step": 484000 }, { "epoch": 6.6696977211980935, "grad_norm": 4.5085768699646, "learning_rate": 3.274873270160414e-05, "loss": 0.4886, "step": 484100 }, { "epoch": 6.671075473257833, "grad_norm": 4.754798412322998, "learning_rate": 3.2740864387390496e-05, "loss": 0.5464, "step": 484200 }, { "epoch": 6.672453225317572, "grad_norm": 5.900638103485107, "learning_rate": 3.27329956279386e-05, "loss": 0.5197, "step": 484300 }, { "epoch": 6.673830977377311, "grad_norm": 14.146143913269043, "learning_rate": 3.272512642391696e-05, "loss": 0.4744, "step": 484400 }, { "epoch": 6.675208729437051, "grad_norm": 6.912065029144287, "learning_rate": 3.2717256775994095e-05, "loss": 0.5196, "step": 484500 }, { "epoch": 6.6765864814967895, "grad_norm": 30.106733322143555, "learning_rate": 3.270938668483858e-05, "loss": 0.4377, "step": 484600 }, { "epoch": 6.677964233556529, "grad_norm": 7.502816677093506, "learning_rate": 3.2701516151119015e-05, "loss": 0.4493, "step": 484700 }, { "epoch": 6.679341985616269, "grad_norm": 31.605297088623047, "learning_rate": 3.269364517550403e-05, "loss": 0.4129, "step": 484800 }, { "epoch": 6.680719737676008, "grad_norm": 4.691681861877441, "learning_rate": 3.2685773758662336e-05, "loss": 0.5738, "step": 484900 }, { "epoch": 6.682097489735747, "grad_norm": 8.39633560180664, "learning_rate": 3.267790190126262e-05, "loss": 0.56, "step": 485000 }, { "epoch": 6.683475241795486, "grad_norm": 1.8914600610733032, "learning_rate": 3.267002960397365e-05, "loss": 0.5342, "step": 485100 }, { "epoch": 6.684852993855226, "grad_norm": 2.823514223098755, "learning_rate": 3.2662156867464215e-05, "loss": 0.4553, "step": 485200 }, { "epoch": 6.6862307459149655, "grad_norm": 3.148958206176758, "learning_rate": 3.265428369240314e-05, "loss": 0.4745, "step": 485300 }, { "epoch": 6.687608497974704, "grad_norm": 5.808370590209961, "learning_rate": 3.264648881775407e-05, "loss": 0.4824, "step": 485400 }, { "epoch": 6.688986250034444, "grad_norm": 3.830756425857544, "learning_rate": 3.263861477196519e-05, "loss": 0.4953, "step": 485500 }, { "epoch": 6.690364002094183, "grad_norm": 19.666732788085938, "learning_rate": 3.26307402896247e-05, "loss": 0.4816, "step": 485600 }, { "epoch": 6.691741754153923, "grad_norm": 12.761123657226562, "learning_rate": 3.262286537140157e-05, "loss": 0.5555, "step": 485700 }, { "epoch": 6.6931195062136615, "grad_norm": 5.290180683135986, "learning_rate": 3.261499001796483e-05, "loss": 0.4334, "step": 485800 }, { "epoch": 6.694497258273401, "grad_norm": 2.622549533843994, "learning_rate": 3.2607114229983506e-05, "loss": 0.4774, "step": 485900 }, { "epoch": 6.695875010333141, "grad_norm": 2.8366758823394775, "learning_rate": 3.259923800812671e-05, "loss": 0.5055, "step": 486000 }, { "epoch": 6.69725276239288, "grad_norm": 8.155316352844238, "learning_rate": 3.2591440121756355e-05, "loss": 0.5687, "step": 486100 }, { "epoch": 6.698630514452619, "grad_norm": 33.312744140625, "learning_rate": 3.258356303847807e-05, "loss": 0.4846, "step": 486200 }, { "epoch": 6.700008266512358, "grad_norm": 3.404679775238037, "learning_rate": 3.257568552332508e-05, "loss": 0.5001, "step": 486300 }, { "epoch": 6.701386018572098, "grad_norm": 3.142570734024048, "learning_rate": 3.256780757696665e-05, "loss": 0.468, "step": 486400 }, { "epoch": 6.7027637706318375, "grad_norm": 3.5822596549987793, "learning_rate": 3.255992920007205e-05, "loss": 0.5092, "step": 486500 }, { "epoch": 6.704141522691576, "grad_norm": 2.6791110038757324, "learning_rate": 3.255205039331056e-05, "loss": 0.441, "step": 486600 }, { "epoch": 6.705519274751316, "grad_norm": 1.9100193977355957, "learning_rate": 3.254417115735155e-05, "loss": 0.53, "step": 486700 }, { "epoch": 6.706897026811055, "grad_norm": 24.22588539123535, "learning_rate": 3.2536291492864393e-05, "loss": 0.4945, "step": 486800 }, { "epoch": 6.708274778870795, "grad_norm": 3.6156580448150635, "learning_rate": 3.252841140051851e-05, "loss": 0.4579, "step": 486900 }, { "epoch": 6.7096525309305335, "grad_norm": 1.2529386281967163, "learning_rate": 3.252053088098335e-05, "loss": 0.5118, "step": 487000 }, { "epoch": 6.711030282990273, "grad_norm": 1.7775460481643677, "learning_rate": 3.2512649934928416e-05, "loss": 0.5469, "step": 487100 }, { "epoch": 6.712408035050013, "grad_norm": 5.117815017700195, "learning_rate": 3.2504768563023213e-05, "loss": 0.4713, "step": 487200 }, { "epoch": 6.713785787109751, "grad_norm": 0.360550194978714, "learning_rate": 3.249688676593733e-05, "loss": 0.6273, "step": 487300 }, { "epoch": 6.715163539169491, "grad_norm": 4.095431327819824, "learning_rate": 3.248900454434034e-05, "loss": 0.4707, "step": 487400 }, { "epoch": 6.71654129122923, "grad_norm": 0.3416297733783722, "learning_rate": 3.24811218989019e-05, "loss": 0.4315, "step": 487500 }, { "epoch": 6.71791904328897, "grad_norm": 2.7688403129577637, "learning_rate": 3.247323883029166e-05, "loss": 0.4854, "step": 487600 }, { "epoch": 6.7192967953487095, "grad_norm": 3.3573837280273438, "learning_rate": 3.246535533917935e-05, "loss": 0.5234, "step": 487700 }, { "epoch": 6.720674547408448, "grad_norm": 4.6890177726745605, "learning_rate": 3.24574714262347e-05, "loss": 0.4517, "step": 487800 }, { "epoch": 6.722052299468188, "grad_norm": 2.5689754486083984, "learning_rate": 3.2449587092127484e-05, "loss": 0.5398, "step": 487900 }, { "epoch": 6.723430051527927, "grad_norm": 9.631098747253418, "learning_rate": 3.2441702337527515e-05, "loss": 0.5326, "step": 488000 }, { "epoch": 6.724807803587666, "grad_norm": 5.43438196182251, "learning_rate": 3.243381716310466e-05, "loss": 0.5599, "step": 488100 }, { "epoch": 6.7261855556474055, "grad_norm": 7.3053364753723145, "learning_rate": 3.24259315695288e-05, "loss": 0.5374, "step": 488200 }, { "epoch": 6.727563307707145, "grad_norm": 1.1146855354309082, "learning_rate": 3.241804555746985e-05, "loss": 0.4776, "step": 488300 }, { "epoch": 6.728941059766885, "grad_norm": 11.632970809936523, "learning_rate": 3.2410159127597766e-05, "loss": 0.5525, "step": 488400 }, { "epoch": 6.730318811826624, "grad_norm": 2.38018536567688, "learning_rate": 3.240227228058255e-05, "loss": 0.5466, "step": 488500 }, { "epoch": 6.731696563886363, "grad_norm": 6.1933817863464355, "learning_rate": 3.239438501709422e-05, "loss": 0.5129, "step": 488600 }, { "epoch": 6.733074315946102, "grad_norm": 2.9498612880706787, "learning_rate": 3.238649733780284e-05, "loss": 0.5218, "step": 488700 }, { "epoch": 6.734452068005842, "grad_norm": 5.941333770751953, "learning_rate": 3.237860924337852e-05, "loss": 0.5081, "step": 488800 }, { "epoch": 6.735829820065581, "grad_norm": 3.954420328140259, "learning_rate": 3.237072073449137e-05, "loss": 0.5357, "step": 488900 }, { "epoch": 6.73720757212532, "grad_norm": 8.380908966064453, "learning_rate": 3.2362831811811595e-05, "loss": 0.4416, "step": 489000 }, { "epoch": 6.73858532418506, "grad_norm": 1.4821916818618774, "learning_rate": 3.235494247600937e-05, "loss": 0.5783, "step": 489100 }, { "epoch": 6.739963076244799, "grad_norm": 4.570451259613037, "learning_rate": 3.234705272775494e-05, "loss": 0.5671, "step": 489200 }, { "epoch": 6.741340828304538, "grad_norm": 5.681606292724609, "learning_rate": 3.2339162567718595e-05, "loss": 0.4956, "step": 489300 }, { "epoch": 6.7427185803642775, "grad_norm": 5.003773212432861, "learning_rate": 3.233135090431491e-05, "loss": 0.5427, "step": 489400 }, { "epoch": 6.744096332424017, "grad_norm": 19.467884063720703, "learning_rate": 3.2323459926826755e-05, "loss": 0.4236, "step": 489500 }, { "epoch": 6.745474084483757, "grad_norm": 7.783166885375977, "learning_rate": 3.2315568539561016e-05, "loss": 0.5016, "step": 489600 }, { "epoch": 6.746851836543495, "grad_norm": 6.3391432762146, "learning_rate": 3.230767674318809e-05, "loss": 0.5005, "step": 489700 }, { "epoch": 6.748229588603235, "grad_norm": 4.056929588317871, "learning_rate": 3.229978453837842e-05, "loss": 0.4315, "step": 489800 }, { "epoch": 6.749607340662974, "grad_norm": 3.4065561294555664, "learning_rate": 3.229189192580251e-05, "loss": 0.5535, "step": 489900 }, { "epoch": 6.750985092722714, "grad_norm": 2.0479066371917725, "learning_rate": 3.2283998906130855e-05, "loss": 0.4908, "step": 490000 }, { "epoch": 6.752362844782453, "grad_norm": 9.683295249938965, "learning_rate": 3.227610548003402e-05, "loss": 0.5337, "step": 490100 }, { "epoch": 6.753740596842192, "grad_norm": 8.605391502380371, "learning_rate": 3.2268211648182605e-05, "loss": 0.522, "step": 490200 }, { "epoch": 6.755118348901932, "grad_norm": 6.824690341949463, "learning_rate": 3.2260317411247195e-05, "loss": 0.4674, "step": 490300 }, { "epoch": 6.756496100961671, "grad_norm": 8.162220001220703, "learning_rate": 3.225242276989848e-05, "loss": 0.5158, "step": 490400 }, { "epoch": 6.75787385302141, "grad_norm": 1.6816174983978271, "learning_rate": 3.224452772480713e-05, "loss": 0.4538, "step": 490500 }, { "epoch": 6.7592516050811495, "grad_norm": 41.01235580444336, "learning_rate": 3.2236632276643884e-05, "loss": 0.4143, "step": 490600 }, { "epoch": 6.760629357140889, "grad_norm": 6.8301100730896, "learning_rate": 3.222873642607949e-05, "loss": 0.5168, "step": 490700 }, { "epoch": 6.762007109200629, "grad_norm": 5.045297622680664, "learning_rate": 3.2220840173784745e-05, "loss": 0.4729, "step": 490800 }, { "epoch": 6.763384861260367, "grad_norm": 34.562564849853516, "learning_rate": 3.221294352043048e-05, "loss": 0.5204, "step": 490900 }, { "epoch": 6.764762613320107, "grad_norm": 1.4532976150512695, "learning_rate": 3.220504646668755e-05, "loss": 0.4942, "step": 491000 }, { "epoch": 6.766140365379846, "grad_norm": 3.980806827545166, "learning_rate": 3.2197149013226844e-05, "loss": 0.4614, "step": 491100 }, { "epoch": 6.767518117439586, "grad_norm": 6.934349536895752, "learning_rate": 3.218925116071931e-05, "loss": 0.4826, "step": 491200 }, { "epoch": 6.768895869499325, "grad_norm": 4.31529426574707, "learning_rate": 3.218135290983589e-05, "loss": 0.4803, "step": 491300 }, { "epoch": 6.770273621559064, "grad_norm": 7.444957256317139, "learning_rate": 3.2173454261247594e-05, "loss": 0.4711, "step": 491400 }, { "epoch": 6.771651373618804, "grad_norm": 2.31036114692688, "learning_rate": 3.2165555215625457e-05, "loss": 0.4993, "step": 491500 }, { "epoch": 6.773029125678542, "grad_norm": 2.6928467750549316, "learning_rate": 3.2157655773640526e-05, "loss": 0.5387, "step": 491600 }, { "epoch": 6.774406877738282, "grad_norm": 5.866816997528076, "learning_rate": 3.214975593596391e-05, "loss": 0.4708, "step": 491700 }, { "epoch": 6.7757846297980215, "grad_norm": 1.9427406787872314, "learning_rate": 3.2141855703266745e-05, "loss": 0.5387, "step": 491800 }, { "epoch": 6.777162381857761, "grad_norm": 4.013765335083008, "learning_rate": 3.213395507622018e-05, "loss": 0.4648, "step": 491900 }, { "epoch": 6.778540133917501, "grad_norm": 7.95597505569458, "learning_rate": 3.212605405549543e-05, "loss": 0.4618, "step": 492000 }, { "epoch": 6.779917885977239, "grad_norm": 5.17407751083374, "learning_rate": 3.2118152641763724e-05, "loss": 0.4951, "step": 492100 }, { "epoch": 6.781295638036979, "grad_norm": 4.269516944885254, "learning_rate": 3.2110250835696326e-05, "loss": 0.491, "step": 492200 }, { "epoch": 6.782673390096718, "grad_norm": 4.915322780609131, "learning_rate": 3.2102348637964525e-05, "loss": 0.5258, "step": 492300 }, { "epoch": 6.784051142156457, "grad_norm": 2.7525646686553955, "learning_rate": 3.2094446049239666e-05, "loss": 0.5491, "step": 492400 }, { "epoch": 6.785428894216197, "grad_norm": 5.786526203155518, "learning_rate": 3.2086543070193115e-05, "loss": 0.4285, "step": 492500 }, { "epoch": 6.786806646275936, "grad_norm": 12.089977264404297, "learning_rate": 3.207863970149626e-05, "loss": 0.4866, "step": 492600 }, { "epoch": 6.788184398335676, "grad_norm": 27.45583152770996, "learning_rate": 3.207073594382054e-05, "loss": 0.4653, "step": 492700 }, { "epoch": 6.789562150395415, "grad_norm": 6.595381736755371, "learning_rate": 3.2062831797837415e-05, "loss": 0.4734, "step": 492800 }, { "epoch": 6.790939902455154, "grad_norm": 2.7230958938598633, "learning_rate": 3.205492726421839e-05, "loss": 0.4776, "step": 492900 }, { "epoch": 6.7923176545148936, "grad_norm": 3.9158592224121094, "learning_rate": 3.204702234363499e-05, "loss": 0.4275, "step": 493000 }, { "epoch": 6.793695406574633, "grad_norm": 4.048093795776367, "learning_rate": 3.203911703675877e-05, "loss": 0.5006, "step": 493100 }, { "epoch": 6.795073158634372, "grad_norm": 3.271937131881714, "learning_rate": 3.203121134426136e-05, "loss": 0.5368, "step": 493200 }, { "epoch": 6.796450910694111, "grad_norm": 3.249682664871216, "learning_rate": 3.2023305266814344e-05, "loss": 0.532, "step": 493300 }, { "epoch": 6.797828662753851, "grad_norm": 1.9720748662948608, "learning_rate": 3.201539880508941e-05, "loss": 0.4613, "step": 493400 }, { "epoch": 6.7992064148135904, "grad_norm": 0.5858930349349976, "learning_rate": 3.200749195975825e-05, "loss": 0.4689, "step": 493500 }, { "epoch": 6.800584166873329, "grad_norm": 13.444096565246582, "learning_rate": 3.199958473149258e-05, "loss": 0.4849, "step": 493600 }, { "epoch": 6.801961918933069, "grad_norm": 0.05797514319419861, "learning_rate": 3.199167712096417e-05, "loss": 0.4725, "step": 493700 }, { "epoch": 6.803339670992808, "grad_norm": 3.8541224002838135, "learning_rate": 3.198384821065268e-05, "loss": 0.4986, "step": 493800 }, { "epoch": 6.804717423052548, "grad_norm": 0.6057089567184448, "learning_rate": 3.197593984142005e-05, "loss": 0.4329, "step": 493900 }, { "epoch": 6.8060951751122865, "grad_norm": 5.346893310546875, "learning_rate": 3.196803109193345e-05, "loss": 0.493, "step": 494000 }, { "epoch": 6.807472927172026, "grad_norm": 4.722138404846191, "learning_rate": 3.196012196286474e-05, "loss": 0.4863, "step": 494100 }, { "epoch": 6.808850679231766, "grad_norm": 7.602149963378906, "learning_rate": 3.195221245488585e-05, "loss": 0.4951, "step": 494200 }, { "epoch": 6.810228431291505, "grad_norm": 1.5293307304382324, "learning_rate": 3.1944302568668746e-05, "loss": 0.445, "step": 494300 }, { "epoch": 6.811606183351244, "grad_norm": 7.256683349609375, "learning_rate": 3.1936392304885396e-05, "loss": 0.5257, "step": 494400 }, { "epoch": 6.812983935410983, "grad_norm": 5.243493556976318, "learning_rate": 3.192848166420782e-05, "loss": 0.505, "step": 494500 }, { "epoch": 6.814361687470723, "grad_norm": 2.9806153774261475, "learning_rate": 3.192057064730807e-05, "loss": 0.5492, "step": 494600 }, { "epoch": 6.8157394395304625, "grad_norm": 5.731026649475098, "learning_rate": 3.1912738370639494e-05, "loss": 0.5791, "step": 494700 }, { "epoch": 6.817117191590201, "grad_norm": 4.773301124572754, "learning_rate": 3.190482660705713e-05, "loss": 0.5285, "step": 494800 }, { "epoch": 6.818494943649941, "grad_norm": 1.94752037525177, "learning_rate": 3.1896914469262203e-05, "loss": 0.4838, "step": 494900 }, { "epoch": 6.81987269570968, "grad_norm": 1.5738259553909302, "learning_rate": 3.18890019579269e-05, "loss": 0.4624, "step": 495000 }, { "epoch": 6.82125044776942, "grad_norm": 13.824067115783691, "learning_rate": 3.1881089073723436e-05, "loss": 0.4716, "step": 495100 }, { "epoch": 6.8226281998291585, "grad_norm": 1.0518079996109009, "learning_rate": 3.187317581732404e-05, "loss": 0.4049, "step": 495200 }, { "epoch": 6.824005951888898, "grad_norm": 6.62912130355835, "learning_rate": 3.186526218940098e-05, "loss": 0.492, "step": 495300 }, { "epoch": 6.825383703948638, "grad_norm": 4.1702880859375, "learning_rate": 3.185734819062656e-05, "loss": 0.4524, "step": 495400 }, { "epoch": 6.826761456008377, "grad_norm": 2.934587001800537, "learning_rate": 3.184943382167313e-05, "loss": 0.5201, "step": 495500 }, { "epoch": 6.828139208068116, "grad_norm": 7.047045707702637, "learning_rate": 3.184151908321304e-05, "loss": 0.4311, "step": 495600 }, { "epoch": 6.829516960127855, "grad_norm": 3.5394976139068604, "learning_rate": 3.1833603975918695e-05, "loss": 0.506, "step": 495700 }, { "epoch": 6.830894712187595, "grad_norm": 6.301085472106934, "learning_rate": 3.182568850046252e-05, "loss": 0.5721, "step": 495800 }, { "epoch": 6.832272464247334, "grad_norm": 10.820953369140625, "learning_rate": 3.181777265751698e-05, "loss": 0.5412, "step": 495900 }, { "epoch": 6.833650216307073, "grad_norm": 2.890017032623291, "learning_rate": 3.180985644775455e-05, "loss": 0.5362, "step": 496000 }, { "epoch": 6.835027968366813, "grad_norm": 6.602724075317383, "learning_rate": 3.180193987184778e-05, "loss": 0.5041, "step": 496100 }, { "epoch": 6.836405720426552, "grad_norm": 5.462466716766357, "learning_rate": 3.1794022930469204e-05, "loss": 0.4821, "step": 496200 }, { "epoch": 6.837783472486292, "grad_norm": 14.555152893066406, "learning_rate": 3.1786105624291415e-05, "loss": 0.5095, "step": 496300 }, { "epoch": 6.8391612245460305, "grad_norm": 3.682447671890259, "learning_rate": 3.1778187953987016e-05, "loss": 0.4157, "step": 496400 }, { "epoch": 6.84053897660577, "grad_norm": 1.9888455867767334, "learning_rate": 3.177026992022868e-05, "loss": 0.5603, "step": 496500 }, { "epoch": 6.84191672866551, "grad_norm": 2.1429059505462646, "learning_rate": 3.1762351523689055e-05, "loss": 0.4713, "step": 496600 }, { "epoch": 6.843294480725248, "grad_norm": 4.387076377868652, "learning_rate": 3.1754432765040866e-05, "loss": 0.5193, "step": 496700 }, { "epoch": 6.844672232784988, "grad_norm": 10.121049880981445, "learning_rate": 3.174651364495685e-05, "loss": 0.5285, "step": 496800 }, { "epoch": 6.846049984844727, "grad_norm": 8.326859474182129, "learning_rate": 3.1738594164109765e-05, "loss": 0.4851, "step": 496900 }, { "epoch": 6.847427736904467, "grad_norm": 4.124664306640625, "learning_rate": 3.173067432317244e-05, "loss": 0.4572, "step": 497000 }, { "epoch": 6.8488054889642065, "grad_norm": 6.335623264312744, "learning_rate": 3.1722754122817665e-05, "loss": 0.4621, "step": 497100 }, { "epoch": 6.850183241023945, "grad_norm": 4.623098850250244, "learning_rate": 3.1714833563718336e-05, "loss": 0.503, "step": 497200 }, { "epoch": 6.851560993083685, "grad_norm": 12.144587516784668, "learning_rate": 3.170691264654733e-05, "loss": 0.4547, "step": 497300 }, { "epoch": 6.852938745143424, "grad_norm": 1.749037742614746, "learning_rate": 3.169899137197757e-05, "loss": 0.4806, "step": 497400 }, { "epoch": 6.854316497203163, "grad_norm": 3.978651523590088, "learning_rate": 3.169106974068201e-05, "loss": 0.4687, "step": 497500 }, { "epoch": 6.8556942492629025, "grad_norm": 0.904420793056488, "learning_rate": 3.168314775333365e-05, "loss": 0.5043, "step": 497600 }, { "epoch": 6.857072001322642, "grad_norm": 3.9423272609710693, "learning_rate": 3.167522541060547e-05, "loss": 0.489, "step": 497700 }, { "epoch": 6.858449753382382, "grad_norm": 31.07257843017578, "learning_rate": 3.166730271317054e-05, "loss": 0.417, "step": 497800 }, { "epoch": 6.85982750544212, "grad_norm": 9.66943645477295, "learning_rate": 3.165937966170192e-05, "loss": 0.5451, "step": 497900 }, { "epoch": 6.86120525750186, "grad_norm": 1.4526536464691162, "learning_rate": 3.165145625687272e-05, "loss": 0.4968, "step": 498000 }, { "epoch": 6.862583009561599, "grad_norm": 20.55359649658203, "learning_rate": 3.164353249935608e-05, "loss": 0.5431, "step": 498100 }, { "epoch": 6.863960761621339, "grad_norm": 2.7309725284576416, "learning_rate": 3.163560838982515e-05, "loss": 0.4878, "step": 498200 }, { "epoch": 6.865338513681078, "grad_norm": 1.5482150316238403, "learning_rate": 3.1627683928953124e-05, "loss": 0.561, "step": 498300 }, { "epoch": 6.866716265740817, "grad_norm": 10.03954029083252, "learning_rate": 3.161975911741324e-05, "loss": 0.5143, "step": 498400 }, { "epoch": 6.868094017800557, "grad_norm": 4.985734939575195, "learning_rate": 3.161183395587874e-05, "loss": 0.5662, "step": 498500 }, { "epoch": 6.869471769860296, "grad_norm": 2.313674211502075, "learning_rate": 3.1603908445022904e-05, "loss": 0.4884, "step": 498600 }, { "epoch": 6.870849521920035, "grad_norm": 6.781418800354004, "learning_rate": 3.159606184583769e-05, "loss": 0.5005, "step": 498700 }, { "epoch": 6.8722272739797745, "grad_norm": 2.4639346599578857, "learning_rate": 3.1588135641835576e-05, "loss": 0.5789, "step": 498800 }, { "epoch": 6.873605026039514, "grad_norm": 0.7739230990409851, "learning_rate": 3.158020909052543e-05, "loss": 0.4535, "step": 498900 }, { "epoch": 6.874982778099254, "grad_norm": 5.304497241973877, "learning_rate": 3.157228219258064e-05, "loss": 0.4985, "step": 499000 }, { "epoch": 6.876360530158992, "grad_norm": 6.326128005981445, "learning_rate": 3.1564354948674644e-05, "loss": 0.4601, "step": 499100 }, { "epoch": 6.877738282218732, "grad_norm": 6.563399791717529, "learning_rate": 3.1556427359480914e-05, "loss": 0.5091, "step": 499200 }, { "epoch": 6.879116034278471, "grad_norm": 4.808310508728027, "learning_rate": 3.154849942567291e-05, "loss": 0.4562, "step": 499300 }, { "epoch": 6.880493786338211, "grad_norm": 4.943713665008545, "learning_rate": 3.154057114792418e-05, "loss": 0.4924, "step": 499400 }, { "epoch": 6.88187153839795, "grad_norm": 3.5838000774383545, "learning_rate": 3.153264252690826e-05, "loss": 0.3959, "step": 499500 }, { "epoch": 6.883249290457689, "grad_norm": 1.174947738647461, "learning_rate": 3.152471356329872e-05, "loss": 0.4396, "step": 499600 }, { "epoch": 6.884627042517429, "grad_norm": 4.0670390129089355, "learning_rate": 3.151678425776918e-05, "loss": 0.5085, "step": 499700 }, { "epoch": 6.886004794577168, "grad_norm": 3.499330759048462, "learning_rate": 3.150885461099326e-05, "loss": 0.46, "step": 499800 }, { "epoch": 6.887382546636907, "grad_norm": 4.000747203826904, "learning_rate": 3.150092462364464e-05, "loss": 0.4708, "step": 499900 }, { "epoch": 6.8887602986966465, "grad_norm": 9.867107391357422, "learning_rate": 3.1492994296396996e-05, "loss": 0.4882, "step": 500000 }, { "epoch": 6.890138050756386, "grad_norm": 90.97855377197266, "learning_rate": 3.148506362992406e-05, "loss": 0.4481, "step": 500100 }, { "epoch": 6.891515802816125, "grad_norm": 1.7543073892593384, "learning_rate": 3.147713262489958e-05, "loss": 0.564, "step": 500200 }, { "epoch": 6.892893554875864, "grad_norm": 4.410828590393066, "learning_rate": 3.146920128199734e-05, "loss": 0.4306, "step": 500300 }, { "epoch": 6.894271306935604, "grad_norm": 16.284709930419922, "learning_rate": 3.146126960189114e-05, "loss": 0.4977, "step": 500400 }, { "epoch": 6.895649058995343, "grad_norm": 4.696292400360107, "learning_rate": 3.1453337585254834e-05, "loss": 0.4558, "step": 500500 }, { "epoch": 6.897026811055083, "grad_norm": 0.5019664764404297, "learning_rate": 3.144540523276227e-05, "loss": 0.4428, "step": 500600 }, { "epoch": 6.898404563114822, "grad_norm": 2.202000856399536, "learning_rate": 3.143747254508734e-05, "loss": 0.5189, "step": 500700 }, { "epoch": 6.899782315174561, "grad_norm": 4.785269260406494, "learning_rate": 3.1429539522903973e-05, "loss": 0.4897, "step": 500800 }, { "epoch": 6.901160067234301, "grad_norm": 3.115980386734009, "learning_rate": 3.1421606166886125e-05, "loss": 0.4327, "step": 500900 }, { "epoch": 6.902537819294039, "grad_norm": 2.6305105686187744, "learning_rate": 3.1413672477707775e-05, "loss": 0.5207, "step": 501000 }, { "epoch": 6.903915571353779, "grad_norm": 3.773603677749634, "learning_rate": 3.140573845604292e-05, "loss": 0.4767, "step": 501100 }, { "epoch": 6.9052933234135185, "grad_norm": 3.1492981910705566, "learning_rate": 3.13978041025656e-05, "loss": 0.5809, "step": 501200 }, { "epoch": 6.906671075473258, "grad_norm": 24.479820251464844, "learning_rate": 3.138986941794988e-05, "loss": 0.4322, "step": 501300 }, { "epoch": 6.908048827532998, "grad_norm": 1.3926104307174683, "learning_rate": 3.138193440286986e-05, "loss": 0.4288, "step": 501400 }, { "epoch": 6.909426579592736, "grad_norm": 10.369736671447754, "learning_rate": 3.1373999057999645e-05, "loss": 0.49, "step": 501500 }, { "epoch": 6.910804331652476, "grad_norm": 1.4693257808685303, "learning_rate": 3.13660633840134e-05, "loss": 0.543, "step": 501600 }, { "epoch": 6.912182083712215, "grad_norm": 3.880431652069092, "learning_rate": 3.135812738158528e-05, "loss": 0.5376, "step": 501700 }, { "epoch": 6.913559835771954, "grad_norm": 6.648316860198975, "learning_rate": 3.13502704163117e-05, "loss": 0.4842, "step": 501800 }, { "epoch": 6.914937587831694, "grad_norm": 5.535737037658691, "learning_rate": 3.1342333762290096e-05, "loss": 0.5468, "step": 501900 }, { "epoch": 6.916315339891433, "grad_norm": 8.411781311035156, "learning_rate": 3.133439678184258e-05, "loss": 0.4651, "step": 502000 }, { "epoch": 6.917693091951173, "grad_norm": 4.188355445861816, "learning_rate": 3.1326459475643444e-05, "loss": 0.4832, "step": 502100 }, { "epoch": 6.919070844010911, "grad_norm": 4.7604217529296875, "learning_rate": 3.1318521844366995e-05, "loss": 0.4266, "step": 502200 }, { "epoch": 6.920448596070651, "grad_norm": 2.426968574523926, "learning_rate": 3.1310583888687586e-05, "loss": 0.4613, "step": 502300 }, { "epoch": 6.9218263481303905, "grad_norm": 4.971142768859863, "learning_rate": 3.130264560927958e-05, "loss": 0.4702, "step": 502400 }, { "epoch": 6.92320410019013, "grad_norm": 31.65337562561035, "learning_rate": 3.1294707006817365e-05, "loss": 0.5103, "step": 502500 }, { "epoch": 6.924581852249869, "grad_norm": 5.310464859008789, "learning_rate": 3.128676808197539e-05, "loss": 0.4752, "step": 502600 }, { "epoch": 6.925959604309608, "grad_norm": 3.0934910774230957, "learning_rate": 3.127882883542809e-05, "loss": 0.518, "step": 502700 }, { "epoch": 6.927337356369348, "grad_norm": 3.8131024837493896, "learning_rate": 3.127088926784994e-05, "loss": 0.4081, "step": 502800 }, { "epoch": 6.928715108429087, "grad_norm": 6.654551029205322, "learning_rate": 3.126294937991546e-05, "loss": 0.5059, "step": 502900 }, { "epoch": 6.930092860488826, "grad_norm": 9.5676908493042, "learning_rate": 3.125500917229918e-05, "loss": 0.5122, "step": 503000 }, { "epoch": 6.931470612548566, "grad_norm": 8.509909629821777, "learning_rate": 3.124706864567566e-05, "loss": 0.5514, "step": 503100 }, { "epoch": 6.932848364608305, "grad_norm": 2.3952219486236572, "learning_rate": 3.123912780071949e-05, "loss": 0.4621, "step": 503200 }, { "epoch": 6.934226116668045, "grad_norm": 31.232297897338867, "learning_rate": 3.123118663810527e-05, "loss": 0.4865, "step": 503300 }, { "epoch": 6.9356038687277834, "grad_norm": 4.742006301879883, "learning_rate": 3.122324515850768e-05, "loss": 0.4909, "step": 503400 }, { "epoch": 6.936981620787523, "grad_norm": 2.189765453338623, "learning_rate": 3.1215303362601355e-05, "loss": 0.5231, "step": 503500 }, { "epoch": 6.938359372847263, "grad_norm": 3.0376875400543213, "learning_rate": 3.1207361251061e-05, "loss": 0.474, "step": 503600 }, { "epoch": 6.939737124907002, "grad_norm": 4.776489734649658, "learning_rate": 3.119941882456134e-05, "loss": 0.4679, "step": 503700 }, { "epoch": 6.941114876966741, "grad_norm": 44.82393264770508, "learning_rate": 3.1191476083777124e-05, "loss": 0.4496, "step": 503800 }, { "epoch": 6.94249262902648, "grad_norm": 0.723544716835022, "learning_rate": 3.118361246147723e-05, "loss": 0.4407, "step": 503900 }, { "epoch": 6.94387038108622, "grad_norm": 3.5784547328948975, "learning_rate": 3.117566909727427e-05, "loss": 0.4546, "step": 504000 }, { "epoch": 6.9452481331459595, "grad_norm": 3.118750810623169, "learning_rate": 3.116772542080442e-05, "loss": 0.4263, "step": 504100 }, { "epoch": 6.946625885205698, "grad_norm": 2.2398905754089355, "learning_rate": 3.115978143274253e-05, "loss": 0.4868, "step": 504200 }, { "epoch": 6.948003637265438, "grad_norm": 2.22751522064209, "learning_rate": 3.115183713376348e-05, "loss": 0.4383, "step": 504300 }, { "epoch": 6.949381389325177, "grad_norm": 2.2035436630249023, "learning_rate": 3.114397197216788e-05, "loss": 0.4788, "step": 504400 }, { "epoch": 6.950759141384916, "grad_norm": 3.2892355918884277, "learning_rate": 3.11360270564716e-05, "loss": 0.479, "step": 504500 }, { "epoch": 6.9521368934446555, "grad_norm": 2.4126551151275635, "learning_rate": 3.112808183187623e-05, "loss": 0.4948, "step": 504600 }, { "epoch": 6.953514645504395, "grad_norm": 11.513557434082031, "learning_rate": 3.1120136299056744e-05, "loss": 0.4821, "step": 504700 }, { "epoch": 6.954892397564135, "grad_norm": 2.8434507846832275, "learning_rate": 3.111219045868816e-05, "loss": 0.4704, "step": 504800 }, { "epoch": 6.956270149623874, "grad_norm": 9.525400161743164, "learning_rate": 3.110424431144551e-05, "loss": 0.4625, "step": 504900 }, { "epoch": 6.957647901683613, "grad_norm": 0.6736272573471069, "learning_rate": 3.109629785800387e-05, "loss": 0.4282, "step": 505000 }, { "epoch": 6.959025653743352, "grad_norm": 1.9305440187454224, "learning_rate": 3.108835109903833e-05, "loss": 0.4173, "step": 505100 }, { "epoch": 6.960403405803092, "grad_norm": 1.8894736766815186, "learning_rate": 3.1080404035224006e-05, "loss": 0.4974, "step": 505200 }, { "epoch": 6.961781157862831, "grad_norm": 2.7600796222686768, "learning_rate": 3.107245666723604e-05, "loss": 0.5096, "step": 505300 }, { "epoch": 6.96315890992257, "grad_norm": 3.5628931522369385, "learning_rate": 3.106450899574961e-05, "loss": 0.4847, "step": 505400 }, { "epoch": 6.96453666198231, "grad_norm": 6.958978652954102, "learning_rate": 3.105656102143989e-05, "loss": 0.5269, "step": 505500 }, { "epoch": 6.965914414042049, "grad_norm": 2.5903046131134033, "learning_rate": 3.1048612744982125e-05, "loss": 0.3984, "step": 505600 }, { "epoch": 6.967292166101789, "grad_norm": 4.880198955535889, "learning_rate": 3.104066416705156e-05, "loss": 0.4831, "step": 505700 }, { "epoch": 6.9686699181615275, "grad_norm": 1.1121947765350342, "learning_rate": 3.1032715288323455e-05, "loss": 0.4608, "step": 505800 }, { "epoch": 6.970047670221267, "grad_norm": 6.725651741027832, "learning_rate": 3.102476610947311e-05, "loss": 0.443, "step": 505900 }, { "epoch": 6.971425422281007, "grad_norm": 3.4882566928863525, "learning_rate": 3.101681663117585e-05, "loss": 0.4318, "step": 506000 }, { "epoch": 6.972803174340745, "grad_norm": 5.540103435516357, "learning_rate": 3.100886685410703e-05, "loss": 0.4187, "step": 506100 }, { "epoch": 6.974180926400485, "grad_norm": 3.5153932571411133, "learning_rate": 3.100091677894202e-05, "loss": 0.4448, "step": 506200 }, { "epoch": 6.975558678460224, "grad_norm": 9.365389823913574, "learning_rate": 3.0992966406356216e-05, "loss": 0.4937, "step": 506300 }, { "epoch": 6.976936430519964, "grad_norm": 2.807455539703369, "learning_rate": 3.098501573702505e-05, "loss": 0.4133, "step": 506400 }, { "epoch": 6.978314182579703, "grad_norm": 2.1231842041015625, "learning_rate": 3.097706477162396e-05, "loss": 0.4902, "step": 506500 }, { "epoch": 6.979691934639442, "grad_norm": 1.3850544691085815, "learning_rate": 3.0969113510828423e-05, "loss": 0.4493, "step": 506600 }, { "epoch": 6.981069686699182, "grad_norm": 8.940996170043945, "learning_rate": 3.096116195531397e-05, "loss": 0.4689, "step": 506700 }, { "epoch": 6.982447438758921, "grad_norm": 2.3018627166748047, "learning_rate": 3.095321010575608e-05, "loss": 0.5327, "step": 506800 }, { "epoch": 6.98382519081866, "grad_norm": 5.0102643966674805, "learning_rate": 3.094525796283034e-05, "loss": 0.4557, "step": 506900 }, { "epoch": 6.9852029428783995, "grad_norm": 6.3492536544799805, "learning_rate": 3.09373055272123e-05, "loss": 0.5399, "step": 507000 }, { "epoch": 6.986580694938139, "grad_norm": 3.183195114135742, "learning_rate": 3.092935279957757e-05, "loss": 0.4557, "step": 507100 }, { "epoch": 6.987958446997879, "grad_norm": 16.109149932861328, "learning_rate": 3.092139978060178e-05, "loss": 0.484, "step": 507200 }, { "epoch": 6.989336199057617, "grad_norm": 3.767611026763916, "learning_rate": 3.091352600549356e-05, "loss": 0.4853, "step": 507300 }, { "epoch": 6.990713951117357, "grad_norm": 5.4513068199157715, "learning_rate": 3.090557240875917e-05, "loss": 0.5395, "step": 507400 }, { "epoch": 6.992091703177096, "grad_norm": 5.304195404052734, "learning_rate": 3.0897618522703974e-05, "loss": 0.5016, "step": 507500 }, { "epoch": 6.993469455236836, "grad_norm": 9.135114669799805, "learning_rate": 3.088966434800371e-05, "loss": 0.4338, "step": 507600 }, { "epoch": 6.994847207296575, "grad_norm": 2.1700408458709717, "learning_rate": 3.088170988533411e-05, "loss": 0.4509, "step": 507700 }, { "epoch": 6.996224959356314, "grad_norm": 3.658083200454712, "learning_rate": 3.087375513537096e-05, "loss": 0.4806, "step": 507800 }, { "epoch": 6.997602711416054, "grad_norm": 1.2274701595306396, "learning_rate": 3.086580009879005e-05, "loss": 0.5319, "step": 507900 }, { "epoch": 6.998980463475793, "grad_norm": 1.8919082880020142, "learning_rate": 3.0857844776267195e-05, "loss": 0.4602, "step": 508000 }, { "epoch": 7.000358215535532, "grad_norm": 3.0730230808258057, "learning_rate": 3.0849889168478256e-05, "loss": 0.4708, "step": 508100 }, { "epoch": 7.0017359675952715, "grad_norm": 3.043363094329834, "learning_rate": 3.0841933276099094e-05, "loss": 0.4178, "step": 508200 }, { "epoch": 7.003113719655011, "grad_norm": 3.6803717613220215, "learning_rate": 3.0833977099805594e-05, "loss": 0.4457, "step": 508300 }, { "epoch": 7.004491471714751, "grad_norm": 3.0459094047546387, "learning_rate": 3.0826020640273696e-05, "loss": 0.4497, "step": 508400 }, { "epoch": 7.005869223774489, "grad_norm": 3.3015036582946777, "learning_rate": 3.081806389817931e-05, "loss": 0.5178, "step": 508500 }, { "epoch": 7.007246975834229, "grad_norm": 4.332216262817383, "learning_rate": 3.081010687419845e-05, "loss": 0.4321, "step": 508600 }, { "epoch": 7.008624727893968, "grad_norm": 2.6249611377716064, "learning_rate": 3.0802149569007056e-05, "loss": 0.4651, "step": 508700 }, { "epoch": 7.010002479953708, "grad_norm": 11.402806282043457, "learning_rate": 3.079419198328116e-05, "loss": 0.4067, "step": 508800 }, { "epoch": 7.011380232013447, "grad_norm": 5.703737258911133, "learning_rate": 3.0786234117696813e-05, "loss": 0.397, "step": 508900 }, { "epoch": 7.012757984073186, "grad_norm": 13.51340389251709, "learning_rate": 3.077827597293006e-05, "loss": 0.4753, "step": 509000 }, { "epoch": 7.014135736132926, "grad_norm": 2.3686861991882324, "learning_rate": 3.0770317549657e-05, "loss": 0.4537, "step": 509100 }, { "epoch": 7.015513488192665, "grad_norm": 0.9951190948486328, "learning_rate": 3.0762358848553724e-05, "loss": 0.4088, "step": 509200 }, { "epoch": 7.016891240252404, "grad_norm": 7.1610517501831055, "learning_rate": 3.075439987029637e-05, "loss": 0.4219, "step": 509300 }, { "epoch": 7.0182689923121435, "grad_norm": 7.076640605926514, "learning_rate": 3.0746440615561107e-05, "loss": 0.421, "step": 509400 }, { "epoch": 7.019646744371883, "grad_norm": 3.9407331943511963, "learning_rate": 3.0738481085024095e-05, "loss": 0.4539, "step": 509500 }, { "epoch": 7.021024496431622, "grad_norm": 4.092820167541504, "learning_rate": 3.073052127936155e-05, "loss": 0.4536, "step": 509600 }, { "epoch": 7.022402248491361, "grad_norm": 14.47374439239502, "learning_rate": 3.072256119924971e-05, "loss": 0.4627, "step": 509700 }, { "epoch": 7.023780000551101, "grad_norm": 5.377509117126465, "learning_rate": 3.071460084536479e-05, "loss": 0.4174, "step": 509800 }, { "epoch": 7.02515775261084, "grad_norm": 6.398963928222656, "learning_rate": 3.0706640218383096e-05, "loss": 0.4497, "step": 509900 }, { "epoch": 7.026535504670579, "grad_norm": 1.2777804136276245, "learning_rate": 3.06986793189809e-05, "loss": 0.4446, "step": 510000 }, { "epoch": 7.027913256730319, "grad_norm": 3.766404390335083, "learning_rate": 3.0690718147834534e-05, "loss": 0.3517, "step": 510100 }, { "epoch": 7.029291008790058, "grad_norm": 5.022616386413574, "learning_rate": 3.0682756705620346e-05, "loss": 0.4452, "step": 510200 }, { "epoch": 7.030668760849798, "grad_norm": 3.8178772926330566, "learning_rate": 3.067479499301468e-05, "loss": 0.3791, "step": 510300 }, { "epoch": 7.032046512909536, "grad_norm": 2.398712158203125, "learning_rate": 3.0666912631850016e-05, "loss": 0.3962, "step": 510400 }, { "epoch": 7.033424264969276, "grad_norm": 1.932659387588501, "learning_rate": 3.0658950383177663e-05, "loss": 0.436, "step": 510500 }, { "epoch": 7.0348020170290155, "grad_norm": 1.6256223917007446, "learning_rate": 3.0650987866136304e-05, "loss": 0.4505, "step": 510600 }, { "epoch": 7.036179769088755, "grad_norm": 3.6010141372680664, "learning_rate": 3.064302508140239e-05, "loss": 0.3881, "step": 510700 }, { "epoch": 7.037557521148494, "grad_norm": 21.118865966796875, "learning_rate": 3.0635062029652445e-05, "loss": 0.457, "step": 510800 }, { "epoch": 7.038935273208233, "grad_norm": 3.9681448936462402, "learning_rate": 3.062709871156292e-05, "loss": 0.4641, "step": 510900 }, { "epoch": 7.040313025267973, "grad_norm": 3.96144962310791, "learning_rate": 3.0619135127810376e-05, "loss": 0.4246, "step": 511000 }, { "epoch": 7.041690777327712, "grad_norm": 7.626237392425537, "learning_rate": 3.061117127907134e-05, "loss": 0.4033, "step": 511100 }, { "epoch": 7.043068529387451, "grad_norm": 1.7304155826568604, "learning_rate": 3.060320716602238e-05, "loss": 0.4325, "step": 511200 }, { "epoch": 7.044446281447191, "grad_norm": 3.747910976409912, "learning_rate": 3.0595242789340106e-05, "loss": 0.3817, "step": 511300 }, { "epoch": 7.04582403350693, "grad_norm": 2.054321050643921, "learning_rate": 3.058727814970111e-05, "loss": 0.4156, "step": 511400 }, { "epoch": 7.04720178556667, "grad_norm": 12.061721801757812, "learning_rate": 3.057931324778205e-05, "loss": 0.4536, "step": 511500 }, { "epoch": 7.048579537626408, "grad_norm": 2.83654522895813, "learning_rate": 3.057134808425958e-05, "loss": 0.3963, "step": 511600 }, { "epoch": 7.049957289686148, "grad_norm": 1.5100481510162354, "learning_rate": 3.0563382659810365e-05, "loss": 0.4602, "step": 511700 }, { "epoch": 7.0513350417458875, "grad_norm": 3.5936434268951416, "learning_rate": 3.0555416975111125e-05, "loss": 0.4491, "step": 511800 }, { "epoch": 7.052712793805627, "grad_norm": 2.6990315914154053, "learning_rate": 3.0547451030838584e-05, "loss": 0.4064, "step": 511900 }, { "epoch": 7.054090545865366, "grad_norm": 4.937404632568359, "learning_rate": 3.053948482766949e-05, "loss": 0.4176, "step": 512000 }, { "epoch": 7.055468297925105, "grad_norm": 1.8632651567459106, "learning_rate": 3.05315183662806e-05, "loss": 0.4221, "step": 512100 }, { "epoch": 7.056846049984845, "grad_norm": 3.9911539554595947, "learning_rate": 3.0523551647348724e-05, "loss": 0.4019, "step": 512200 }, { "epoch": 7.058223802044584, "grad_norm": 61.9040641784668, "learning_rate": 3.0515584671550668e-05, "loss": 0.4738, "step": 512300 }, { "epoch": 7.059601554104323, "grad_norm": 3.3290135860443115, "learning_rate": 3.050761743956326e-05, "loss": 0.4035, "step": 512400 }, { "epoch": 7.060979306164063, "grad_norm": 2.886779308319092, "learning_rate": 3.049964995206337e-05, "loss": 0.4099, "step": 512500 }, { "epoch": 7.062357058223802, "grad_norm": 3.3119235038757324, "learning_rate": 3.0491682209727864e-05, "loss": 0.4687, "step": 512600 }, { "epoch": 7.063734810283542, "grad_norm": 2.0905604362487793, "learning_rate": 3.048371421323366e-05, "loss": 0.4001, "step": 512700 }, { "epoch": 7.0651125623432804, "grad_norm": 4.203014850616455, "learning_rate": 3.0475745963257655e-05, "loss": 0.4361, "step": 512800 }, { "epoch": 7.06649031440302, "grad_norm": 3.405407667160034, "learning_rate": 3.0467777460476818e-05, "loss": 0.4542, "step": 512900 }, { "epoch": 7.06786806646276, "grad_norm": 0.03760630637407303, "learning_rate": 3.04598087055681e-05, "loss": 0.4496, "step": 513000 }, { "epoch": 7.069245818522499, "grad_norm": 3.6018900871276855, "learning_rate": 3.0451839699208485e-05, "loss": 0.4487, "step": 513100 }, { "epoch": 7.070623570582238, "grad_norm": 15.080385208129883, "learning_rate": 3.044387044207499e-05, "loss": 0.3994, "step": 513200 }, { "epoch": 7.072001322641977, "grad_norm": 8.081743240356445, "learning_rate": 3.0435900934844637e-05, "loss": 0.4801, "step": 513300 }, { "epoch": 7.073379074701717, "grad_norm": 6.781630992889404, "learning_rate": 3.0427931178194484e-05, "loss": 0.474, "step": 513400 }, { "epoch": 7.0747568267614565, "grad_norm": 4.604904651641846, "learning_rate": 3.0419961172801592e-05, "loss": 0.4059, "step": 513500 }, { "epoch": 7.076134578821195, "grad_norm": 5.984791278839111, "learning_rate": 3.0411990919343068e-05, "loss": 0.4695, "step": 513600 }, { "epoch": 7.077512330880935, "grad_norm": 9.89235782623291, "learning_rate": 3.0404020418496015e-05, "loss": 0.4242, "step": 513700 }, { "epoch": 7.078890082940674, "grad_norm": 1.737723708152771, "learning_rate": 3.039604967093757e-05, "loss": 0.492, "step": 513800 }, { "epoch": 7.080267835000413, "grad_norm": 3.33735990524292, "learning_rate": 3.0388078677344887e-05, "loss": 0.4114, "step": 513900 }, { "epoch": 7.0816455870601525, "grad_norm": 2.9308416843414307, "learning_rate": 3.0380107438395146e-05, "loss": 0.4311, "step": 514000 }, { "epoch": 7.083023339119892, "grad_norm": 12.007293701171875, "learning_rate": 3.037213595476555e-05, "loss": 0.3914, "step": 514100 }, { "epoch": 7.084401091179632, "grad_norm": 2.809816360473633, "learning_rate": 3.0364164227133306e-05, "loss": 0.4261, "step": 514200 }, { "epoch": 7.08577884323937, "grad_norm": 10.038213729858398, "learning_rate": 3.035619225617565e-05, "loss": 0.4254, "step": 514300 }, { "epoch": 7.08715659529911, "grad_norm": 3.3328492641448975, "learning_rate": 3.0348220042569863e-05, "loss": 0.4583, "step": 514400 }, { "epoch": 7.088534347358849, "grad_norm": 1.0093694925308228, "learning_rate": 3.0340247586993202e-05, "loss": 0.4324, "step": 514500 }, { "epoch": 7.089912099418589, "grad_norm": 5.072347640991211, "learning_rate": 3.0332354618283865e-05, "loss": 0.4712, "step": 514600 }, { "epoch": 7.091289851478328, "grad_norm": 4.36110258102417, "learning_rate": 3.0324381683200216e-05, "loss": 0.5111, "step": 514700 }, { "epoch": 7.092667603538067, "grad_norm": 7.218010902404785, "learning_rate": 3.0316408508170893e-05, "loss": 0.4284, "step": 514800 }, { "epoch": 7.094045355597807, "grad_norm": 10.516966819763184, "learning_rate": 3.030843509387325e-05, "loss": 0.4565, "step": 514900 }, { "epoch": 7.095423107657546, "grad_norm": 2.4566447734832764, "learning_rate": 3.0300461440984687e-05, "loss": 0.4645, "step": 515000 }, { "epoch": 7.096800859717285, "grad_norm": 2.3622419834136963, "learning_rate": 3.0292487550182597e-05, "loss": 0.4296, "step": 515100 }, { "epoch": 7.0981786117770245, "grad_norm": 4.899537563323975, "learning_rate": 3.0284593164596874e-05, "loss": 0.4866, "step": 515200 }, { "epoch": 7.099556363836764, "grad_norm": 8.881457328796387, "learning_rate": 3.027661880236225e-05, "loss": 0.3999, "step": 515300 }, { "epoch": 7.100934115896504, "grad_norm": 6.3609747886657715, "learning_rate": 3.0268644204239648e-05, "loss": 0.4221, "step": 515400 }, { "epoch": 7.102311867956242, "grad_norm": 2.240654230117798, "learning_rate": 3.0260669370906568e-05, "loss": 0.4363, "step": 515500 }, { "epoch": 7.103689620015982, "grad_norm": 9.152902603149414, "learning_rate": 3.0252694303040497e-05, "loss": 0.4064, "step": 515600 }, { "epoch": 7.105067372075721, "grad_norm": 2.1131410598754883, "learning_rate": 3.0244719001318956e-05, "loss": 0.4849, "step": 515700 }, { "epoch": 7.106445124135461, "grad_norm": 6.97761344909668, "learning_rate": 3.02367434664195e-05, "loss": 0.4491, "step": 515800 }, { "epoch": 7.1078228761952, "grad_norm": 9.90329647064209, "learning_rate": 3.0228767699019687e-05, "loss": 0.4905, "step": 515900 }, { "epoch": 7.109200628254939, "grad_norm": 12.74622631072998, "learning_rate": 3.02207916997971e-05, "loss": 0.4048, "step": 516000 }, { "epoch": 7.110578380314679, "grad_norm": 14.02765941619873, "learning_rate": 3.0212815469429326e-05, "loss": 0.421, "step": 516100 }, { "epoch": 7.111956132374418, "grad_norm": 2.7014546394348145, "learning_rate": 3.0204839008594012e-05, "loss": 0.365, "step": 516200 }, { "epoch": 7.113333884434157, "grad_norm": 2.474792003631592, "learning_rate": 3.019686231796878e-05, "loss": 0.4447, "step": 516300 }, { "epoch": 7.1147116364938965, "grad_norm": 3.366610288619995, "learning_rate": 3.01888853982313e-05, "loss": 0.4626, "step": 516400 }, { "epoch": 7.116089388553636, "grad_norm": 1.4523404836654663, "learning_rate": 3.0180908250059247e-05, "loss": 0.412, "step": 516500 }, { "epoch": 7.117467140613376, "grad_norm": 1.7142333984375, "learning_rate": 3.0172930874130323e-05, "loss": 0.4165, "step": 516600 }, { "epoch": 7.118844892673114, "grad_norm": 4.076796531677246, "learning_rate": 3.0164953271122248e-05, "loss": 0.4068, "step": 516700 }, { "epoch": 7.120222644732854, "grad_norm": 3.382675886154175, "learning_rate": 3.0156975441712762e-05, "loss": 0.4397, "step": 516800 }, { "epoch": 7.121600396792593, "grad_norm": 3.0515313148498535, "learning_rate": 3.014899738657961e-05, "loss": 0.4513, "step": 516900 }, { "epoch": 7.122978148852333, "grad_norm": 7.7009711265563965, "learning_rate": 3.0141019106400586e-05, "loss": 0.4646, "step": 517000 }, { "epoch": 7.124355900912072, "grad_norm": 2.972747325897217, "learning_rate": 3.0133040601853478e-05, "loss": 0.4453, "step": 517100 }, { "epoch": 7.125733652971811, "grad_norm": 4.326071739196777, "learning_rate": 3.0125061873616094e-05, "loss": 0.4013, "step": 517200 }, { "epoch": 7.127111405031551, "grad_norm": 4.487992763519287, "learning_rate": 3.0117082922366266e-05, "loss": 0.4091, "step": 517300 }, { "epoch": 7.12848915709129, "grad_norm": 2.09557843208313, "learning_rate": 3.0109103748781877e-05, "loss": 0.4303, "step": 517400 }, { "epoch": 7.129866909151029, "grad_norm": 12.754274368286133, "learning_rate": 3.0101124353540762e-05, "loss": 0.4457, "step": 517500 }, { "epoch": 7.1312446612107685, "grad_norm": 3.8575031757354736, "learning_rate": 3.0093144737320827e-05, "loss": 0.4703, "step": 517600 }, { "epoch": 7.132622413270508, "grad_norm": 4.321773052215576, "learning_rate": 3.0085164900799986e-05, "loss": 0.456, "step": 517700 }, { "epoch": 7.134000165330248, "grad_norm": 8.63968563079834, "learning_rate": 3.0077184844656153e-05, "loss": 0.4482, "step": 517800 }, { "epoch": 7.135377917389986, "grad_norm": 2.6466023921966553, "learning_rate": 3.0069204569567286e-05, "loss": 0.402, "step": 517900 }, { "epoch": 7.136755669449726, "grad_norm": 8.302736282348633, "learning_rate": 3.0061224076211354e-05, "loss": 0.4454, "step": 518000 }, { "epoch": 7.138133421509465, "grad_norm": 0.9995610117912292, "learning_rate": 3.0053243365266326e-05, "loss": 0.3898, "step": 518100 }, { "epoch": 7.139511173569204, "grad_norm": 55.964229583740234, "learning_rate": 3.0045262437410217e-05, "loss": 0.471, "step": 518200 }, { "epoch": 7.140888925628944, "grad_norm": 3.818483352661133, "learning_rate": 3.0037361105830053e-05, "loss": 0.4831, "step": 518300 }, { "epoch": 7.142266677688683, "grad_norm": 3.3601295948028564, "learning_rate": 3.0029379748338044e-05, "loss": 0.4791, "step": 518400 }, { "epoch": 7.143644429748423, "grad_norm": 3.047697067260742, "learning_rate": 3.0021398175962284e-05, "loss": 0.4197, "step": 518500 }, { "epoch": 7.145022181808161, "grad_norm": 1.284662127494812, "learning_rate": 3.0013416389380846e-05, "loss": 0.3649, "step": 518600 }, { "epoch": 7.146399933867901, "grad_norm": 5.258458614349365, "learning_rate": 3.0005434389271828e-05, "loss": 0.3916, "step": 518700 }, { "epoch": 7.1477776859276405, "grad_norm": 30.18373680114746, "learning_rate": 2.999745217631335e-05, "loss": 0.4054, "step": 518800 }, { "epoch": 7.14915543798738, "grad_norm": 7.279288291931152, "learning_rate": 2.9989469751183526e-05, "loss": 0.485, "step": 518900 }, { "epoch": 7.150533190047119, "grad_norm": 1.3396530151367188, "learning_rate": 2.998148711456051e-05, "loss": 0.4461, "step": 519000 }, { "epoch": 7.151910942106858, "grad_norm": 34.63507080078125, "learning_rate": 2.997350426712247e-05, "loss": 0.4164, "step": 519100 }, { "epoch": 7.153288694166598, "grad_norm": 5.330453872680664, "learning_rate": 2.9965521209547576e-05, "loss": 0.4527, "step": 519200 }, { "epoch": 7.154666446226337, "grad_norm": 1.974777102470398, "learning_rate": 2.995753794251406e-05, "loss": 0.4552, "step": 519300 }, { "epoch": 7.156044198286076, "grad_norm": 3.270253896713257, "learning_rate": 2.9949554466700117e-05, "loss": 0.4305, "step": 519400 }, { "epoch": 7.157421950345816, "grad_norm": 27.618839263916016, "learning_rate": 2.9941570782783983e-05, "loss": 0.4638, "step": 519500 }, { "epoch": 7.158799702405555, "grad_norm": 3.824273109436035, "learning_rate": 2.993358689144393e-05, "loss": 0.4341, "step": 519600 }, { "epoch": 7.160177454465295, "grad_norm": 1.5950987339019775, "learning_rate": 2.9925602793358213e-05, "loss": 0.4206, "step": 519700 }, { "epoch": 7.161555206525033, "grad_norm": 151.4424285888672, "learning_rate": 2.991761848920513e-05, "loss": 0.37, "step": 519800 }, { "epoch": 7.162932958584773, "grad_norm": 5.675760746002197, "learning_rate": 2.9909633979662995e-05, "loss": 0.3889, "step": 519900 }, { "epoch": 7.1643107106445125, "grad_norm": 2.904588460922241, "learning_rate": 2.990164926541012e-05, "loss": 0.517, "step": 520000 }, { "epoch": 7.165688462704252, "grad_norm": 1.7478234767913818, "learning_rate": 2.9893664347124852e-05, "loss": 0.4609, "step": 520100 }, { "epoch": 7.167066214763991, "grad_norm": 4.9433274269104, "learning_rate": 2.9885679225485554e-05, "loss": 0.4542, "step": 520200 }, { "epoch": 7.16844396682373, "grad_norm": 1.3795069456100464, "learning_rate": 2.9877693901170602e-05, "loss": 0.4164, "step": 520300 }, { "epoch": 7.16982171888347, "grad_norm": 3.183743953704834, "learning_rate": 2.9869708374858393e-05, "loss": 0.4676, "step": 520400 }, { "epoch": 7.171199470943209, "grad_norm": 5.469613075256348, "learning_rate": 2.9861722647227324e-05, "loss": 0.4687, "step": 520500 }, { "epoch": 7.172577223002948, "grad_norm": 6.8705339431762695, "learning_rate": 2.985373671895584e-05, "loss": 0.4248, "step": 520600 }, { "epoch": 7.173954975062688, "grad_norm": 2.3069798946380615, "learning_rate": 2.9845750590722378e-05, "loss": 0.4291, "step": 520700 }, { "epoch": 7.175332727122427, "grad_norm": 3.884575128555298, "learning_rate": 2.9837764263205408e-05, "loss": 0.4307, "step": 520800 }, { "epoch": 7.176710479182167, "grad_norm": 10.956753730773926, "learning_rate": 2.98297777370834e-05, "loss": 0.4638, "step": 520900 }, { "epoch": 7.178088231241905, "grad_norm": 3.845417022705078, "learning_rate": 2.9821791013034853e-05, "loss": 0.391, "step": 521000 }, { "epoch": 7.179465983301645, "grad_norm": 3.302147388458252, "learning_rate": 2.981380409173828e-05, "loss": 0.4312, "step": 521100 }, { "epoch": 7.1808437353613845, "grad_norm": 2.3778023719787598, "learning_rate": 2.9805816973872217e-05, "loss": 0.4814, "step": 521200 }, { "epoch": 7.182221487421124, "grad_norm": 1.9851925373077393, "learning_rate": 2.97978296601152e-05, "loss": 0.366, "step": 521300 }, { "epoch": 7.183599239480863, "grad_norm": 3.7024972438812256, "learning_rate": 2.97898421511458e-05, "loss": 0.4017, "step": 521400 }, { "epoch": 7.184976991540602, "grad_norm": 5.1820149421691895, "learning_rate": 2.97818544476426e-05, "loss": 0.4547, "step": 521500 }, { "epoch": 7.186354743600342, "grad_norm": 1.1525753736495972, "learning_rate": 2.977386655028418e-05, "loss": 0.4352, "step": 521600 }, { "epoch": 7.187732495660081, "grad_norm": 0.24104857444763184, "learning_rate": 2.9765878459749174e-05, "loss": 0.4042, "step": 521700 }, { "epoch": 7.18911024771982, "grad_norm": 5.256969451904297, "learning_rate": 2.9757890176716194e-05, "loss": 0.4602, "step": 521800 }, { "epoch": 7.19048799977956, "grad_norm": 2.743558168411255, "learning_rate": 2.974990170186389e-05, "loss": 0.4433, "step": 521900 }, { "epoch": 7.191865751839299, "grad_norm": 6.436760902404785, "learning_rate": 2.974199292347478e-05, "loss": 0.4772, "step": 522000 }, { "epoch": 7.193243503899039, "grad_norm": 0.617780327796936, "learning_rate": 2.9734004068921097e-05, "loss": 0.4326, "step": 522100 }, { "epoch": 7.194621255958777, "grad_norm": 2.4693448543548584, "learning_rate": 2.9726015024577336e-05, "loss": 0.4073, "step": 522200 }, { "epoch": 7.195999008018517, "grad_norm": 6.432834625244141, "learning_rate": 2.9718025791122218e-05, "loss": 0.4747, "step": 522300 }, { "epoch": 7.1973767600782566, "grad_norm": 2.1386616230010986, "learning_rate": 2.971011626438385e-05, "loss": 0.4159, "step": 522400 }, { "epoch": 7.198754512137995, "grad_norm": 4.946809768676758, "learning_rate": 2.970212665661637e-05, "loss": 0.4102, "step": 522500 }, { "epoch": 7.200132264197735, "grad_norm": 0.419688880443573, "learning_rate": 2.9694136861766973e-05, "loss": 0.4213, "step": 522600 }, { "epoch": 7.201510016257474, "grad_norm": 5.979204177856445, "learning_rate": 2.968614688051442e-05, "loss": 0.4521, "step": 522700 }, { "epoch": 7.202887768317214, "grad_norm": 14.80136489868164, "learning_rate": 2.9678156713537505e-05, "loss": 0.4867, "step": 522800 }, { "epoch": 7.204265520376953, "grad_norm": 3.5998952388763428, "learning_rate": 2.9670166361515034e-05, "loss": 0.4268, "step": 522900 }, { "epoch": 7.205643272436692, "grad_norm": 7.841350555419922, "learning_rate": 2.9662175825125823e-05, "loss": 0.4179, "step": 523000 }, { "epoch": 7.207021024496432, "grad_norm": 3.468855619430542, "learning_rate": 2.9654185105048718e-05, "loss": 0.495, "step": 523100 }, { "epoch": 7.208398776556171, "grad_norm": 3.9456002712249756, "learning_rate": 2.964619420196258e-05, "loss": 0.4701, "step": 523200 }, { "epoch": 7.20977652861591, "grad_norm": 18.172672271728516, "learning_rate": 2.9638203116546247e-05, "loss": 0.4695, "step": 523300 }, { "epoch": 7.2111542806756495, "grad_norm": 21.43109893798828, "learning_rate": 2.9630211849478623e-05, "loss": 0.4594, "step": 523400 }, { "epoch": 7.212532032735389, "grad_norm": 13.627384185791016, "learning_rate": 2.962222040143861e-05, "loss": 0.3979, "step": 523500 }, { "epoch": 7.213909784795129, "grad_norm": 3.7063870429992676, "learning_rate": 2.9614228773105113e-05, "loss": 0.4841, "step": 523600 }, { "epoch": 7.215287536854867, "grad_norm": 4.228190898895264, "learning_rate": 2.9606236965157075e-05, "loss": 0.4706, "step": 523700 }, { "epoch": 7.216665288914607, "grad_norm": 2.6452341079711914, "learning_rate": 2.959824497827342e-05, "loss": 0.4628, "step": 523800 }, { "epoch": 7.218043040974346, "grad_norm": 2.161000967025757, "learning_rate": 2.9590332735664657e-05, "loss": 0.4914, "step": 523900 }, { "epoch": 7.219420793034086, "grad_norm": 2.0252413749694824, "learning_rate": 2.958234039471911e-05, "loss": 0.4206, "step": 524000 }, { "epoch": 7.220798545093825, "grad_norm": 2.671046495437622, "learning_rate": 2.9574347876868095e-05, "loss": 0.4021, "step": 524100 }, { "epoch": 7.222176297153564, "grad_norm": 2.3486526012420654, "learning_rate": 2.9566355182790603e-05, "loss": 0.4204, "step": 524200 }, { "epoch": 7.223554049213304, "grad_norm": 5.901416778564453, "learning_rate": 2.955836231316568e-05, "loss": 0.412, "step": 524300 }, { "epoch": 7.224931801273043, "grad_norm": 9.44016170501709, "learning_rate": 2.955036926867233e-05, "loss": 0.4258, "step": 524400 }, { "epoch": 7.226309553332782, "grad_norm": 2.9871771335601807, "learning_rate": 2.9542376049989646e-05, "loss": 0.4158, "step": 524500 }, { "epoch": 7.2276873053925215, "grad_norm": 14.95018196105957, "learning_rate": 2.9534382657796653e-05, "loss": 0.4722, "step": 524600 }, { "epoch": 7.229065057452261, "grad_norm": 1.974859595298767, "learning_rate": 2.9526389092772434e-05, "loss": 0.4452, "step": 524700 }, { "epoch": 7.230442809512001, "grad_norm": 2.981731653213501, "learning_rate": 2.9518395355596115e-05, "loss": 0.4352, "step": 524800 }, { "epoch": 7.231820561571739, "grad_norm": 4.314277648925781, "learning_rate": 2.9510401446946774e-05, "loss": 0.4807, "step": 524900 }, { "epoch": 7.233198313631479, "grad_norm": 9.199114799499512, "learning_rate": 2.950240736750355e-05, "loss": 0.4366, "step": 525000 }, { "epoch": 7.234576065691218, "grad_norm": 1.1397278308868408, "learning_rate": 2.9494413117945576e-05, "loss": 0.4193, "step": 525100 }, { "epoch": 7.235953817750958, "grad_norm": 5.191973686218262, "learning_rate": 2.9486418698951997e-05, "loss": 0.4511, "step": 525200 }, { "epoch": 7.237331569810697, "grad_norm": 2.0410349369049072, "learning_rate": 2.9478424111201993e-05, "loss": 0.3834, "step": 525300 }, { "epoch": 7.238709321870436, "grad_norm": 3.705223321914673, "learning_rate": 2.9470429355374737e-05, "loss": 0.4109, "step": 525400 }, { "epoch": 7.240087073930176, "grad_norm": 4.902582168579102, "learning_rate": 2.946243443214943e-05, "loss": 0.416, "step": 525500 }, { "epoch": 7.241464825989915, "grad_norm": 3.0717673301696777, "learning_rate": 2.9454439342205272e-05, "loss": 0.4985, "step": 525600 }, { "epoch": 7.242842578049654, "grad_norm": 2.995602607727051, "learning_rate": 2.94464440862215e-05, "loss": 0.4075, "step": 525700 }, { "epoch": 7.2442203301093935, "grad_norm": 4.364591598510742, "learning_rate": 2.943844866487734e-05, "loss": 0.4497, "step": 525800 }, { "epoch": 7.245598082169133, "grad_norm": 11.302666664123535, "learning_rate": 2.9430453078852052e-05, "loss": 0.4524, "step": 525900 }, { "epoch": 7.246975834228873, "grad_norm": 3.318530321121216, "learning_rate": 2.9422457328824896e-05, "loss": 0.4194, "step": 526000 }, { "epoch": 7.248353586288611, "grad_norm": 4.413473129272461, "learning_rate": 2.9414461415475154e-05, "loss": 0.5009, "step": 526100 }, { "epoch": 7.249731338348351, "grad_norm": 3.0524516105651855, "learning_rate": 2.9406465339482126e-05, "loss": 0.389, "step": 526200 }, { "epoch": 7.25110909040809, "grad_norm": 2.052962064743042, "learning_rate": 2.9398469101525107e-05, "loss": 0.4102, "step": 526300 }, { "epoch": 7.25248684246783, "grad_norm": 5.107116222381592, "learning_rate": 2.9390472702283432e-05, "loss": 0.4026, "step": 526400 }, { "epoch": 7.253864594527569, "grad_norm": 15.659510612487793, "learning_rate": 2.9382476142436423e-05, "loss": 0.4649, "step": 526500 }, { "epoch": 7.255242346587308, "grad_norm": 3.75313401222229, "learning_rate": 2.937447942266344e-05, "loss": 0.4052, "step": 526600 }, { "epoch": 7.256620098647048, "grad_norm": 3.125775098800659, "learning_rate": 2.936648254364384e-05, "loss": 0.4134, "step": 526700 }, { "epoch": 7.257997850706786, "grad_norm": 3.8388991355895996, "learning_rate": 2.9358485506056994e-05, "loss": 0.4775, "step": 526800 }, { "epoch": 7.259375602766526, "grad_norm": 4.062264919281006, "learning_rate": 2.9350488310582303e-05, "loss": 0.4405, "step": 526900 }, { "epoch": 7.2607533548262655, "grad_norm": 3.9900310039520264, "learning_rate": 2.9342490957899157e-05, "loss": 0.4336, "step": 527000 }, { "epoch": 7.262131106886005, "grad_norm": 4.169195175170898, "learning_rate": 2.9334493448686982e-05, "loss": 0.4213, "step": 527100 }, { "epoch": 7.263508858945744, "grad_norm": 0.6121081709861755, "learning_rate": 2.9326495783625203e-05, "loss": 0.4636, "step": 527200 }, { "epoch": 7.264886611005483, "grad_norm": 6.136238098144531, "learning_rate": 2.9318497963393264e-05, "loss": 0.462, "step": 527300 }, { "epoch": 7.266264363065223, "grad_norm": 5.410613059997559, "learning_rate": 2.931049998867062e-05, "loss": 0.4871, "step": 527400 }, { "epoch": 7.267642115124962, "grad_norm": 5.456384181976318, "learning_rate": 2.9302501860136725e-05, "loss": 0.4255, "step": 527500 }, { "epoch": 7.269019867184701, "grad_norm": 15.0463228225708, "learning_rate": 2.9294503578471096e-05, "loss": 0.4193, "step": 527600 }, { "epoch": 7.270397619244441, "grad_norm": 5.020733833312988, "learning_rate": 2.92865051443532e-05, "loss": 0.4931, "step": 527700 }, { "epoch": 7.27177537130418, "grad_norm": 7.640762805938721, "learning_rate": 2.9278506558462548e-05, "loss": 0.3628, "step": 527800 }, { "epoch": 7.27315312336392, "grad_norm": 6.148104190826416, "learning_rate": 2.927050782147867e-05, "loss": 0.4479, "step": 527900 }, { "epoch": 7.274530875423658, "grad_norm": 3.0570690631866455, "learning_rate": 2.9262508934081092e-05, "loss": 0.4184, "step": 528000 }, { "epoch": 7.275908627483398, "grad_norm": 0.2889871597290039, "learning_rate": 2.9254509896949365e-05, "loss": 0.4049, "step": 528100 }, { "epoch": 7.2772863795431375, "grad_norm": 3.989398717880249, "learning_rate": 2.924651071076305e-05, "loss": 0.4074, "step": 528200 }, { "epoch": 7.278664131602877, "grad_norm": 14.667363166809082, "learning_rate": 2.92385113762017e-05, "loss": 0.4336, "step": 528300 }, { "epoch": 7.280041883662616, "grad_norm": 11.679713249206543, "learning_rate": 2.9230511893944936e-05, "loss": 0.4514, "step": 528400 }, { "epoch": 7.281419635722355, "grad_norm": 1.3945144414901733, "learning_rate": 2.9222512264672317e-05, "loss": 0.4507, "step": 528500 }, { "epoch": 7.282797387782095, "grad_norm": 2.6839096546173096, "learning_rate": 2.9214512489063474e-05, "loss": 0.3817, "step": 528600 }, { "epoch": 7.284175139841834, "grad_norm": 5.41428279876709, "learning_rate": 2.9206512567798028e-05, "loss": 0.4908, "step": 528700 }, { "epoch": 7.285552891901573, "grad_norm": 1.5273067951202393, "learning_rate": 2.91985125015556e-05, "loss": 0.3675, "step": 528800 }, { "epoch": 7.286930643961313, "grad_norm": 3.85188889503479, "learning_rate": 2.9190512291015852e-05, "loss": 0.4272, "step": 528900 }, { "epoch": 7.288308396021052, "grad_norm": 4.808576583862305, "learning_rate": 2.918251193685843e-05, "loss": 0.4369, "step": 529000 }, { "epoch": 7.289686148080792, "grad_norm": 5.606349945068359, "learning_rate": 2.917451143976302e-05, "loss": 0.4907, "step": 529100 }, { "epoch": 7.29106390014053, "grad_norm": 6.450354099273682, "learning_rate": 2.916651080040929e-05, "loss": 0.511, "step": 529200 }, { "epoch": 7.29244165220027, "grad_norm": 13.357542037963867, "learning_rate": 2.915851001947694e-05, "loss": 0.3752, "step": 529300 }, { "epoch": 7.2938194042600095, "grad_norm": 7.168420791625977, "learning_rate": 2.9150509097645687e-05, "loss": 0.4906, "step": 529400 }, { "epoch": 7.295197156319749, "grad_norm": 2.673644781112671, "learning_rate": 2.9142508035595238e-05, "loss": 0.461, "step": 529500 }, { "epoch": 7.296574908379488, "grad_norm": 4.309229850769043, "learning_rate": 2.9134506834005326e-05, "loss": 0.4162, "step": 529600 }, { "epoch": 7.297952660439227, "grad_norm": 0.18738001585006714, "learning_rate": 2.9126505493555704e-05, "loss": 0.479, "step": 529700 }, { "epoch": 7.299330412498967, "grad_norm": 14.323974609375, "learning_rate": 2.9118504014926113e-05, "loss": 0.4673, "step": 529800 }, { "epoch": 7.300708164558706, "grad_norm": 1.7647420167922974, "learning_rate": 2.9110502398796323e-05, "loss": 0.4056, "step": 529900 }, { "epoch": 7.302085916618445, "grad_norm": 15.972511291503906, "learning_rate": 2.910250064584612e-05, "loss": 0.3977, "step": 530000 }, { "epoch": 7.303463668678185, "grad_norm": 2.2886714935302734, "learning_rate": 2.9094498756755287e-05, "loss": 0.4233, "step": 530100 }, { "epoch": 7.304841420737924, "grad_norm": 3.633195400238037, "learning_rate": 2.908649673220363e-05, "loss": 0.4259, "step": 530200 }, { "epoch": 7.306219172797663, "grad_norm": 13.278536796569824, "learning_rate": 2.9078494572870957e-05, "loss": 0.4427, "step": 530300 }, { "epoch": 7.307596924857402, "grad_norm": 9.276966094970703, "learning_rate": 2.9070492279437095e-05, "loss": 0.4599, "step": 530400 }, { "epoch": 7.308974676917142, "grad_norm": 7.163000106811523, "learning_rate": 2.906248985258188e-05, "loss": 0.386, "step": 530500 }, { "epoch": 7.3103524289768815, "grad_norm": 8.477914810180664, "learning_rate": 2.9054487292985148e-05, "loss": 0.3981, "step": 530600 }, { "epoch": 7.311730181036621, "grad_norm": 7.509415149688721, "learning_rate": 2.9046484601326782e-05, "loss": 0.4666, "step": 530700 }, { "epoch": 7.31310793309636, "grad_norm": 5.5561723709106445, "learning_rate": 2.903848177828663e-05, "loss": 0.4509, "step": 530800 }, { "epoch": 7.314485685156099, "grad_norm": 2.2281110286712646, "learning_rate": 2.9030478824544584e-05, "loss": 0.5091, "step": 530900 }, { "epoch": 7.315863437215839, "grad_norm": 3.82918119430542, "learning_rate": 2.902247574078053e-05, "loss": 0.3843, "step": 531000 }, { "epoch": 7.3172411892755775, "grad_norm": 2.1636435985565186, "learning_rate": 2.9014472527674365e-05, "loss": 0.41, "step": 531100 }, { "epoch": 7.318618941335317, "grad_norm": 2.1685776710510254, "learning_rate": 2.9006469185906032e-05, "loss": 0.3967, "step": 531200 }, { "epoch": 7.319996693395057, "grad_norm": 6.576162815093994, "learning_rate": 2.8998465716155414e-05, "loss": 0.402, "step": 531300 }, { "epoch": 7.321374445454796, "grad_norm": 0.6103913187980652, "learning_rate": 2.8990462119102477e-05, "loss": 0.3635, "step": 531400 }, { "epoch": 7.322752197514535, "grad_norm": 1.9723066091537476, "learning_rate": 2.8982458395427158e-05, "loss": 0.5149, "step": 531500 }, { "epoch": 7.324129949574274, "grad_norm": 7.15331506729126, "learning_rate": 2.8974454545809406e-05, "loss": 0.3947, "step": 531600 }, { "epoch": 7.325507701634014, "grad_norm": 0.49902448058128357, "learning_rate": 2.8966450570929203e-05, "loss": 0.415, "step": 531700 }, { "epoch": 7.3268854536937535, "grad_norm": 2.9225857257843018, "learning_rate": 2.895844647146653e-05, "loss": 0.4571, "step": 531800 }, { "epoch": 7.328263205753492, "grad_norm": 2.2372071743011475, "learning_rate": 2.895044224810135e-05, "loss": 0.4725, "step": 531900 }, { "epoch": 7.329640957813232, "grad_norm": 13.383376121520996, "learning_rate": 2.8942437901513694e-05, "loss": 0.4283, "step": 532000 }, { "epoch": 7.331018709872971, "grad_norm": 7.015833377838135, "learning_rate": 2.893451347767921e-05, "loss": 0.4322, "step": 532100 }, { "epoch": 7.332396461932711, "grad_norm": 2.4232017993927, "learning_rate": 2.892650888790187e-05, "loss": 0.3958, "step": 532200 }, { "epoch": 7.3337742139924496, "grad_norm": 9.83591365814209, "learning_rate": 2.89185041769353e-05, "loss": 0.506, "step": 532300 }, { "epoch": 7.335151966052189, "grad_norm": 5.600688934326172, "learning_rate": 2.8910499345459546e-05, "loss": 0.4758, "step": 532400 }, { "epoch": 7.336529718111929, "grad_norm": 7.676602363586426, "learning_rate": 2.8902494394154653e-05, "loss": 0.4548, "step": 532500 }, { "epoch": 7.337907470171668, "grad_norm": 5.2926859855651855, "learning_rate": 2.8894489323700694e-05, "loss": 0.4022, "step": 532600 }, { "epoch": 7.339285222231407, "grad_norm": 1.8817243576049805, "learning_rate": 2.888648413477773e-05, "loss": 0.4084, "step": 532700 }, { "epoch": 7.3406629742911464, "grad_norm": 3.6371889114379883, "learning_rate": 2.8878478828065834e-05, "loss": 0.4387, "step": 532800 }, { "epoch": 7.342040726350886, "grad_norm": 8.56273365020752, "learning_rate": 2.8870473404245126e-05, "loss": 0.4313, "step": 532900 }, { "epoch": 7.343418478410626, "grad_norm": 1.4176666736602783, "learning_rate": 2.8862467863995676e-05, "loss": 0.4533, "step": 533000 }, { "epoch": 7.344796230470364, "grad_norm": 5.151734352111816, "learning_rate": 2.885446220799763e-05, "loss": 0.407, "step": 533100 }, { "epoch": 7.346173982530104, "grad_norm": 1.582909107208252, "learning_rate": 2.8846456436931075e-05, "loss": 0.4715, "step": 533200 }, { "epoch": 7.347551734589843, "grad_norm": 3.3835840225219727, "learning_rate": 2.8838450551476157e-05, "loss": 0.4339, "step": 533300 }, { "epoch": 7.348929486649583, "grad_norm": 3.6356005668640137, "learning_rate": 2.883044455231303e-05, "loss": 0.4139, "step": 533400 }, { "epoch": 7.350307238709322, "grad_norm": 2.878119468688965, "learning_rate": 2.8822438440121817e-05, "loss": 0.3802, "step": 533500 }, { "epoch": 7.351684990769061, "grad_norm": 1.8025470972061157, "learning_rate": 2.8814432215582696e-05, "loss": 0.4256, "step": 533600 }, { "epoch": 7.353062742828801, "grad_norm": 4.973130702972412, "learning_rate": 2.8806425879375844e-05, "loss": 0.4103, "step": 533700 }, { "epoch": 7.35444049488854, "grad_norm": 3.463181495666504, "learning_rate": 2.8798419432181415e-05, "loss": 0.4916, "step": 533800 }, { "epoch": 7.355818246948279, "grad_norm": 6.862941741943359, "learning_rate": 2.8790412874679622e-05, "loss": 0.4747, "step": 533900 }, { "epoch": 7.3571959990080185, "grad_norm": 4.2563323974609375, "learning_rate": 2.8782406207550652e-05, "loss": 0.4257, "step": 534000 }, { "epoch": 7.358573751067758, "grad_norm": 2.533911943435669, "learning_rate": 2.877439943147471e-05, "loss": 0.4251, "step": 534100 }, { "epoch": 7.359951503127498, "grad_norm": 6.186253070831299, "learning_rate": 2.876639254713203e-05, "loss": 0.4444, "step": 534200 }, { "epoch": 7.361329255187236, "grad_norm": 75.2963638305664, "learning_rate": 2.8758385555202807e-05, "loss": 0.4514, "step": 534300 }, { "epoch": 7.362707007246976, "grad_norm": 2.9996325969696045, "learning_rate": 2.87503784563673e-05, "loss": 0.4059, "step": 534400 }, { "epoch": 7.364084759306715, "grad_norm": 1.9278546571731567, "learning_rate": 2.874245132387995e-05, "loss": 0.4366, "step": 534500 }, { "epoch": 7.365462511366454, "grad_norm": 8.845470428466797, "learning_rate": 2.8734444014324697e-05, "loss": 0.4597, "step": 534600 }, { "epoch": 7.366840263426194, "grad_norm": 6.135219097137451, "learning_rate": 2.8726436599897097e-05, "loss": 0.3821, "step": 534700 }, { "epoch": 7.368218015485933, "grad_norm": 7.168758392333984, "learning_rate": 2.871842908127745e-05, "loss": 0.4434, "step": 534800 }, { "epoch": 7.369595767545673, "grad_norm": 2.1625781059265137, "learning_rate": 2.8710421459146006e-05, "loss": 0.4486, "step": 534900 }, { "epoch": 7.370973519605412, "grad_norm": 5.022161960601807, "learning_rate": 2.8702413734183077e-05, "loss": 0.4862, "step": 535000 }, { "epoch": 7.372351271665151, "grad_norm": 4.327476978302002, "learning_rate": 2.8694405907068946e-05, "loss": 0.4424, "step": 535100 }, { "epoch": 7.3737290237248905, "grad_norm": 4.025018215179443, "learning_rate": 2.8686397978483918e-05, "loss": 0.5015, "step": 535200 }, { "epoch": 7.37510677578463, "grad_norm": 19.758026123046875, "learning_rate": 2.8678389949108325e-05, "loss": 0.4413, "step": 535300 }, { "epoch": 7.376484527844369, "grad_norm": 79.03694152832031, "learning_rate": 2.8670381819622464e-05, "loss": 0.4203, "step": 535400 }, { "epoch": 7.377862279904108, "grad_norm": 15.999736785888672, "learning_rate": 2.8662373590706677e-05, "loss": 0.4213, "step": 535500 }, { "epoch": 7.379240031963848, "grad_norm": 4.96759033203125, "learning_rate": 2.865444534680455e-05, "loss": 0.4026, "step": 535600 }, { "epoch": 7.380617784023587, "grad_norm": 5.163212299346924, "learning_rate": 2.8646436922047275e-05, "loss": 0.4621, "step": 535700 }, { "epoch": 7.381995536083326, "grad_norm": 2.5682151317596436, "learning_rate": 2.8638428399894317e-05, "loss": 0.4248, "step": 535800 }, { "epoch": 7.383373288143066, "grad_norm": 5.439509391784668, "learning_rate": 2.8630419781026053e-05, "loss": 0.401, "step": 535900 }, { "epoch": 7.384751040202805, "grad_norm": 3.276240110397339, "learning_rate": 2.8622411066122833e-05, "loss": 0.3952, "step": 536000 }, { "epoch": 7.386128792262545, "grad_norm": 3.118412971496582, "learning_rate": 2.861440225586506e-05, "loss": 0.4748, "step": 536100 }, { "epoch": 7.387506544322283, "grad_norm": 2.562843084335327, "learning_rate": 2.8606393350933108e-05, "loss": 0.4624, "step": 536200 }, { "epoch": 7.388884296382023, "grad_norm": 9.925040245056152, "learning_rate": 2.8598384352007377e-05, "loss": 0.4602, "step": 536300 }, { "epoch": 7.3902620484417625, "grad_norm": 5.952777862548828, "learning_rate": 2.8590375259768286e-05, "loss": 0.44, "step": 536400 }, { "epoch": 7.391639800501502, "grad_norm": 3.4073009490966797, "learning_rate": 2.858236607489624e-05, "loss": 0.4874, "step": 536500 }, { "epoch": 7.393017552561241, "grad_norm": 2.9499261379241943, "learning_rate": 2.857435679807164e-05, "loss": 0.4362, "step": 536600 }, { "epoch": 7.39439530462098, "grad_norm": 27.421464920043945, "learning_rate": 2.8566347429974954e-05, "loss": 0.4466, "step": 536700 }, { "epoch": 7.39577305668072, "grad_norm": 3.419938802719116, "learning_rate": 2.855833797128658e-05, "loss": 0.4359, "step": 536800 }, { "epoch": 7.397150808740459, "grad_norm": 0.5551576614379883, "learning_rate": 2.8550328422686995e-05, "loss": 0.4982, "step": 536900 }, { "epoch": 7.398528560800198, "grad_norm": 2.9478471279144287, "learning_rate": 2.8542318784856638e-05, "loss": 0.4494, "step": 537000 }, { "epoch": 7.399906312859938, "grad_norm": 2.561605453491211, "learning_rate": 2.8534309058475953e-05, "loss": 0.477, "step": 537100 }, { "epoch": 7.401284064919677, "grad_norm": 9.127660751342773, "learning_rate": 2.8526299244225435e-05, "loss": 0.3674, "step": 537200 }, { "epoch": 7.402661816979417, "grad_norm": 31.75274085998535, "learning_rate": 2.8518289342785544e-05, "loss": 0.3494, "step": 537300 }, { "epoch": 7.404039569039155, "grad_norm": 4.384542465209961, "learning_rate": 2.8510279354836762e-05, "loss": 0.4763, "step": 537400 }, { "epoch": 7.405417321098895, "grad_norm": 5.3455915451049805, "learning_rate": 2.8502269281059588e-05, "loss": 0.4089, "step": 537500 }, { "epoch": 7.4067950731586345, "grad_norm": 2.7535457611083984, "learning_rate": 2.8494259122134498e-05, "loss": 0.3736, "step": 537600 }, { "epoch": 7.408172825218374, "grad_norm": 14.361881256103516, "learning_rate": 2.848624887874201e-05, "loss": 0.4311, "step": 537700 }, { "epoch": 7.409550577278113, "grad_norm": 8.127216339111328, "learning_rate": 2.8478238551562643e-05, "loss": 0.4814, "step": 537800 }, { "epoch": 7.410928329337852, "grad_norm": 5.568881034851074, "learning_rate": 2.8470228141276904e-05, "loss": 0.4091, "step": 537900 }, { "epoch": 7.412306081397592, "grad_norm": 4.130171298980713, "learning_rate": 2.8462217648565315e-05, "loss": 0.4916, "step": 538000 }, { "epoch": 7.413683833457331, "grad_norm": 3.00921368598938, "learning_rate": 2.845420707410842e-05, "loss": 0.4329, "step": 538100 }, { "epoch": 7.41506158551707, "grad_norm": 6.275873184204102, "learning_rate": 2.8446196418586746e-05, "loss": 0.4459, "step": 538200 }, { "epoch": 7.41643933757681, "grad_norm": 2.6935315132141113, "learning_rate": 2.8438185682680855e-05, "loss": 0.4002, "step": 538300 }, { "epoch": 7.417817089636549, "grad_norm": 1.236141562461853, "learning_rate": 2.8430174867071287e-05, "loss": 0.4465, "step": 538400 }, { "epoch": 7.419194841696289, "grad_norm": 7.577119827270508, "learning_rate": 2.8422163972438604e-05, "loss": 0.4508, "step": 538500 }, { "epoch": 7.420572593756027, "grad_norm": 3.757509231567383, "learning_rate": 2.841415299946337e-05, "loss": 0.4487, "step": 538600 }, { "epoch": 7.421950345815767, "grad_norm": 2.673419952392578, "learning_rate": 2.8406141948826166e-05, "loss": 0.4198, "step": 538700 }, { "epoch": 7.4233280978755065, "grad_norm": 5.737542152404785, "learning_rate": 2.8398130821207567e-05, "loss": 0.3899, "step": 538800 }, { "epoch": 7.424705849935245, "grad_norm": 0.56877201795578, "learning_rate": 2.839011961728816e-05, "loss": 0.4721, "step": 538900 }, { "epoch": 7.426083601994985, "grad_norm": 2.6419119834899902, "learning_rate": 2.8382108337748535e-05, "loss": 0.3868, "step": 539000 }, { "epoch": 7.427461354054724, "grad_norm": 5.866122245788574, "learning_rate": 2.83740969832693e-05, "loss": 0.4438, "step": 539100 }, { "epoch": 7.428839106114464, "grad_norm": 5.372983932495117, "learning_rate": 2.8366085554531052e-05, "loss": 0.4033, "step": 539200 }, { "epoch": 7.430216858174203, "grad_norm": 1.023581624031067, "learning_rate": 2.8358154167599546e-05, "loss": 0.4075, "step": 539300 }, { "epoch": 7.431594610233942, "grad_norm": 7.562337398529053, "learning_rate": 2.835014259311072e-05, "loss": 0.4169, "step": 539400 }, { "epoch": 7.432972362293682, "grad_norm": 6.716611862182617, "learning_rate": 2.8342130946397945e-05, "loss": 0.4291, "step": 539500 }, { "epoch": 7.434350114353421, "grad_norm": 2.795417070388794, "learning_rate": 2.833411922814183e-05, "loss": 0.4276, "step": 539600 }, { "epoch": 7.43572786641316, "grad_norm": 2.315000295639038, "learning_rate": 2.8326107439023017e-05, "loss": 0.3747, "step": 539700 }, { "epoch": 7.437105618472899, "grad_norm": 3.9713361263275146, "learning_rate": 2.8318095579722155e-05, "loss": 0.422, "step": 539800 }, { "epoch": 7.438483370532639, "grad_norm": 7.583224773406982, "learning_rate": 2.831008365091989e-05, "loss": 0.3877, "step": 539900 }, { "epoch": 7.4398611225923785, "grad_norm": 2.159616231918335, "learning_rate": 2.8302071653296866e-05, "loss": 0.4354, "step": 540000 }, { "epoch": 7.441238874652117, "grad_norm": 12.720037460327148, "learning_rate": 2.8294059587533757e-05, "loss": 0.432, "step": 540100 }, { "epoch": 7.442616626711857, "grad_norm": 5.1632304191589355, "learning_rate": 2.8286047454311226e-05, "loss": 0.4922, "step": 540200 }, { "epoch": 7.443994378771596, "grad_norm": 10.606891632080078, "learning_rate": 2.827803525430994e-05, "loss": 0.456, "step": 540300 }, { "epoch": 7.445372130831336, "grad_norm": 5.939263820648193, "learning_rate": 2.827002298821058e-05, "loss": 0.4572, "step": 540400 }, { "epoch": 7.4467498828910745, "grad_norm": 1.9471814632415771, "learning_rate": 2.8262010656693828e-05, "loss": 0.4526, "step": 540500 }, { "epoch": 7.448127634950814, "grad_norm": 18.333885192871094, "learning_rate": 2.825399826044037e-05, "loss": 0.4129, "step": 540600 }, { "epoch": 7.449505387010554, "grad_norm": 3.623263120651245, "learning_rate": 2.82459858001309e-05, "loss": 0.4514, "step": 540700 }, { "epoch": 7.450883139070293, "grad_norm": 4.437058925628662, "learning_rate": 2.8237973276446122e-05, "loss": 0.4565, "step": 540800 }, { "epoch": 7.452260891130032, "grad_norm": 2.5760324001312256, "learning_rate": 2.822996069006674e-05, "loss": 0.4344, "step": 540900 }, { "epoch": 7.453638643189771, "grad_norm": 3.3212900161743164, "learning_rate": 2.822194804167346e-05, "loss": 0.4333, "step": 541000 }, { "epoch": 7.455016395249511, "grad_norm": 5.009664535522461, "learning_rate": 2.8213935331947008e-05, "loss": 0.3425, "step": 541100 }, { "epoch": 7.4563941473092505, "grad_norm": 1.490462064743042, "learning_rate": 2.820592256156809e-05, "loss": 0.5042, "step": 541200 }, { "epoch": 7.457771899368989, "grad_norm": 2.1359567642211914, "learning_rate": 2.8197909731217436e-05, "loss": 0.3638, "step": 541300 }, { "epoch": 7.459149651428729, "grad_norm": 1.5745075941085815, "learning_rate": 2.818989684157579e-05, "loss": 0.5117, "step": 541400 }, { "epoch": 7.460527403488468, "grad_norm": 4.589914321899414, "learning_rate": 2.8181883893323876e-05, "loss": 0.4293, "step": 541500 }, { "epoch": 7.461905155548208, "grad_norm": 6.481187343597412, "learning_rate": 2.8173870887142427e-05, "loss": 0.4427, "step": 541600 }, { "epoch": 7.4632829076079465, "grad_norm": 0.042053285986185074, "learning_rate": 2.816585782371221e-05, "loss": 0.3851, "step": 541700 }, { "epoch": 7.464660659667686, "grad_norm": 8.815479278564453, "learning_rate": 2.8157844703713953e-05, "loss": 0.4387, "step": 541800 }, { "epoch": 7.466038411727426, "grad_norm": 2.8633878231048584, "learning_rate": 2.8149911659861695e-05, "loss": 0.4418, "step": 541900 }, { "epoch": 7.467416163787165, "grad_norm": 6.054144859313965, "learning_rate": 2.8141898429318355e-05, "loss": 0.4278, "step": 542000 }, { "epoch": 7.468793915846904, "grad_norm": 3.6870791912078857, "learning_rate": 2.8133885144242454e-05, "loss": 0.4411, "step": 542100 }, { "epoch": 7.470171667906643, "grad_norm": 28.393657684326172, "learning_rate": 2.8125871805314777e-05, "loss": 0.3942, "step": 542200 }, { "epoch": 7.471549419966383, "grad_norm": 8.25516128540039, "learning_rate": 2.811785841321608e-05, "loss": 0.4861, "step": 542300 }, { "epoch": 7.4729271720261226, "grad_norm": 4.084705352783203, "learning_rate": 2.8109844968627156e-05, "loss": 0.404, "step": 542400 }, { "epoch": 7.474304924085861, "grad_norm": 3.2064032554626465, "learning_rate": 2.8101831472228777e-05, "loss": 0.4117, "step": 542500 }, { "epoch": 7.475682676145601, "grad_norm": 0.1507849395275116, "learning_rate": 2.8093817924701737e-05, "loss": 0.4073, "step": 542600 }, { "epoch": 7.47706042820534, "grad_norm": 10.756246566772461, "learning_rate": 2.8085804326726814e-05, "loss": 0.4548, "step": 542700 }, { "epoch": 7.47843818026508, "grad_norm": 2.5515196323394775, "learning_rate": 2.807779067898484e-05, "loss": 0.4752, "step": 542800 }, { "epoch": 7.479815932324819, "grad_norm": 6.677034378051758, "learning_rate": 2.8069776982156577e-05, "loss": 0.4321, "step": 542900 }, { "epoch": 7.481193684384558, "grad_norm": 2.3016717433929443, "learning_rate": 2.8061763236922845e-05, "loss": 0.38, "step": 543000 }, { "epoch": 7.482571436444298, "grad_norm": 3.4783923625946045, "learning_rate": 2.805374944396446e-05, "loss": 0.4317, "step": 543100 }, { "epoch": 7.483949188504036, "grad_norm": 2.9948785305023193, "learning_rate": 2.8045735603962214e-05, "loss": 0.3922, "step": 543200 }, { "epoch": 7.485326940563776, "grad_norm": 6.320486545562744, "learning_rate": 2.8037721717596953e-05, "loss": 0.3554, "step": 543300 }, { "epoch": 7.4867046926235155, "grad_norm": 7.366561412811279, "learning_rate": 2.8029707785549475e-05, "loss": 0.4667, "step": 543400 }, { "epoch": 7.488082444683255, "grad_norm": 0.17292086780071259, "learning_rate": 2.8021693808500603e-05, "loss": 0.4258, "step": 543500 }, { "epoch": 7.489460196742995, "grad_norm": 3.4930570125579834, "learning_rate": 2.801367978713118e-05, "loss": 0.3629, "step": 543600 }, { "epoch": 7.490837948802733, "grad_norm": 3.487156629562378, "learning_rate": 2.800566572212202e-05, "loss": 0.4246, "step": 543700 }, { "epoch": 7.492215700862473, "grad_norm": 12.597655296325684, "learning_rate": 2.7997651614153985e-05, "loss": 0.4273, "step": 543800 }, { "epoch": 7.493593452922212, "grad_norm": 11.945016860961914, "learning_rate": 2.79896374639079e-05, "loss": 0.4542, "step": 543900 }, { "epoch": 7.494971204981951, "grad_norm": 0.7353599071502686, "learning_rate": 2.798162327206459e-05, "loss": 0.4616, "step": 544000 }, { "epoch": 7.496348957041691, "grad_norm": 3.4027493000030518, "learning_rate": 2.797360903930493e-05, "loss": 0.4658, "step": 544100 }, { "epoch": 7.49772670910143, "grad_norm": 5.9105544090271, "learning_rate": 2.796559476630976e-05, "loss": 0.4275, "step": 544200 }, { "epoch": 7.49910446116117, "grad_norm": 15.132706642150879, "learning_rate": 2.7957580453759934e-05, "loss": 0.4558, "step": 544300 }, { "epoch": 7.500482213220909, "grad_norm": 9.754582405090332, "learning_rate": 2.794956610233631e-05, "loss": 0.4595, "step": 544400 }, { "epoch": 7.501859965280648, "grad_norm": 3.4021215438842773, "learning_rate": 2.7941551712719735e-05, "loss": 0.453, "step": 544500 }, { "epoch": 7.5032377173403875, "grad_norm": 3.115478038787842, "learning_rate": 2.7933537285591087e-05, "loss": 0.3945, "step": 544600 }, { "epoch": 7.504615469400127, "grad_norm": 5.966651916503906, "learning_rate": 2.7925522821631235e-05, "loss": 0.4195, "step": 544700 }, { "epoch": 7.505993221459866, "grad_norm": 5.1514105796813965, "learning_rate": 2.7917508321521035e-05, "loss": 0.4192, "step": 544800 }, { "epoch": 7.507370973519605, "grad_norm": 4.739969253540039, "learning_rate": 2.7909573931470508e-05, "loss": 0.4486, "step": 544900 }, { "epoch": 7.508748725579345, "grad_norm": 7.577914237976074, "learning_rate": 2.790155936144677e-05, "loss": 0.4292, "step": 545000 }, { "epoch": 7.510126477639084, "grad_norm": 3.4212565422058105, "learning_rate": 2.7893544757308502e-05, "loss": 0.4494, "step": 545100 }, { "epoch": 7.511504229698823, "grad_norm": 4.344147205352783, "learning_rate": 2.7885530119736596e-05, "loss": 0.4346, "step": 545200 }, { "epoch": 7.512881981758563, "grad_norm": 13.158196449279785, "learning_rate": 2.787751544941194e-05, "loss": 0.4817, "step": 545300 }, { "epoch": 7.514259733818302, "grad_norm": 9.003913879394531, "learning_rate": 2.786950074701541e-05, "loss": 0.4278, "step": 545400 }, { "epoch": 7.515637485878042, "grad_norm": 9.059292793273926, "learning_rate": 2.7861486013227906e-05, "loss": 0.4371, "step": 545500 }, { "epoch": 7.51701523793778, "grad_norm": 0.09797698259353638, "learning_rate": 2.7853471248730304e-05, "loss": 0.452, "step": 545600 }, { "epoch": 7.51839298999752, "grad_norm": 5.361985683441162, "learning_rate": 2.784545645420352e-05, "loss": 0.3916, "step": 545700 }, { "epoch": 7.5197707420572595, "grad_norm": 4.8792643547058105, "learning_rate": 2.783744163032845e-05, "loss": 0.4467, "step": 545800 }, { "epoch": 7.521148494116999, "grad_norm": 6.651999473571777, "learning_rate": 2.7829426777785974e-05, "loss": 0.4608, "step": 545900 }, { "epoch": 7.522526246176738, "grad_norm": 4.2536492347717285, "learning_rate": 2.782141189725701e-05, "loss": 0.4687, "step": 546000 }, { "epoch": 7.523903998236477, "grad_norm": 2.4149169921875, "learning_rate": 2.7813396989422468e-05, "loss": 0.4177, "step": 546100 }, { "epoch": 7.525281750296217, "grad_norm": 4.404573440551758, "learning_rate": 2.7805382054963247e-05, "loss": 0.3984, "step": 546200 }, { "epoch": 7.526659502355956, "grad_norm": 2.8749806880950928, "learning_rate": 2.7797367094560257e-05, "loss": 0.4649, "step": 546300 }, { "epoch": 7.528037254415695, "grad_norm": 4.871800899505615, "learning_rate": 2.7789352108894404e-05, "loss": 0.4709, "step": 546400 }, { "epoch": 7.529415006475435, "grad_norm": 22.180395126342773, "learning_rate": 2.7781337098646615e-05, "loss": 0.4227, "step": 546500 }, { "epoch": 7.530792758535174, "grad_norm": 0.8249420523643494, "learning_rate": 2.7773322064497802e-05, "loss": 0.4538, "step": 546600 }, { "epoch": 7.532170510594914, "grad_norm": 10.610916137695312, "learning_rate": 2.776530700712888e-05, "loss": 0.4457, "step": 546700 }, { "epoch": 7.533548262654652, "grad_norm": 2.190502643585205, "learning_rate": 2.775729192722077e-05, "loss": 0.4445, "step": 546800 }, { "epoch": 7.534926014714392, "grad_norm": 9.598668098449707, "learning_rate": 2.774927682545439e-05, "loss": 0.5067, "step": 546900 }, { "epoch": 7.5363037667741315, "grad_norm": 0.7223147749900818, "learning_rate": 2.7741261702510673e-05, "loss": 0.4449, "step": 547000 }, { "epoch": 7.537681518833871, "grad_norm": 3.7646493911743164, "learning_rate": 2.773324655907054e-05, "loss": 0.5046, "step": 547100 }, { "epoch": 7.53905927089361, "grad_norm": 2.554262161254883, "learning_rate": 2.772523139581492e-05, "loss": 0.3902, "step": 547200 }, { "epoch": 7.540437022953349, "grad_norm": 3.9231226444244385, "learning_rate": 2.7717216213424737e-05, "loss": 0.4376, "step": 547300 }, { "epoch": 7.541814775013089, "grad_norm": 3.263807535171509, "learning_rate": 2.770920101258093e-05, "loss": 0.3812, "step": 547400 }, { "epoch": 7.5431925270728275, "grad_norm": 4.426365375518799, "learning_rate": 2.7701185793964423e-05, "loss": 0.4195, "step": 547500 }, { "epoch": 7.544570279132567, "grad_norm": 0.9327697157859802, "learning_rate": 2.769317055825616e-05, "loss": 0.4011, "step": 547600 }, { "epoch": 7.545948031192307, "grad_norm": 0.11842532455921173, "learning_rate": 2.7685155306137054e-05, "loss": 0.559, "step": 547700 }, { "epoch": 7.547325783252046, "grad_norm": 2.767132043838501, "learning_rate": 2.7677140038288074e-05, "loss": 0.4519, "step": 547800 }, { "epoch": 7.548703535311786, "grad_norm": 4.038454532623291, "learning_rate": 2.7669124755390133e-05, "loss": 0.4346, "step": 547900 }, { "epoch": 7.550081287371524, "grad_norm": 2.855855703353882, "learning_rate": 2.7661109458124186e-05, "loss": 0.4632, "step": 548000 }, { "epoch": 7.551459039431264, "grad_norm": 2.4040982723236084, "learning_rate": 2.7653094147171163e-05, "loss": 0.4184, "step": 548100 }, { "epoch": 7.5528367914910035, "grad_norm": 4.255384922027588, "learning_rate": 2.7645078823211996e-05, "loss": 0.3831, "step": 548200 }, { "epoch": 7.554214543550742, "grad_norm": 6.381831169128418, "learning_rate": 2.763706348692766e-05, "loss": 0.485, "step": 548300 }, { "epoch": 7.555592295610482, "grad_norm": 2.5923430919647217, "learning_rate": 2.762904813899907e-05, "loss": 0.43, "step": 548400 }, { "epoch": 7.556970047670221, "grad_norm": 1.9660134315490723, "learning_rate": 2.7621032780107176e-05, "loss": 0.3912, "step": 548500 }, { "epoch": 7.558347799729961, "grad_norm": 21.31551742553711, "learning_rate": 2.7613017410932943e-05, "loss": 0.4232, "step": 548600 }, { "epoch": 7.5597255517897, "grad_norm": 1.1674977540969849, "learning_rate": 2.760508218599034e-05, "loss": 0.4445, "step": 548700 }, { "epoch": 7.561103303849439, "grad_norm": 1.6215500831604004, "learning_rate": 2.7597066798380063e-05, "loss": 0.4453, "step": 548800 }, { "epoch": 7.562481055909179, "grad_norm": 2.0366902351379395, "learning_rate": 2.7589051402523462e-05, "loss": 0.3568, "step": 548900 }, { "epoch": 7.563858807968918, "grad_norm": 3.8983054161071777, "learning_rate": 2.75810359991015e-05, "loss": 0.4205, "step": 549000 }, { "epoch": 7.565236560028657, "grad_norm": 4.745150089263916, "learning_rate": 2.7573020588795105e-05, "loss": 0.3978, "step": 549100 }, { "epoch": 7.566614312088396, "grad_norm": 1.7292938232421875, "learning_rate": 2.7565005172285243e-05, "loss": 0.3805, "step": 549200 }, { "epoch": 7.567992064148136, "grad_norm": 4.7617573738098145, "learning_rate": 2.7556989750252857e-05, "loss": 0.4486, "step": 549300 }, { "epoch": 7.5693698162078755, "grad_norm": 4.335742473602295, "learning_rate": 2.7548974323378903e-05, "loss": 0.4752, "step": 549400 }, { "epoch": 7.570747568267614, "grad_norm": 7.225325107574463, "learning_rate": 2.754095889234433e-05, "loss": 0.4665, "step": 549500 }, { "epoch": 7.572125320327354, "grad_norm": 7.217161178588867, "learning_rate": 2.7532943457830076e-05, "loss": 0.4916, "step": 549600 }, { "epoch": 7.573503072387093, "grad_norm": 6.872833251953125, "learning_rate": 2.7524928020517125e-05, "loss": 0.4356, "step": 549700 }, { "epoch": 7.574880824446833, "grad_norm": 3.390831232070923, "learning_rate": 2.7516912581086405e-05, "loss": 0.3905, "step": 549800 }, { "epoch": 7.5762585765065715, "grad_norm": 1.4677950143814087, "learning_rate": 2.7508897140218866e-05, "loss": 0.3942, "step": 549900 }, { "epoch": 7.577636328566311, "grad_norm": 5.779827117919922, "learning_rate": 2.7500881698595486e-05, "loss": 0.4291, "step": 550000 }, { "epoch": 7.579014080626051, "grad_norm": 5.80511474609375, "learning_rate": 2.749286625689719e-05, "loss": 0.45, "step": 550100 }, { "epoch": 7.58039183268579, "grad_norm": 3.532496213912964, "learning_rate": 2.7484850815804945e-05, "loss": 0.4133, "step": 550200 }, { "epoch": 7.581769584745529, "grad_norm": 4.331103324890137, "learning_rate": 2.7476835375999697e-05, "loss": 0.4377, "step": 550300 }, { "epoch": 7.583147336805268, "grad_norm": 3.1047475337982178, "learning_rate": 2.7468819938162402e-05, "loss": 0.4224, "step": 550400 }, { "epoch": 7.584525088865008, "grad_norm": 3.1765358448028564, "learning_rate": 2.7460804502974013e-05, "loss": 0.4322, "step": 550500 }, { "epoch": 7.5859028409247475, "grad_norm": 15.844441413879395, "learning_rate": 2.7452789071115486e-05, "loss": 0.4464, "step": 550600 }, { "epoch": 7.587280592984486, "grad_norm": 3.4814414978027344, "learning_rate": 2.7444773643267775e-05, "loss": 0.4075, "step": 550700 }, { "epoch": 7.588658345044226, "grad_norm": 2.5962369441986084, "learning_rate": 2.7436758220111818e-05, "loss": 0.4031, "step": 550800 }, { "epoch": 7.590036097103965, "grad_norm": 2.559882879257202, "learning_rate": 2.742882295647757e-05, "loss": 0.4743, "step": 550900 }, { "epoch": 7.591413849163705, "grad_norm": 0.9008790254592896, "learning_rate": 2.742080754468408e-05, "loss": 0.4486, "step": 551000 }, { "epoch": 7.5927916012234435, "grad_norm": 8.004463195800781, "learning_rate": 2.7412792139618397e-05, "loss": 0.4016, "step": 551100 }, { "epoch": 7.594169353283183, "grad_norm": 17.060848236083984, "learning_rate": 2.740477674196147e-05, "loss": 0.4366, "step": 551200 }, { "epoch": 7.595547105342923, "grad_norm": 1.7429239749908447, "learning_rate": 2.7396761352394246e-05, "loss": 0.4756, "step": 551300 }, { "epoch": 7.596924857402662, "grad_norm": 5.913734436035156, "learning_rate": 2.738874597159767e-05, "loss": 0.4153, "step": 551400 }, { "epoch": 7.598302609462401, "grad_norm": 3.1381280422210693, "learning_rate": 2.738073060025269e-05, "loss": 0.4191, "step": 551500 }, { "epoch": 7.59968036152214, "grad_norm": 3.892171859741211, "learning_rate": 2.7372715239040254e-05, "loss": 0.4157, "step": 551600 }, { "epoch": 7.60105811358188, "grad_norm": 4.3432416915893555, "learning_rate": 2.7364699888641323e-05, "loss": 0.401, "step": 551700 }, { "epoch": 7.602435865641619, "grad_norm": 11.388246536254883, "learning_rate": 2.735668454973682e-05, "loss": 0.4001, "step": 551800 }, { "epoch": 7.603813617701358, "grad_norm": 11.4810791015625, "learning_rate": 2.7348669223007698e-05, "loss": 0.4245, "step": 551900 }, { "epoch": 7.605191369761098, "grad_norm": 3.2112677097320557, "learning_rate": 2.7340653909134894e-05, "loss": 0.4443, "step": 552000 }, { "epoch": 7.606569121820837, "grad_norm": 13.221724510192871, "learning_rate": 2.733263860879936e-05, "loss": 0.4185, "step": 552100 }, { "epoch": 7.607946873880577, "grad_norm": 4.487495422363281, "learning_rate": 2.7324623322682035e-05, "loss": 0.371, "step": 552200 }, { "epoch": 7.609324625940316, "grad_norm": 3.0562098026275635, "learning_rate": 2.7316608051463862e-05, "loss": 0.3934, "step": 552300 }, { "epoch": 7.610702378000055, "grad_norm": 0.7446732521057129, "learning_rate": 2.7308592795825763e-05, "loss": 0.4926, "step": 552400 }, { "epoch": 7.612080130059795, "grad_norm": 3.0428073406219482, "learning_rate": 2.7300577556448684e-05, "loss": 0.3953, "step": 552500 }, { "epoch": 7.613457882119533, "grad_norm": 3.3515334129333496, "learning_rate": 2.7292562334013568e-05, "loss": 0.4957, "step": 552600 }, { "epoch": 7.614835634179273, "grad_norm": 2.5777339935302734, "learning_rate": 2.728454712920134e-05, "loss": 0.4384, "step": 552700 }, { "epoch": 7.6162133862390125, "grad_norm": 4.800641059875488, "learning_rate": 2.7276531942692935e-05, "loss": 0.4549, "step": 552800 }, { "epoch": 7.617591138298752, "grad_norm": 20.748266220092773, "learning_rate": 2.726851677516929e-05, "loss": 0.4585, "step": 552900 }, { "epoch": 7.618968890358492, "grad_norm": 3.864189863204956, "learning_rate": 2.7260501627311324e-05, "loss": 0.4663, "step": 553000 }, { "epoch": 7.62034664241823, "grad_norm": 14.188593864440918, "learning_rate": 2.725248649979997e-05, "loss": 0.376, "step": 553100 }, { "epoch": 7.62172439447797, "grad_norm": 5.0581817626953125, "learning_rate": 2.7244471393316158e-05, "loss": 0.457, "step": 553200 }, { "epoch": 7.623102146537709, "grad_norm": 1.201155424118042, "learning_rate": 2.7236456308540806e-05, "loss": 0.437, "step": 553300 }, { "epoch": 7.624479898597448, "grad_norm": 63.219810485839844, "learning_rate": 2.7228441246154843e-05, "loss": 0.5435, "step": 553400 }, { "epoch": 7.625857650657188, "grad_norm": 10.504203796386719, "learning_rate": 2.7220426206839177e-05, "loss": 0.4972, "step": 553500 }, { "epoch": 7.627235402716927, "grad_norm": 3.643195152282715, "learning_rate": 2.7212411191274738e-05, "loss": 0.4497, "step": 553600 }, { "epoch": 7.628613154776667, "grad_norm": 3.3292970657348633, "learning_rate": 2.7204396200142443e-05, "loss": 0.4045, "step": 553700 }, { "epoch": 7.629990906836405, "grad_norm": 3.394129991531372, "learning_rate": 2.71963812341232e-05, "loss": 0.4129, "step": 553800 }, { "epoch": 7.631368658896145, "grad_norm": 2.786311626434326, "learning_rate": 2.7188446443170265e-05, "loss": 0.4085, "step": 553900 }, { "epoch": 7.6327464109558845, "grad_norm": 11.43811321258545, "learning_rate": 2.7180431529151748e-05, "loss": 0.4132, "step": 554000 }, { "epoch": 7.634124163015624, "grad_norm": 0.5038183927536011, "learning_rate": 2.7172416642282212e-05, "loss": 0.4673, "step": 554100 }, { "epoch": 7.635501915075363, "grad_norm": 8.87948989868164, "learning_rate": 2.7164401783242547e-05, "loss": 0.3787, "step": 554200 }, { "epoch": 7.636879667135102, "grad_norm": 6.885680675506592, "learning_rate": 2.7156386952713675e-05, "loss": 0.4465, "step": 554300 }, { "epoch": 7.638257419194842, "grad_norm": 2.42177677154541, "learning_rate": 2.7148452299243125e-05, "loss": 0.5471, "step": 554400 }, { "epoch": 7.639635171254581, "grad_norm": 1.4351866245269775, "learning_rate": 2.7140437527476432e-05, "loss": 0.428, "step": 554500 }, { "epoch": 7.64101292331432, "grad_norm": 8.119945526123047, "learning_rate": 2.7132422786256407e-05, "loss": 0.4198, "step": 554600 }, { "epoch": 7.64239067537406, "grad_norm": 2.7974629402160645, "learning_rate": 2.7124408076263946e-05, "loss": 0.3663, "step": 554700 }, { "epoch": 7.643768427433799, "grad_norm": 3.1003544330596924, "learning_rate": 2.7116393398179934e-05, "loss": 0.4467, "step": 554800 }, { "epoch": 7.645146179493539, "grad_norm": 1.658278226852417, "learning_rate": 2.710837875268527e-05, "loss": 0.4093, "step": 554900 }, { "epoch": 7.646523931553277, "grad_norm": 4.178915023803711, "learning_rate": 2.7100364140460845e-05, "loss": 0.4519, "step": 555000 }, { "epoch": 7.647901683613017, "grad_norm": 10.85798168182373, "learning_rate": 2.7092349562187523e-05, "loss": 0.404, "step": 555100 }, { "epoch": 7.6492794356727565, "grad_norm": 3.015878915786743, "learning_rate": 2.708433501854619e-05, "loss": 0.4606, "step": 555200 }, { "epoch": 7.650657187732496, "grad_norm": 4.767267227172852, "learning_rate": 2.7076320510217717e-05, "loss": 0.4036, "step": 555300 }, { "epoch": 7.652034939792235, "grad_norm": 2.2454137802124023, "learning_rate": 2.706830603788299e-05, "loss": 0.4091, "step": 555400 }, { "epoch": 7.653412691851974, "grad_norm": 9.89771842956543, "learning_rate": 2.7060291602222876e-05, "loss": 0.4364, "step": 555500 }, { "epoch": 7.654790443911714, "grad_norm": 37.50621795654297, "learning_rate": 2.7052277203918248e-05, "loss": 0.4235, "step": 555600 }, { "epoch": 7.656168195971453, "grad_norm": 3.441382884979248, "learning_rate": 2.704426284364995e-05, "loss": 0.4475, "step": 555700 }, { "epoch": 7.657545948031192, "grad_norm": 1.5240055322647095, "learning_rate": 2.7036248522098852e-05, "loss": 0.3591, "step": 555800 }, { "epoch": 7.658923700090932, "grad_norm": 3.976015329360962, "learning_rate": 2.7028234239945822e-05, "loss": 0.4359, "step": 555900 }, { "epoch": 7.660301452150671, "grad_norm": 22.412111282348633, "learning_rate": 2.7020219997871705e-05, "loss": 0.4308, "step": 556000 }, { "epoch": 7.66167920421041, "grad_norm": 1.1921263933181763, "learning_rate": 2.701220579655736e-05, "loss": 0.3562, "step": 556100 }, { "epoch": 7.663056956270149, "grad_norm": 2.0331079959869385, "learning_rate": 2.700419163668362e-05, "loss": 0.4328, "step": 556200 }, { "epoch": 7.664434708329889, "grad_norm": 2.5105929374694824, "learning_rate": 2.6996177518931335e-05, "loss": 0.4011, "step": 556300 }, { "epoch": 7.6658124603896285, "grad_norm": 42.577552795410156, "learning_rate": 2.698816344398135e-05, "loss": 0.4453, "step": 556400 }, { "epoch": 7.667190212449368, "grad_norm": 10.191869735717773, "learning_rate": 2.6980149412514502e-05, "loss": 0.4004, "step": 556500 }, { "epoch": 7.668567964509107, "grad_norm": 2.487185001373291, "learning_rate": 2.6972135425211618e-05, "loss": 0.4382, "step": 556600 }, { "epoch": 7.669945716568846, "grad_norm": 5.342384338378906, "learning_rate": 2.6964121482753535e-05, "loss": 0.4139, "step": 556700 }, { "epoch": 7.671323468628586, "grad_norm": 5.744577884674072, "learning_rate": 2.6956107585821068e-05, "loss": 0.4347, "step": 556800 }, { "epoch": 7.6727012206883245, "grad_norm": 2.852713108062744, "learning_rate": 2.6948093735095044e-05, "loss": 0.4401, "step": 556900 }, { "epoch": 7.674078972748064, "grad_norm": 1.303440809249878, "learning_rate": 2.6940079931256285e-05, "loss": 0.4303, "step": 557000 }, { "epoch": 7.675456724807804, "grad_norm": 4.114048480987549, "learning_rate": 2.69320661749856e-05, "loss": 0.4402, "step": 557100 }, { "epoch": 7.676834476867543, "grad_norm": 19.123132705688477, "learning_rate": 2.692405246696379e-05, "loss": 0.3761, "step": 557200 }, { "epoch": 7.678212228927283, "grad_norm": 3.209638833999634, "learning_rate": 2.691603880787169e-05, "loss": 0.4454, "step": 557300 }, { "epoch": 7.679589980987021, "grad_norm": 10.314963340759277, "learning_rate": 2.690802519839006e-05, "loss": 0.4055, "step": 557400 }, { "epoch": 7.680967733046761, "grad_norm": 2.643754720687866, "learning_rate": 2.6900011639199726e-05, "loss": 0.453, "step": 557500 }, { "epoch": 7.6823454851065005, "grad_norm": 9.098482131958008, "learning_rate": 2.689199813098147e-05, "loss": 0.3885, "step": 557600 }, { "epoch": 7.683723237166239, "grad_norm": 10.241753578186035, "learning_rate": 2.688398467441608e-05, "loss": 0.4361, "step": 557700 }, { "epoch": 7.685100989225979, "grad_norm": 2.877782106399536, "learning_rate": 2.6875971270184363e-05, "loss": 0.4063, "step": 557800 }, { "epoch": 7.686478741285718, "grad_norm": 7.93145751953125, "learning_rate": 2.6867957918967058e-05, "loss": 0.4785, "step": 557900 }, { "epoch": 7.687856493345458, "grad_norm": 1.1333529949188232, "learning_rate": 2.685994462144497e-05, "loss": 0.3646, "step": 558000 }, { "epoch": 7.6892342454051965, "grad_norm": 1.2457891702651978, "learning_rate": 2.6851931378298857e-05, "loss": 0.4471, "step": 558100 }, { "epoch": 7.690611997464936, "grad_norm": 1.9103825092315674, "learning_rate": 2.6843918190209486e-05, "loss": 0.4184, "step": 558200 }, { "epoch": 7.691989749524676, "grad_norm": 4.85763692855835, "learning_rate": 2.6835905057857624e-05, "loss": 0.4061, "step": 558300 }, { "epoch": 7.693367501584415, "grad_norm": 2.5763587951660156, "learning_rate": 2.6827891981924035e-05, "loss": 0.3913, "step": 558400 }, { "epoch": 7.694745253644154, "grad_norm": 8.854558944702148, "learning_rate": 2.681987896308945e-05, "loss": 0.4125, "step": 558500 }, { "epoch": 7.696123005703893, "grad_norm": 3.6522865295410156, "learning_rate": 2.681186600203463e-05, "loss": 0.4602, "step": 558600 }, { "epoch": 7.697500757763633, "grad_norm": 5.700541019439697, "learning_rate": 2.680385309944031e-05, "loss": 0.4814, "step": 558700 }, { "epoch": 7.6988785098233725, "grad_norm": 3.4218716621398926, "learning_rate": 2.6795840255987232e-05, "loss": 0.4159, "step": 558800 }, { "epoch": 7.700256261883111, "grad_norm": 3.5009915828704834, "learning_rate": 2.6787827472356134e-05, "loss": 0.4139, "step": 558900 }, { "epoch": 7.701634013942851, "grad_norm": 12.02125072479248, "learning_rate": 2.6779814749227732e-05, "loss": 0.43, "step": 559000 }, { "epoch": 7.70301176600259, "grad_norm": 4.383796215057373, "learning_rate": 2.677180208728275e-05, "loss": 0.3723, "step": 559100 }, { "epoch": 7.70438951806233, "grad_norm": 5.934932231903076, "learning_rate": 2.67637894872019e-05, "loss": 0.4077, "step": 559200 }, { "epoch": 7.7057672701220685, "grad_norm": 4.267889499664307, "learning_rate": 2.6755776949665903e-05, "loss": 0.427, "step": 559300 }, { "epoch": 7.707145022181808, "grad_norm": 9.591529846191406, "learning_rate": 2.6747764475355462e-05, "loss": 0.3654, "step": 559400 }, { "epoch": 7.708522774241548, "grad_norm": 12.786456108093262, "learning_rate": 2.673975206495129e-05, "loss": 0.423, "step": 559500 }, { "epoch": 7.709900526301287, "grad_norm": 12.553844451904297, "learning_rate": 2.6731739719134056e-05, "loss": 0.4766, "step": 559600 }, { "epoch": 7.711278278361026, "grad_norm": 9.227893829345703, "learning_rate": 2.6723727438584458e-05, "loss": 0.439, "step": 559700 }, { "epoch": 7.712656030420765, "grad_norm": 4.103210926055908, "learning_rate": 2.6715715223983192e-05, "loss": 0.4315, "step": 559800 }, { "epoch": 7.714033782480505, "grad_norm": 5.083662033081055, "learning_rate": 2.670770307601093e-05, "loss": 0.498, "step": 559900 }, { "epoch": 7.7154115345402445, "grad_norm": 5.420502185821533, "learning_rate": 2.6699690995348356e-05, "loss": 0.446, "step": 560000 }, { "epoch": 7.716789286599983, "grad_norm": 16.163969039916992, "learning_rate": 2.6691678982676116e-05, "loss": 0.4454, "step": 560100 }, { "epoch": 7.718167038659723, "grad_norm": 2.8711178302764893, "learning_rate": 2.6683667038674877e-05, "loss": 0.4122, "step": 560200 }, { "epoch": 7.719544790719462, "grad_norm": 2.779858350753784, "learning_rate": 2.6675655164025304e-05, "loss": 0.3653, "step": 560300 }, { "epoch": 7.720922542779201, "grad_norm": 2.6652016639709473, "learning_rate": 2.6667643359408043e-05, "loss": 0.4462, "step": 560400 }, { "epoch": 7.7223002948389405, "grad_norm": 13.430249214172363, "learning_rate": 2.6659711742490513e-05, "loss": 0.4056, "step": 560500 }, { "epoch": 7.72367804689868, "grad_norm": 0.8853716850280762, "learning_rate": 2.6651700079262498e-05, "loss": 0.3815, "step": 560600 }, { "epoch": 7.72505579895842, "grad_norm": 1.9806708097457886, "learning_rate": 2.66436884881019e-05, "loss": 0.4437, "step": 560700 }, { "epoch": 7.726433551018159, "grad_norm": 4.61207914352417, "learning_rate": 2.663567696968934e-05, "loss": 0.4567, "step": 560800 }, { "epoch": 7.727811303077898, "grad_norm": 24.115774154663086, "learning_rate": 2.6627665524705445e-05, "loss": 0.4012, "step": 560900 }, { "epoch": 7.729189055137637, "grad_norm": 7.2109575271606445, "learning_rate": 2.6619654153830825e-05, "loss": 0.4484, "step": 561000 }, { "epoch": 7.730566807197377, "grad_norm": 1.6353265047073364, "learning_rate": 2.66116428577461e-05, "loss": 0.4777, "step": 561100 }, { "epoch": 7.731944559257116, "grad_norm": 2.9727556705474854, "learning_rate": 2.660363163713185e-05, "loss": 0.4616, "step": 561200 }, { "epoch": 7.733322311316855, "grad_norm": 6.219762802124023, "learning_rate": 2.6595700603734124e-05, "loss": 0.3998, "step": 561300 }, { "epoch": 7.734700063376595, "grad_norm": 2.914759874343872, "learning_rate": 2.6587689535330934e-05, "loss": 0.4137, "step": 561400 }, { "epoch": 7.736077815436334, "grad_norm": 2.966567039489746, "learning_rate": 2.6579678544433177e-05, "loss": 0.3974, "step": 561500 }, { "epoch": 7.737455567496074, "grad_norm": 4.464357376098633, "learning_rate": 2.6571667631721446e-05, "loss": 0.3672, "step": 561600 }, { "epoch": 7.7388333195558126, "grad_norm": 3.9366455078125, "learning_rate": 2.6563656797876302e-05, "loss": 0.4507, "step": 561700 }, { "epoch": 7.740211071615552, "grad_norm": 4.0575480461120605, "learning_rate": 2.6555646043578296e-05, "loss": 0.3838, "step": 561800 }, { "epoch": 7.741588823675292, "grad_norm": 4.34018087387085, "learning_rate": 2.6547635369507995e-05, "loss": 0.4754, "step": 561900 }, { "epoch": 7.74296657573503, "grad_norm": 0.7386291027069092, "learning_rate": 2.6539624776345932e-05, "loss": 0.3844, "step": 562000 }, { "epoch": 7.74434432779477, "grad_norm": 6.642742156982422, "learning_rate": 2.6531614264772664e-05, "loss": 0.4125, "step": 562100 }, { "epoch": 7.7457220798545094, "grad_norm": 6.9536566734313965, "learning_rate": 2.652360383546872e-05, "loss": 0.4391, "step": 562200 }, { "epoch": 7.747099831914249, "grad_norm": 10.2610445022583, "learning_rate": 2.6515593489114627e-05, "loss": 0.418, "step": 562300 }, { "epoch": 7.748477583973988, "grad_norm": 4.365941524505615, "learning_rate": 2.65075832263909e-05, "loss": 0.4079, "step": 562400 }, { "epoch": 7.749855336033727, "grad_norm": 20.58535385131836, "learning_rate": 2.649957304797805e-05, "loss": 0.4213, "step": 562500 }, { "epoch": 7.751233088093467, "grad_norm": 4.588970184326172, "learning_rate": 2.6491562954556592e-05, "loss": 0.4259, "step": 562600 }, { "epoch": 7.752610840153206, "grad_norm": 4.208446979522705, "learning_rate": 2.6483552946807022e-05, "loss": 0.3596, "step": 562700 }, { "epoch": 7.753988592212945, "grad_norm": 52.8625602722168, "learning_rate": 2.647554302540983e-05, "loss": 0.395, "step": 562800 }, { "epoch": 7.755366344272685, "grad_norm": 2.5685863494873047, "learning_rate": 2.64675331910455e-05, "loss": 0.4088, "step": 562900 }, { "epoch": 7.756744096332424, "grad_norm": 3.0562326908111572, "learning_rate": 2.6459523444394497e-05, "loss": 0.4425, "step": 563000 }, { "epoch": 7.758121848392164, "grad_norm": 7.920937538146973, "learning_rate": 2.645151378613731e-05, "loss": 0.4541, "step": 563100 }, { "epoch": 7.759499600451902, "grad_norm": 3.6052544116973877, "learning_rate": 2.6443504216954387e-05, "loss": 0.3607, "step": 563200 }, { "epoch": 7.760877352511642, "grad_norm": 1.7825300693511963, "learning_rate": 2.6435494737526195e-05, "loss": 0.3671, "step": 563300 }, { "epoch": 7.7622551045713815, "grad_norm": 7.826066970825195, "learning_rate": 2.6427485348533166e-05, "loss": 0.4822, "step": 563400 }, { "epoch": 7.763632856631121, "grad_norm": 1.5091921091079712, "learning_rate": 2.6419476050655732e-05, "loss": 0.434, "step": 563500 }, { "epoch": 7.76501060869086, "grad_norm": 1.339563012123108, "learning_rate": 2.641146684457435e-05, "loss": 0.4014, "step": 563600 }, { "epoch": 7.766388360750599, "grad_norm": 1.4326204061508179, "learning_rate": 2.640345773096942e-05, "loss": 0.4113, "step": 563700 }, { "epoch": 7.767766112810339, "grad_norm": 4.294501304626465, "learning_rate": 2.6395448710521363e-05, "loss": 0.4447, "step": 563800 }, { "epoch": 7.769143864870078, "grad_norm": 2.865030288696289, "learning_rate": 2.6387439783910606e-05, "loss": 0.4677, "step": 563900 }, { "epoch": 7.770521616929817, "grad_norm": 1.9951144456863403, "learning_rate": 2.6379430951817515e-05, "loss": 0.4348, "step": 564000 }, { "epoch": 7.771899368989557, "grad_norm": 5.0759124755859375, "learning_rate": 2.6371422214922504e-05, "loss": 0.4425, "step": 564100 }, { "epoch": 7.773277121049296, "grad_norm": 4.322912216186523, "learning_rate": 2.6363413573905945e-05, "loss": 0.3546, "step": 564200 }, { "epoch": 7.774654873109036, "grad_norm": 1.111991047859192, "learning_rate": 2.6355405029448217e-05, "loss": 0.4067, "step": 564300 }, { "epoch": 7.776032625168774, "grad_norm": 34.97132873535156, "learning_rate": 2.6347396582229703e-05, "loss": 0.4227, "step": 564400 }, { "epoch": 7.777410377228514, "grad_norm": 5.639999866485596, "learning_rate": 2.6339388232930728e-05, "loss": 0.4211, "step": 564500 }, { "epoch": 7.7787881292882535, "grad_norm": 1.3731552362442017, "learning_rate": 2.6331379982231666e-05, "loss": 0.3848, "step": 564600 }, { "epoch": 7.780165881347992, "grad_norm": 2.370307445526123, "learning_rate": 2.6323371830812856e-05, "loss": 0.3922, "step": 564700 }, { "epoch": 7.781543633407732, "grad_norm": 2.8705644607543945, "learning_rate": 2.631536377935462e-05, "loss": 0.3729, "step": 564800 }, { "epoch": 7.782921385467471, "grad_norm": 2.0288171768188477, "learning_rate": 2.6307355828537297e-05, "loss": 0.3766, "step": 564900 }, { "epoch": 7.784299137527211, "grad_norm": 3.128899574279785, "learning_rate": 2.6299347979041205e-05, "loss": 0.4042, "step": 565000 }, { "epoch": 7.78567688958695, "grad_norm": 0.427706778049469, "learning_rate": 2.6291340231546637e-05, "loss": 0.4259, "step": 565100 }, { "epoch": 7.787054641646689, "grad_norm": 9.575339317321777, "learning_rate": 2.6283332586733902e-05, "loss": 0.4748, "step": 565200 }, { "epoch": 7.788432393706429, "grad_norm": 1.6544184684753418, "learning_rate": 2.6275405120183912e-05, "loss": 0.3711, "step": 565300 }, { "epoch": 7.789810145766168, "grad_norm": 4.526662349700928, "learning_rate": 2.626739768173191e-05, "loss": 0.3737, "step": 565400 }, { "epoch": 7.791187897825907, "grad_norm": 2.490725517272949, "learning_rate": 2.6259390347995786e-05, "loss": 0.4355, "step": 565500 }, { "epoch": 7.792565649885646, "grad_norm": 11.295348167419434, "learning_rate": 2.6251383119655785e-05, "loss": 0.3842, "step": 565600 }, { "epoch": 7.793943401945386, "grad_norm": 14.879592895507812, "learning_rate": 2.6243375997392183e-05, "loss": 0.4518, "step": 565700 }, { "epoch": 7.7953211540051255, "grad_norm": 3.0076754093170166, "learning_rate": 2.623536898188522e-05, "loss": 0.4581, "step": 565800 }, { "epoch": 7.796698906064865, "grad_norm": 2.590406656265259, "learning_rate": 2.6227362073815132e-05, "loss": 0.4213, "step": 565900 }, { "epoch": 7.798076658124604, "grad_norm": 0.42780813574790955, "learning_rate": 2.621935527386214e-05, "loss": 0.3608, "step": 566000 }, { "epoch": 7.799454410184343, "grad_norm": 14.491691589355469, "learning_rate": 2.6211348582706486e-05, "loss": 0.409, "step": 566100 }, { "epoch": 7.800832162244083, "grad_norm": 5.128462791442871, "learning_rate": 2.6203342001028344e-05, "loss": 0.4283, "step": 566200 }, { "epoch": 7.8022099143038215, "grad_norm": 5.963847637176514, "learning_rate": 2.619533552950794e-05, "loss": 0.3778, "step": 566300 }, { "epoch": 7.803587666363561, "grad_norm": 4.169637203216553, "learning_rate": 2.618732916882546e-05, "loss": 0.3817, "step": 566400 }, { "epoch": 7.804965418423301, "grad_norm": 2.175851345062256, "learning_rate": 2.617932291966107e-05, "loss": 0.3646, "step": 566500 }, { "epoch": 7.80634317048304, "grad_norm": 1.4738775491714478, "learning_rate": 2.617131678269498e-05, "loss": 0.4078, "step": 566600 }, { "epoch": 7.807720922542779, "grad_norm": 2.8327794075012207, "learning_rate": 2.6163310758607303e-05, "loss": 0.4243, "step": 566700 }, { "epoch": 7.809098674602518, "grad_norm": 1.4749609231948853, "learning_rate": 2.6155304848078224e-05, "loss": 0.3565, "step": 566800 }, { "epoch": 7.810476426662258, "grad_norm": 2.7822959423065186, "learning_rate": 2.6147299051787876e-05, "loss": 0.4418, "step": 566900 }, { "epoch": 7.8118541787219975, "grad_norm": 6.20667839050293, "learning_rate": 2.613929337041639e-05, "loss": 0.379, "step": 567000 }, { "epoch": 7.813231930781736, "grad_norm": 2.1793887615203857, "learning_rate": 2.6131287804643898e-05, "loss": 0.4167, "step": 567100 }, { "epoch": 7.814609682841476, "grad_norm": 8.690431594848633, "learning_rate": 2.612336240906764e-05, "loss": 0.397, "step": 567200 }, { "epoch": 7.815987434901215, "grad_norm": 4.355604648590088, "learning_rate": 2.6115357075360494e-05, "loss": 0.4604, "step": 567300 }, { "epoch": 7.817365186960955, "grad_norm": 15.770100593566895, "learning_rate": 2.6107351859285854e-05, "loss": 0.3565, "step": 567400 }, { "epoch": 7.8187429390206935, "grad_norm": 5.782956123352051, "learning_rate": 2.60993467615238e-05, "loss": 0.4724, "step": 567500 }, { "epoch": 7.820120691080433, "grad_norm": 3.9906046390533447, "learning_rate": 2.6091341782754396e-05, "loss": 0.4499, "step": 567600 }, { "epoch": 7.821498443140173, "grad_norm": 3.61303448677063, "learning_rate": 2.6083336923657736e-05, "loss": 0.3671, "step": 567700 }, { "epoch": 7.822876195199912, "grad_norm": 2.9689767360687256, "learning_rate": 2.6075332184913833e-05, "loss": 0.4194, "step": 567800 }, { "epoch": 7.824253947259651, "grad_norm": 5.303683757781982, "learning_rate": 2.6067327567202753e-05, "loss": 0.4382, "step": 567900 }, { "epoch": 7.82563169931939, "grad_norm": 6.710387706756592, "learning_rate": 2.6059323071204526e-05, "loss": 0.4151, "step": 568000 }, { "epoch": 7.82700945137913, "grad_norm": 3.0356507301330566, "learning_rate": 2.605131869759917e-05, "loss": 0.3545, "step": 568100 }, { "epoch": 7.8283872034388695, "grad_norm": 3.9841818809509277, "learning_rate": 2.6043314447066707e-05, "loss": 0.3851, "step": 568200 }, { "epoch": 7.829764955498608, "grad_norm": 6.8464179039001465, "learning_rate": 2.603531032028714e-05, "loss": 0.3606, "step": 568300 }, { "epoch": 7.831142707558348, "grad_norm": 4.464069843292236, "learning_rate": 2.6027306317940448e-05, "loss": 0.4169, "step": 568400 }, { "epoch": 7.832520459618087, "grad_norm": 0.2582082450389862, "learning_rate": 2.601930244070662e-05, "loss": 0.3763, "step": 568500 }, { "epoch": 7.833898211677827, "grad_norm": 5.311912536621094, "learning_rate": 2.601129868926562e-05, "loss": 0.4158, "step": 568600 }, { "epoch": 7.8352759637375655, "grad_norm": 2.2017080783843994, "learning_rate": 2.600329506429742e-05, "loss": 0.426, "step": 568700 }, { "epoch": 7.836653715797305, "grad_norm": 1.7846885919570923, "learning_rate": 2.599529156648196e-05, "loss": 0.4042, "step": 568800 }, { "epoch": 7.838031467857045, "grad_norm": 2.7411811351776123, "learning_rate": 2.598728819649918e-05, "loss": 0.3834, "step": 568900 }, { "epoch": 7.839409219916783, "grad_norm": 4.691423416137695, "learning_rate": 2.5979284955029004e-05, "loss": 0.4371, "step": 569000 }, { "epoch": 7.840786971976523, "grad_norm": 1.355125904083252, "learning_rate": 2.5971281842751346e-05, "loss": 0.4117, "step": 569100 }, { "epoch": 7.842164724036262, "grad_norm": 4.330750942230225, "learning_rate": 2.5963278860346122e-05, "loss": 0.3715, "step": 569200 }, { "epoch": 7.843542476096002, "grad_norm": 1.6993210315704346, "learning_rate": 2.5955276008493223e-05, "loss": 0.3687, "step": 569300 }, { "epoch": 7.8449202281557415, "grad_norm": 5.218278884887695, "learning_rate": 2.594727328787253e-05, "loss": 0.445, "step": 569400 }, { "epoch": 7.84629798021548, "grad_norm": 7.775432586669922, "learning_rate": 2.5939270699163914e-05, "loss": 0.2981, "step": 569500 }, { "epoch": 7.84767573227522, "grad_norm": 9.965502738952637, "learning_rate": 2.5931268243047227e-05, "loss": 0.4132, "step": 569600 }, { "epoch": 7.849053484334959, "grad_norm": 2.4802777767181396, "learning_rate": 2.5923265920202337e-05, "loss": 0.3762, "step": 569700 }, { "epoch": 7.850431236394698, "grad_norm": 1.8184071779251099, "learning_rate": 2.591526373130907e-05, "loss": 0.4546, "step": 569800 }, { "epoch": 7.8518089884544375, "grad_norm": 5.638575077056885, "learning_rate": 2.5907261677047264e-05, "loss": 0.4105, "step": 569900 }, { "epoch": 7.853186740514177, "grad_norm": 2.285266399383545, "learning_rate": 2.5899259758096726e-05, "loss": 0.3404, "step": 570000 }, { "epoch": 7.854564492573917, "grad_norm": 2.8196353912353516, "learning_rate": 2.589125797513725e-05, "loss": 0.4075, "step": 570100 }, { "epoch": 7.855942244633656, "grad_norm": 2.015794515609741, "learning_rate": 2.588325632884865e-05, "loss": 0.409, "step": 570200 }, { "epoch": 7.857319996693395, "grad_norm": 2.6167922019958496, "learning_rate": 2.587525481991069e-05, "loss": 0.4386, "step": 570300 }, { "epoch": 7.858697748753134, "grad_norm": 2.875387668609619, "learning_rate": 2.5867253449003146e-05, "loss": 0.413, "step": 570400 }, { "epoch": 7.860075500812874, "grad_norm": 2.809980869293213, "learning_rate": 2.5859252216805784e-05, "loss": 0.4364, "step": 570500 }, { "epoch": 7.861453252872613, "grad_norm": 3.130037546157837, "learning_rate": 2.585125112399833e-05, "loss": 0.4008, "step": 570600 }, { "epoch": 7.862831004932352, "grad_norm": 4.842129707336426, "learning_rate": 2.5843250171260528e-05, "loss": 0.441, "step": 570700 }, { "epoch": 7.864208756992092, "grad_norm": 2.6613197326660156, "learning_rate": 2.58352493592721e-05, "loss": 0.3766, "step": 570800 }, { "epoch": 7.865586509051831, "grad_norm": 3.213390827178955, "learning_rate": 2.582724868871276e-05, "loss": 0.3882, "step": 570900 }, { "epoch": 7.86696426111157, "grad_norm": 5.0584516525268555, "learning_rate": 2.581924816026219e-05, "loss": 0.4301, "step": 571000 }, { "epoch": 7.8683420131713095, "grad_norm": 3.562164783477783, "learning_rate": 2.5811247774600107e-05, "loss": 0.4083, "step": 571100 }, { "epoch": 7.869719765231049, "grad_norm": 2.4599568843841553, "learning_rate": 2.5803247532406156e-05, "loss": 0.3868, "step": 571200 }, { "epoch": 7.871097517290789, "grad_norm": 2.2250401973724365, "learning_rate": 2.5795247434360005e-05, "loss": 0.3955, "step": 571300 }, { "epoch": 7.872475269350527, "grad_norm": 2.9267094135284424, "learning_rate": 2.5787327479954364e-05, "loss": 0.438, "step": 571400 }, { "epoch": 7.873853021410267, "grad_norm": 0.6323767900466919, "learning_rate": 2.577932767078432e-05, "loss": 0.4212, "step": 571500 }, { "epoch": 7.875230773470006, "grad_norm": 4.856153964996338, "learning_rate": 2.5771328007794202e-05, "loss": 0.3295, "step": 571600 }, { "epoch": 7.876608525529746, "grad_norm": 1.9507824182510376, "learning_rate": 2.5763328491663602e-05, "loss": 0.4042, "step": 571700 }, { "epoch": 7.877986277589485, "grad_norm": 2.466931104660034, "learning_rate": 2.5755329123072132e-05, "loss": 0.4183, "step": 571800 }, { "epoch": 7.879364029649224, "grad_norm": 13.116840362548828, "learning_rate": 2.574732990269938e-05, "loss": 0.4507, "step": 571900 }, { "epoch": 7.880741781708964, "grad_norm": 4.8487868309021, "learning_rate": 2.5739330831224922e-05, "loss": 0.4412, "step": 572000 }, { "epoch": 7.882119533768703, "grad_norm": 2.353994607925415, "learning_rate": 2.5731411897804643e-05, "loss": 0.4294, "step": 572100 }, { "epoch": 7.883497285828442, "grad_norm": 34.3173713684082, "learning_rate": 2.5723413124659503e-05, "loss": 0.4385, "step": 572200 }, { "epoch": 7.884875037888182, "grad_norm": 2.1839373111724854, "learning_rate": 2.5715414502444507e-05, "loss": 0.4574, "step": 572300 }, { "epoch": 7.886252789947921, "grad_norm": 2.063659906387329, "learning_rate": 2.570741603183917e-05, "loss": 0.3899, "step": 572400 }, { "epoch": 7.887630542007661, "grad_norm": 3.1924850940704346, "learning_rate": 2.569941771352302e-05, "loss": 0.4293, "step": 572500 }, { "epoch": 7.889008294067399, "grad_norm": 3.378161907196045, "learning_rate": 2.5691419548175547e-05, "loss": 0.3575, "step": 572600 }, { "epoch": 7.890386046127139, "grad_norm": 2.4634101390838623, "learning_rate": 2.5683421536476245e-05, "loss": 0.4625, "step": 572700 }, { "epoch": 7.8917637981868785, "grad_norm": 7.387239456176758, "learning_rate": 2.5675423679104567e-05, "loss": 0.4136, "step": 572800 }, { "epoch": 7.893141550246618, "grad_norm": 2.0837934017181396, "learning_rate": 2.5667425976739976e-05, "loss": 0.4187, "step": 572900 }, { "epoch": 7.894519302306357, "grad_norm": 4.208126068115234, "learning_rate": 2.565942843006193e-05, "loss": 0.359, "step": 573000 }, { "epoch": 7.895897054366096, "grad_norm": 1.845621109008789, "learning_rate": 2.5651431039749858e-05, "loss": 0.3958, "step": 573100 }, { "epoch": 7.897274806425836, "grad_norm": 17.89778709411621, "learning_rate": 2.564343380648318e-05, "loss": 0.4281, "step": 573200 }, { "epoch": 7.8986525584855745, "grad_norm": 256.0893249511719, "learning_rate": 2.5635436730941292e-05, "loss": 0.4628, "step": 573300 }, { "epoch": 7.900030310545314, "grad_norm": 5.519166469573975, "learning_rate": 2.562743981380359e-05, "loss": 0.4253, "step": 573400 }, { "epoch": 7.901408062605054, "grad_norm": 3.711449146270752, "learning_rate": 2.5619443055749457e-05, "loss": 0.3716, "step": 573500 }, { "epoch": 7.902785814664793, "grad_norm": 6.972109794616699, "learning_rate": 2.561144645745826e-05, "loss": 0.4532, "step": 573600 }, { "epoch": 7.904163566724533, "grad_norm": 8.208768844604492, "learning_rate": 2.5603450019609344e-05, "loss": 0.4242, "step": 573700 }, { "epoch": 7.905541318784271, "grad_norm": 1.4055252075195312, "learning_rate": 2.5595453742882065e-05, "loss": 0.4448, "step": 573800 }, { "epoch": 7.906919070844011, "grad_norm": 2.9897499084472656, "learning_rate": 2.5587457627955717e-05, "loss": 0.446, "step": 573900 }, { "epoch": 7.9082968229037505, "grad_norm": 9.12283992767334, "learning_rate": 2.5579461675509634e-05, "loss": 0.4791, "step": 574000 }, { "epoch": 7.909674574963489, "grad_norm": 1.9095302820205688, "learning_rate": 2.5571465886223107e-05, "loss": 0.3901, "step": 574100 }, { "epoch": 7.911052327023229, "grad_norm": 9.948441505432129, "learning_rate": 2.556347026077542e-05, "loss": 0.4153, "step": 574200 }, { "epoch": 7.912430079082968, "grad_norm": 2.4185192584991455, "learning_rate": 2.5555474799845838e-05, "loss": 0.3412, "step": 574300 }, { "epoch": 7.913807831142708, "grad_norm": 17.072826385498047, "learning_rate": 2.5547479504113632e-05, "loss": 0.4045, "step": 574400 }, { "epoch": 7.915185583202447, "grad_norm": 3.982893466949463, "learning_rate": 2.553948437425802e-05, "loss": 0.3965, "step": 574500 }, { "epoch": 7.916563335262186, "grad_norm": 2.8802690505981445, "learning_rate": 2.5531489410958242e-05, "loss": 0.4122, "step": 574600 }, { "epoch": 7.917941087321926, "grad_norm": 10.4449462890625, "learning_rate": 2.5523494614893513e-05, "loss": 0.3842, "step": 574700 }, { "epoch": 7.919318839381665, "grad_norm": 0.7747752666473389, "learning_rate": 2.551549998674302e-05, "loss": 0.4521, "step": 574800 }, { "epoch": 7.920696591441404, "grad_norm": 19.949691772460938, "learning_rate": 2.5507505527185974e-05, "loss": 0.4399, "step": 574900 }, { "epoch": 7.922074343501143, "grad_norm": 4.304880142211914, "learning_rate": 2.5499511236901507e-05, "loss": 0.408, "step": 575000 }, { "epoch": 7.923452095560883, "grad_norm": 2.953263759613037, "learning_rate": 2.5491517116568804e-05, "loss": 0.4221, "step": 575100 }, { "epoch": 7.9248298476206225, "grad_norm": 0.3570718765258789, "learning_rate": 2.5483523166866995e-05, "loss": 0.415, "step": 575200 }, { "epoch": 7.926207599680361, "grad_norm": 2.9194681644439697, "learning_rate": 2.5475529388475205e-05, "loss": 0.4056, "step": 575300 }, { "epoch": 7.927585351740101, "grad_norm": 2.4002156257629395, "learning_rate": 2.5467535782072556e-05, "loss": 0.3716, "step": 575400 }, { "epoch": 7.92896310379984, "grad_norm": 4.822518348693848, "learning_rate": 2.5459542348338146e-05, "loss": 0.3922, "step": 575500 }, { "epoch": 7.93034085585958, "grad_norm": 1.4808429479599, "learning_rate": 2.5451549087951045e-05, "loss": 0.393, "step": 575600 }, { "epoch": 7.9317186079193185, "grad_norm": 2.558567523956299, "learning_rate": 2.5443556001590333e-05, "loss": 0.4077, "step": 575700 }, { "epoch": 7.933096359979058, "grad_norm": 5.497522354125977, "learning_rate": 2.5435563089935048e-05, "loss": 0.3509, "step": 575800 }, { "epoch": 7.934474112038798, "grad_norm": 7.092094421386719, "learning_rate": 2.5427570353664246e-05, "loss": 0.3809, "step": 575900 }, { "epoch": 7.935851864098537, "grad_norm": 2.1565871238708496, "learning_rate": 2.5419577793456958e-05, "loss": 0.4432, "step": 576000 }, { "epoch": 7.937229616158276, "grad_norm": 4.346516132354736, "learning_rate": 2.5411585409992168e-05, "loss": 0.3974, "step": 576100 }, { "epoch": 7.938607368218015, "grad_norm": 7.287683486938477, "learning_rate": 2.5403593203948884e-05, "loss": 0.3503, "step": 576200 }, { "epoch": 7.939985120277755, "grad_norm": 2.822115659713745, "learning_rate": 2.5395601176006075e-05, "loss": 0.4218, "step": 576300 }, { "epoch": 7.9413628723374945, "grad_norm": 1.4879916906356812, "learning_rate": 2.5387609326842718e-05, "loss": 0.3648, "step": 576400 }, { "epoch": 7.942740624397233, "grad_norm": 2.8131580352783203, "learning_rate": 2.537961765713776e-05, "loss": 0.3476, "step": 576500 }, { "epoch": 7.944118376456973, "grad_norm": 5.575896739959717, "learning_rate": 2.5371626167570128e-05, "loss": 0.4035, "step": 576600 }, { "epoch": 7.945496128516712, "grad_norm": 1.5800246000289917, "learning_rate": 2.536363485881874e-05, "loss": 0.3972, "step": 576700 }, { "epoch": 7.946873880576452, "grad_norm": 3.6117801666259766, "learning_rate": 2.5355643731562495e-05, "loss": 0.4865, "step": 576800 }, { "epoch": 7.9482516326361905, "grad_norm": 2.568058967590332, "learning_rate": 2.5347652786480293e-05, "loss": 0.3146, "step": 576900 }, { "epoch": 7.94962938469593, "grad_norm": 19.48751449584961, "learning_rate": 2.5339662024250994e-05, "loss": 0.4294, "step": 577000 }, { "epoch": 7.95100713675567, "grad_norm": 2.744723320007324, "learning_rate": 2.5331671445553463e-05, "loss": 0.4209, "step": 577100 }, { "epoch": 7.952384888815409, "grad_norm": 3.562403440475464, "learning_rate": 2.5323681051066536e-05, "loss": 0.4274, "step": 577200 }, { "epoch": 7.953762640875148, "grad_norm": 3.140655279159546, "learning_rate": 2.531569084146903e-05, "loss": 0.3748, "step": 577300 }, { "epoch": 7.955140392934887, "grad_norm": 2.7843120098114014, "learning_rate": 2.5307700817439762e-05, "loss": 0.4226, "step": 577400 }, { "epoch": 7.956518144994627, "grad_norm": 3.3314783573150635, "learning_rate": 2.529971097965753e-05, "loss": 0.4802, "step": 577500 }, { "epoch": 7.957895897054366, "grad_norm": 3.069687604904175, "learning_rate": 2.529180122438216e-05, "loss": 0.4703, "step": 577600 }, { "epoch": 7.959273649114105, "grad_norm": 9.834455490112305, "learning_rate": 2.5283811759250905e-05, "loss": 0.3704, "step": 577700 }, { "epoch": 7.960651401173845, "grad_norm": 3.5520596504211426, "learning_rate": 2.5275822482396176e-05, "loss": 0.3881, "step": 577800 }, { "epoch": 7.962029153233584, "grad_norm": 3.569101572036743, "learning_rate": 2.52678333944967e-05, "loss": 0.3665, "step": 577900 }, { "epoch": 7.963406905293324, "grad_norm": 1.5755479335784912, "learning_rate": 2.52598444962312e-05, "loss": 0.411, "step": 578000 }, { "epoch": 7.9647846573530625, "grad_norm": 10.073369026184082, "learning_rate": 2.5251855788278373e-05, "loss": 0.4834, "step": 578100 }, { "epoch": 7.966162409412802, "grad_norm": 1.798141360282898, "learning_rate": 2.5243867271316905e-05, "loss": 0.3937, "step": 578200 }, { "epoch": 7.967540161472542, "grad_norm": 7.872628688812256, "learning_rate": 2.523587894602545e-05, "loss": 0.4234, "step": 578300 }, { "epoch": 7.96891791353228, "grad_norm": 41.05575180053711, "learning_rate": 2.522789081308265e-05, "loss": 0.4066, "step": 578400 }, { "epoch": 7.97029566559202, "grad_norm": 1.5082037448883057, "learning_rate": 2.521990287316716e-05, "loss": 0.3792, "step": 578500 }, { "epoch": 7.971673417651759, "grad_norm": 4.020823001861572, "learning_rate": 2.5211915126957587e-05, "loss": 0.4582, "step": 578600 }, { "epoch": 7.973051169711499, "grad_norm": 3.1719093322753906, "learning_rate": 2.5203927575132523e-05, "loss": 0.369, "step": 578700 }, { "epoch": 7.9744289217712385, "grad_norm": 3.79127836227417, "learning_rate": 2.519594021837057e-05, "loss": 0.3865, "step": 578800 }, { "epoch": 7.975806673830977, "grad_norm": 2.3954873085021973, "learning_rate": 2.518795305735027e-05, "loss": 0.4073, "step": 578900 }, { "epoch": 7.977184425890717, "grad_norm": 3.813998222351074, "learning_rate": 2.5179966092750192e-05, "loss": 0.4388, "step": 579000 }, { "epoch": 7.978562177950456, "grad_norm": 2.709373950958252, "learning_rate": 2.5171979325248863e-05, "loss": 0.4524, "step": 579100 }, { "epoch": 7.979939930010195, "grad_norm": 0.18899941444396973, "learning_rate": 2.5163992755524803e-05, "loss": 0.4042, "step": 579200 }, { "epoch": 7.9813176820699345, "grad_norm": 2.6865170001983643, "learning_rate": 2.5156006384256514e-05, "loss": 0.354, "step": 579300 }, { "epoch": 7.982695434129674, "grad_norm": 5.534639835357666, "learning_rate": 2.514802021212246e-05, "loss": 0.4178, "step": 579400 }, { "epoch": 7.984073186189414, "grad_norm": 51.13286590576172, "learning_rate": 2.514003423980113e-05, "loss": 0.3789, "step": 579500 }, { "epoch": 7.985450938249152, "grad_norm": 5.714435577392578, "learning_rate": 2.5132048467970962e-05, "loss": 0.3794, "step": 579600 }, { "epoch": 7.986828690308892, "grad_norm": 1.4140841960906982, "learning_rate": 2.5124062897310395e-05, "loss": 0.4031, "step": 579700 }, { "epoch": 7.988206442368631, "grad_norm": 16.32473373413086, "learning_rate": 2.511607752849783e-05, "loss": 0.4109, "step": 579800 }, { "epoch": 7.989584194428371, "grad_norm": 5.832487106323242, "learning_rate": 2.5108092362211687e-05, "loss": 0.3748, "step": 579900 }, { "epoch": 7.99096194648811, "grad_norm": 0.9610350131988525, "learning_rate": 2.5100107399130334e-05, "loss": 0.4126, "step": 580000 }, { "epoch": 7.992339698547849, "grad_norm": 7.916874885559082, "learning_rate": 2.509212263993213e-05, "loss": 0.4233, "step": 580100 }, { "epoch": 7.993717450607589, "grad_norm": 2.643354654312134, "learning_rate": 2.508413808529543e-05, "loss": 0.3827, "step": 580200 }, { "epoch": 7.995095202667328, "grad_norm": 2.7436318397521973, "learning_rate": 2.507615373589855e-05, "loss": 0.4819, "step": 580300 }, { "epoch": 7.996472954727067, "grad_norm": 7.280269622802734, "learning_rate": 2.506816959241983e-05, "loss": 0.4383, "step": 580400 }, { "epoch": 7.9978507067868065, "grad_norm": 4.963311672210693, "learning_rate": 2.5060185655537527e-05, "loss": 0.3718, "step": 580500 }, { "epoch": 7.999228458846546, "grad_norm": 0.560031533241272, "learning_rate": 2.505220192592994e-05, "loss": 0.4805, "step": 580600 }, { "epoch": 8.000606210906286, "grad_norm": 6.957786560058594, "learning_rate": 2.5044218404275323e-05, "loss": 0.3501, "step": 580700 }, { "epoch": 8.001983962966024, "grad_norm": 2.446098566055298, "learning_rate": 2.5036235091251908e-05, "loss": 0.4419, "step": 580800 }, { "epoch": 8.003361715025765, "grad_norm": 5.296990871429443, "learning_rate": 2.5028251987537934e-05, "loss": 0.2845, "step": 580900 }, { "epoch": 8.004739467085503, "grad_norm": 19.149066925048828, "learning_rate": 2.5020269093811606e-05, "loss": 0.3206, "step": 581000 }, { "epoch": 8.006117219145242, "grad_norm": 19.77134895324707, "learning_rate": 2.5012286410751097e-05, "loss": 0.3589, "step": 581100 }, { "epoch": 8.007494971204983, "grad_norm": 0.6790552139282227, "learning_rate": 2.5004303939034585e-05, "loss": 0.4234, "step": 581200 }, { "epoch": 8.008872723264721, "grad_norm": 3.866525173187256, "learning_rate": 2.4996401500885425e-05, "loss": 0.3598, "step": 581300 }, { "epoch": 8.01025047532446, "grad_norm": 2.6719462871551514, "learning_rate": 2.498841945176098e-05, "loss": 0.3573, "step": 581400 }, { "epoch": 8.0116282273842, "grad_norm": 3.52056622505188, "learning_rate": 2.4980437616008164e-05, "loss": 0.3626, "step": 581500 }, { "epoch": 8.013005979443939, "grad_norm": 1.1236474514007568, "learning_rate": 2.497245599430505e-05, "loss": 0.4041, "step": 581600 }, { "epoch": 8.01438373150368, "grad_norm": 72.87816619873047, "learning_rate": 2.4964474587329735e-05, "loss": 0.4067, "step": 581700 }, { "epoch": 8.015761483563418, "grad_norm": 3.123290538787842, "learning_rate": 2.4956493395760276e-05, "loss": 0.364, "step": 581800 }, { "epoch": 8.017139235623157, "grad_norm": 7.390705108642578, "learning_rate": 2.4948512420274705e-05, "loss": 0.4034, "step": 581900 }, { "epoch": 8.018516987682897, "grad_norm": 5.756507396697998, "learning_rate": 2.4940531661551066e-05, "loss": 0.3837, "step": 582000 }, { "epoch": 8.019894739742636, "grad_norm": 5.694328308105469, "learning_rate": 2.4932551120267364e-05, "loss": 0.3223, "step": 582100 }, { "epoch": 8.021272491802375, "grad_norm": 5.076456069946289, "learning_rate": 2.492457079710157e-05, "loss": 0.4219, "step": 582200 }, { "epoch": 8.022650243862115, "grad_norm": 3.5752804279327393, "learning_rate": 2.491659069273167e-05, "loss": 0.4212, "step": 582300 }, { "epoch": 8.024027995921854, "grad_norm": 13.525606155395508, "learning_rate": 2.4908610807835594e-05, "loss": 0.3778, "step": 582400 }, { "epoch": 8.025405747981594, "grad_norm": 3.615927219390869, "learning_rate": 2.4900631143091293e-05, "loss": 0.3144, "step": 582500 }, { "epoch": 8.026783500041333, "grad_norm": 3.405439615249634, "learning_rate": 2.4892651699176684e-05, "loss": 0.3821, "step": 582600 }, { "epoch": 8.028161252101071, "grad_norm": 0.8719682693481445, "learning_rate": 2.4884672476769644e-05, "loss": 0.3854, "step": 582700 }, { "epoch": 8.029539004160812, "grad_norm": 2.0392138957977295, "learning_rate": 2.487669347654806e-05, "loss": 0.4037, "step": 582800 }, { "epoch": 8.03091675622055, "grad_norm": 6.125339031219482, "learning_rate": 2.486871469918978e-05, "loss": 0.3863, "step": 582900 }, { "epoch": 8.03229450828029, "grad_norm": 3.3226945400238037, "learning_rate": 2.486073614537265e-05, "loss": 0.4577, "step": 583000 }, { "epoch": 8.03367226034003, "grad_norm": 6.521392822265625, "learning_rate": 2.485275781577449e-05, "loss": 0.3628, "step": 583100 }, { "epoch": 8.035050012399768, "grad_norm": 3.3724944591522217, "learning_rate": 2.4844779711073103e-05, "loss": 0.3712, "step": 583200 }, { "epoch": 8.036427764459507, "grad_norm": 4.434491157531738, "learning_rate": 2.4836801831946256e-05, "loss": 0.4095, "step": 583300 }, { "epoch": 8.037805516519247, "grad_norm": 1.3057156801223755, "learning_rate": 2.482882417907171e-05, "loss": 0.3733, "step": 583400 }, { "epoch": 8.039183268578986, "grad_norm": 3.5886263847351074, "learning_rate": 2.4820846753127226e-05, "loss": 0.3534, "step": 583500 }, { "epoch": 8.040561020638727, "grad_norm": 4.770686626434326, "learning_rate": 2.4812869554790517e-05, "loss": 0.3746, "step": 583600 }, { "epoch": 8.041938772698465, "grad_norm": 4.717912197113037, "learning_rate": 2.4804892584739283e-05, "loss": 0.3566, "step": 583700 }, { "epoch": 8.043316524758204, "grad_norm": 4.722676753997803, "learning_rate": 2.4796915843651216e-05, "loss": 0.347, "step": 583800 }, { "epoch": 8.044694276817944, "grad_norm": 15.43338680267334, "learning_rate": 2.478893933220397e-05, "loss": 0.4241, "step": 583900 }, { "epoch": 8.046072028877683, "grad_norm": 1.5749702453613281, "learning_rate": 2.4780963051075203e-05, "loss": 0.4263, "step": 584000 }, { "epoch": 8.047449780937422, "grad_norm": 13.211424827575684, "learning_rate": 2.4772987000942537e-05, "loss": 0.3843, "step": 584100 }, { "epoch": 8.048827532997162, "grad_norm": 4.8888959884643555, "learning_rate": 2.476501118248357e-05, "loss": 0.3367, "step": 584200 }, { "epoch": 8.0502052850569, "grad_norm": 1.3789567947387695, "learning_rate": 2.4757035596375908e-05, "loss": 0.3575, "step": 584300 }, { "epoch": 8.051583037116641, "grad_norm": 0.9290852546691895, "learning_rate": 2.4749139995672162e-05, "loss": 0.3297, "step": 584400 }, { "epoch": 8.05296078917638, "grad_norm": 5.468562126159668, "learning_rate": 2.474116487395934e-05, "loss": 0.38, "step": 584500 }, { "epoch": 8.054338541236119, "grad_norm": 2.3821027278900146, "learning_rate": 2.4733189986623667e-05, "loss": 0.353, "step": 584600 }, { "epoch": 8.055716293295859, "grad_norm": 4.483600616455078, "learning_rate": 2.4725215334342664e-05, "loss": 0.3691, "step": 584700 }, { "epoch": 8.057094045355598, "grad_norm": 10.275177001953125, "learning_rate": 2.4717240917793826e-05, "loss": 0.3254, "step": 584800 }, { "epoch": 8.058471797415336, "grad_norm": 7.386829376220703, "learning_rate": 2.4709266737654598e-05, "loss": 0.3882, "step": 584900 }, { "epoch": 8.059849549475077, "grad_norm": 1.0556020736694336, "learning_rate": 2.470129279460244e-05, "loss": 0.3875, "step": 585000 }, { "epoch": 8.061227301534815, "grad_norm": 4.6821370124816895, "learning_rate": 2.469331908931477e-05, "loss": 0.4297, "step": 585100 }, { "epoch": 8.062605053594556, "grad_norm": 20.469966888427734, "learning_rate": 2.468534562246901e-05, "loss": 0.3798, "step": 585200 }, { "epoch": 8.063982805654295, "grad_norm": 1.6803171634674072, "learning_rate": 2.4677372394742536e-05, "loss": 0.3917, "step": 585300 }, { "epoch": 8.065360557714033, "grad_norm": 1.3143651485443115, "learning_rate": 2.4669399406812725e-05, "loss": 0.3587, "step": 585400 }, { "epoch": 8.066738309773774, "grad_norm": 2.5498340129852295, "learning_rate": 2.466142665935691e-05, "loss": 0.391, "step": 585500 }, { "epoch": 8.068116061833512, "grad_norm": 3.9095711708068848, "learning_rate": 2.4653454153052417e-05, "loss": 0.3532, "step": 585600 }, { "epoch": 8.069493813893251, "grad_norm": 6.187689781188965, "learning_rate": 2.464548188857656e-05, "loss": 0.455, "step": 585700 }, { "epoch": 8.070871565952991, "grad_norm": 5.546910285949707, "learning_rate": 2.4637509866606622e-05, "loss": 0.3863, "step": 585800 }, { "epoch": 8.07224931801273, "grad_norm": 2.386507511138916, "learning_rate": 2.4629538087819864e-05, "loss": 0.3837, "step": 585900 }, { "epoch": 8.07362707007247, "grad_norm": 12.018856048583984, "learning_rate": 2.4621566552893538e-05, "loss": 0.4, "step": 586000 }, { "epoch": 8.07500482213221, "grad_norm": 3.6749844551086426, "learning_rate": 2.4613674974196055e-05, "loss": 0.4033, "step": 586100 }, { "epoch": 8.076382574191948, "grad_norm": 2.346867799758911, "learning_rate": 2.4605703926566714e-05, "loss": 0.3522, "step": 586200 }, { "epoch": 8.077760326251688, "grad_norm": 3.628887176513672, "learning_rate": 2.4597733124822634e-05, "loss": 0.3212, "step": 586300 }, { "epoch": 8.079138078311427, "grad_norm": 5.22951602935791, "learning_rate": 2.4589762569640976e-05, "loss": 0.4024, "step": 586400 }, { "epoch": 8.080515830371166, "grad_norm": 21.341642379760742, "learning_rate": 2.4581792261698883e-05, "loss": 0.3919, "step": 586500 }, { "epoch": 8.081893582430906, "grad_norm": 37.0136833190918, "learning_rate": 2.4573822201673458e-05, "loss": 0.3846, "step": 586600 }, { "epoch": 8.083271334490645, "grad_norm": 3.5850675106048584, "learning_rate": 2.4565852390241804e-05, "loss": 0.4317, "step": 586700 }, { "epoch": 8.084649086550385, "grad_norm": 2.767005443572998, "learning_rate": 2.4557882828081003e-05, "loss": 0.3224, "step": 586800 }, { "epoch": 8.086026838610124, "grad_norm": 2.5002527236938477, "learning_rate": 2.454991351586811e-05, "loss": 0.3734, "step": 586900 }, { "epoch": 8.087404590669863, "grad_norm": 3.275029182434082, "learning_rate": 2.4541944454280165e-05, "loss": 0.3285, "step": 587000 }, { "epoch": 8.088782342729603, "grad_norm": 3.186555862426758, "learning_rate": 2.453397564399416e-05, "loss": 0.3757, "step": 587100 }, { "epoch": 8.090160094789342, "grad_norm": 1.81247878074646, "learning_rate": 2.4526007085687098e-05, "loss": 0.3433, "step": 587200 }, { "epoch": 8.09153784684908, "grad_norm": 3.46832275390625, "learning_rate": 2.4518038780035947e-05, "loss": 0.4017, "step": 587300 }, { "epoch": 8.09291559890882, "grad_norm": 2.4992687702178955, "learning_rate": 2.4510070727717666e-05, "loss": 0.4383, "step": 587400 }, { "epoch": 8.09429335096856, "grad_norm": 0.7022088170051575, "learning_rate": 2.4502102929409167e-05, "loss": 0.3616, "step": 587500 }, { "epoch": 8.095671103028298, "grad_norm": 3.1558876037597656, "learning_rate": 2.449413538578737e-05, "loss": 0.3719, "step": 587600 }, { "epoch": 8.097048855088039, "grad_norm": 4.710842132568359, "learning_rate": 2.4486168097529137e-05, "loss": 0.3555, "step": 587700 }, { "epoch": 8.098426607147777, "grad_norm": 2.684572219848633, "learning_rate": 2.447820106531135e-05, "loss": 0.3875, "step": 587800 }, { "epoch": 8.099804359207518, "grad_norm": 2.155186176300049, "learning_rate": 2.4470234289810844e-05, "loss": 0.3816, "step": 587900 }, { "epoch": 8.101182111267256, "grad_norm": 2.575976610183716, "learning_rate": 2.446226777170443e-05, "loss": 0.3729, "step": 588000 }, { "epoch": 8.102559863326995, "grad_norm": 8.682147026062012, "learning_rate": 2.4454301511668903e-05, "loss": 0.4029, "step": 588100 }, { "epoch": 8.103937615386736, "grad_norm": 2.202414035797119, "learning_rate": 2.444633551038106e-05, "loss": 0.3256, "step": 588200 }, { "epoch": 8.105315367446474, "grad_norm": 2.3452773094177246, "learning_rate": 2.4438369768517633e-05, "loss": 0.3766, "step": 588300 }, { "epoch": 8.106693119506213, "grad_norm": 4.665926933288574, "learning_rate": 2.4430404286755354e-05, "loss": 0.3883, "step": 588400 }, { "epoch": 8.108070871565953, "grad_norm": 0.5225210785865784, "learning_rate": 2.4422439065770938e-05, "loss": 0.3915, "step": 588500 }, { "epoch": 8.109448623625692, "grad_norm": 3.7652533054351807, "learning_rate": 2.4414474106241063e-05, "loss": 0.435, "step": 588600 }, { "epoch": 8.110826375685432, "grad_norm": 10.29556941986084, "learning_rate": 2.4406509408842406e-05, "loss": 0.3668, "step": 588700 }, { "epoch": 8.112204127745171, "grad_norm": 5.862832546234131, "learning_rate": 2.4398544974251598e-05, "loss": 0.4052, "step": 588800 }, { "epoch": 8.11358187980491, "grad_norm": 3.3638527393341064, "learning_rate": 2.4390580803145255e-05, "loss": 0.4398, "step": 588900 }, { "epoch": 8.11495963186465, "grad_norm": 4.01972770690918, "learning_rate": 2.4382616896199987e-05, "loss": 0.3262, "step": 589000 }, { "epoch": 8.116337383924389, "grad_norm": 8.807936668395996, "learning_rate": 2.4374653254092354e-05, "loss": 0.376, "step": 589100 }, { "epoch": 8.117715135984128, "grad_norm": 18.51058578491211, "learning_rate": 2.436668987749892e-05, "loss": 0.4229, "step": 589200 }, { "epoch": 8.119092888043868, "grad_norm": 2.7492315769195557, "learning_rate": 2.435880639688039e-05, "loss": 0.3598, "step": 589300 }, { "epoch": 8.120470640103607, "grad_norm": 1.5246316194534302, "learning_rate": 2.4350843550672885e-05, "loss": 0.3344, "step": 589400 }, { "epoch": 8.121848392163347, "grad_norm": 7.685235500335693, "learning_rate": 2.434288097200234e-05, "loss": 0.4201, "step": 589500 }, { "epoch": 8.123226144223086, "grad_norm": 3.045987606048584, "learning_rate": 2.433491866154521e-05, "loss": 0.3697, "step": 589600 }, { "epoch": 8.124603896282824, "grad_norm": 3.55100154876709, "learning_rate": 2.4326956619977925e-05, "loss": 0.3516, "step": 589700 }, { "epoch": 8.125981648342565, "grad_norm": 2.05771803855896, "learning_rate": 2.4318994847976924e-05, "loss": 0.3444, "step": 589800 }, { "epoch": 8.127359400402304, "grad_norm": 2.076603412628174, "learning_rate": 2.431103334621857e-05, "loss": 0.3839, "step": 589900 }, { "epoch": 8.128737152462042, "grad_norm": 1.0720906257629395, "learning_rate": 2.4303072115379255e-05, "loss": 0.4236, "step": 590000 }, { "epoch": 8.130114904521783, "grad_norm": 8.220071792602539, "learning_rate": 2.4295111156135315e-05, "loss": 0.3565, "step": 590100 }, { "epoch": 8.131492656581521, "grad_norm": 3.7118887901306152, "learning_rate": 2.4287150469163072e-05, "loss": 0.4404, "step": 590200 }, { "epoch": 8.132870408641262, "grad_norm": 1.0506024360656738, "learning_rate": 2.4279190055138845e-05, "loss": 0.3967, "step": 590300 }, { "epoch": 8.134248160701, "grad_norm": 2.0124876499176025, "learning_rate": 2.42712299147389e-05, "loss": 0.3737, "step": 590400 }, { "epoch": 8.135625912760739, "grad_norm": 2.834197998046875, "learning_rate": 2.426327004863949e-05, "loss": 0.3778, "step": 590500 }, { "epoch": 8.13700366482048, "grad_norm": 2.8162879943847656, "learning_rate": 2.425531045751685e-05, "loss": 0.3665, "step": 590600 }, { "epoch": 8.138381416880218, "grad_norm": 3.4018990993499756, "learning_rate": 2.4247351142047178e-05, "loss": 0.3366, "step": 590700 }, { "epoch": 8.139759168939957, "grad_norm": 1.353232502937317, "learning_rate": 2.423939210290667e-05, "loss": 0.3567, "step": 590800 }, { "epoch": 8.141136920999697, "grad_norm": 4.212244987487793, "learning_rate": 2.4231433340771495e-05, "loss": 0.3662, "step": 590900 }, { "epoch": 8.142514673059436, "grad_norm": 4.180209159851074, "learning_rate": 2.4223474856317772e-05, "loss": 0.3656, "step": 591000 }, { "epoch": 8.143892425119176, "grad_norm": 10.292398452758789, "learning_rate": 2.421551665022162e-05, "loss": 0.4106, "step": 591100 }, { "epoch": 8.145270177178915, "grad_norm": 3.6615183353424072, "learning_rate": 2.4207558723159126e-05, "loss": 0.3467, "step": 591200 }, { "epoch": 8.146647929238654, "grad_norm": 0.9376118779182434, "learning_rate": 2.4199601075806373e-05, "loss": 0.3996, "step": 591300 }, { "epoch": 8.148025681298394, "grad_norm": 5.277390480041504, "learning_rate": 2.4191643708839388e-05, "loss": 0.3259, "step": 591400 }, { "epoch": 8.149403433358133, "grad_norm": 1.3508918285369873, "learning_rate": 2.4183686622934205e-05, "loss": 0.3492, "step": 591500 }, { "epoch": 8.150781185417872, "grad_norm": 18.680063247680664, "learning_rate": 2.4175729818766803e-05, "loss": 0.3493, "step": 591600 }, { "epoch": 8.152158937477612, "grad_norm": 8.195116996765137, "learning_rate": 2.4167773297013152e-05, "loss": 0.4073, "step": 591700 }, { "epoch": 8.15353668953735, "grad_norm": 9.060653686523438, "learning_rate": 2.4159817058349215e-05, "loss": 0.3619, "step": 591800 }, { "epoch": 8.15491444159709, "grad_norm": 1.9935379028320312, "learning_rate": 2.4151861103450907e-05, "loss": 0.4389, "step": 591900 }, { "epoch": 8.15629219365683, "grad_norm": 3.29073429107666, "learning_rate": 2.414390543299413e-05, "loss": 0.331, "step": 592000 }, { "epoch": 8.157669945716568, "grad_norm": 0.2610456943511963, "learning_rate": 2.4135950047654755e-05, "loss": 0.3759, "step": 592100 }, { "epoch": 8.159047697776309, "grad_norm": 3.4052512645721436, "learning_rate": 2.4127994948108627e-05, "loss": 0.3656, "step": 592200 }, { "epoch": 8.160425449836048, "grad_norm": 1.7754141092300415, "learning_rate": 2.412004013503158e-05, "loss": 0.4142, "step": 592300 }, { "epoch": 8.161803201895786, "grad_norm": 0.057644303888082504, "learning_rate": 2.4112085609099422e-05, "loss": 0.3774, "step": 592400 }, { "epoch": 8.163180953955527, "grad_norm": 2.374054193496704, "learning_rate": 2.4104131370987922e-05, "loss": 0.4012, "step": 592500 }, { "epoch": 8.164558706015265, "grad_norm": 2.596203088760376, "learning_rate": 2.409617742137284e-05, "loss": 0.2971, "step": 592600 }, { "epoch": 8.165936458075004, "grad_norm": 2.750272035598755, "learning_rate": 2.4088223760929887e-05, "loss": 0.34, "step": 592700 }, { "epoch": 8.167314210134744, "grad_norm": 2.6829633712768555, "learning_rate": 2.4080270390334786e-05, "loss": 0.3827, "step": 592800 }, { "epoch": 8.168691962194483, "grad_norm": 4.634260177612305, "learning_rate": 2.4072317310263208e-05, "loss": 0.3521, "step": 592900 }, { "epoch": 8.170069714254224, "grad_norm": 1.6457654237747192, "learning_rate": 2.4064364521390817e-05, "loss": 0.383, "step": 593000 }, { "epoch": 8.171447466313962, "grad_norm": 2.2018299102783203, "learning_rate": 2.405641202439324e-05, "loss": 0.3409, "step": 593100 }, { "epoch": 8.172825218373701, "grad_norm": 3.3153560161590576, "learning_rate": 2.4048459819946067e-05, "loss": 0.3608, "step": 593200 }, { "epoch": 8.174202970433441, "grad_norm": 6.065871238708496, "learning_rate": 2.4040507908724895e-05, "loss": 0.3746, "step": 593300 }, { "epoch": 8.17558072249318, "grad_norm": 4.061732769012451, "learning_rate": 2.4032556291405274e-05, "loss": 0.38, "step": 593400 }, { "epoch": 8.176958474552919, "grad_norm": 4.170194149017334, "learning_rate": 2.402460496866274e-05, "loss": 0.427, "step": 593500 }, { "epoch": 8.17833622661266, "grad_norm": 4.86626672744751, "learning_rate": 2.4016653941172783e-05, "loss": 0.38, "step": 593600 }, { "epoch": 8.179713978672398, "grad_norm": 115.64444732666016, "learning_rate": 2.4008703209610908e-05, "loss": 0.3639, "step": 593700 }, { "epoch": 8.181091730732138, "grad_norm": 2.123413562774658, "learning_rate": 2.400075277465255e-05, "loss": 0.3959, "step": 593800 }, { "epoch": 8.182469482791877, "grad_norm": 2.600405216217041, "learning_rate": 2.3992882136876187e-05, "loss": 0.3835, "step": 593900 }, { "epoch": 8.183847234851616, "grad_norm": 2.9496586322784424, "learning_rate": 2.3984932294168256e-05, "loss": 0.4046, "step": 594000 }, { "epoch": 8.185224986911356, "grad_norm": 3.0139377117156982, "learning_rate": 2.3976982750083313e-05, "loss": 0.3847, "step": 594100 }, { "epoch": 8.186602738971095, "grad_norm": 2.8484079837799072, "learning_rate": 2.396903350529671e-05, "loss": 0.3166, "step": 594200 }, { "epoch": 8.187980491030833, "grad_norm": 0.648325502872467, "learning_rate": 2.396108456048377e-05, "loss": 0.304, "step": 594300 }, { "epoch": 8.189358243090574, "grad_norm": 24.33765983581543, "learning_rate": 2.3953135916319794e-05, "loss": 0.3166, "step": 594400 }, { "epoch": 8.190735995150312, "grad_norm": 2.528599977493286, "learning_rate": 2.3945187573480076e-05, "loss": 0.3675, "step": 594500 }, { "epoch": 8.192113747210053, "grad_norm": 3.2952067852020264, "learning_rate": 2.393723953263986e-05, "loss": 0.3656, "step": 594600 }, { "epoch": 8.193491499269792, "grad_norm": 2.1732401847839355, "learning_rate": 2.3929291794474363e-05, "loss": 0.4261, "step": 594700 }, { "epoch": 8.19486925132953, "grad_norm": 4.165266990661621, "learning_rate": 2.392134435965882e-05, "loss": 0.3583, "step": 594800 }, { "epoch": 8.19624700338927, "grad_norm": 3.733318328857422, "learning_rate": 2.391339722886836e-05, "loss": 0.4283, "step": 594900 }, { "epoch": 8.19762475544901, "grad_norm": 0.897555410861969, "learning_rate": 2.390545040277816e-05, "loss": 0.3226, "step": 595000 }, { "epoch": 8.199002507508748, "grad_norm": 3.5876872539520264, "learning_rate": 2.3897503882063338e-05, "loss": 0.3049, "step": 595100 }, { "epoch": 8.200380259568488, "grad_norm": 11.801627159118652, "learning_rate": 2.388955766739899e-05, "loss": 0.3764, "step": 595200 }, { "epoch": 8.201758011628227, "grad_norm": 0.23541490733623505, "learning_rate": 2.3881611759460198e-05, "loss": 0.3459, "step": 595300 }, { "epoch": 8.203135763687968, "grad_norm": 3.1915454864501953, "learning_rate": 2.3873666158921987e-05, "loss": 0.3515, "step": 595400 }, { "epoch": 8.204513515747706, "grad_norm": 8.96133804321289, "learning_rate": 2.386572086645939e-05, "loss": 0.3607, "step": 595500 }, { "epoch": 8.205891267807445, "grad_norm": 4.7565741539001465, "learning_rate": 2.3857775882747402e-05, "loss": 0.3701, "step": 595600 }, { "epoch": 8.207269019867185, "grad_norm": 2.963245391845703, "learning_rate": 2.384983120846098e-05, "loss": 0.4234, "step": 595700 }, { "epoch": 8.208646771926924, "grad_norm": 2.912383556365967, "learning_rate": 2.3841886844275077e-05, "loss": 0.3806, "step": 595800 }, { "epoch": 8.210024523986663, "grad_norm": 2.8053717613220215, "learning_rate": 2.3833942790864604e-05, "loss": 0.3714, "step": 595900 }, { "epoch": 8.211402276046403, "grad_norm": 2.7469027042388916, "learning_rate": 2.3825999048904446e-05, "loss": 0.4079, "step": 596000 }, { "epoch": 8.212780028106142, "grad_norm": 3.2951292991638184, "learning_rate": 2.3818055619069457e-05, "loss": 0.4025, "step": 596100 }, { "epoch": 8.21415778016588, "grad_norm": 13.589353561401367, "learning_rate": 2.3810112502034483e-05, "loss": 0.4093, "step": 596200 }, { "epoch": 8.215535532225621, "grad_norm": 12.381446838378906, "learning_rate": 2.380216969847433e-05, "loss": 0.3148, "step": 596300 }, { "epoch": 8.21691328428536, "grad_norm": 2.9471869468688965, "learning_rate": 2.3794227209063778e-05, "loss": 0.3429, "step": 596400 }, { "epoch": 8.2182910363451, "grad_norm": 2.6726393699645996, "learning_rate": 2.3786285034477594e-05, "loss": 0.3193, "step": 596500 }, { "epoch": 8.219668788404839, "grad_norm": 1.4599651098251343, "learning_rate": 2.3778343175390484e-05, "loss": 0.416, "step": 596600 }, { "epoch": 8.221046540464577, "grad_norm": 6.416299343109131, "learning_rate": 2.377040163247716e-05, "loss": 0.3648, "step": 596700 }, { "epoch": 8.222424292524318, "grad_norm": 6.702447414398193, "learning_rate": 2.3762460406412302e-05, "loss": 0.3936, "step": 596800 }, { "epoch": 8.223802044584057, "grad_norm": 3.8319742679595947, "learning_rate": 2.3754519497870553e-05, "loss": 0.4281, "step": 596900 }, { "epoch": 8.225179796643795, "grad_norm": 49.12172317504883, "learning_rate": 2.374657890752654e-05, "loss": 0.4049, "step": 597000 }, { "epoch": 8.226557548703536, "grad_norm": 1.7983771562576294, "learning_rate": 2.373863863605485e-05, "loss": 0.3459, "step": 597100 }, { "epoch": 8.227935300763274, "grad_norm": 6.215121269226074, "learning_rate": 2.3730698684130037e-05, "loss": 0.3489, "step": 597200 }, { "epoch": 8.229313052823015, "grad_norm": 3.391045093536377, "learning_rate": 2.3722759052426664e-05, "loss": 0.3825, "step": 597300 }, { "epoch": 8.230690804882753, "grad_norm": 5.365300178527832, "learning_rate": 2.3714819741619237e-05, "loss": 0.3795, "step": 597400 }, { "epoch": 8.232068556942492, "grad_norm": 1.8313648700714111, "learning_rate": 2.3706880752382223e-05, "loss": 0.4024, "step": 597500 }, { "epoch": 8.233446309002233, "grad_norm": 11.814016342163086, "learning_rate": 2.369894208539012e-05, "loss": 0.3888, "step": 597600 }, { "epoch": 8.234824061061971, "grad_norm": 6.280355453491211, "learning_rate": 2.369108312315738e-05, "loss": 0.3785, "step": 597700 }, { "epoch": 8.23620181312171, "grad_norm": 5.001255035400391, "learning_rate": 2.3683145099439022e-05, "loss": 0.3578, "step": 597800 }, { "epoch": 8.23757956518145, "grad_norm": 4.572300434112549, "learning_rate": 2.3675207399982008e-05, "loss": 0.381, "step": 597900 }, { "epoch": 8.238957317241189, "grad_norm": 6.731862545013428, "learning_rate": 2.3667270025460703e-05, "loss": 0.4538, "step": 598000 }, { "epoch": 8.24033506930093, "grad_norm": 2.721407651901245, "learning_rate": 2.3659332976549418e-05, "loss": 0.401, "step": 598100 }, { "epoch": 8.241712821360668, "grad_norm": 3.0862796306610107, "learning_rate": 2.365139625392244e-05, "loss": 0.4178, "step": 598200 }, { "epoch": 8.243090573420407, "grad_norm": 1.7837305068969727, "learning_rate": 2.3643539220590068e-05, "loss": 0.3603, "step": 598300 }, { "epoch": 8.244468325480147, "grad_norm": 3.3449065685272217, "learning_rate": 2.3635603149274818e-05, "loss": 0.389, "step": 598400 }, { "epoch": 8.245846077539886, "grad_norm": 1.9673396348953247, "learning_rate": 2.3627667406259845e-05, "loss": 0.3753, "step": 598500 }, { "epoch": 8.247223829599625, "grad_norm": 6.038936138153076, "learning_rate": 2.3619731992219356e-05, "loss": 0.3798, "step": 598600 }, { "epoch": 8.248601581659365, "grad_norm": 0.7166121006011963, "learning_rate": 2.3611796907827465e-05, "loss": 0.3308, "step": 598700 }, { "epoch": 8.249979333719104, "grad_norm": 1.7720637321472168, "learning_rate": 2.360386215375833e-05, "loss": 0.3657, "step": 598800 }, { "epoch": 8.251357085778844, "grad_norm": 3.3728113174438477, "learning_rate": 2.3595927730686032e-05, "loss": 0.2964, "step": 598900 }, { "epoch": 8.252734837838583, "grad_norm": 1.96244478225708, "learning_rate": 2.358799363928464e-05, "loss": 0.3805, "step": 599000 }, { "epoch": 8.254112589898321, "grad_norm": 2.015028715133667, "learning_rate": 2.3580059880228212e-05, "loss": 0.3648, "step": 599100 }, { "epoch": 8.255490341958062, "grad_norm": 2.4776034355163574, "learning_rate": 2.3572126454190757e-05, "loss": 0.3487, "step": 599200 }, { "epoch": 8.2568680940178, "grad_norm": 3.41325044631958, "learning_rate": 2.3564193361846246e-05, "loss": 0.3302, "step": 599300 }, { "epoch": 8.25824584607754, "grad_norm": 1.0462127923965454, "learning_rate": 2.3556260603868644e-05, "loss": 0.3896, "step": 599400 }, { "epoch": 8.25962359813728, "grad_norm": 10.382911682128906, "learning_rate": 2.3548328180931878e-05, "loss": 0.3496, "step": 599500 }, { "epoch": 8.261001350197018, "grad_norm": 2.761988639831543, "learning_rate": 2.3540396093709852e-05, "loss": 0.3523, "step": 599600 }, { "epoch": 8.262379102256759, "grad_norm": 2.800248861312866, "learning_rate": 2.3532464342876444e-05, "loss": 0.338, "step": 599700 }, { "epoch": 8.263756854316497, "grad_norm": 23.189224243164062, "learning_rate": 2.352453292910548e-05, "loss": 0.3473, "step": 599800 }, { "epoch": 8.265134606376236, "grad_norm": 5.876138687133789, "learning_rate": 2.351660185307078e-05, "loss": 0.4313, "step": 599900 }, { "epoch": 8.266512358435977, "grad_norm": 3.2400267124176025, "learning_rate": 2.3508671115446127e-05, "loss": 0.3509, "step": 600000 }, { "epoch": 8.266512358435977, "eval_accuracy": 0.8889665328785155, "eval_cer": 0.07646042258881632, "eval_loss": 0.4270029664039612, "eval_runtime": 8833.6312, "eval_samples_per_second": 6.106, "eval_steps_per_second": 0.382, "eval_wer": 0.15148950089177773, "step": 600000 }, { "epoch": 8.267890110495715, "grad_norm": 5.5132293701171875, "learning_rate": 2.350074071690529e-05, "loss": 0.3045, "step": 600100 }, { "epoch": 8.269267862555454, "grad_norm": 5.565347194671631, "learning_rate": 2.3492810658121985e-05, "loss": 0.4019, "step": 600200 }, { "epoch": 8.270645614615194, "grad_norm": 5.544607639312744, "learning_rate": 2.3484880939769924e-05, "loss": 0.3741, "step": 600300 }, { "epoch": 8.272023366674933, "grad_norm": 3.7324399948120117, "learning_rate": 2.3476951562522764e-05, "loss": 0.424, "step": 600400 }, { "epoch": 8.273401118734672, "grad_norm": 4.268611431121826, "learning_rate": 2.3469022527054144e-05, "loss": 0.3721, "step": 600500 }, { "epoch": 8.274778870794412, "grad_norm": 3.2757174968719482, "learning_rate": 2.346109383403769e-05, "loss": 0.3595, "step": 600600 }, { "epoch": 8.27615662285415, "grad_norm": 0.6043170094490051, "learning_rate": 2.3453165484146972e-05, "loss": 0.3292, "step": 600700 }, { "epoch": 8.277534374913891, "grad_norm": 47.83230972290039, "learning_rate": 2.3445237478055555e-05, "loss": 0.4445, "step": 600800 }, { "epoch": 8.27891212697363, "grad_norm": 1.1040006875991821, "learning_rate": 2.3437309816436963e-05, "loss": 0.3769, "step": 600900 }, { "epoch": 8.280289879033369, "grad_norm": 4.363882064819336, "learning_rate": 2.342938249996467e-05, "loss": 0.3596, "step": 601000 }, { "epoch": 8.281667631093109, "grad_norm": 6.074005126953125, "learning_rate": 2.3421455529312172e-05, "loss": 0.3651, "step": 601100 }, { "epoch": 8.283045383152848, "grad_norm": 3.6676464080810547, "learning_rate": 2.3413528905152887e-05, "loss": 0.4084, "step": 601200 }, { "epoch": 8.284423135212586, "grad_norm": 9.595353126525879, "learning_rate": 2.340560262816023e-05, "loss": 0.4027, "step": 601300 }, { "epoch": 8.285800887272327, "grad_norm": 3.8764138221740723, "learning_rate": 2.3397676699007583e-05, "loss": 0.445, "step": 601400 }, { "epoch": 8.287178639332065, "grad_norm": 2.0737035274505615, "learning_rate": 2.3389751118368274e-05, "loss": 0.3393, "step": 601500 }, { "epoch": 8.288556391391806, "grad_norm": 3.8336541652679443, "learning_rate": 2.3381825886915645e-05, "loss": 0.3761, "step": 601600 }, { "epoch": 8.289934143451545, "grad_norm": 4.329288482666016, "learning_rate": 2.337390100532297e-05, "loss": 0.3364, "step": 601700 }, { "epoch": 8.291311895511283, "grad_norm": 1.8989430665969849, "learning_rate": 2.336597647426352e-05, "loss": 0.4321, "step": 601800 }, { "epoch": 8.292689647571024, "grad_norm": 3.7484686374664307, "learning_rate": 2.3358052294410507e-05, "loss": 0.3013, "step": 601900 }, { "epoch": 8.294067399630762, "grad_norm": 131.56802368164062, "learning_rate": 2.3350128466437156e-05, "loss": 0.3319, "step": 602000 }, { "epoch": 8.295445151690501, "grad_norm": 4.122030735015869, "learning_rate": 2.3342204991016614e-05, "loss": 0.3312, "step": 602100 }, { "epoch": 8.296822903750241, "grad_norm": 2.953845262527466, "learning_rate": 2.3334281868822032e-05, "loss": 0.3149, "step": 602200 }, { "epoch": 8.29820065580998, "grad_norm": 2.1116340160369873, "learning_rate": 2.3326359100526516e-05, "loss": 0.3329, "step": 602300 }, { "epoch": 8.29957840786972, "grad_norm": 4.048133373260498, "learning_rate": 2.331843668680314e-05, "loss": 0.4537, "step": 602400 }, { "epoch": 8.30095615992946, "grad_norm": 7.376530170440674, "learning_rate": 2.3310514628324973e-05, "loss": 0.4062, "step": 602500 }, { "epoch": 8.302333911989198, "grad_norm": 2.8816463947296143, "learning_rate": 2.3302592925765013e-05, "loss": 0.3593, "step": 602600 }, { "epoch": 8.303711664048938, "grad_norm": 9.513242721557617, "learning_rate": 2.329467157979626e-05, "loss": 0.3453, "step": 602700 }, { "epoch": 8.305089416108677, "grad_norm": 0.8848949670791626, "learning_rate": 2.328675059109167e-05, "loss": 0.4427, "step": 602800 }, { "epoch": 8.306467168168416, "grad_norm": 1.0727472305297852, "learning_rate": 2.3278829960324167e-05, "loss": 0.3866, "step": 602900 }, { "epoch": 8.307844920228156, "grad_norm": 1.1733559370040894, "learning_rate": 2.3270909688166662e-05, "loss": 0.3644, "step": 603000 }, { "epoch": 8.309222672287895, "grad_norm": 4.333801746368408, "learning_rate": 2.326298977529202e-05, "loss": 0.3817, "step": 603100 }, { "epoch": 8.310600424347635, "grad_norm": 3.5866081714630127, "learning_rate": 2.3255070222373066e-05, "loss": 0.36, "step": 603200 }, { "epoch": 8.311978176407374, "grad_norm": 2.5693233013153076, "learning_rate": 2.324715103008261e-05, "loss": 0.4152, "step": 603300 }, { "epoch": 8.313355928467113, "grad_norm": 3.4720840454101562, "learning_rate": 2.323923219909343e-05, "loss": 0.4331, "step": 603400 }, { "epoch": 8.314733680526853, "grad_norm": 4.121750354766846, "learning_rate": 2.3231313730078275e-05, "loss": 0.3307, "step": 603500 }, { "epoch": 8.316111432586592, "grad_norm": 2.5358846187591553, "learning_rate": 2.3223395623709866e-05, "loss": 0.3363, "step": 603600 }, { "epoch": 8.31748918464633, "grad_norm": 4.643316268920898, "learning_rate": 2.321547788066087e-05, "loss": 0.4222, "step": 603700 }, { "epoch": 8.31886693670607, "grad_norm": 15.085054397583008, "learning_rate": 2.3207560501603948e-05, "loss": 0.4, "step": 603800 }, { "epoch": 8.32024468876581, "grad_norm": 1.613295078277588, "learning_rate": 2.319964348721172e-05, "loss": 0.426, "step": 603900 }, { "epoch": 8.32162244082555, "grad_norm": 2.696157693862915, "learning_rate": 2.3191726838156775e-05, "loss": 0.3816, "step": 604000 }, { "epoch": 8.323000192885289, "grad_norm": 4.008109092712402, "learning_rate": 2.3183810555111682e-05, "loss": 0.3973, "step": 604100 }, { "epoch": 8.324377944945027, "grad_norm": 4.119177341461182, "learning_rate": 2.3175894638748968e-05, "loss": 0.3812, "step": 604200 }, { "epoch": 8.325755697004768, "grad_norm": 1.6911532878875732, "learning_rate": 2.3167979089741123e-05, "loss": 0.3343, "step": 604300 }, { "epoch": 8.327133449064506, "grad_norm": 40.78925323486328, "learning_rate": 2.316006390876061e-05, "loss": 0.3955, "step": 604400 }, { "epoch": 8.328511201124245, "grad_norm": 5.041555404663086, "learning_rate": 2.3152149096479877e-05, "loss": 0.4188, "step": 604500 }, { "epoch": 8.329888953183985, "grad_norm": 5.011321067810059, "learning_rate": 2.314423465357132e-05, "loss": 0.3411, "step": 604600 }, { "epoch": 8.331266705243724, "grad_norm": 1.6941614151000977, "learning_rate": 2.313632058070732e-05, "loss": 0.4286, "step": 604700 }, { "epoch": 8.332644457303463, "grad_norm": 1.391391634941101, "learning_rate": 2.3128406878560205e-05, "loss": 0.3986, "step": 604800 }, { "epoch": 8.334022209363203, "grad_norm": 1.0014657974243164, "learning_rate": 2.3120493547802283e-05, "loss": 0.3414, "step": 604900 }, { "epoch": 8.335399961422942, "grad_norm": 4.861555099487305, "learning_rate": 2.311258058910585e-05, "loss": 0.3396, "step": 605000 }, { "epoch": 8.336777713482682, "grad_norm": 1.4590474367141724, "learning_rate": 2.3104668003143134e-05, "loss": 0.3458, "step": 605100 }, { "epoch": 8.338155465542421, "grad_norm": 14.449016571044922, "learning_rate": 2.309675579058636e-05, "loss": 0.318, "step": 605200 }, { "epoch": 8.33953321760216, "grad_norm": 0.2966105043888092, "learning_rate": 2.3088843952107726e-05, "loss": 0.3762, "step": 605300 }, { "epoch": 8.3409109696619, "grad_norm": 2.887688636779785, "learning_rate": 2.308093248837934e-05, "loss": 0.3545, "step": 605400 }, { "epoch": 8.342288721721639, "grad_norm": 2.524355173110962, "learning_rate": 2.307302140007336e-05, "loss": 0.351, "step": 605500 }, { "epoch": 8.343666473781377, "grad_norm": 2.019190549850464, "learning_rate": 2.3065189793120094e-05, "loss": 0.3583, "step": 605600 }, { "epoch": 8.345044225841118, "grad_norm": 17.822757720947266, "learning_rate": 2.3057279453904136e-05, "loss": 0.4177, "step": 605700 }, { "epoch": 8.346421977900857, "grad_norm": 3.5091047286987305, "learning_rate": 2.3049369492120022e-05, "loss": 0.3025, "step": 605800 }, { "epoch": 8.347799729960597, "grad_norm": 4.332221984863281, "learning_rate": 2.3041459908439737e-05, "loss": 0.3338, "step": 605900 }, { "epoch": 8.349177482020336, "grad_norm": 4.5085577964782715, "learning_rate": 2.303355070353523e-05, "loss": 0.3733, "step": 606000 }, { "epoch": 8.350555234080074, "grad_norm": 3.847532272338867, "learning_rate": 2.302564187807845e-05, "loss": 0.3917, "step": 606100 }, { "epoch": 8.351932986139815, "grad_norm": 3.193223476409912, "learning_rate": 2.3017733432741284e-05, "loss": 0.3147, "step": 606200 }, { "epoch": 8.353310738199553, "grad_norm": 2.7339401245117188, "learning_rate": 2.300982536819559e-05, "loss": 0.3297, "step": 606300 }, { "epoch": 8.354688490259292, "grad_norm": 8.012553215026855, "learning_rate": 2.3001917685113215e-05, "loss": 0.3798, "step": 606400 }, { "epoch": 8.356066242319033, "grad_norm": 3.3491709232330322, "learning_rate": 2.299401038416593e-05, "loss": 0.3884, "step": 606500 }, { "epoch": 8.357443994378771, "grad_norm": 3.142204761505127, "learning_rate": 2.298610346602552e-05, "loss": 0.3787, "step": 606600 }, { "epoch": 8.358821746438512, "grad_norm": 0.42889827489852905, "learning_rate": 2.2978196931363716e-05, "loss": 0.3027, "step": 606700 }, { "epoch": 8.36019949849825, "grad_norm": 8.121647834777832, "learning_rate": 2.297029078085221e-05, "loss": 0.3942, "step": 606800 }, { "epoch": 8.361577250557989, "grad_norm": 4.54153299331665, "learning_rate": 2.296238501516269e-05, "loss": 0.3688, "step": 606900 }, { "epoch": 8.36295500261773, "grad_norm": 3.75486159324646, "learning_rate": 2.2954479634966764e-05, "loss": 0.417, "step": 607000 }, { "epoch": 8.364332754677468, "grad_norm": 2.48858904838562, "learning_rate": 2.294657464093605e-05, "loss": 0.3655, "step": 607100 }, { "epoch": 8.365710506737207, "grad_norm": 1.6995512247085571, "learning_rate": 2.2938670033742116e-05, "loss": 0.4368, "step": 607200 }, { "epoch": 8.367088258796947, "grad_norm": 1.7572351694107056, "learning_rate": 2.29307658140565e-05, "loss": 0.3861, "step": 607300 }, { "epoch": 8.368466010856686, "grad_norm": 1.17812979221344, "learning_rate": 2.2922861982550694e-05, "loss": 0.3635, "step": 607400 }, { "epoch": 8.369843762916426, "grad_norm": 3.8669700622558594, "learning_rate": 2.2914958539896197e-05, "loss": 0.3535, "step": 607500 }, { "epoch": 8.371221514976165, "grad_norm": 2.132659912109375, "learning_rate": 2.29071345153654e-05, "loss": 0.34, "step": 607600 }, { "epoch": 8.372599267035904, "grad_norm": 9.288592338562012, "learning_rate": 2.2899231848522483e-05, "loss": 0.422, "step": 607700 }, { "epoch": 8.373977019095644, "grad_norm": 2.039140224456787, "learning_rate": 2.289132957253837e-05, "loss": 0.4281, "step": 607800 }, { "epoch": 8.375354771155383, "grad_norm": 4.187887668609619, "learning_rate": 2.2883427688084385e-05, "loss": 0.3237, "step": 607900 }, { "epoch": 8.376732523215122, "grad_norm": 6.897355556488037, "learning_rate": 2.287552619583184e-05, "loss": 0.4401, "step": 608000 }, { "epoch": 8.378110275274862, "grad_norm": 10.177162170410156, "learning_rate": 2.2867625096452e-05, "loss": 0.3406, "step": 608100 }, { "epoch": 8.3794880273346, "grad_norm": 1.845786452293396, "learning_rate": 2.2859724390616104e-05, "loss": 0.3308, "step": 608200 }, { "epoch": 8.380865779394341, "grad_norm": 4.943540573120117, "learning_rate": 2.285182407899536e-05, "loss": 0.3152, "step": 608300 }, { "epoch": 8.38224353145408, "grad_norm": 1.8014031648635864, "learning_rate": 2.2843924162260944e-05, "loss": 0.4084, "step": 608400 }, { "epoch": 8.383621283513818, "grad_norm": 6.4546003341674805, "learning_rate": 2.2836024641083983e-05, "loss": 0.3594, "step": 608500 }, { "epoch": 8.384999035573559, "grad_norm": 1.6401779651641846, "learning_rate": 2.2828125516135607e-05, "loss": 0.3357, "step": 608600 }, { "epoch": 8.386376787633298, "grad_norm": 6.374835014343262, "learning_rate": 2.2820226788086854e-05, "loss": 0.3954, "step": 608700 }, { "epoch": 8.387754539693036, "grad_norm": 2.8909759521484375, "learning_rate": 2.2812328457608782e-05, "loss": 0.3428, "step": 608800 }, { "epoch": 8.389132291752777, "grad_norm": 3.7436742782592773, "learning_rate": 2.2804509502721254e-05, "loss": 0.3669, "step": 608900 }, { "epoch": 8.390510043812515, "grad_norm": 3.7521841526031494, "learning_rate": 2.279661196540506e-05, "loss": 0.374, "step": 609000 }, { "epoch": 8.391887795872254, "grad_norm": 3.0969252586364746, "learning_rate": 2.278871482766575e-05, "loss": 0.3472, "step": 609100 }, { "epoch": 8.393265547931994, "grad_norm": 1.7246965169906616, "learning_rate": 2.2780818090174216e-05, "loss": 0.3478, "step": 609200 }, { "epoch": 8.394643299991733, "grad_norm": 2.6951217651367188, "learning_rate": 2.2772921753601318e-05, "loss": 0.3401, "step": 609300 }, { "epoch": 8.396021052051474, "grad_norm": 4.763112545013428, "learning_rate": 2.2765025818617907e-05, "loss": 0.3571, "step": 609400 }, { "epoch": 8.397398804111212, "grad_norm": 1.2914090156555176, "learning_rate": 2.275713028589478e-05, "loss": 0.3724, "step": 609500 }, { "epoch": 8.39877655617095, "grad_norm": 4.179256439208984, "learning_rate": 2.2749235156102695e-05, "loss": 0.3408, "step": 609600 }, { "epoch": 8.400154308230691, "grad_norm": 2.138127565383911, "learning_rate": 2.2741340429912397e-05, "loss": 0.41, "step": 609700 }, { "epoch": 8.40153206029043, "grad_norm": 3.9688987731933594, "learning_rate": 2.2733446107994564e-05, "loss": 0.3795, "step": 609800 }, { "epoch": 8.402909812350169, "grad_norm": 4.312638759613037, "learning_rate": 2.2725552191019872e-05, "loss": 0.4125, "step": 609900 }, { "epoch": 8.404287564409909, "grad_norm": 3.477618932723999, "learning_rate": 2.2717658679658952e-05, "loss": 0.3811, "step": 610000 }, { "epoch": 8.405665316469648, "grad_norm": 3.677065849304199, "learning_rate": 2.2709765574582386e-05, "loss": 0.3886, "step": 610100 }, { "epoch": 8.407043068529388, "grad_norm": 0.648706316947937, "learning_rate": 2.2701872876460756e-05, "loss": 0.3446, "step": 610200 }, { "epoch": 8.408420820589127, "grad_norm": 4.264386177062988, "learning_rate": 2.2693980585964557e-05, "loss": 0.3438, "step": 610300 }, { "epoch": 8.409798572648866, "grad_norm": 1.4507111310958862, "learning_rate": 2.2686167620563035e-05, "loss": 0.336, "step": 610400 }, { "epoch": 8.411176324708606, "grad_norm": 4.321676254272461, "learning_rate": 2.2678276143236182e-05, "loss": 0.4352, "step": 610500 }, { "epoch": 8.412554076768345, "grad_norm": 3.7936556339263916, "learning_rate": 2.2670385075539443e-05, "loss": 0.3565, "step": 610600 }, { "epoch": 8.413931828828083, "grad_norm": 55.288944244384766, "learning_rate": 2.2662494418143204e-05, "loss": 0.3025, "step": 610700 }, { "epoch": 8.415309580887824, "grad_norm": 1.353340983390808, "learning_rate": 2.2654604171717813e-05, "loss": 0.2957, "step": 610800 }, { "epoch": 8.416687332947562, "grad_norm": 2.9988954067230225, "learning_rate": 2.2646714336933585e-05, "loss": 0.3446, "step": 610900 }, { "epoch": 8.418065085007303, "grad_norm": 2.7173781394958496, "learning_rate": 2.2638824914460795e-05, "loss": 0.3954, "step": 611000 }, { "epoch": 8.419442837067042, "grad_norm": 3.973426580429077, "learning_rate": 2.2630935904969706e-05, "loss": 0.3835, "step": 611100 }, { "epoch": 8.42082058912678, "grad_norm": 3.0652267932891846, "learning_rate": 2.2623047309130517e-05, "loss": 0.4002, "step": 611200 }, { "epoch": 8.42219834118652, "grad_norm": 14.96908950805664, "learning_rate": 2.261515912761342e-05, "loss": 0.3555, "step": 611300 }, { "epoch": 8.42357609324626, "grad_norm": 2.573018789291382, "learning_rate": 2.260727136108854e-05, "loss": 0.4071, "step": 611400 }, { "epoch": 8.424953845305998, "grad_norm": 2.935473680496216, "learning_rate": 2.259938401022598e-05, "loss": 0.3932, "step": 611500 }, { "epoch": 8.426331597365738, "grad_norm": 12.06583309173584, "learning_rate": 2.2591497075695825e-05, "loss": 0.3949, "step": 611600 }, { "epoch": 8.427709349425477, "grad_norm": 2.336740493774414, "learning_rate": 2.2583610558168108e-05, "loss": 0.3601, "step": 611700 }, { "epoch": 8.429087101485218, "grad_norm": 4.078133583068848, "learning_rate": 2.257572445831282e-05, "loss": 0.3446, "step": 611800 }, { "epoch": 8.430464853544956, "grad_norm": 4.701194763183594, "learning_rate": 2.2567838776799945e-05, "loss": 0.3829, "step": 611900 }, { "epoch": 8.431842605604695, "grad_norm": 0.9086661338806152, "learning_rate": 2.2559953514299387e-05, "loss": 0.4539, "step": 612000 }, { "epoch": 8.433220357664435, "grad_norm": 2.3533055782318115, "learning_rate": 2.2552068671481055e-05, "loss": 0.3443, "step": 612100 }, { "epoch": 8.434598109724174, "grad_norm": 8.864639282226562, "learning_rate": 2.25441842490148e-05, "loss": 0.3849, "step": 612200 }, { "epoch": 8.435975861783913, "grad_norm": 0.09770727902650833, "learning_rate": 2.2536300247570444e-05, "loss": 0.3283, "step": 612300 }, { "epoch": 8.437353613843653, "grad_norm": 4.248231887817383, "learning_rate": 2.2528416667817797e-05, "loss": 0.4212, "step": 612400 }, { "epoch": 8.438731365903392, "grad_norm": 3.1666977405548096, "learning_rate": 2.2520533510426566e-05, "loss": 0.3853, "step": 612500 }, { "epoch": 8.440109117963132, "grad_norm": 3.1559460163116455, "learning_rate": 2.2512650776066496e-05, "loss": 0.3534, "step": 612600 }, { "epoch": 8.441486870022871, "grad_norm": 17.667865753173828, "learning_rate": 2.2504768465407255e-05, "loss": 0.3432, "step": 612700 }, { "epoch": 8.44286462208261, "grad_norm": 3.9279208183288574, "learning_rate": 2.2496886579118486e-05, "loss": 0.3363, "step": 612800 }, { "epoch": 8.44424237414235, "grad_norm": 1.1642777919769287, "learning_rate": 2.24890051178698e-05, "loss": 0.3377, "step": 612900 }, { "epoch": 8.445620126202089, "grad_norm": 2.8263070583343506, "learning_rate": 2.2481124082330772e-05, "loss": 0.3332, "step": 613000 }, { "epoch": 8.446997878261827, "grad_norm": 2.421978712081909, "learning_rate": 2.2473243473170925e-05, "loss": 0.3642, "step": 613100 }, { "epoch": 8.448375630321568, "grad_norm": 3.852734088897705, "learning_rate": 2.2465363291059764e-05, "loss": 0.3992, "step": 613200 }, { "epoch": 8.449753382381306, "grad_norm": 7.440618991851807, "learning_rate": 2.245748353666674e-05, "loss": 0.4098, "step": 613300 }, { "epoch": 8.451131134441045, "grad_norm": 3.2082061767578125, "learning_rate": 2.2449604210661292e-05, "loss": 0.3467, "step": 613400 }, { "epoch": 8.452508886500786, "grad_norm": 4.258767127990723, "learning_rate": 2.2441725313712805e-05, "loss": 0.3394, "step": 613500 }, { "epoch": 8.453886638560524, "grad_norm": 10.594532012939453, "learning_rate": 2.2433846846490637e-05, "loss": 0.3901, "step": 613600 }, { "epoch": 8.455264390620265, "grad_norm": 1.9518663883209229, "learning_rate": 2.242596880966409e-05, "loss": 0.3417, "step": 613700 }, { "epoch": 8.456642142680003, "grad_norm": 7.226503372192383, "learning_rate": 2.2418091203902447e-05, "loss": 0.3096, "step": 613800 }, { "epoch": 8.458019894739742, "grad_norm": 2.3644461631774902, "learning_rate": 2.241021402987496e-05, "loss": 0.3249, "step": 613900 }, { "epoch": 8.459397646799482, "grad_norm": 3.722769021987915, "learning_rate": 2.240233728825083e-05, "loss": 0.3694, "step": 614000 }, { "epoch": 8.460775398859221, "grad_norm": 5.688827037811279, "learning_rate": 2.2394460979699233e-05, "loss": 0.3863, "step": 614100 }, { "epoch": 8.46215315091896, "grad_norm": 5.0374755859375, "learning_rate": 2.238658510488929e-05, "loss": 0.4053, "step": 614200 }, { "epoch": 8.4635309029787, "grad_norm": 1.7014131546020508, "learning_rate": 2.2378709664490098e-05, "loss": 0.3535, "step": 614300 }, { "epoch": 8.464908655038439, "grad_norm": 4.276094436645508, "learning_rate": 2.2370834659170725e-05, "loss": 0.3563, "step": 614400 }, { "epoch": 8.46628640709818, "grad_norm": 58.399173736572266, "learning_rate": 2.2362960089600185e-05, "loss": 0.3891, "step": 614500 }, { "epoch": 8.467664159157918, "grad_norm": 13.329980850219727, "learning_rate": 2.2355085956447467e-05, "loss": 0.4228, "step": 614600 }, { "epoch": 8.469041911217657, "grad_norm": 0.040426105260849, "learning_rate": 2.2347212260381534e-05, "loss": 0.3723, "step": 614700 }, { "epoch": 8.470419663277397, "grad_norm": 1.624005913734436, "learning_rate": 2.2339339002071258e-05, "loss": 0.3514, "step": 614800 }, { "epoch": 8.471797415337136, "grad_norm": 6.812719345092773, "learning_rate": 2.2331466182185547e-05, "loss": 0.3903, "step": 614900 }, { "epoch": 8.473175167396874, "grad_norm": 2.2937488555908203, "learning_rate": 2.232359380139322e-05, "loss": 0.3432, "step": 615000 }, { "epoch": 8.474552919456615, "grad_norm": 3.268092155456543, "learning_rate": 2.2315721860363085e-05, "loss": 0.4469, "step": 615100 }, { "epoch": 8.475930671516354, "grad_norm": 2.450117826461792, "learning_rate": 2.2307850359763912e-05, "loss": 0.2871, "step": 615200 }, { "epoch": 8.477308423576094, "grad_norm": 10.77607536315918, "learning_rate": 2.2299979300264397e-05, "loss": 0.3628, "step": 615300 }, { "epoch": 8.478686175635833, "grad_norm": 3.631063461303711, "learning_rate": 2.2292108682533248e-05, "loss": 0.3261, "step": 615400 }, { "epoch": 8.480063927695571, "grad_norm": 7.375083923339844, "learning_rate": 2.2284238507239114e-05, "loss": 0.3845, "step": 615500 }, { "epoch": 8.481441679755312, "grad_norm": 18.21994972229004, "learning_rate": 2.2276368775050598e-05, "loss": 0.396, "step": 615600 }, { "epoch": 8.48281943181505, "grad_norm": 8.315932273864746, "learning_rate": 2.2268499486636277e-05, "loss": 0.4236, "step": 615700 }, { "epoch": 8.48419718387479, "grad_norm": 3.5208487510681152, "learning_rate": 2.2260630642664697e-05, "loss": 0.2986, "step": 615800 }, { "epoch": 8.48557493593453, "grad_norm": 4.82790994644165, "learning_rate": 2.2252762243804343e-05, "loss": 0.3374, "step": 615900 }, { "epoch": 8.486952687994268, "grad_norm": 2.6233551502227783, "learning_rate": 2.224489429072368e-05, "loss": 0.3301, "step": 616000 }, { "epoch": 8.488330440054009, "grad_norm": 4.033213138580322, "learning_rate": 2.2237026784091137e-05, "loss": 0.3587, "step": 616100 }, { "epoch": 8.489708192113747, "grad_norm": 0.020865071564912796, "learning_rate": 2.2229159724575085e-05, "loss": 0.4055, "step": 616200 }, { "epoch": 8.491085944173486, "grad_norm": 2.265803575515747, "learning_rate": 2.222129311284389e-05, "loss": 0.2873, "step": 616300 }, { "epoch": 8.492463696233226, "grad_norm": 4.1437296867370605, "learning_rate": 2.2213426949565846e-05, "loss": 0.3674, "step": 616400 }, { "epoch": 8.493841448292965, "grad_norm": 1.2387611865997314, "learning_rate": 2.220556123540923e-05, "loss": 0.3699, "step": 616500 }, { "epoch": 8.495219200352704, "grad_norm": 4.634833812713623, "learning_rate": 2.219769597104227e-05, "loss": 0.341, "step": 616600 }, { "epoch": 8.496596952412444, "grad_norm": 2.196974515914917, "learning_rate": 2.218983115713316e-05, "loss": 0.335, "step": 616700 }, { "epoch": 8.497974704472183, "grad_norm": 6.876392364501953, "learning_rate": 2.2181966794350065e-05, "loss": 0.3516, "step": 616800 }, { "epoch": 8.499352456531923, "grad_norm": 6.537124156951904, "learning_rate": 2.2174102883361107e-05, "loss": 0.3388, "step": 616900 }, { "epoch": 8.500730208591662, "grad_norm": 3.6307976245880127, "learning_rate": 2.2166239424834346e-05, "loss": 0.3919, "step": 617000 }, { "epoch": 8.5021079606514, "grad_norm": 2.5189926624298096, "learning_rate": 2.2158376419437836e-05, "loss": 0.3275, "step": 617100 }, { "epoch": 8.503485712711141, "grad_norm": 6.779555797576904, "learning_rate": 2.215059249110706e-05, "loss": 0.3508, "step": 617200 }, { "epoch": 8.50486346477088, "grad_norm": 1.5058640241622925, "learning_rate": 2.214273038942704e-05, "loss": 0.3538, "step": 617300 }, { "epoch": 8.506241216830619, "grad_norm": 1.3997087478637695, "learning_rate": 2.2134868742874495e-05, "loss": 0.3169, "step": 617400 }, { "epoch": 8.507618968890359, "grad_norm": 3.7029576301574707, "learning_rate": 2.2127007552117284e-05, "loss": 0.3227, "step": 617500 }, { "epoch": 8.508996720950098, "grad_norm": 0.2418132722377777, "learning_rate": 2.2119146817823276e-05, "loss": 0.3398, "step": 617600 }, { "epoch": 8.510374473009836, "grad_norm": 3.9458911418914795, "learning_rate": 2.2111286540660274e-05, "loss": 0.3432, "step": 617700 }, { "epoch": 8.511752225069577, "grad_norm": 6.2931647300720215, "learning_rate": 2.210342672129604e-05, "loss": 0.3449, "step": 617800 }, { "epoch": 8.513129977129315, "grad_norm": 3.383282423019409, "learning_rate": 2.209556736039832e-05, "loss": 0.3166, "step": 617900 }, { "epoch": 8.514507729189056, "grad_norm": 1.0736831426620483, "learning_rate": 2.2087708458634813e-05, "loss": 0.3434, "step": 618000 }, { "epoch": 8.515885481248795, "grad_norm": 13.528286933898926, "learning_rate": 2.2079850016673154e-05, "loss": 0.359, "step": 618100 }, { "epoch": 8.517263233308533, "grad_norm": 2.126558542251587, "learning_rate": 2.2071992035180964e-05, "loss": 0.3862, "step": 618200 }, { "epoch": 8.518640985368274, "grad_norm": 10.538199424743652, "learning_rate": 2.2064213087744546e-05, "loss": 0.3365, "step": 618300 }, { "epoch": 8.520018737428012, "grad_norm": 3.284346103668213, "learning_rate": 2.205635602457264e-05, "loss": 0.3891, "step": 618400 }, { "epoch": 8.521396489487751, "grad_norm": 4.785995006561279, "learning_rate": 2.2048499423866137e-05, "loss": 0.3935, "step": 618500 }, { "epoch": 8.522774241547491, "grad_norm": 2.2195351123809814, "learning_rate": 2.2040643286292492e-05, "loss": 0.3867, "step": 618600 }, { "epoch": 8.52415199360723, "grad_norm": 10.841598510742188, "learning_rate": 2.203278761251913e-05, "loss": 0.3986, "step": 618700 }, { "epoch": 8.52552974566697, "grad_norm": 1.6979165077209473, "learning_rate": 2.2024932403213433e-05, "loss": 0.3768, "step": 618800 }, { "epoch": 8.52690749772671, "grad_norm": 2.557459831237793, "learning_rate": 2.201707765904274e-05, "loss": 0.3505, "step": 618900 }, { "epoch": 8.528285249786448, "grad_norm": 5.050466060638428, "learning_rate": 2.2009223380674342e-05, "loss": 0.4186, "step": 619000 }, { "epoch": 8.529663001846188, "grad_norm": 3.518502950668335, "learning_rate": 2.2001369568775524e-05, "loss": 0.3236, "step": 619100 }, { "epoch": 8.531040753905927, "grad_norm": 14.200884819030762, "learning_rate": 2.199351622401348e-05, "loss": 0.333, "step": 619200 }, { "epoch": 8.532418505965666, "grad_norm": 2.772815704345703, "learning_rate": 2.198566334705541e-05, "loss": 0.3971, "step": 619300 }, { "epoch": 8.533796258025406, "grad_norm": 3.06123948097229, "learning_rate": 2.1977810938568446e-05, "loss": 0.3708, "step": 619400 }, { "epoch": 8.535174010085145, "grad_norm": 15.280471801757812, "learning_rate": 2.1970037516288753e-05, "loss": 0.3949, "step": 619500 }, { "epoch": 8.536551762144885, "grad_norm": 10.371110916137695, "learning_rate": 2.196218604204393e-05, "loss": 0.357, "step": 619600 }, { "epoch": 8.537929514204624, "grad_norm": 1.9435280561447144, "learning_rate": 2.1954335038264723e-05, "loss": 0.4076, "step": 619700 }, { "epoch": 8.539307266264363, "grad_norm": 5.665966987609863, "learning_rate": 2.1946484505618124e-05, "loss": 0.3412, "step": 619800 }, { "epoch": 8.540685018324103, "grad_norm": 4.273377418518066, "learning_rate": 2.1938634444771085e-05, "loss": 0.3348, "step": 619900 }, { "epoch": 8.542062770383842, "grad_norm": 3.8269553184509277, "learning_rate": 2.19307848563905e-05, "loss": 0.4072, "step": 620000 }, { "epoch": 8.54344052244358, "grad_norm": 3.528566837310791, "learning_rate": 2.1922935741143226e-05, "loss": 0.3975, "step": 620100 }, { "epoch": 8.54481827450332, "grad_norm": 2.17942214012146, "learning_rate": 2.191508709969611e-05, "loss": 0.3837, "step": 620200 }, { "epoch": 8.54619602656306, "grad_norm": 1.3713845014572144, "learning_rate": 2.1907238932715903e-05, "loss": 0.3173, "step": 620300 }, { "epoch": 8.547573778622798, "grad_norm": 1.2039626836776733, "learning_rate": 2.189939124086936e-05, "loss": 0.408, "step": 620400 }, { "epoch": 8.548951530682539, "grad_norm": 8.209099769592285, "learning_rate": 2.189154402482319e-05, "loss": 0.3694, "step": 620500 }, { "epoch": 8.550329282742277, "grad_norm": 11.325695037841797, "learning_rate": 2.188369728524404e-05, "loss": 0.3669, "step": 620600 }, { "epoch": 8.551707034802018, "grad_norm": 4.762801647186279, "learning_rate": 2.1875851022798557e-05, "loss": 0.4044, "step": 620700 }, { "epoch": 8.553084786861756, "grad_norm": 4.980169296264648, "learning_rate": 2.1868005238153286e-05, "loss": 0.3757, "step": 620800 }, { "epoch": 8.554462538921495, "grad_norm": 3.2319538593292236, "learning_rate": 2.1860159931974786e-05, "loss": 0.3736, "step": 620900 }, { "epoch": 8.555840290981235, "grad_norm": 3.6315629482269287, "learning_rate": 2.1852315104929556e-05, "loss": 0.4119, "step": 621000 }, { "epoch": 8.557218043040974, "grad_norm": 5.15279483795166, "learning_rate": 2.184447075768404e-05, "loss": 0.4228, "step": 621100 }, { "epoch": 8.558595795100715, "grad_norm": 10.44373893737793, "learning_rate": 2.1836626890904678e-05, "loss": 0.3641, "step": 621200 }, { "epoch": 8.559973547160453, "grad_norm": 3.743250846862793, "learning_rate": 2.182878350525784e-05, "loss": 0.3489, "step": 621300 }, { "epoch": 8.561351299220192, "grad_norm": 5.815445899963379, "learning_rate": 2.1820940601409848e-05, "loss": 0.4291, "step": 621400 }, { "epoch": 8.562729051279932, "grad_norm": 15.064278602600098, "learning_rate": 2.1813098180027008e-05, "loss": 0.4041, "step": 621500 }, { "epoch": 8.564106803339671, "grad_norm": 0.438976526260376, "learning_rate": 2.1805256241775566e-05, "loss": 0.3378, "step": 621600 }, { "epoch": 8.56548455539941, "grad_norm": 37.62239456176758, "learning_rate": 2.1797414787321747e-05, "loss": 0.4028, "step": 621700 }, { "epoch": 8.56686230745915, "grad_norm": 3.2186551094055176, "learning_rate": 2.1789573817331722e-05, "loss": 0.3689, "step": 621800 }, { "epoch": 8.568240059518889, "grad_norm": 3.1746091842651367, "learning_rate": 2.1781733332471608e-05, "loss": 0.3712, "step": 621900 }, { "epoch": 8.569617811578627, "grad_norm": 1.5825210809707642, "learning_rate": 2.1773893333407502e-05, "loss": 0.4296, "step": 622000 }, { "epoch": 8.570995563638368, "grad_norm": 2.9672021865844727, "learning_rate": 2.1766053820805446e-05, "loss": 0.3918, "step": 622100 }, { "epoch": 8.572373315698107, "grad_norm": 2.572219133377075, "learning_rate": 2.1758214795331458e-05, "loss": 0.2776, "step": 622200 }, { "epoch": 8.573751067757847, "grad_norm": 3.83341646194458, "learning_rate": 2.17503762576515e-05, "loss": 0.3809, "step": 622300 }, { "epoch": 8.575128819817586, "grad_norm": 2.831357002258301, "learning_rate": 2.17425382084315e-05, "loss": 0.3602, "step": 622400 }, { "epoch": 8.576506571877324, "grad_norm": 3.027714490890503, "learning_rate": 2.1734700648337324e-05, "loss": 0.3878, "step": 622500 }, { "epoch": 8.577884323937065, "grad_norm": 4.66804313659668, "learning_rate": 2.172686357803482e-05, "loss": 0.3284, "step": 622600 }, { "epoch": 8.579262075996803, "grad_norm": 23.84748077392578, "learning_rate": 2.17190269981898e-05, "loss": 0.3428, "step": 622700 }, { "epoch": 8.580639828056542, "grad_norm": 3.4912636280059814, "learning_rate": 2.1711190909468002e-05, "loss": 0.3501, "step": 622800 }, { "epoch": 8.582017580116283, "grad_norm": 4.160470962524414, "learning_rate": 2.1703355312535167e-05, "loss": 0.3844, "step": 622900 }, { "epoch": 8.583395332176021, "grad_norm": 1.3331727981567383, "learning_rate": 2.1695598556661888e-05, "loss": 0.3289, "step": 623000 }, { "epoch": 8.584773084235762, "grad_norm": 4.610540866851807, "learning_rate": 2.168776394036942e-05, "loss": 0.3148, "step": 623100 }, { "epoch": 8.5861508362955, "grad_norm": 3.8662302494049072, "learning_rate": 2.1679929817856143e-05, "loss": 0.3659, "step": 623200 }, { "epoch": 8.587528588355239, "grad_norm": 2.9380335807800293, "learning_rate": 2.1672096189787592e-05, "loss": 0.3748, "step": 623300 }, { "epoch": 8.58890634041498, "grad_norm": 22.89092254638672, "learning_rate": 2.1664263056829298e-05, "loss": 0.3834, "step": 623400 }, { "epoch": 8.590284092474718, "grad_norm": 2.1078238487243652, "learning_rate": 2.1656430419646716e-05, "loss": 0.3908, "step": 623500 }, { "epoch": 8.591661844534457, "grad_norm": 0.28540176153182983, "learning_rate": 2.164859827890526e-05, "loss": 0.3971, "step": 623600 }, { "epoch": 8.593039596594197, "grad_norm": 2.0287351608276367, "learning_rate": 2.1640766635270314e-05, "loss": 0.3485, "step": 623700 }, { "epoch": 8.594417348653936, "grad_norm": 5.338911056518555, "learning_rate": 2.1632935489407214e-05, "loss": 0.3248, "step": 623800 }, { "epoch": 8.595795100713676, "grad_norm": 8.471325874328613, "learning_rate": 2.1625104841981265e-05, "loss": 0.3621, "step": 623900 }, { "epoch": 8.597172852773415, "grad_norm": 2.9211699962615967, "learning_rate": 2.1617274693657723e-05, "loss": 0.4189, "step": 624000 }, { "epoch": 8.598550604833154, "grad_norm": 9.724971771240234, "learning_rate": 2.160944504510178e-05, "loss": 0.3334, "step": 624100 }, { "epoch": 8.599928356892894, "grad_norm": 20.8826904296875, "learning_rate": 2.1601615896978625e-05, "loss": 0.3576, "step": 624200 }, { "epoch": 8.601306108952633, "grad_norm": 1.1538742780685425, "learning_rate": 2.1593787249953362e-05, "loss": 0.3389, "step": 624300 }, { "epoch": 8.602683861012371, "grad_norm": 8.331433296203613, "learning_rate": 2.1585959104691098e-05, "loss": 0.3762, "step": 624400 }, { "epoch": 8.604061613072112, "grad_norm": 3.8170533180236816, "learning_rate": 2.1578131461856862e-05, "loss": 0.3505, "step": 624500 }, { "epoch": 8.60543936513185, "grad_norm": 3.952625036239624, "learning_rate": 2.1570304322115663e-05, "loss": 0.3906, "step": 624600 }, { "epoch": 8.60681711719159, "grad_norm": 1.0608445405960083, "learning_rate": 2.1562477686132438e-05, "loss": 0.3441, "step": 624700 }, { "epoch": 8.60819486925133, "grad_norm": 8.85407829284668, "learning_rate": 2.1554651554572106e-05, "loss": 0.3283, "step": 624800 }, { "epoch": 8.609572621311068, "grad_norm": 3.8254077434539795, "learning_rate": 2.154682592809955e-05, "loss": 0.3561, "step": 624900 }, { "epoch": 8.610950373370809, "grad_norm": 0.8852051496505737, "learning_rate": 2.1539000807379583e-05, "loss": 0.4056, "step": 625000 }, { "epoch": 8.612328125430547, "grad_norm": 1.8141647577285767, "learning_rate": 2.1531176193077002e-05, "loss": 0.3857, "step": 625100 }, { "epoch": 8.613705877490286, "grad_norm": 4.894231796264648, "learning_rate": 2.1523352085856533e-05, "loss": 0.3728, "step": 625200 }, { "epoch": 8.615083629550027, "grad_norm": 3.1469619274139404, "learning_rate": 2.1515528486382874e-05, "loss": 0.3991, "step": 625300 }, { "epoch": 8.616461381609765, "grad_norm": 5.235361099243164, "learning_rate": 2.1507705395320693e-05, "loss": 0.3484, "step": 625400 }, { "epoch": 8.617839133669506, "grad_norm": 0.24693188071250916, "learning_rate": 2.1499882813334593e-05, "loss": 0.3861, "step": 625500 }, { "epoch": 8.619216885729244, "grad_norm": 3.9979472160339355, "learning_rate": 2.149206074108914e-05, "loss": 0.357, "step": 625600 }, { "epoch": 8.620594637788983, "grad_norm": 2.707336664199829, "learning_rate": 2.1484239179248882e-05, "loss": 0.373, "step": 625700 }, { "epoch": 8.621972389848723, "grad_norm": 2.6941821575164795, "learning_rate": 2.1476418128478265e-05, "loss": 0.3757, "step": 625800 }, { "epoch": 8.623350141908462, "grad_norm": 9.870085716247559, "learning_rate": 2.1468675792296846e-05, "loss": 0.3871, "step": 625900 }, { "epoch": 8.6247278939682, "grad_norm": 1.317989468574524, "learning_rate": 2.1460855760531547e-05, "loss": 0.3981, "step": 626000 }, { "epoch": 8.626105646027941, "grad_norm": 0.3457280993461609, "learning_rate": 2.145303624182245e-05, "loss": 0.3748, "step": 626100 }, { "epoch": 8.62748339808768, "grad_norm": 6.618454933166504, "learning_rate": 2.1445217236833864e-05, "loss": 0.3637, "step": 626200 }, { "epoch": 8.628861150147419, "grad_norm": 0.3231651782989502, "learning_rate": 2.1437398746230043e-05, "loss": 0.3317, "step": 626300 }, { "epoch": 8.630238902207159, "grad_norm": 123.69172668457031, "learning_rate": 2.1429580770675208e-05, "loss": 0.3121, "step": 626400 }, { "epoch": 8.631616654266898, "grad_norm": 13.843061447143555, "learning_rate": 2.1421763310833554e-05, "loss": 0.3702, "step": 626500 }, { "epoch": 8.632994406326638, "grad_norm": 4.132974147796631, "learning_rate": 2.1413946367369198e-05, "loss": 0.3185, "step": 626600 }, { "epoch": 8.634372158386377, "grad_norm": 2.850433111190796, "learning_rate": 2.1406129940946235e-05, "loss": 0.3682, "step": 626700 }, { "epoch": 8.635749910446116, "grad_norm": 3.6372785568237305, "learning_rate": 2.1398392188751067e-05, "loss": 0.3459, "step": 626800 }, { "epoch": 8.637127662505856, "grad_norm": 3.0738308429718018, "learning_rate": 2.1390576793216e-05, "loss": 0.368, "step": 626900 }, { "epoch": 8.638505414565595, "grad_norm": 4.2001190185546875, "learning_rate": 2.138276191670769e-05, "loss": 0.3747, "step": 627000 }, { "epoch": 8.639883166625333, "grad_norm": 5.7996649742126465, "learning_rate": 2.1374947559890045e-05, "loss": 0.4183, "step": 627100 }, { "epoch": 8.641260918685074, "grad_norm": 4.514593601226807, "learning_rate": 2.1367133723426945e-05, "loss": 0.3402, "step": 627200 }, { "epoch": 8.642638670744812, "grad_norm": 3.181171417236328, "learning_rate": 2.1359320407982217e-05, "loss": 0.347, "step": 627300 }, { "epoch": 8.644016422804553, "grad_norm": 10.193471908569336, "learning_rate": 2.135150761421963e-05, "loss": 0.3796, "step": 627400 }, { "epoch": 8.645394174864292, "grad_norm": 5.279388427734375, "learning_rate": 2.1343695342802928e-05, "loss": 0.3462, "step": 627500 }, { "epoch": 8.64677192692403, "grad_norm": 1.758195400238037, "learning_rate": 2.1335883594395796e-05, "loss": 0.3909, "step": 627600 }, { "epoch": 8.64814967898377, "grad_norm": 5.303006172180176, "learning_rate": 2.1328072369661892e-05, "loss": 0.3801, "step": 627700 }, { "epoch": 8.64952743104351, "grad_norm": 10.079161643981934, "learning_rate": 2.132026166926482e-05, "loss": 0.3501, "step": 627800 }, { "epoch": 8.650905183103248, "grad_norm": 4.683069705963135, "learning_rate": 2.1312451493868136e-05, "loss": 0.3903, "step": 627900 }, { "epoch": 8.652282935162988, "grad_norm": 2.807865858078003, "learning_rate": 2.1304641844135347e-05, "loss": 0.395, "step": 628000 }, { "epoch": 8.653660687222727, "grad_norm": 291.091064453125, "learning_rate": 2.1296832720729923e-05, "loss": 0.3458, "step": 628100 }, { "epoch": 8.655038439282468, "grad_norm": 5.003805637359619, "learning_rate": 2.1289024124315303e-05, "loss": 0.3934, "step": 628200 }, { "epoch": 8.656416191342206, "grad_norm": 2.5112297534942627, "learning_rate": 2.1281216055554854e-05, "loss": 0.3772, "step": 628300 }, { "epoch": 8.657793943401945, "grad_norm": 3.257223129272461, "learning_rate": 2.127340851511193e-05, "loss": 0.4067, "step": 628400 }, { "epoch": 8.659171695461685, "grad_norm": 2.134662389755249, "learning_rate": 2.1265601503649793e-05, "loss": 0.3785, "step": 628500 }, { "epoch": 8.660549447521424, "grad_norm": 1.4923478364944458, "learning_rate": 2.1257795021831706e-05, "loss": 0.3921, "step": 628600 }, { "epoch": 8.661927199581163, "grad_norm": 0.9694164991378784, "learning_rate": 2.1249989070320866e-05, "loss": 0.3845, "step": 628700 }, { "epoch": 8.663304951640903, "grad_norm": 3.6847665309906006, "learning_rate": 2.1242183649780432e-05, "loss": 0.3627, "step": 628800 }, { "epoch": 8.664682703700642, "grad_norm": 5.224494457244873, "learning_rate": 2.1234378760873506e-05, "loss": 0.3502, "step": 628900 }, { "epoch": 8.66606045576038, "grad_norm": 2.7181360721588135, "learning_rate": 2.1226574404263174e-05, "loss": 0.3584, "step": 629000 }, { "epoch": 8.66743820782012, "grad_norm": 4.30332612991333, "learning_rate": 2.1218770580612427e-05, "loss": 0.3758, "step": 629100 }, { "epoch": 8.66881595987986, "grad_norm": 10.797175407409668, "learning_rate": 2.1210967290584255e-05, "loss": 0.3691, "step": 629200 }, { "epoch": 8.6701937119396, "grad_norm": 1.8653578758239746, "learning_rate": 2.120316453484159e-05, "loss": 0.3064, "step": 629300 }, { "epoch": 8.671571463999339, "grad_norm": 1.7396917343139648, "learning_rate": 2.1195362314047304e-05, "loss": 0.3993, "step": 629400 }, { "epoch": 8.672949216059077, "grad_norm": 6.089523792266846, "learning_rate": 2.1187560628864264e-05, "loss": 0.3728, "step": 629500 }, { "epoch": 8.674326968118818, "grad_norm": 1.9020843505859375, "learning_rate": 2.1179759479955222e-05, "loss": 0.3878, "step": 629600 }, { "epoch": 8.675704720178556, "grad_norm": 1.65755033493042, "learning_rate": 2.1171958867982957e-05, "loss": 0.3914, "step": 629700 }, { "epoch": 8.677082472238297, "grad_norm": 13.32901382446289, "learning_rate": 2.1164158793610154e-05, "loss": 0.3985, "step": 629800 }, { "epoch": 8.678460224298036, "grad_norm": 2.6107616424560547, "learning_rate": 2.1156359257499474e-05, "loss": 0.3577, "step": 629900 }, { "epoch": 8.679837976357774, "grad_norm": 4.511290073394775, "learning_rate": 2.1148560260313533e-05, "loss": 0.4, "step": 630000 }, { "epoch": 8.681215728417515, "grad_norm": 2.9307472705841064, "learning_rate": 2.11407618027149e-05, "loss": 0.3736, "step": 630100 }, { "epoch": 8.682593480477253, "grad_norm": 4.101418972015381, "learning_rate": 2.113296388536608e-05, "loss": 0.364, "step": 630200 }, { "epoch": 8.683971232536992, "grad_norm": 2.0894899368286133, "learning_rate": 2.1125166508929544e-05, "loss": 0.3727, "step": 630300 }, { "epoch": 8.685348984596732, "grad_norm": 5.536656856536865, "learning_rate": 2.1117369674067726e-05, "loss": 0.3735, "step": 630400 }, { "epoch": 8.686726736656471, "grad_norm": 3.605229139328003, "learning_rate": 2.1109573381443014e-05, "loss": 0.404, "step": 630500 }, { "epoch": 8.68810448871621, "grad_norm": 12.471243858337402, "learning_rate": 2.1101777631717737e-05, "loss": 0.3092, "step": 630600 }, { "epoch": 8.68948224077595, "grad_norm": 6.4673261642456055, "learning_rate": 2.1093982425554187e-05, "loss": 0.366, "step": 630700 }, { "epoch": 8.690859992835689, "grad_norm": 37.566261291503906, "learning_rate": 2.1086187763614597e-05, "loss": 0.3343, "step": 630800 }, { "epoch": 8.69223774489543, "grad_norm": 73.8747329711914, "learning_rate": 2.1078393646561162e-05, "loss": 0.363, "step": 630900 }, { "epoch": 8.693615496955168, "grad_norm": 4.5704474449157715, "learning_rate": 2.1070600075056048e-05, "loss": 0.4258, "step": 631000 }, { "epoch": 8.694993249014907, "grad_norm": 5.659417629241943, "learning_rate": 2.106280704976135e-05, "loss": 0.3957, "step": 631100 }, { "epoch": 8.696371001074647, "grad_norm": 1.0288254022598267, "learning_rate": 2.105501457133913e-05, "loss": 0.34, "step": 631200 }, { "epoch": 8.697748753134386, "grad_norm": 1.318321704864502, "learning_rate": 2.1047222640451394e-05, "loss": 0.3644, "step": 631300 }, { "epoch": 8.699126505194124, "grad_norm": 9.477001190185547, "learning_rate": 2.1039431257760093e-05, "loss": 0.3952, "step": 631400 }, { "epoch": 8.700504257253865, "grad_norm": 2.8804399967193604, "learning_rate": 2.1031640423927173e-05, "loss": 0.3663, "step": 631500 }, { "epoch": 8.701882009313604, "grad_norm": 3.0058481693267822, "learning_rate": 2.1023850139614483e-05, "loss": 0.3694, "step": 631600 }, { "epoch": 8.703259761373344, "grad_norm": 4.2868733406066895, "learning_rate": 2.1016060405483855e-05, "loss": 0.3723, "step": 631700 }, { "epoch": 8.704637513433083, "grad_norm": 3.027644157409668, "learning_rate": 2.1008271222197082e-05, "loss": 0.3173, "step": 631800 }, { "epoch": 8.706015265492821, "grad_norm": 7.656113147735596, "learning_rate": 2.100048259041586e-05, "loss": 0.3912, "step": 631900 }, { "epoch": 8.707393017552562, "grad_norm": 3.0780627727508545, "learning_rate": 2.0992694510801908e-05, "loss": 0.3499, "step": 632000 }, { "epoch": 8.7087707696123, "grad_norm": 2.0451438426971436, "learning_rate": 2.098490698401684e-05, "loss": 0.4192, "step": 632100 }, { "epoch": 8.71014852167204, "grad_norm": 1.1024266481399536, "learning_rate": 2.0977120010722256e-05, "loss": 0.3635, "step": 632200 }, { "epoch": 8.71152627373178, "grad_norm": 5.2056660652160645, "learning_rate": 2.0969333591579706e-05, "loss": 0.3522, "step": 632300 }, { "epoch": 8.712904025791518, "grad_norm": 5.245372772216797, "learning_rate": 2.0961547727250665e-05, "loss": 0.3681, "step": 632400 }, { "epoch": 8.714281777851259, "grad_norm": 3.949108600616455, "learning_rate": 2.09537624183966e-05, "loss": 0.42, "step": 632500 }, { "epoch": 8.715659529910997, "grad_norm": 31.30867576599121, "learning_rate": 2.094597766567891e-05, "loss": 0.3786, "step": 632600 }, { "epoch": 8.717037281970736, "grad_norm": 1.5206201076507568, "learning_rate": 2.0938193469758946e-05, "loss": 0.3889, "step": 632700 }, { "epoch": 8.718415034030476, "grad_norm": 4.9385809898376465, "learning_rate": 2.0930409831298016e-05, "loss": 0.3642, "step": 632800 }, { "epoch": 8.719792786090215, "grad_norm": 2.5209903717041016, "learning_rate": 2.0922626750957388e-05, "loss": 0.4055, "step": 632900 }, { "epoch": 8.721170538149954, "grad_norm": 102.56981658935547, "learning_rate": 2.0914922051845713e-05, "loss": 0.3687, "step": 633000 }, { "epoch": 8.722548290209694, "grad_norm": 1.3988274335861206, "learning_rate": 2.0907217901037317e-05, "loss": 0.4149, "step": 633100 }, { "epoch": 8.723926042269433, "grad_norm": 4.61720085144043, "learning_rate": 2.0899436487816093e-05, "loss": 0.3269, "step": 633200 }, { "epoch": 8.725303794329172, "grad_norm": 2.69108510017395, "learning_rate": 2.0891655635346514e-05, "loss": 0.3574, "step": 633300 }, { "epoch": 8.726681546388912, "grad_norm": 4.8942766189575195, "learning_rate": 2.08838753442896e-05, "loss": 0.4135, "step": 633400 }, { "epoch": 8.72805929844865, "grad_norm": 2.1186954975128174, "learning_rate": 2.0876095615306317e-05, "loss": 0.3424, "step": 633500 }, { "epoch": 8.729437050508391, "grad_norm": 1.4958524703979492, "learning_rate": 2.0868316449057602e-05, "loss": 0.3547, "step": 633600 }, { "epoch": 8.73081480256813, "grad_norm": 1.7743455171585083, "learning_rate": 2.0860537846204325e-05, "loss": 0.3844, "step": 633700 }, { "epoch": 8.732192554627868, "grad_norm": 4.088932514190674, "learning_rate": 2.085275980740733e-05, "loss": 0.2961, "step": 633800 }, { "epoch": 8.733570306687609, "grad_norm": 4.657464981079102, "learning_rate": 2.0844982333327402e-05, "loss": 0.3539, "step": 633900 }, { "epoch": 8.734948058747348, "grad_norm": 3.937804698944092, "learning_rate": 2.0837205424625268e-05, "loss": 0.3732, "step": 634000 }, { "epoch": 8.736325810807088, "grad_norm": 1.4566566944122314, "learning_rate": 2.082942908196162e-05, "loss": 0.3736, "step": 634100 }, { "epoch": 8.737703562866827, "grad_norm": 4.650998592376709, "learning_rate": 2.082165330599709e-05, "loss": 0.3367, "step": 634200 }, { "epoch": 8.739081314926565, "grad_norm": 1.7612947225570679, "learning_rate": 2.0813878097392283e-05, "loss": 0.3818, "step": 634300 }, { "epoch": 8.740459066986306, "grad_norm": 6.239738464355469, "learning_rate": 2.080610345680774e-05, "loss": 0.4275, "step": 634400 }, { "epoch": 8.741836819046044, "grad_norm": 2.9712233543395996, "learning_rate": 2.0798329384903958e-05, "loss": 0.3319, "step": 634500 }, { "epoch": 8.743214571105783, "grad_norm": 3.7142488956451416, "learning_rate": 2.0790555882341377e-05, "loss": 0.3258, "step": 634600 }, { "epoch": 8.744592323165524, "grad_norm": 0.22519977390766144, "learning_rate": 2.0782782949780393e-05, "loss": 0.3356, "step": 634700 }, { "epoch": 8.745970075225262, "grad_norm": 2.24440598487854, "learning_rate": 2.0775010587881365e-05, "loss": 0.3566, "step": 634800 }, { "epoch": 8.747347827285001, "grad_norm": 2.5549769401550293, "learning_rate": 2.0767238797304592e-05, "loss": 0.3664, "step": 634900 }, { "epoch": 8.748725579344741, "grad_norm": 3.2478723526000977, "learning_rate": 2.075946757871033e-05, "loss": 0.3617, "step": 635000 }, { "epoch": 8.75010333140448, "grad_norm": 0.5323312282562256, "learning_rate": 2.0751774636381554e-05, "loss": 0.3847, "step": 635100 }, { "epoch": 8.75148108346422, "grad_norm": 4.1103668212890625, "learning_rate": 2.074400455799658e-05, "loss": 0.3774, "step": 635200 }, { "epoch": 8.75285883552396, "grad_norm": 3.3447136878967285, "learning_rate": 2.073623505356798e-05, "loss": 0.3973, "step": 635300 }, { "epoch": 8.754236587583698, "grad_norm": 31.8425350189209, "learning_rate": 2.072846612375581e-05, "loss": 0.3592, "step": 635400 }, { "epoch": 8.755614339643438, "grad_norm": 6.60775089263916, "learning_rate": 2.0720697769220094e-05, "loss": 0.3959, "step": 635500 }, { "epoch": 8.756992091703177, "grad_norm": 3.9991989135742188, "learning_rate": 2.0712929990620785e-05, "loss": 0.3653, "step": 635600 }, { "epoch": 8.758369843762916, "grad_norm": 91.29729461669922, "learning_rate": 2.0705162788617797e-05, "loss": 0.3593, "step": 635700 }, { "epoch": 8.759747595822656, "grad_norm": 2.214317560195923, "learning_rate": 2.0697396163870987e-05, "loss": 0.2996, "step": 635800 }, { "epoch": 8.761125347882395, "grad_norm": 5.3555474281311035, "learning_rate": 2.0689630117040175e-05, "loss": 0.3369, "step": 635900 }, { "epoch": 8.762503099942135, "grad_norm": 3.082096815109253, "learning_rate": 2.0681864648785127e-05, "loss": 0.3839, "step": 636000 }, { "epoch": 8.763880852001874, "grad_norm": 3.0181756019592285, "learning_rate": 2.0674099759765565e-05, "loss": 0.3329, "step": 636100 }, { "epoch": 8.765258604061613, "grad_norm": 2.8812708854675293, "learning_rate": 2.0666335450641147e-05, "loss": 0.3516, "step": 636200 }, { "epoch": 8.766636356121353, "grad_norm": 1.071608066558838, "learning_rate": 2.0658571722071487e-05, "loss": 0.408, "step": 636300 }, { "epoch": 8.768014108181092, "grad_norm": 31.29279327392578, "learning_rate": 2.065080857471616e-05, "loss": 0.3685, "step": 636400 }, { "epoch": 8.76939186024083, "grad_norm": 6.259915351867676, "learning_rate": 2.0643046009234683e-05, "loss": 0.4008, "step": 636500 }, { "epoch": 8.77076961230057, "grad_norm": 2.4642298221588135, "learning_rate": 2.0635284026286534e-05, "loss": 0.3663, "step": 636600 }, { "epoch": 8.77214736436031, "grad_norm": 9.889074325561523, "learning_rate": 2.0627522626531134e-05, "loss": 0.3251, "step": 636700 }, { "epoch": 8.77352511642005, "grad_norm": 8.76566219329834, "learning_rate": 2.0619761810627836e-05, "loss": 0.3339, "step": 636800 }, { "epoch": 8.774902868479789, "grad_norm": 4.284940242767334, "learning_rate": 2.0612001579235964e-05, "loss": 0.4172, "step": 636900 }, { "epoch": 8.776280620539527, "grad_norm": 5.477663516998291, "learning_rate": 2.0604241933014808e-05, "loss": 0.3845, "step": 637000 }, { "epoch": 8.777658372599268, "grad_norm": 1.4679712057113647, "learning_rate": 2.0596482872623574e-05, "loss": 0.3244, "step": 637100 }, { "epoch": 8.779036124659006, "grad_norm": 2.950423240661621, "learning_rate": 2.0588724398721438e-05, "loss": 0.2932, "step": 637200 }, { "epoch": 8.780413876718745, "grad_norm": 1.8477563858032227, "learning_rate": 2.0580966511967535e-05, "loss": 0.3989, "step": 637300 }, { "epoch": 8.781791628778485, "grad_norm": 2.641486644744873, "learning_rate": 2.0573209213020905e-05, "loss": 0.3496, "step": 637400 }, { "epoch": 8.783169380838224, "grad_norm": 5.36055326461792, "learning_rate": 2.05654525025406e-05, "loss": 0.3255, "step": 637500 }, { "epoch": 8.784547132897963, "grad_norm": 3.5428483486175537, "learning_rate": 2.055769638118558e-05, "loss": 0.3172, "step": 637600 }, { "epoch": 8.785924884957703, "grad_norm": 2.5028932094573975, "learning_rate": 2.0549940849614762e-05, "loss": 0.3935, "step": 637700 }, { "epoch": 8.787302637017442, "grad_norm": 20.1326904296875, "learning_rate": 2.054218590848704e-05, "loss": 0.4184, "step": 637800 }, { "epoch": 8.788680389077182, "grad_norm": 3.7458786964416504, "learning_rate": 2.053443155846121e-05, "loss": 0.3261, "step": 637900 }, { "epoch": 8.790058141136921, "grad_norm": 2.482987403869629, "learning_rate": 2.0526677800196055e-05, "loss": 0.3527, "step": 638000 }, { "epoch": 8.79143589319666, "grad_norm": 4.323294162750244, "learning_rate": 2.05189246343503e-05, "loss": 0.359, "step": 638100 }, { "epoch": 8.7928136452564, "grad_norm": 6.655841827392578, "learning_rate": 2.0511172061582603e-05, "loss": 0.3961, "step": 638200 }, { "epoch": 8.794191397316139, "grad_norm": 3.3421835899353027, "learning_rate": 2.0503420082551603e-05, "loss": 0.3659, "step": 638300 }, { "epoch": 8.79556914937588, "grad_norm": 3.375495672225952, "learning_rate": 2.0495668697915867e-05, "loss": 0.3447, "step": 638400 }, { "epoch": 8.796946901435618, "grad_norm": 2.357154130935669, "learning_rate": 2.04879179083339e-05, "loss": 0.3615, "step": 638500 }, { "epoch": 8.798324653495357, "grad_norm": 0.9620901346206665, "learning_rate": 2.048016771446418e-05, "loss": 0.3136, "step": 638600 }, { "epoch": 8.799702405555097, "grad_norm": 3.7110488414764404, "learning_rate": 2.0472418116965123e-05, "loss": 0.3705, "step": 638700 }, { "epoch": 8.801080157614836, "grad_norm": 3.417762041091919, "learning_rate": 2.0464669116495107e-05, "loss": 0.3978, "step": 638800 }, { "epoch": 8.802457909674574, "grad_norm": 3.186450719833374, "learning_rate": 2.0456920713712447e-05, "loss": 0.3583, "step": 638900 }, { "epoch": 8.803835661734315, "grad_norm": 8.985251426696777, "learning_rate": 2.04491729092754e-05, "loss": 0.3596, "step": 639000 }, { "epoch": 8.805213413794053, "grad_norm": 2.134378671646118, "learning_rate": 2.0441425703842182e-05, "loss": 0.318, "step": 639100 }, { "epoch": 8.806591165853792, "grad_norm": 4.261523723602295, "learning_rate": 2.0433679098070955e-05, "loss": 0.357, "step": 639200 }, { "epoch": 8.807968917913533, "grad_norm": 1.3818753957748413, "learning_rate": 2.0425933092619848e-05, "loss": 0.3662, "step": 639300 }, { "epoch": 8.809346669973271, "grad_norm": 4.6248674392700195, "learning_rate": 2.041818768814692e-05, "loss": 0.3433, "step": 639400 }, { "epoch": 8.810724422033012, "grad_norm": 3.4297780990600586, "learning_rate": 2.0410442885310178e-05, "loss": 0.3308, "step": 639500 }, { "epoch": 8.81210217409275, "grad_norm": 1.6342157125473022, "learning_rate": 2.0402698684767576e-05, "loss": 0.3548, "step": 639600 }, { "epoch": 8.813479926152489, "grad_norm": 2.110055685043335, "learning_rate": 2.0394955087177028e-05, "loss": 0.348, "step": 639700 }, { "epoch": 8.81485767821223, "grad_norm": 1.732565999031067, "learning_rate": 2.03872120931964e-05, "loss": 0.3287, "step": 639800 }, { "epoch": 8.816235430271968, "grad_norm": 14.913119316101074, "learning_rate": 2.037946970348349e-05, "loss": 0.3657, "step": 639900 }, { "epoch": 8.817613182331707, "grad_norm": 3.0401253700256348, "learning_rate": 2.0371805333547403e-05, "loss": 0.343, "step": 640000 }, { "epoch": 8.818990934391447, "grad_norm": 8.204343795776367, "learning_rate": 2.0364064148284063e-05, "loss": 0.386, "step": 640100 }, { "epoch": 8.820368686451186, "grad_norm": 2.843414306640625, "learning_rate": 2.0356323569254984e-05, "loss": 0.403, "step": 640200 }, { "epoch": 8.821746438510926, "grad_norm": 1.1177184581756592, "learning_rate": 2.0348583597117767e-05, "loss": 0.354, "step": 640300 }, { "epoch": 8.823124190570665, "grad_norm": 1.9453452825546265, "learning_rate": 2.0340844232529952e-05, "loss": 0.3325, "step": 640400 }, { "epoch": 8.824501942630404, "grad_norm": 1.9291720390319824, "learning_rate": 2.033310547614905e-05, "loss": 0.4021, "step": 640500 }, { "epoch": 8.825879694690144, "grad_norm": 18.277000427246094, "learning_rate": 2.0325367328632514e-05, "loss": 0.3942, "step": 640600 }, { "epoch": 8.827257446749883, "grad_norm": 3.3130598068237305, "learning_rate": 2.031762979063772e-05, "loss": 0.3686, "step": 640700 }, { "epoch": 8.828635198809621, "grad_norm": 2.889070510864258, "learning_rate": 2.030989286282202e-05, "loss": 0.3175, "step": 640800 }, { "epoch": 8.830012950869362, "grad_norm": 3.128248453140259, "learning_rate": 2.0302156545842698e-05, "loss": 0.3889, "step": 640900 }, { "epoch": 8.8313907029291, "grad_norm": 2.4471993446350098, "learning_rate": 2.0294420840357008e-05, "loss": 0.3585, "step": 641000 }, { "epoch": 8.832768454988841, "grad_norm": 2.7810728549957275, "learning_rate": 2.028668574702214e-05, "loss": 0.347, "step": 641100 }, { "epoch": 8.83414620704858, "grad_norm": 3.0514719486236572, "learning_rate": 2.0278951266495213e-05, "loss": 0.3781, "step": 641200 }, { "epoch": 8.835523959108318, "grad_norm": 18.44270896911621, "learning_rate": 2.0271217399433314e-05, "loss": 0.3712, "step": 641300 }, { "epoch": 8.836901711168059, "grad_norm": 6.6126532554626465, "learning_rate": 2.026348414649348e-05, "loss": 0.383, "step": 641400 }, { "epoch": 8.838279463227797, "grad_norm": 2.161134719848633, "learning_rate": 2.0255751508332694e-05, "loss": 0.4232, "step": 641500 }, { "epoch": 8.839657215287536, "grad_norm": 0.09722273051738739, "learning_rate": 2.0248019485607877e-05, "loss": 0.3768, "step": 641600 }, { "epoch": 8.841034967347277, "grad_norm": 1.4188485145568848, "learning_rate": 2.0240288078975913e-05, "loss": 0.3404, "step": 641700 }, { "epoch": 8.842412719407015, "grad_norm": 2.2287075519561768, "learning_rate": 2.023255728909361e-05, "loss": 0.3726, "step": 641800 }, { "epoch": 8.843790471466754, "grad_norm": 2.5462350845336914, "learning_rate": 2.0224827116617738e-05, "loss": 0.4319, "step": 641900 }, { "epoch": 8.845168223526494, "grad_norm": 17.24924659729004, "learning_rate": 2.021709756220503e-05, "loss": 0.3561, "step": 642000 }, { "epoch": 8.846545975586233, "grad_norm": 2.118520736694336, "learning_rate": 2.020936862651214e-05, "loss": 0.3184, "step": 642100 }, { "epoch": 8.847923727645973, "grad_norm": 2.4389121532440186, "learning_rate": 2.0201640310195696e-05, "loss": 0.3698, "step": 642200 }, { "epoch": 8.849301479705712, "grad_norm": 1.0755431652069092, "learning_rate": 2.0193989887803747e-05, "loss": 0.4169, "step": 642300 }, { "epoch": 8.85067923176545, "grad_norm": 0.5344855785369873, "learning_rate": 2.0186262805999645e-05, "loss": 0.385, "step": 642400 }, { "epoch": 8.852056983825191, "grad_norm": 1.9990497827529907, "learning_rate": 2.0178536345534926e-05, "loss": 0.3362, "step": 642500 }, { "epoch": 8.85343473588493, "grad_norm": 3.2231595516204834, "learning_rate": 2.0170810507066012e-05, "loss": 0.3497, "step": 642600 }, { "epoch": 8.85481248794467, "grad_norm": 69.06319427490234, "learning_rate": 2.0163085291249235e-05, "loss": 0.3334, "step": 642700 }, { "epoch": 8.856190240004409, "grad_norm": 13.277430534362793, "learning_rate": 2.0155360698740905e-05, "loss": 0.3377, "step": 642800 }, { "epoch": 8.857567992064148, "grad_norm": 4.476804733276367, "learning_rate": 2.0147636730197256e-05, "loss": 0.3558, "step": 642900 }, { "epoch": 8.858945744123888, "grad_norm": 9.009331703186035, "learning_rate": 2.013991338627447e-05, "loss": 0.3262, "step": 643000 }, { "epoch": 8.860323496183627, "grad_norm": 2.8945515155792236, "learning_rate": 2.0132267891717896e-05, "loss": 0.3657, "step": 643100 }, { "epoch": 8.861701248243365, "grad_norm": 3.7373850345611572, "learning_rate": 2.0124545792742638e-05, "loss": 0.355, "step": 643200 }, { "epoch": 8.863079000303106, "grad_norm": 2.566220760345459, "learning_rate": 2.011682432034997e-05, "loss": 0.3767, "step": 643300 }, { "epoch": 8.864456752362845, "grad_norm": 9.89470386505127, "learning_rate": 2.0109103475195833e-05, "loss": 0.407, "step": 643400 }, { "epoch": 8.865834504422583, "grad_norm": 10.400753021240234, "learning_rate": 2.010138325793617e-05, "loss": 0.3621, "step": 643500 }, { "epoch": 8.867212256482324, "grad_norm": 3.108999729156494, "learning_rate": 2.009366366922685e-05, "loss": 0.3685, "step": 643600 }, { "epoch": 8.868590008542062, "grad_norm": 3.425283193588257, "learning_rate": 2.0085944709723686e-05, "loss": 0.3849, "step": 643700 }, { "epoch": 8.869967760601803, "grad_norm": 1.8222777843475342, "learning_rate": 2.007822638008246e-05, "loss": 0.3713, "step": 643800 }, { "epoch": 8.871345512661541, "grad_norm": 2.6649415493011475, "learning_rate": 2.0070508680958872e-05, "loss": 0.353, "step": 643900 }, { "epoch": 8.87272326472128, "grad_norm": 28.03338623046875, "learning_rate": 2.0062791613008575e-05, "loss": 0.4077, "step": 644000 }, { "epoch": 8.87410101678102, "grad_norm": 5.776099681854248, "learning_rate": 2.0055075176887183e-05, "loss": 0.3818, "step": 644100 }, { "epoch": 8.87547876884076, "grad_norm": 4.197676181793213, "learning_rate": 2.0047359373250233e-05, "loss": 0.3454, "step": 644200 }, { "epoch": 8.876856520900498, "grad_norm": 24.748563766479492, "learning_rate": 2.003964420275324e-05, "loss": 0.3467, "step": 644300 }, { "epoch": 8.878234272960238, "grad_norm": 22.444381713867188, "learning_rate": 2.0031929666051646e-05, "loss": 0.3601, "step": 644400 }, { "epoch": 8.879612025019977, "grad_norm": 4.238951683044434, "learning_rate": 2.0024215763800825e-05, "loss": 0.3603, "step": 644500 }, { "epoch": 8.880989777079717, "grad_norm": 3.1222634315490723, "learning_rate": 2.001650249665612e-05, "loss": 0.3436, "step": 644600 }, { "epoch": 8.882367529139456, "grad_norm": 4.98201322555542, "learning_rate": 2.000878986527281e-05, "loss": 0.3781, "step": 644700 }, { "epoch": 8.883745281199195, "grad_norm": 4.639926910400391, "learning_rate": 2.000107787030613e-05, "loss": 0.3801, "step": 644800 }, { "epoch": 8.885123033258935, "grad_norm": 4.252802848815918, "learning_rate": 1.9993366512411248e-05, "loss": 0.3545, "step": 644900 }, { "epoch": 8.886500785318674, "grad_norm": 2.558443546295166, "learning_rate": 1.9985655792243293e-05, "loss": 0.405, "step": 645000 }, { "epoch": 8.887878537378413, "grad_norm": 1.008420467376709, "learning_rate": 1.997794571045731e-05, "loss": 0.3637, "step": 645100 }, { "epoch": 8.889256289438153, "grad_norm": 17.324047088623047, "learning_rate": 1.9970236267708326e-05, "loss": 0.3195, "step": 645200 }, { "epoch": 8.890634041497892, "grad_norm": 2.8452277183532715, "learning_rate": 1.9962527464651288e-05, "loss": 0.4106, "step": 645300 }, { "epoch": 8.892011793557632, "grad_norm": 2.7887914180755615, "learning_rate": 1.9954819301941112e-05, "loss": 0.3562, "step": 645400 }, { "epoch": 8.89338954561737, "grad_norm": 11.319156646728516, "learning_rate": 1.9947111780232637e-05, "loss": 0.3825, "step": 645500 }, { "epoch": 8.89476729767711, "grad_norm": 1.8130007982254028, "learning_rate": 1.993940490018065e-05, "loss": 0.3958, "step": 645600 }, { "epoch": 8.89614504973685, "grad_norm": 5.498356819152832, "learning_rate": 1.9931698662439896e-05, "loss": 0.3622, "step": 645700 }, { "epoch": 8.897522801796589, "grad_norm": 2.856766939163208, "learning_rate": 1.9923993067665063e-05, "loss": 0.34, "step": 645800 }, { "epoch": 8.898900553856327, "grad_norm": 1.9779382944107056, "learning_rate": 1.991628811651078e-05, "loss": 0.3917, "step": 645900 }, { "epoch": 8.900278305916068, "grad_norm": 2.144191265106201, "learning_rate": 1.9908583809631612e-05, "loss": 0.3565, "step": 646000 }, { "epoch": 8.901656057975806, "grad_norm": 4.26161003112793, "learning_rate": 1.99008801476821e-05, "loss": 0.3507, "step": 646100 }, { "epoch": 8.903033810035547, "grad_norm": 0.9047802090644836, "learning_rate": 1.9893177131316685e-05, "loss": 0.3669, "step": 646200 }, { "epoch": 8.904411562095286, "grad_norm": 2.501375675201416, "learning_rate": 1.988547476118979e-05, "loss": 0.3704, "step": 646300 }, { "epoch": 8.905789314155024, "grad_norm": 4.128477573394775, "learning_rate": 1.987777303795577e-05, "loss": 0.3601, "step": 646400 }, { "epoch": 8.907167066214765, "grad_norm": 5.648437023162842, "learning_rate": 1.9870071962268926e-05, "loss": 0.3672, "step": 646500 }, { "epoch": 8.908544818274503, "grad_norm": 3.3617026805877686, "learning_rate": 1.9862371534783517e-05, "loss": 0.3645, "step": 646600 }, { "epoch": 8.909922570334242, "grad_norm": 1.1747711896896362, "learning_rate": 1.985474875072602e-05, "loss": 0.3421, "step": 646700 }, { "epoch": 8.911300322393982, "grad_norm": 10.259430885314941, "learning_rate": 1.9847126603246615e-05, "loss": 0.4167, "step": 646800 }, { "epoch": 8.912678074453721, "grad_norm": 3.7096409797668457, "learning_rate": 1.9839428111280693e-05, "loss": 0.3456, "step": 646900 }, { "epoch": 8.914055826513462, "grad_norm": 4.699954032897949, "learning_rate": 1.983173027011954e-05, "loss": 0.3976, "step": 647000 }, { "epoch": 8.9154335785732, "grad_norm": 1.1032865047454834, "learning_rate": 1.982403308041714e-05, "loss": 0.3781, "step": 647100 }, { "epoch": 8.916811330632939, "grad_norm": 2.5502681732177734, "learning_rate": 1.98163365428274e-05, "loss": 0.3834, "step": 647200 }, { "epoch": 8.91818908269268, "grad_norm": 4.178029537200928, "learning_rate": 1.9808640658004177e-05, "loss": 0.3739, "step": 647300 }, { "epoch": 8.919566834752418, "grad_norm": 2.408040761947632, "learning_rate": 1.980094542660128e-05, "loss": 0.3741, "step": 647400 }, { "epoch": 8.920944586812157, "grad_norm": 3.5056746006011963, "learning_rate": 1.979325084927245e-05, "loss": 0.2945, "step": 647500 }, { "epoch": 8.922322338871897, "grad_norm": 15.244461059570312, "learning_rate": 1.9785556926671394e-05, "loss": 0.4315, "step": 647600 }, { "epoch": 8.923700090931636, "grad_norm": 4.473372936248779, "learning_rate": 1.977786365945175e-05, "loss": 0.374, "step": 647700 }, { "epoch": 8.925077842991374, "grad_norm": 2.718208074569702, "learning_rate": 1.9770171048267088e-05, "loss": 0.3505, "step": 647800 }, { "epoch": 8.926455595051115, "grad_norm": 1.6715748310089111, "learning_rate": 1.976247909377094e-05, "loss": 0.3496, "step": 647900 }, { "epoch": 8.927833347110854, "grad_norm": 2.0373291969299316, "learning_rate": 1.9754787796616774e-05, "loss": 0.3915, "step": 648000 }, { "epoch": 8.929211099170594, "grad_norm": 0.7419623136520386, "learning_rate": 1.9747097157458015e-05, "loss": 0.3486, "step": 648100 }, { "epoch": 8.930588851230333, "grad_norm": 1.6946591138839722, "learning_rate": 1.9739407176948014e-05, "loss": 0.3539, "step": 648200 }, { "epoch": 8.931966603290071, "grad_norm": 4.420561790466309, "learning_rate": 1.973171785574008e-05, "loss": 0.3821, "step": 648300 }, { "epoch": 8.933344355349812, "grad_norm": 0.7315015196800232, "learning_rate": 1.9724029194487457e-05, "loss": 0.357, "step": 648400 }, { "epoch": 8.93472210740955, "grad_norm": 2.912536382675171, "learning_rate": 1.9716341193843322e-05, "loss": 0.3878, "step": 648500 }, { "epoch": 8.936099859469289, "grad_norm": 2.9838454723358154, "learning_rate": 1.9708653854460838e-05, "loss": 0.3609, "step": 648600 }, { "epoch": 8.93747761152903, "grad_norm": 16.680017471313477, "learning_rate": 1.9700967176993063e-05, "loss": 0.3641, "step": 648700 }, { "epoch": 8.938855363588768, "grad_norm": 0.606227695941925, "learning_rate": 1.9693281162093034e-05, "loss": 0.3477, "step": 648800 }, { "epoch": 8.940233115648509, "grad_norm": 2.8133199214935303, "learning_rate": 1.9685595810413703e-05, "loss": 0.3109, "step": 648900 }, { "epoch": 8.941610867708247, "grad_norm": 2.7305634021759033, "learning_rate": 1.9677911122607982e-05, "loss": 0.4273, "step": 649000 }, { "epoch": 8.942988619767986, "grad_norm": 4.37584114074707, "learning_rate": 1.9670227099328732e-05, "loss": 0.3321, "step": 649100 }, { "epoch": 8.944366371827726, "grad_norm": 3.619100570678711, "learning_rate": 1.966254374122875e-05, "loss": 0.3497, "step": 649200 }, { "epoch": 8.945744123887465, "grad_norm": 1.2310850620269775, "learning_rate": 1.9654861048960758e-05, "loss": 0.339, "step": 649300 }, { "epoch": 8.947121875947204, "grad_norm": 5.259525775909424, "learning_rate": 1.9647179023177484e-05, "loss": 0.3635, "step": 649400 }, { "epoch": 8.948499628006944, "grad_norm": 2.097104787826538, "learning_rate": 1.9639497664531505e-05, "loss": 0.3272, "step": 649500 }, { "epoch": 8.949877380066683, "grad_norm": 25.21429443359375, "learning_rate": 1.963181697367542e-05, "loss": 0.3521, "step": 649600 }, { "epoch": 8.951255132126423, "grad_norm": 4.1588921546936035, "learning_rate": 1.9624136951261736e-05, "loss": 0.3512, "step": 649700 }, { "epoch": 8.952632884186162, "grad_norm": 6.7349982261657715, "learning_rate": 1.9616457597942906e-05, "loss": 0.4332, "step": 649800 }, { "epoch": 8.9540106362459, "grad_norm": 2.080474376678467, "learning_rate": 1.960877891437135e-05, "loss": 0.3255, "step": 649900 }, { "epoch": 8.955388388305641, "grad_norm": 2.8107616901397705, "learning_rate": 1.9601100901199377e-05, "loss": 0.3393, "step": 650000 }, { "epoch": 8.95676614036538, "grad_norm": 4.335209369659424, "learning_rate": 1.9593423559079307e-05, "loss": 0.2934, "step": 650100 }, { "epoch": 8.958143892425118, "grad_norm": 2.77384090423584, "learning_rate": 1.9585746888663346e-05, "loss": 0.3469, "step": 650200 }, { "epoch": 8.959521644484859, "grad_norm": 4.639960289001465, "learning_rate": 1.9578070890603674e-05, "loss": 0.4201, "step": 650300 }, { "epoch": 8.960899396544598, "grad_norm": 1.8101547956466675, "learning_rate": 1.9570395565552418e-05, "loss": 0.3313, "step": 650400 }, { "epoch": 8.962277148604338, "grad_norm": 4.467845439910889, "learning_rate": 1.956272091416163e-05, "loss": 0.3304, "step": 650500 }, { "epoch": 8.963654900664077, "grad_norm": 0.37650877237319946, "learning_rate": 1.9555046937083298e-05, "loss": 0.349, "step": 650600 }, { "epoch": 8.965032652723815, "grad_norm": 3.261247158050537, "learning_rate": 1.9547373634969377e-05, "loss": 0.3851, "step": 650700 }, { "epoch": 8.966410404783556, "grad_norm": 2.061296224594116, "learning_rate": 1.9539701008471744e-05, "loss": 0.3765, "step": 650800 }, { "epoch": 8.967788156843294, "grad_norm": 2.228896379470825, "learning_rate": 1.953202905824224e-05, "loss": 0.3305, "step": 650900 }, { "epoch": 8.969165908903033, "grad_norm": 8.218269348144531, "learning_rate": 1.952435778493264e-05, "loss": 0.3627, "step": 651000 }, { "epoch": 8.970543660962774, "grad_norm": 2.1343777179718018, "learning_rate": 1.9516687189194645e-05, "loss": 0.3877, "step": 651100 }, { "epoch": 8.971921413022512, "grad_norm": 40.226924896240234, "learning_rate": 1.950901727167991e-05, "loss": 0.3208, "step": 651200 }, { "epoch": 8.973299165082253, "grad_norm": 9.179095268249512, "learning_rate": 1.9501348033040035e-05, "loss": 0.3757, "step": 651300 }, { "epoch": 8.974676917141991, "grad_norm": 1.7589216232299805, "learning_rate": 1.9493679473926574e-05, "loss": 0.338, "step": 651400 }, { "epoch": 8.97605466920173, "grad_norm": 1.6015539169311523, "learning_rate": 1.9486011594991e-05, "loss": 0.3935, "step": 651500 }, { "epoch": 8.97743242126147, "grad_norm": 2.482551336288452, "learning_rate": 1.9478344396884748e-05, "loss": 0.3721, "step": 651600 }, { "epoch": 8.97881017332121, "grad_norm": 4.991109371185303, "learning_rate": 1.947067788025917e-05, "loss": 0.4536, "step": 651700 }, { "epoch": 8.980187925380948, "grad_norm": 31.218257904052734, "learning_rate": 1.946301204576558e-05, "loss": 0.3648, "step": 651800 }, { "epoch": 8.981565677440688, "grad_norm": 4.4216437339782715, "learning_rate": 1.9455346894055235e-05, "loss": 0.3503, "step": 651900 }, { "epoch": 8.982943429500427, "grad_norm": 12.041495323181152, "learning_rate": 1.9447682425779333e-05, "loss": 0.4132, "step": 652000 }, { "epoch": 8.984321181560166, "grad_norm": 4.941658973693848, "learning_rate": 1.9440018641589e-05, "loss": 0.322, "step": 652100 }, { "epoch": 8.985698933619906, "grad_norm": 1.1174205541610718, "learning_rate": 1.9432355542135327e-05, "loss": 0.3558, "step": 652200 }, { "epoch": 8.987076685679645, "grad_norm": 9.361457824707031, "learning_rate": 1.9424693128069313e-05, "loss": 0.3712, "step": 652300 }, { "epoch": 8.988454437739385, "grad_norm": 3.7306151390075684, "learning_rate": 1.9417031400041933e-05, "loss": 0.3485, "step": 652400 }, { "epoch": 8.989832189799124, "grad_norm": 0.7190465331077576, "learning_rate": 1.9409370358704093e-05, "loss": 0.3853, "step": 652500 }, { "epoch": 8.991209941858862, "grad_norm": 2.62778639793396, "learning_rate": 1.940171000470663e-05, "loss": 0.343, "step": 652600 }, { "epoch": 8.992587693918603, "grad_norm": 2.3849852085113525, "learning_rate": 1.9394050338700335e-05, "loss": 0.3181, "step": 652700 }, { "epoch": 8.993965445978342, "grad_norm": 3.9958038330078125, "learning_rate": 1.9386391361335924e-05, "loss": 0.3648, "step": 652800 }, { "epoch": 8.99534319803808, "grad_norm": 6.0911431312561035, "learning_rate": 1.937880965273067e-05, "loss": 0.3746, "step": 652900 }, { "epoch": 8.99672095009782, "grad_norm": 3.161404848098755, "learning_rate": 1.9371152047699343e-05, "loss": 0.2874, "step": 653000 }, { "epoch": 8.99809870215756, "grad_norm": 2.9368815422058105, "learning_rate": 1.9363495133255242e-05, "loss": 0.3651, "step": 653100 }, { "epoch": 8.9994764542173, "grad_norm": 3.8970587253570557, "learning_rate": 1.9355838910048867e-05, "loss": 0.3118, "step": 653200 }, { "epoch": 9.000854206277038, "grad_norm": 2.086366891860962, "learning_rate": 1.9348183378730632e-05, "loss": 0.3165, "step": 653300 }, { "epoch": 9.002231958336777, "grad_norm": 14.260379791259766, "learning_rate": 1.9340528539950922e-05, "loss": 0.3473, "step": 653400 }, { "epoch": 9.003609710396518, "grad_norm": 1.8470979928970337, "learning_rate": 1.9332874394360053e-05, "loss": 0.2843, "step": 653500 }, { "epoch": 9.004987462456256, "grad_norm": 1.7738455533981323, "learning_rate": 1.9325220942608287e-05, "loss": 0.3101, "step": 653600 }, { "epoch": 9.006365214515995, "grad_norm": 2.6851699352264404, "learning_rate": 1.931756818534583e-05, "loss": 0.3478, "step": 653700 }, { "epoch": 9.007742966575735, "grad_norm": 1.751868486404419, "learning_rate": 1.930991612322282e-05, "loss": 0.2857, "step": 653800 }, { "epoch": 9.009120718635474, "grad_norm": 0.8391517996788025, "learning_rate": 1.9302264756889334e-05, "loss": 0.3333, "step": 653900 }, { "epoch": 9.010498470695214, "grad_norm": 5.324915409088135, "learning_rate": 1.929461408699538e-05, "loss": 0.3862, "step": 654000 }, { "epoch": 9.011876222754953, "grad_norm": 0.32781800627708435, "learning_rate": 1.9286964114190953e-05, "loss": 0.3942, "step": 654100 }, { "epoch": 9.013253974814692, "grad_norm": 1.898881435394287, "learning_rate": 1.927931483912594e-05, "loss": 0.3558, "step": 654200 }, { "epoch": 9.014631726874432, "grad_norm": 7.0831451416015625, "learning_rate": 1.927166626245018e-05, "loss": 0.2918, "step": 654300 }, { "epoch": 9.016009478934171, "grad_norm": 5.255655288696289, "learning_rate": 1.9264018384813488e-05, "loss": 0.3313, "step": 654400 }, { "epoch": 9.01738723099391, "grad_norm": 5.007075786590576, "learning_rate": 1.9256371206865546e-05, "loss": 0.3338, "step": 654500 }, { "epoch": 9.01876498305365, "grad_norm": 5.827671527862549, "learning_rate": 1.924872472925606e-05, "loss": 0.3802, "step": 654600 }, { "epoch": 9.020142735113389, "grad_norm": 2.536600112915039, "learning_rate": 1.924107895263461e-05, "loss": 0.3368, "step": 654700 }, { "epoch": 9.02152048717313, "grad_norm": 2.9947972297668457, "learning_rate": 1.9233433877650757e-05, "loss": 0.3148, "step": 654800 }, { "epoch": 9.022898239232868, "grad_norm": 3.742250919342041, "learning_rate": 1.9225789504954e-05, "loss": 0.3346, "step": 654900 }, { "epoch": 9.024275991292606, "grad_norm": 2.8474090099334717, "learning_rate": 1.921814583519374e-05, "loss": 0.3194, "step": 655000 }, { "epoch": 9.025653743352347, "grad_norm": 0.39667606353759766, "learning_rate": 1.921050286901937e-05, "loss": 0.344, "step": 655100 }, { "epoch": 9.027031495412086, "grad_norm": 1.5153580904006958, "learning_rate": 1.920293702621148e-05, "loss": 0.3408, "step": 655200 }, { "epoch": 9.028409247471824, "grad_norm": 3.461282253265381, "learning_rate": 1.9195295462104678e-05, "loss": 0.3435, "step": 655300 }, { "epoch": 9.029786999531565, "grad_norm": 11.459829330444336, "learning_rate": 1.918765460352502e-05, "loss": 0.3204, "step": 655400 }, { "epoch": 9.031164751591303, "grad_norm": 2.755305290222168, "learning_rate": 1.918001445112162e-05, "loss": 0.3978, "step": 655500 }, { "epoch": 9.032542503651044, "grad_norm": 4.273592948913574, "learning_rate": 1.9172375005543557e-05, "loss": 0.3399, "step": 655600 }, { "epoch": 9.033920255710782, "grad_norm": 1.006514072418213, "learning_rate": 1.9164736267439843e-05, "loss": 0.292, "step": 655700 }, { "epoch": 9.035298007770521, "grad_norm": 1.3249393701553345, "learning_rate": 1.9157098237459427e-05, "loss": 0.3759, "step": 655800 }, { "epoch": 9.036675759830262, "grad_norm": 6.889577865600586, "learning_rate": 1.91494609162512e-05, "loss": 0.3488, "step": 655900 }, { "epoch": 9.03805351189, "grad_norm": 5.249415397644043, "learning_rate": 1.914182430446399e-05, "loss": 0.3441, "step": 656000 }, { "epoch": 9.039431263949739, "grad_norm": 5.642026424407959, "learning_rate": 1.9134188402746553e-05, "loss": 0.3226, "step": 656100 }, { "epoch": 9.04080901600948, "grad_norm": 1.900230884552002, "learning_rate": 1.912655321174762e-05, "loss": 0.3411, "step": 656200 }, { "epoch": 9.042186768069218, "grad_norm": 1.1722453832626343, "learning_rate": 1.911891873211583e-05, "loss": 0.3359, "step": 656300 }, { "epoch": 9.043564520128957, "grad_norm": 4.841881275177002, "learning_rate": 1.9111284964499772e-05, "loss": 0.2931, "step": 656400 }, { "epoch": 9.044942272188697, "grad_norm": 1.3248008489608765, "learning_rate": 1.9103651909547967e-05, "loss": 0.3582, "step": 656500 }, { "epoch": 9.046320024248436, "grad_norm": 2.876638412475586, "learning_rate": 1.9096019567908903e-05, "loss": 0.3125, "step": 656600 }, { "epoch": 9.047697776308176, "grad_norm": 2.9264562129974365, "learning_rate": 1.9088387940230955e-05, "loss": 0.3023, "step": 656700 }, { "epoch": 9.049075528367915, "grad_norm": 0.5946580767631531, "learning_rate": 1.9080757027162488e-05, "loss": 0.3055, "step": 656800 }, { "epoch": 9.050453280427654, "grad_norm": 6.849308490753174, "learning_rate": 1.9073126829351785e-05, "loss": 0.3995, "step": 656900 }, { "epoch": 9.051831032487394, "grad_norm": 1.8169491291046143, "learning_rate": 1.9065573638720256e-05, "loss": 0.3333, "step": 657000 }, { "epoch": 9.053208784547133, "grad_norm": 6.851864337921143, "learning_rate": 1.905794486620095e-05, "loss": 0.3623, "step": 657100 }, { "epoch": 9.054586536606871, "grad_norm": 1.1169012784957886, "learning_rate": 1.905031681087741e-05, "loss": 0.3315, "step": 657200 }, { "epoch": 9.055964288666612, "grad_norm": 7.5937957763671875, "learning_rate": 1.9042689473397678e-05, "loss": 0.3001, "step": 657300 }, { "epoch": 9.05734204072635, "grad_norm": 2.8620786666870117, "learning_rate": 1.903506285440975e-05, "loss": 0.3373, "step": 657400 }, { "epoch": 9.058719792786091, "grad_norm": 3.499518394470215, "learning_rate": 1.902743695456154e-05, "loss": 0.3202, "step": 657500 }, { "epoch": 9.06009754484583, "grad_norm": 1.1459527015686035, "learning_rate": 1.9019811774500907e-05, "loss": 0.3682, "step": 657600 }, { "epoch": 9.061475296905568, "grad_norm": 9.876166343688965, "learning_rate": 1.9012187314875647e-05, "loss": 0.2768, "step": 657700 }, { "epoch": 9.062853048965309, "grad_norm": 3.774568796157837, "learning_rate": 1.900456357633349e-05, "loss": 0.3148, "step": 657800 }, { "epoch": 9.064230801025047, "grad_norm": 5.276428699493408, "learning_rate": 1.8996940559522125e-05, "loss": 0.2929, "step": 657900 }, { "epoch": 9.065608553084786, "grad_norm": 12.878783226013184, "learning_rate": 1.8989318265089166e-05, "loss": 0.3364, "step": 658000 }, { "epoch": 9.066986305144527, "grad_norm": 2.5375170707702637, "learning_rate": 1.8981696693682146e-05, "loss": 0.3486, "step": 658100 }, { "epoch": 9.068364057204265, "grad_norm": 42.197322845458984, "learning_rate": 1.8974075845948593e-05, "loss": 0.3415, "step": 658200 }, { "epoch": 9.069741809264006, "grad_norm": 1.193588376045227, "learning_rate": 1.89664557225359e-05, "loss": 0.329, "step": 658300 }, { "epoch": 9.071119561323744, "grad_norm": 3.3420767784118652, "learning_rate": 1.8958836324091456e-05, "loss": 0.2652, "step": 658400 }, { "epoch": 9.072497313383483, "grad_norm": 2.342928886413574, "learning_rate": 1.895121765126256e-05, "loss": 0.2867, "step": 658500 }, { "epoch": 9.073875065443223, "grad_norm": 3.6669256687164307, "learning_rate": 1.8943599704696457e-05, "loss": 0.3036, "step": 658600 }, { "epoch": 9.075252817502962, "grad_norm": 4.23557710647583, "learning_rate": 1.8935982485040336e-05, "loss": 0.3305, "step": 658700 }, { "epoch": 9.0766305695627, "grad_norm": 2.9444212913513184, "learning_rate": 1.8928365992941326e-05, "loss": 0.3561, "step": 658800 }, { "epoch": 9.078008321622441, "grad_norm": 4.4864301681518555, "learning_rate": 1.8920750229046467e-05, "loss": 0.2939, "step": 658900 }, { "epoch": 9.07938607368218, "grad_norm": 25.02345085144043, "learning_rate": 1.891313519400277e-05, "loss": 0.3943, "step": 659000 }, { "epoch": 9.08076382574192, "grad_norm": 1.5346360206604004, "learning_rate": 1.8905520888457163e-05, "loss": 0.3065, "step": 659100 }, { "epoch": 9.082141577801659, "grad_norm": 3.7177698612213135, "learning_rate": 1.889790731305653e-05, "loss": 0.3285, "step": 659200 }, { "epoch": 9.083519329861398, "grad_norm": 4.2940778732299805, "learning_rate": 1.8890294468447683e-05, "loss": 0.3337, "step": 659300 }, { "epoch": 9.084897081921138, "grad_norm": 2.8907344341278076, "learning_rate": 1.8882682355277364e-05, "loss": 0.3025, "step": 659400 }, { "epoch": 9.086274833980877, "grad_norm": 2.1692159175872803, "learning_rate": 1.8875070974192264e-05, "loss": 0.3567, "step": 659500 }, { "epoch": 9.087652586040615, "grad_norm": 4.225934982299805, "learning_rate": 1.8867460325839e-05, "loss": 0.3404, "step": 659600 }, { "epoch": 9.089030338100356, "grad_norm": 2.248513698577881, "learning_rate": 1.8859850410864158e-05, "loss": 0.3381, "step": 659700 }, { "epoch": 9.090408090160095, "grad_norm": 14.0415678024292, "learning_rate": 1.8852241229914223e-05, "loss": 0.2877, "step": 659800 }, { "epoch": 9.091785842219835, "grad_norm": 2.9047765731811523, "learning_rate": 1.884463278363564e-05, "loss": 0.304, "step": 659900 }, { "epoch": 9.093163594279574, "grad_norm": 3.4089772701263428, "learning_rate": 1.8837025072674784e-05, "loss": 0.3505, "step": 660000 }, { "epoch": 9.094541346339312, "grad_norm": 2.795149564743042, "learning_rate": 1.8829418097677958e-05, "loss": 0.3162, "step": 660100 }, { "epoch": 9.095919098399053, "grad_norm": 0.02902807481586933, "learning_rate": 1.882181185929143e-05, "loss": 0.3304, "step": 660200 }, { "epoch": 9.097296850458791, "grad_norm": 8.011312484741211, "learning_rate": 1.8814206358161376e-05, "loss": 0.3649, "step": 660300 }, { "epoch": 9.09867460251853, "grad_norm": 3.283021926879883, "learning_rate": 1.880660159493394e-05, "loss": 0.3459, "step": 660400 }, { "epoch": 9.10005235457827, "grad_norm": 3.5296435356140137, "learning_rate": 1.8798997570255166e-05, "loss": 0.2651, "step": 660500 }, { "epoch": 9.10143010663801, "grad_norm": 3.820683479309082, "learning_rate": 1.879139428477106e-05, "loss": 0.3926, "step": 660600 }, { "epoch": 9.102807858697748, "grad_norm": 5.917936325073242, "learning_rate": 1.8783791739127563e-05, "loss": 0.3278, "step": 660700 }, { "epoch": 9.104185610757488, "grad_norm": 1.6316719055175781, "learning_rate": 1.877618993397055e-05, "loss": 0.3287, "step": 660800 }, { "epoch": 9.105563362817227, "grad_norm": 1.9378429651260376, "learning_rate": 1.8768588869945838e-05, "loss": 0.3004, "step": 660900 }, { "epoch": 9.106941114876967, "grad_norm": 3.2813785076141357, "learning_rate": 1.8760988547699174e-05, "loss": 0.3673, "step": 661000 }, { "epoch": 9.108318866936706, "grad_norm": 1.8384367227554321, "learning_rate": 1.8753388967876227e-05, "loss": 0.3135, "step": 661100 }, { "epoch": 9.109696618996445, "grad_norm": 3.2492265701293945, "learning_rate": 1.8745790131122644e-05, "loss": 0.3072, "step": 661200 }, { "epoch": 9.111074371056185, "grad_norm": 2.9144251346588135, "learning_rate": 1.873819203808397e-05, "loss": 0.307, "step": 661300 }, { "epoch": 9.112452123115924, "grad_norm": 18.725872039794922, "learning_rate": 1.8730594689405713e-05, "loss": 0.339, "step": 661400 }, { "epoch": 9.113829875175663, "grad_norm": 1.7501806020736694, "learning_rate": 1.872299808573331e-05, "loss": 0.3095, "step": 661500 }, { "epoch": 9.115207627235403, "grad_norm": 1.690932273864746, "learning_rate": 1.87154022277121e-05, "loss": 0.3457, "step": 661600 }, { "epoch": 9.116585379295142, "grad_norm": 1.1774755716323853, "learning_rate": 1.8707807115987426e-05, "loss": 0.3121, "step": 661700 }, { "epoch": 9.117963131354882, "grad_norm": 1.8819571733474731, "learning_rate": 1.8700288691152862e-05, "loss": 0.3147, "step": 661800 }, { "epoch": 9.11934088341462, "grad_norm": 9.761354446411133, "learning_rate": 1.869269506647784e-05, "loss": 0.3224, "step": 661900 }, { "epoch": 9.12071863547436, "grad_norm": 2.3311543464660645, "learning_rate": 1.8685102190028427e-05, "loss": 0.3367, "step": 662000 }, { "epoch": 9.1220963875341, "grad_norm": 2.0261716842651367, "learning_rate": 1.8677510062449682e-05, "loss": 0.3428, "step": 662100 }, { "epoch": 9.123474139593839, "grad_norm": 1.8166346549987793, "learning_rate": 1.8669918684386587e-05, "loss": 0.3158, "step": 662200 }, { "epoch": 9.124851891653577, "grad_norm": 1.3290455341339111, "learning_rate": 1.8662328056484073e-05, "loss": 0.3242, "step": 662300 }, { "epoch": 9.126229643713318, "grad_norm": 1.2308013439178467, "learning_rate": 1.8654738179387006e-05, "loss": 0.3231, "step": 662400 }, { "epoch": 9.127607395773056, "grad_norm": 8.199753761291504, "learning_rate": 1.8647149053740185e-05, "loss": 0.355, "step": 662500 }, { "epoch": 9.128985147832797, "grad_norm": 3.0945870876312256, "learning_rate": 1.8639560680188345e-05, "loss": 0.3227, "step": 662600 }, { "epoch": 9.130362899892535, "grad_norm": 5.005845069885254, "learning_rate": 1.8631973059376156e-05, "loss": 0.3159, "step": 662700 }, { "epoch": 9.131740651952274, "grad_norm": 2.084890842437744, "learning_rate": 1.8624386191948212e-05, "loss": 0.3662, "step": 662800 }, { "epoch": 9.133118404012015, "grad_norm": 3.2349278926849365, "learning_rate": 1.8616800078549076e-05, "loss": 0.3511, "step": 662900 }, { "epoch": 9.134496156071753, "grad_norm": 4.741024017333984, "learning_rate": 1.860921471982322e-05, "loss": 0.2891, "step": 663000 }, { "epoch": 9.135873908131492, "grad_norm": 1.9143905639648438, "learning_rate": 1.8601630116415053e-05, "loss": 0.3318, "step": 663100 }, { "epoch": 9.137251660191232, "grad_norm": 0.8522217869758606, "learning_rate": 1.8594046268968947e-05, "loss": 0.2929, "step": 663200 }, { "epoch": 9.138629412250971, "grad_norm": 1.7559139728546143, "learning_rate": 1.8586463178129156e-05, "loss": 0.3288, "step": 663300 }, { "epoch": 9.140007164310711, "grad_norm": 13.857282638549805, "learning_rate": 1.8578880844539927e-05, "loss": 0.3344, "step": 663400 }, { "epoch": 9.14138491637045, "grad_norm": 4.089639663696289, "learning_rate": 1.857129926884541e-05, "loss": 0.3391, "step": 663500 }, { "epoch": 9.142762668430189, "grad_norm": 7.112442493438721, "learning_rate": 1.856371845168969e-05, "loss": 0.3215, "step": 663600 }, { "epoch": 9.14414042048993, "grad_norm": 2.9991023540496826, "learning_rate": 1.8556138393716822e-05, "loss": 0.3308, "step": 663700 }, { "epoch": 9.145518172549668, "grad_norm": 4.503046035766602, "learning_rate": 1.8548559095570743e-05, "loss": 0.3489, "step": 663800 }, { "epoch": 9.146895924609407, "grad_norm": 2.8440442085266113, "learning_rate": 1.8540980557895367e-05, "loss": 0.2849, "step": 663900 }, { "epoch": 9.148273676669147, "grad_norm": 2.5758185386657715, "learning_rate": 1.8533402781334527e-05, "loss": 0.3564, "step": 664000 }, { "epoch": 9.149651428728886, "grad_norm": 96.88946533203125, "learning_rate": 1.852582576653199e-05, "loss": 0.3813, "step": 664100 }, { "epoch": 9.151029180788626, "grad_norm": 0.5691716074943542, "learning_rate": 1.8518249514131464e-05, "loss": 0.3976, "step": 664200 }, { "epoch": 9.152406932848365, "grad_norm": 1.4628241062164307, "learning_rate": 1.8510674024776602e-05, "loss": 0.286, "step": 664300 }, { "epoch": 9.153784684908103, "grad_norm": 1.9692130088806152, "learning_rate": 1.8503099299110966e-05, "loss": 0.3096, "step": 664400 }, { "epoch": 9.155162436967844, "grad_norm": 1.3080925941467285, "learning_rate": 1.8495525337778072e-05, "loss": 0.2746, "step": 664500 }, { "epoch": 9.156540189027583, "grad_norm": 1.6796101331710815, "learning_rate": 1.8487952141421358e-05, "loss": 0.3394, "step": 664600 }, { "epoch": 9.157917941087321, "grad_norm": 3.575744152069092, "learning_rate": 1.8480379710684223e-05, "loss": 0.2725, "step": 664700 }, { "epoch": 9.159295693147062, "grad_norm": 3.0670909881591797, "learning_rate": 1.8472808046209986e-05, "loss": 0.3052, "step": 664800 }, { "epoch": 9.1606734452068, "grad_norm": 3.4189727306365967, "learning_rate": 1.8465237148641877e-05, "loss": 0.3519, "step": 664900 }, { "epoch": 9.162051197266539, "grad_norm": 2.5494418144226074, "learning_rate": 1.8457667018623094e-05, "loss": 0.337, "step": 665000 }, { "epoch": 9.16342894932628, "grad_norm": 1.8558762073516846, "learning_rate": 1.8450097656796754e-05, "loss": 0.3081, "step": 665100 }, { "epoch": 9.164806701386018, "grad_norm": 2.674231767654419, "learning_rate": 1.8442529063805922e-05, "loss": 0.2847, "step": 665200 }, { "epoch": 9.166184453445759, "grad_norm": 1.8738274574279785, "learning_rate": 1.8434961240293586e-05, "loss": 0.2987, "step": 665300 }, { "epoch": 9.167562205505497, "grad_norm": 88.1220932006836, "learning_rate": 1.8427394186902674e-05, "loss": 0.3353, "step": 665400 }, { "epoch": 9.168939957565236, "grad_norm": 1.8143916130065918, "learning_rate": 1.8419827904276033e-05, "loss": 0.2877, "step": 665500 }, { "epoch": 9.170317709624976, "grad_norm": 4.368610382080078, "learning_rate": 1.841226239305646e-05, "loss": 0.407, "step": 665600 }, { "epoch": 9.171695461684715, "grad_norm": 4.862356185913086, "learning_rate": 1.8404697653886696e-05, "loss": 0.3121, "step": 665700 }, { "epoch": 9.173073213744454, "grad_norm": 3.308805227279663, "learning_rate": 1.83971336874094e-05, "loss": 0.2568, "step": 665800 }, { "epoch": 9.174450965804194, "grad_norm": 2.6034023761749268, "learning_rate": 1.838957049426716e-05, "loss": 0.3045, "step": 665900 }, { "epoch": 9.175828717863933, "grad_norm": 2.0631253719329834, "learning_rate": 1.838200807510253e-05, "loss": 0.3384, "step": 666000 }, { "epoch": 9.177206469923673, "grad_norm": 16.743928909301758, "learning_rate": 1.8374446430557944e-05, "loss": 0.3411, "step": 666100 }, { "epoch": 9.178584221983412, "grad_norm": 1.4728195667266846, "learning_rate": 1.8366885561275826e-05, "loss": 0.3629, "step": 666200 }, { "epoch": 9.17996197404315, "grad_norm": 2.939405679702759, "learning_rate": 1.8359325467898504e-05, "loss": 0.289, "step": 666300 }, { "epoch": 9.181339726102891, "grad_norm": 13.283120155334473, "learning_rate": 1.8351841740390532e-05, "loss": 0.3518, "step": 666400 }, { "epoch": 9.18271747816263, "grad_norm": 13.674266815185547, "learning_rate": 1.8344283192974478e-05, "loss": 0.3232, "step": 666500 }, { "epoch": 9.184095230222368, "grad_norm": 2.1873505115509033, "learning_rate": 1.83367254233834e-05, "loss": 0.3638, "step": 666600 }, { "epoch": 9.185472982282109, "grad_norm": 1.5052802562713623, "learning_rate": 1.8329168432259378e-05, "loss": 0.3584, "step": 666700 }, { "epoch": 9.186850734341848, "grad_norm": 4.314074993133545, "learning_rate": 1.832161222024441e-05, "loss": 0.3115, "step": 666800 }, { "epoch": 9.188228486401588, "grad_norm": 3.674027919769287, "learning_rate": 1.8314056787980447e-05, "loss": 0.3559, "step": 666900 }, { "epoch": 9.189606238461327, "grad_norm": 5.653722763061523, "learning_rate": 1.8306502136109355e-05, "loss": 0.3865, "step": 667000 }, { "epoch": 9.190983990521065, "grad_norm": 1.302977204322815, "learning_rate": 1.8298948265272938e-05, "loss": 0.3282, "step": 667100 }, { "epoch": 9.192361742580806, "grad_norm": 7.035192012786865, "learning_rate": 1.829139517611294e-05, "loss": 0.3382, "step": 667200 }, { "epoch": 9.193739494640544, "grad_norm": 1.8187967538833618, "learning_rate": 1.8283842869271026e-05, "loss": 0.361, "step": 667300 }, { "epoch": 9.195117246700283, "grad_norm": 2.8751726150512695, "learning_rate": 1.8276291345388813e-05, "loss": 0.3543, "step": 667400 }, { "epoch": 9.196494998760024, "grad_norm": 1.1306523084640503, "learning_rate": 1.8268740605107838e-05, "loss": 0.3185, "step": 667500 }, { "epoch": 9.197872750819762, "grad_norm": 2.5310068130493164, "learning_rate": 1.8261190649069584e-05, "loss": 0.3017, "step": 667600 }, { "epoch": 9.199250502879503, "grad_norm": 2.3662636280059814, "learning_rate": 1.8253641477915443e-05, "loss": 0.28, "step": 667700 }, { "epoch": 9.200628254939241, "grad_norm": 4.916684150695801, "learning_rate": 1.824609309228676e-05, "loss": 0.3157, "step": 667800 }, { "epoch": 9.20200600699898, "grad_norm": 2.3282675743103027, "learning_rate": 1.823854549282481e-05, "loss": 0.2895, "step": 667900 }, { "epoch": 9.20338375905872, "grad_norm": 4.90512228012085, "learning_rate": 1.823099868017081e-05, "loss": 0.3449, "step": 668000 }, { "epoch": 9.204761511118459, "grad_norm": 2.478827714920044, "learning_rate": 1.822345265496588e-05, "loss": 0.353, "step": 668100 }, { "epoch": 9.206139263178198, "grad_norm": 2.5820541381835938, "learning_rate": 1.8215907417851126e-05, "loss": 0.2918, "step": 668200 }, { "epoch": 9.207517015237938, "grad_norm": 2.040433645248413, "learning_rate": 1.8208362969467514e-05, "loss": 0.3412, "step": 668300 }, { "epoch": 9.208894767297677, "grad_norm": 2.73866605758667, "learning_rate": 1.820089474313663e-05, "loss": 0.3219, "step": 668400 }, { "epoch": 9.210272519357417, "grad_norm": 0.9639937877655029, "learning_rate": 1.8193351866234795e-05, "loss": 0.3192, "step": 668500 }, { "epoch": 9.211650271417156, "grad_norm": 0.4384617805480957, "learning_rate": 1.8185809779980334e-05, "loss": 0.3682, "step": 668600 }, { "epoch": 9.213028023476895, "grad_norm": 4.066040992736816, "learning_rate": 1.8178268485013994e-05, "loss": 0.3221, "step": 668700 }, { "epoch": 9.214405775536635, "grad_norm": 4.149028301239014, "learning_rate": 1.8170727981976425e-05, "loss": 0.3076, "step": 668800 }, { "epoch": 9.215783527596374, "grad_norm": 3.9431777000427246, "learning_rate": 1.816318827150824e-05, "loss": 0.3431, "step": 668900 }, { "epoch": 9.217161279656112, "grad_norm": 4.759339809417725, "learning_rate": 1.815564935424998e-05, "loss": 0.3543, "step": 669000 }, { "epoch": 9.218539031715853, "grad_norm": 4.536906719207764, "learning_rate": 1.8148111230842113e-05, "loss": 0.2961, "step": 669100 }, { "epoch": 9.219916783775592, "grad_norm": 0.23387058079242706, "learning_rate": 1.8140573901925043e-05, "loss": 0.3275, "step": 669200 }, { "epoch": 9.22129453583533, "grad_norm": 19.41399574279785, "learning_rate": 1.8133037368139108e-05, "loss": 0.2992, "step": 669300 }, { "epoch": 9.22267228789507, "grad_norm": 3.484515905380249, "learning_rate": 1.8125501630124558e-05, "loss": 0.3622, "step": 669400 }, { "epoch": 9.22405003995481, "grad_norm": 3.081313133239746, "learning_rate": 1.811796668852161e-05, "loss": 0.3477, "step": 669500 }, { "epoch": 9.22542779201455, "grad_norm": 4.139540672302246, "learning_rate": 1.811043254397039e-05, "loss": 0.363, "step": 669600 }, { "epoch": 9.226805544074288, "grad_norm": 10.904473304748535, "learning_rate": 1.810289919711096e-05, "loss": 0.2931, "step": 669700 }, { "epoch": 9.228183296134027, "grad_norm": 5.84497594833374, "learning_rate": 1.8095366648583326e-05, "loss": 0.3407, "step": 669800 }, { "epoch": 9.229561048193768, "grad_norm": 2.066365957260132, "learning_rate": 1.8087834899027397e-05, "loss": 0.3507, "step": 669900 }, { "epoch": 9.230938800253506, "grad_norm": 2.467874765396118, "learning_rate": 1.808030394908305e-05, "loss": 0.3029, "step": 670000 }, { "epoch": 9.232316552313245, "grad_norm": 4.054433345794678, "learning_rate": 1.8072773799390075e-05, "loss": 0.3059, "step": 670100 }, { "epoch": 9.233694304372985, "grad_norm": 6.117511749267578, "learning_rate": 1.8065244450588197e-05, "loss": 0.3228, "step": 670200 }, { "epoch": 9.235072056432724, "grad_norm": 2.0441293716430664, "learning_rate": 1.805771590331706e-05, "loss": 0.3137, "step": 670300 }, { "epoch": 9.236449808492464, "grad_norm": 1.0492639541625977, "learning_rate": 1.8050188158216277e-05, "loss": 0.2872, "step": 670400 }, { "epoch": 9.237827560552203, "grad_norm": 1.9644291400909424, "learning_rate": 1.8042661215925354e-05, "loss": 0.3598, "step": 670500 }, { "epoch": 9.239205312611942, "grad_norm": 6.718084335327148, "learning_rate": 1.803513507708374e-05, "loss": 0.2659, "step": 670600 }, { "epoch": 9.240583064671682, "grad_norm": 1.1509180068969727, "learning_rate": 1.8027609742330823e-05, "loss": 0.3199, "step": 670700 }, { "epoch": 9.241960816731421, "grad_norm": 3.4187633991241455, "learning_rate": 1.8020085212305914e-05, "loss": 0.322, "step": 670800 }, { "epoch": 9.24333856879116, "grad_norm": 4.264100551605225, "learning_rate": 1.8012561487648277e-05, "loss": 0.2814, "step": 670900 }, { "epoch": 9.2447163208509, "grad_norm": 2.0538039207458496, "learning_rate": 1.8005113794191756e-05, "loss": 0.3257, "step": 671000 }, { "epoch": 9.246094072910639, "grad_norm": 3.604013442993164, "learning_rate": 1.7997591674116478e-05, "loss": 0.3224, "step": 671100 }, { "epoch": 9.24747182497038, "grad_norm": 3.001390218734741, "learning_rate": 1.7990070361319404e-05, "loss": 0.285, "step": 671200 }, { "epoch": 9.248849577030118, "grad_norm": 3.397099494934082, "learning_rate": 1.798254985643951e-05, "loss": 0.3398, "step": 671300 }, { "epoch": 9.250227329089856, "grad_norm": 3.9213922023773193, "learning_rate": 1.7975030160115684e-05, "loss": 0.3565, "step": 671400 }, { "epoch": 9.251605081149597, "grad_norm": 2.9863834381103516, "learning_rate": 1.7967511272986796e-05, "loss": 0.3358, "step": 671500 }, { "epoch": 9.252982833209336, "grad_norm": 1.9003570079803467, "learning_rate": 1.7959993195691575e-05, "loss": 0.3503, "step": 671600 }, { "epoch": 9.254360585269074, "grad_norm": 4.73380184173584, "learning_rate": 1.7952475928868747e-05, "loss": 0.3247, "step": 671700 }, { "epoch": 9.255738337328815, "grad_norm": 4.568268299102783, "learning_rate": 1.7944959473156927e-05, "loss": 0.3281, "step": 671800 }, { "epoch": 9.257116089388553, "grad_norm": 3.1382946968078613, "learning_rate": 1.7937443829194687e-05, "loss": 0.3268, "step": 671900 }, { "epoch": 9.258493841448294, "grad_norm": 1.7915526628494263, "learning_rate": 1.7929928997620527e-05, "loss": 0.3586, "step": 672000 }, { "epoch": 9.259871593508032, "grad_norm": 2.7113542556762695, "learning_rate": 1.7922414979072842e-05, "loss": 0.3164, "step": 672100 }, { "epoch": 9.261249345567771, "grad_norm": 2.5293684005737305, "learning_rate": 1.791490177419001e-05, "loss": 0.3133, "step": 672200 }, { "epoch": 9.262627097627512, "grad_norm": 1.965452790260315, "learning_rate": 1.7907389383610306e-05, "loss": 0.3511, "step": 672300 }, { "epoch": 9.26400484968725, "grad_norm": 2.111309051513672, "learning_rate": 1.7899877807971947e-05, "loss": 0.3235, "step": 672400 }, { "epoch": 9.265382601746989, "grad_norm": 0.14067216217517853, "learning_rate": 1.7892367047913088e-05, "loss": 0.3647, "step": 672500 }, { "epoch": 9.26676035380673, "grad_norm": 21.046375274658203, "learning_rate": 1.7884857104071802e-05, "loss": 0.3543, "step": 672600 }, { "epoch": 9.268138105866468, "grad_norm": 4.697961807250977, "learning_rate": 1.7877347977086096e-05, "loss": 0.3182, "step": 672700 }, { "epoch": 9.269515857926208, "grad_norm": 4.218894004821777, "learning_rate": 1.7869839667593906e-05, "loss": 0.3453, "step": 672800 }, { "epoch": 9.270893609985947, "grad_norm": 2.187321186065674, "learning_rate": 1.7862332176233097e-05, "loss": 0.3644, "step": 672900 }, { "epoch": 9.272271362045686, "grad_norm": 3.0138494968414307, "learning_rate": 1.785482550364148e-05, "loss": 0.282, "step": 673000 }, { "epoch": 9.273649114105426, "grad_norm": 0.10651703178882599, "learning_rate": 1.7847319650456793e-05, "loss": 0.2964, "step": 673100 }, { "epoch": 9.275026866165165, "grad_norm": 1.2092101573944092, "learning_rate": 1.7839889663586754e-05, "loss": 0.3494, "step": 673200 }, { "epoch": 9.276404618224904, "grad_norm": 1.141021490097046, "learning_rate": 1.7832385442918832e-05, "loss": 0.307, "step": 673300 }, { "epoch": 9.277782370284644, "grad_norm": 1.4206140041351318, "learning_rate": 1.782488204356422e-05, "loss": 0.3453, "step": 673400 }, { "epoch": 9.279160122344383, "grad_norm": 11.561460494995117, "learning_rate": 1.7817379466160374e-05, "loss": 0.2967, "step": 673500 }, { "epoch": 9.280537874404121, "grad_norm": 16.791946411132812, "learning_rate": 1.780987771134468e-05, "loss": 0.3177, "step": 673600 }, { "epoch": 9.281915626463862, "grad_norm": 10.034136772155762, "learning_rate": 1.780237677975446e-05, "loss": 0.3102, "step": 673700 }, { "epoch": 9.2832933785236, "grad_norm": 1.6947712898254395, "learning_rate": 1.779487667202693e-05, "loss": 0.3548, "step": 673800 }, { "epoch": 9.284671130583341, "grad_norm": 1.68232262134552, "learning_rate": 1.7787377388799282e-05, "loss": 0.3516, "step": 673900 }, { "epoch": 9.28604888264308, "grad_norm": 0.639404833316803, "learning_rate": 1.7779878930708617e-05, "loss": 0.3438, "step": 674000 }, { "epoch": 9.287426634702818, "grad_norm": 5.2984113693237305, "learning_rate": 1.7772381298391958e-05, "loss": 0.3559, "step": 674100 }, { "epoch": 9.288804386762559, "grad_norm": 0.9397193789482117, "learning_rate": 1.776488449248629e-05, "loss": 0.362, "step": 674200 }, { "epoch": 9.290182138822297, "grad_norm": 1.8699778318405151, "learning_rate": 1.775738851362847e-05, "loss": 0.316, "step": 674300 }, { "epoch": 9.291559890882036, "grad_norm": 3.7849740982055664, "learning_rate": 1.774989336245535e-05, "loss": 0.33, "step": 674400 }, { "epoch": 9.292937642941776, "grad_norm": 4.329867362976074, "learning_rate": 1.7742399039603664e-05, "loss": 0.3824, "step": 674500 }, { "epoch": 9.294315395001515, "grad_norm": 3.1027746200561523, "learning_rate": 1.7734905545710096e-05, "loss": 0.3064, "step": 674600 }, { "epoch": 9.295693147061256, "grad_norm": 2.9407665729522705, "learning_rate": 1.7727412881411265e-05, "loss": 0.3151, "step": 674700 }, { "epoch": 9.297070899120994, "grad_norm": 6.45418119430542, "learning_rate": 1.7719921047343714e-05, "loss": 0.3519, "step": 674800 }, { "epoch": 9.298448651180733, "grad_norm": 0.926025390625, "learning_rate": 1.77124300441439e-05, "loss": 0.3172, "step": 674900 }, { "epoch": 9.299826403240473, "grad_norm": 1.4097199440002441, "learning_rate": 1.7704939872448222e-05, "loss": 0.3266, "step": 675000 }, { "epoch": 9.301204155300212, "grad_norm": 2.1308705806732178, "learning_rate": 1.769745053289301e-05, "loss": 0.3102, "step": 675100 }, { "epoch": 9.30258190735995, "grad_norm": 2.81240177154541, "learning_rate": 1.768996202611453e-05, "loss": 0.3598, "step": 675200 }, { "epoch": 9.303959659419691, "grad_norm": 2.48207426071167, "learning_rate": 1.7682474352748966e-05, "loss": 0.3153, "step": 675300 }, { "epoch": 9.30533741147943, "grad_norm": 6.205196380615234, "learning_rate": 1.767498751343243e-05, "loss": 0.3422, "step": 675400 }, { "epoch": 9.30671516353917, "grad_norm": 1.7789554595947266, "learning_rate": 1.7667501508800964e-05, "loss": 0.3088, "step": 675500 }, { "epoch": 9.308092915598909, "grad_norm": 0.8953598141670227, "learning_rate": 1.7660091187046727e-05, "loss": 0.3108, "step": 675600 }, { "epoch": 9.309470667658648, "grad_norm": 7.384527683258057, "learning_rate": 1.7652681684607667e-05, "loss": 0.3331, "step": 675700 }, { "epoch": 9.310848419718388, "grad_norm": 20.89877700805664, "learning_rate": 1.7645198171108912e-05, "loss": 0.2974, "step": 675800 }, { "epoch": 9.312226171778127, "grad_norm": 24.380756378173828, "learning_rate": 1.7637715494825982e-05, "loss": 0.3273, "step": 675900 }, { "epoch": 9.313603923837865, "grad_norm": 2.0169479846954346, "learning_rate": 1.763023365639458e-05, "loss": 0.2865, "step": 676000 }, { "epoch": 9.314981675897606, "grad_norm": 2.934162139892578, "learning_rate": 1.7622752656450316e-05, "loss": 0.3153, "step": 676100 }, { "epoch": 9.316359427957345, "grad_norm": 4.041893005371094, "learning_rate": 1.761527249562875e-05, "loss": 0.3614, "step": 676200 }, { "epoch": 9.317737180017085, "grad_norm": 21.639204025268555, "learning_rate": 1.7607793174565354e-05, "loss": 0.3131, "step": 676300 }, { "epoch": 9.319114932076824, "grad_norm": 4.907148838043213, "learning_rate": 1.7600314693895543e-05, "loss": 0.3674, "step": 676400 }, { "epoch": 9.320492684136562, "grad_norm": 1.8368486166000366, "learning_rate": 1.759283705425464e-05, "loss": 0.355, "step": 676500 }, { "epoch": 9.321870436196303, "grad_norm": 5.02472448348999, "learning_rate": 1.7585360256277906e-05, "loss": 0.302, "step": 676600 }, { "epoch": 9.323248188256041, "grad_norm": 2.847317695617676, "learning_rate": 1.7577884300600547e-05, "loss": 0.348, "step": 676700 }, { "epoch": 9.32462594031578, "grad_norm": 11.740234375, "learning_rate": 1.757040918785768e-05, "loss": 0.3361, "step": 676800 }, { "epoch": 9.32600369237552, "grad_norm": 3.729403495788574, "learning_rate": 1.756293491868436e-05, "loss": 0.3309, "step": 676900 }, { "epoch": 9.32738144443526, "grad_norm": 3.784461736679077, "learning_rate": 1.7555461493715544e-05, "loss": 0.2769, "step": 677000 }, { "epoch": 9.328759196495, "grad_norm": 3.5498251914978027, "learning_rate": 1.7547988913586148e-05, "loss": 0.3702, "step": 677100 }, { "epoch": 9.330136948554738, "grad_norm": 2.397456645965576, "learning_rate": 1.7540517178931013e-05, "loss": 0.3027, "step": 677200 }, { "epoch": 9.331514700614477, "grad_norm": 3.777019500732422, "learning_rate": 1.75330462903849e-05, "loss": 0.3347, "step": 677300 }, { "epoch": 9.332892452674217, "grad_norm": 2.6081326007843018, "learning_rate": 1.752565094480704e-05, "loss": 0.3224, "step": 677400 }, { "epoch": 9.334270204733956, "grad_norm": 2.033360242843628, "learning_rate": 1.751818174190604e-05, "loss": 0.364, "step": 677500 }, { "epoch": 9.335647956793695, "grad_norm": 3.4498722553253174, "learning_rate": 1.7510713387011563e-05, "loss": 0.3195, "step": 677600 }, { "epoch": 9.337025708853435, "grad_norm": 1.201979160308838, "learning_rate": 1.750324588075808e-05, "loss": 0.3146, "step": 677700 }, { "epoch": 9.338403460913174, "grad_norm": 5.784176349639893, "learning_rate": 1.749577922378001e-05, "loss": 0.3293, "step": 677800 }, { "epoch": 9.339781212972913, "grad_norm": 7.811093807220459, "learning_rate": 1.7488313416711677e-05, "loss": 0.2953, "step": 677900 }, { "epoch": 9.341158965032653, "grad_norm": 3.642213821411133, "learning_rate": 1.748084846018734e-05, "loss": 0.3078, "step": 678000 }, { "epoch": 9.342536717092392, "grad_norm": 3.5646255016326904, "learning_rate": 1.7473384354841188e-05, "loss": 0.2859, "step": 678100 }, { "epoch": 9.343914469152132, "grad_norm": 9.357978820800781, "learning_rate": 1.7465921101307315e-05, "loss": 0.329, "step": 678200 }, { "epoch": 9.34529222121187, "grad_norm": 3.851332664489746, "learning_rate": 1.7458458700219787e-05, "loss": 0.3201, "step": 678300 }, { "epoch": 9.34666997327161, "grad_norm": 3.7411699295043945, "learning_rate": 1.7450997152212564e-05, "loss": 0.3757, "step": 678400 }, { "epoch": 9.34804772533135, "grad_norm": 1.7756787538528442, "learning_rate": 1.744353645791954e-05, "loss": 0.3122, "step": 678500 }, { "epoch": 9.349425477391089, "grad_norm": 1.9279903173446655, "learning_rate": 1.743607661797456e-05, "loss": 0.3409, "step": 678600 }, { "epoch": 9.350803229450827, "grad_norm": 1.9116443395614624, "learning_rate": 1.742861763301134e-05, "loss": 0.3362, "step": 678700 }, { "epoch": 9.352180981510568, "grad_norm": 2.4937658309936523, "learning_rate": 1.742115950366358e-05, "loss": 0.3468, "step": 678800 }, { "epoch": 9.353558733570306, "grad_norm": 1.7982778549194336, "learning_rate": 1.7413702230564883e-05, "loss": 0.3173, "step": 678900 }, { "epoch": 9.354936485630047, "grad_norm": 13.983357429504395, "learning_rate": 1.7406245814348778e-05, "loss": 0.3355, "step": 679000 }, { "epoch": 9.356314237689785, "grad_norm": 3.6085236072540283, "learning_rate": 1.739879025564875e-05, "loss": 0.2948, "step": 679100 }, { "epoch": 9.357691989749524, "grad_norm": 9.648759841918945, "learning_rate": 1.7391335555098146e-05, "loss": 0.3697, "step": 679200 }, { "epoch": 9.359069741809265, "grad_norm": 1.647913932800293, "learning_rate": 1.7383881713330314e-05, "loss": 0.2968, "step": 679300 }, { "epoch": 9.360447493869003, "grad_norm": 2.4777965545654297, "learning_rate": 1.737642873097848e-05, "loss": 0.3126, "step": 679400 }, { "epoch": 9.361825245928742, "grad_norm": 4.222289562225342, "learning_rate": 1.736897660867581e-05, "loss": 0.3261, "step": 679500 }, { "epoch": 9.363202997988482, "grad_norm": 20.759096145629883, "learning_rate": 1.7361525347055417e-05, "loss": 0.3089, "step": 679600 }, { "epoch": 9.364580750048221, "grad_norm": 0.8587434887886047, "learning_rate": 1.7354074946750317e-05, "loss": 0.3204, "step": 679700 }, { "epoch": 9.365958502107961, "grad_norm": 1.9344478845596313, "learning_rate": 1.7346625408393452e-05, "loss": 0.2952, "step": 679800 }, { "epoch": 9.3673362541677, "grad_norm": 3.457611560821533, "learning_rate": 1.73391767326177e-05, "loss": 0.3534, "step": 679900 }, { "epoch": 9.368714006227439, "grad_norm": 2.8440911769866943, "learning_rate": 1.7331728920055863e-05, "loss": 0.2572, "step": 680000 }, { "epoch": 9.37009175828718, "grad_norm": 3.2267119884490967, "learning_rate": 1.732428197134068e-05, "loss": 0.3592, "step": 680100 }, { "epoch": 9.371469510346918, "grad_norm": 4.819392204284668, "learning_rate": 1.7316835887104808e-05, "loss": 0.3647, "step": 680200 }, { "epoch": 9.372847262406657, "grad_norm": 53.90210723876953, "learning_rate": 1.730939066798082e-05, "loss": 0.3705, "step": 680300 }, { "epoch": 9.374225014466397, "grad_norm": 2.236374855041504, "learning_rate": 1.730194631460123e-05, "loss": 0.26, "step": 680400 }, { "epoch": 9.375602766526136, "grad_norm": 8.308146476745605, "learning_rate": 1.7294502827598465e-05, "loss": 0.3124, "step": 680500 }, { "epoch": 9.376980518585876, "grad_norm": 0.9520137906074524, "learning_rate": 1.7287060207604905e-05, "loss": 0.3051, "step": 680600 }, { "epoch": 9.378358270645615, "grad_norm": 2.7053496837615967, "learning_rate": 1.7279618455252825e-05, "loss": 0.3135, "step": 680700 }, { "epoch": 9.379736022705353, "grad_norm": 2.6034014225006104, "learning_rate": 1.7272177571174453e-05, "loss": 0.3162, "step": 680800 }, { "epoch": 9.381113774765094, "grad_norm": 2.3545210361480713, "learning_rate": 1.7264737556001915e-05, "loss": 0.3039, "step": 680900 }, { "epoch": 9.382491526824833, "grad_norm": 24.03419303894043, "learning_rate": 1.7257298410367276e-05, "loss": 0.3631, "step": 681000 }, { "epoch": 9.383869278884571, "grad_norm": 2.7791194915771484, "learning_rate": 1.724986013490255e-05, "loss": 0.3174, "step": 681100 }, { "epoch": 9.385247030944312, "grad_norm": 0.7346885800361633, "learning_rate": 1.7242422730239643e-05, "loss": 0.3408, "step": 681200 }, { "epoch": 9.38662478300405, "grad_norm": 4.430757522583008, "learning_rate": 1.7234986197010402e-05, "loss": 0.3224, "step": 681300 }, { "epoch": 9.38800253506379, "grad_norm": 5.133533000946045, "learning_rate": 1.7227550535846606e-05, "loss": 0.2908, "step": 681400 }, { "epoch": 9.38938028712353, "grad_norm": 2.1571388244628906, "learning_rate": 1.722011574737993e-05, "loss": 0.384, "step": 681500 }, { "epoch": 9.390758039183268, "grad_norm": 2.6111044883728027, "learning_rate": 1.721268183224202e-05, "loss": 0.3019, "step": 681600 }, { "epoch": 9.392135791243009, "grad_norm": 2.909450054168701, "learning_rate": 1.720524879106442e-05, "loss": 0.3285, "step": 681700 }, { "epoch": 9.393513543302747, "grad_norm": 19.432626724243164, "learning_rate": 1.71978166244786e-05, "loss": 0.3627, "step": 681800 }, { "epoch": 9.394891295362486, "grad_norm": 3.875836133956909, "learning_rate": 1.719038533311597e-05, "loss": 0.3003, "step": 681900 }, { "epoch": 9.396269047422226, "grad_norm": 2.5819971561431885, "learning_rate": 1.7182954917607846e-05, "loss": 0.3676, "step": 682000 }, { "epoch": 9.397646799481965, "grad_norm": 2.9338810443878174, "learning_rate": 1.7175525378585476e-05, "loss": 0.3217, "step": 682100 }, { "epoch": 9.399024551541704, "grad_norm": 3.7809653282165527, "learning_rate": 1.716809671668005e-05, "loss": 0.372, "step": 682200 }, { "epoch": 9.400402303601444, "grad_norm": 1.086075782775879, "learning_rate": 1.7160668932522667e-05, "loss": 0.3311, "step": 682300 }, { "epoch": 9.401780055661183, "grad_norm": 2.3986122608184814, "learning_rate": 1.7153316291452085e-05, "loss": 0.3289, "step": 682400 }, { "epoch": 9.403157807720923, "grad_norm": 0.9070461392402649, "learning_rate": 1.714589025589058e-05, "loss": 0.2922, "step": 682500 }, { "epoch": 9.404535559780662, "grad_norm": 2.0744144916534424, "learning_rate": 1.713846509996366e-05, "loss": 0.307, "step": 682600 }, { "epoch": 9.4059133118404, "grad_norm": 3.639514446258545, "learning_rate": 1.7131040824302132e-05, "loss": 0.2896, "step": 682700 }, { "epoch": 9.407291063900141, "grad_norm": 1.4293047189712524, "learning_rate": 1.7123617429536743e-05, "loss": 0.2998, "step": 682800 }, { "epoch": 9.40866881595988, "grad_norm": 4.762004375457764, "learning_rate": 1.7116194916298136e-05, "loss": 0.3372, "step": 682900 }, { "epoch": 9.410046568019618, "grad_norm": 5.242640972137451, "learning_rate": 1.7108773285216895e-05, "loss": 0.3574, "step": 683000 }, { "epoch": 9.411424320079359, "grad_norm": 0.8914658427238464, "learning_rate": 1.710135253692353e-05, "loss": 0.31, "step": 683100 }, { "epoch": 9.412802072139097, "grad_norm": 7.477059841156006, "learning_rate": 1.709393267204845e-05, "loss": 0.305, "step": 683200 }, { "epoch": 9.414179824198838, "grad_norm": 3.8547801971435547, "learning_rate": 1.7086513691222038e-05, "loss": 0.3114, "step": 683300 }, { "epoch": 9.415557576258577, "grad_norm": 2.740771770477295, "learning_rate": 1.707909559507456e-05, "loss": 0.357, "step": 683400 }, { "epoch": 9.416935328318315, "grad_norm": 5.158901214599609, "learning_rate": 1.707167838423622e-05, "loss": 0.3032, "step": 683500 }, { "epoch": 9.418313080378056, "grad_norm": 1.5831928253173828, "learning_rate": 1.706426205933717e-05, "loss": 0.3429, "step": 683600 }, { "epoch": 9.419690832437794, "grad_norm": 5.148906707763672, "learning_rate": 1.7056846621007428e-05, "loss": 0.299, "step": 683700 }, { "epoch": 9.421068584497533, "grad_norm": 3.8267674446105957, "learning_rate": 1.7049432069877003e-05, "loss": 0.2365, "step": 683800 }, { "epoch": 9.422446336557273, "grad_norm": 4.152651786804199, "learning_rate": 1.704201840657578e-05, "loss": 0.2809, "step": 683900 }, { "epoch": 9.423824088617012, "grad_norm": 2.201584577560425, "learning_rate": 1.7034605631733596e-05, "loss": 0.3479, "step": 684000 }, { "epoch": 9.425201840676753, "grad_norm": 3.6741957664489746, "learning_rate": 1.702719374598022e-05, "loss": 0.3312, "step": 684100 }, { "epoch": 9.426579592736491, "grad_norm": 0.8137636184692383, "learning_rate": 1.70197827499453e-05, "loss": 0.3093, "step": 684200 }, { "epoch": 9.42795734479623, "grad_norm": 1.0340723991394043, "learning_rate": 1.7012446740906037e-05, "loss": 0.3416, "step": 684300 }, { "epoch": 9.42933509685597, "grad_norm": 2.266164541244507, "learning_rate": 1.700503751728389e-05, "loss": 0.3557, "step": 684400 }, { "epoch": 9.430712848915709, "grad_norm": 4.826414108276367, "learning_rate": 1.6997629185262508e-05, "loss": 0.2965, "step": 684500 }, { "epoch": 9.432090600975448, "grad_norm": 3.8050553798675537, "learning_rate": 1.699022174547125e-05, "loss": 0.373, "step": 684600 }, { "epoch": 9.433468353035188, "grad_norm": 2.3289053440093994, "learning_rate": 1.698281519853943e-05, "loss": 0.3381, "step": 684700 }, { "epoch": 9.434846105094927, "grad_norm": 0.971208393573761, "learning_rate": 1.6975409545096264e-05, "loss": 0.3062, "step": 684800 }, { "epoch": 9.436223857154667, "grad_norm": 1.1102173328399658, "learning_rate": 1.696800478577089e-05, "loss": 0.3748, "step": 684900 }, { "epoch": 9.437601609214406, "grad_norm": 1.7782244682312012, "learning_rate": 1.6960600921192398e-05, "loss": 0.3018, "step": 685000 }, { "epoch": 9.438979361274145, "grad_norm": 4.1507110595703125, "learning_rate": 1.695319795198978e-05, "loss": 0.3308, "step": 685100 }, { "epoch": 9.440357113333885, "grad_norm": 2.7041454315185547, "learning_rate": 1.6945795878791956e-05, "loss": 0.2937, "step": 685200 }, { "epoch": 9.441734865393624, "grad_norm": 3.111372470855713, "learning_rate": 1.693839470222776e-05, "loss": 0.3103, "step": 685300 }, { "epoch": 9.443112617453362, "grad_norm": 3.622607469558716, "learning_rate": 1.693099442292596e-05, "loss": 0.3264, "step": 685400 }, { "epoch": 9.444490369513103, "grad_norm": 1.2135097980499268, "learning_rate": 1.6923595041515265e-05, "loss": 0.2964, "step": 685500 }, { "epoch": 9.445868121572842, "grad_norm": 1.7570555210113525, "learning_rate": 1.6916196558624275e-05, "loss": 0.3142, "step": 685600 }, { "epoch": 9.447245873632582, "grad_norm": 2.5322115421295166, "learning_rate": 1.6908798974881533e-05, "loss": 0.3333, "step": 685700 }, { "epoch": 9.44862362569232, "grad_norm": 2.4079370498657227, "learning_rate": 1.6901402290915515e-05, "loss": 0.2382, "step": 685800 }, { "epoch": 9.45000137775206, "grad_norm": 3.14982271194458, "learning_rate": 1.689400650735458e-05, "loss": 0.2722, "step": 685900 }, { "epoch": 9.4513791298118, "grad_norm": 1.9461138248443604, "learning_rate": 1.6886611624827056e-05, "loss": 0.3227, "step": 686000 }, { "epoch": 9.452756881871538, "grad_norm": 3.121630907058716, "learning_rate": 1.6879217643961175e-05, "loss": 0.3746, "step": 686100 }, { "epoch": 9.454134633931277, "grad_norm": 1.5833219289779663, "learning_rate": 1.6871824565385082e-05, "loss": 0.2665, "step": 686200 }, { "epoch": 9.455512385991018, "grad_norm": 2.4706714153289795, "learning_rate": 1.686443238972688e-05, "loss": 0.3622, "step": 686300 }, { "epoch": 9.456890138050756, "grad_norm": 1.2587929964065552, "learning_rate": 1.685704111761455e-05, "loss": 0.3371, "step": 686400 }, { "epoch": 9.458267890110495, "grad_norm": 4.239849090576172, "learning_rate": 1.6849650749676023e-05, "loss": 0.3447, "step": 686500 }, { "epoch": 9.459645642170235, "grad_norm": 5.095478057861328, "learning_rate": 1.6842261286539153e-05, "loss": 0.3254, "step": 686600 }, { "epoch": 9.461023394229974, "grad_norm": 4.181543350219727, "learning_rate": 1.6834872728831712e-05, "loss": 0.3111, "step": 686700 }, { "epoch": 9.462401146289714, "grad_norm": 2.247666835784912, "learning_rate": 1.6827485077181395e-05, "loss": 0.3168, "step": 686800 }, { "epoch": 9.463778898349453, "grad_norm": 1.4321414232254028, "learning_rate": 1.6820172195175336e-05, "loss": 0.3077, "step": 686900 }, { "epoch": 9.465156650409192, "grad_norm": 3.2892682552337646, "learning_rate": 1.681278634844581e-05, "loss": 0.2918, "step": 687000 }, { "epoch": 9.466534402468932, "grad_norm": 4.064759731292725, "learning_rate": 1.680540140964977e-05, "loss": 0.3004, "step": 687100 }, { "epoch": 9.46791215452867, "grad_norm": 3.0618879795074463, "learning_rate": 1.6798017379414593e-05, "loss": 0.2754, "step": 687200 }, { "epoch": 9.46928990658841, "grad_norm": 1.4956387281417847, "learning_rate": 1.679063425836758e-05, "loss": 0.2771, "step": 687300 }, { "epoch": 9.47066765864815, "grad_norm": 3.4466359615325928, "learning_rate": 1.6783252047135995e-05, "loss": 0.3947, "step": 687400 }, { "epoch": 9.472045410707889, "grad_norm": 2.664890766143799, "learning_rate": 1.6775870746346962e-05, "loss": 0.3361, "step": 687500 }, { "epoch": 9.473423162767629, "grad_norm": 2.4691033363342285, "learning_rate": 1.676856415601292e-05, "loss": 0.3701, "step": 687600 }, { "epoch": 9.474800914827368, "grad_norm": 2.4638006687164307, "learning_rate": 1.67611846688701e-05, "loss": 0.3707, "step": 687700 }, { "epoch": 9.476178666887106, "grad_norm": 55.51746368408203, "learning_rate": 1.6753806094044596e-05, "loss": 0.3542, "step": 687800 }, { "epoch": 9.477556418946847, "grad_norm": 3.844655752182007, "learning_rate": 1.6746428432163236e-05, "loss": 0.3211, "step": 687900 }, { "epoch": 9.478934171006586, "grad_norm": 11.643254280090332, "learning_rate": 1.6739051683852805e-05, "loss": 0.3182, "step": 688000 }, { "epoch": 9.480311923066324, "grad_norm": 2.1945297718048096, "learning_rate": 1.6731675849739976e-05, "loss": 0.3429, "step": 688100 }, { "epoch": 9.481689675126065, "grad_norm": 16.415245056152344, "learning_rate": 1.6724300930451373e-05, "loss": 0.3414, "step": 688200 }, { "epoch": 9.483067427185803, "grad_norm": 1.9102420806884766, "learning_rate": 1.6716926926613536e-05, "loss": 0.3383, "step": 688300 }, { "epoch": 9.484445179245544, "grad_norm": 3.0731265544891357, "learning_rate": 1.670955383885292e-05, "loss": 0.3558, "step": 688400 }, { "epoch": 9.485822931305282, "grad_norm": 1.145594596862793, "learning_rate": 1.670218166779592e-05, "loss": 0.3362, "step": 688500 }, { "epoch": 9.487200683365021, "grad_norm": 2.1142444610595703, "learning_rate": 1.669481041406882e-05, "loss": 0.3044, "step": 688600 }, { "epoch": 9.488578435424762, "grad_norm": 55.36131286621094, "learning_rate": 1.6687440078297854e-05, "loss": 0.3174, "step": 688700 }, { "epoch": 9.4899561874845, "grad_norm": 1.893140435218811, "learning_rate": 1.668007066110917e-05, "loss": 0.3003, "step": 688800 }, { "epoch": 9.491333939544239, "grad_norm": 1.9860502481460571, "learning_rate": 1.6672702163128838e-05, "loss": 0.3562, "step": 688900 }, { "epoch": 9.49271169160398, "grad_norm": 1.3350133895874023, "learning_rate": 1.6665334584982845e-05, "loss": 0.3237, "step": 689000 }, { "epoch": 9.494089443663718, "grad_norm": 3.0858402252197266, "learning_rate": 1.6657967927297127e-05, "loss": 0.3581, "step": 689100 }, { "epoch": 9.495467195723458, "grad_norm": 1.2162363529205322, "learning_rate": 1.6650602190697482e-05, "loss": 0.2917, "step": 689200 }, { "epoch": 9.496844947783197, "grad_norm": 2.634979248046875, "learning_rate": 1.6643237375809692e-05, "loss": 0.3358, "step": 689300 }, { "epoch": 9.498222699842936, "grad_norm": 2.258401870727539, "learning_rate": 1.6635873483259433e-05, "loss": 0.294, "step": 689400 }, { "epoch": 9.499600451902676, "grad_norm": 3.4919583797454834, "learning_rate": 1.662851051367229e-05, "loss": 0.3145, "step": 689500 }, { "epoch": 9.500978203962415, "grad_norm": 3.4044504165649414, "learning_rate": 1.662114846767382e-05, "loss": 0.3252, "step": 689600 }, { "epoch": 9.502355956022154, "grad_norm": 6.067110061645508, "learning_rate": 1.6613787345889422e-05, "loss": 0.3135, "step": 689700 }, { "epoch": 9.503733708081894, "grad_norm": 24.15770721435547, "learning_rate": 1.6606427148944494e-05, "loss": 0.3438, "step": 689800 }, { "epoch": 9.505111460141633, "grad_norm": 2.949833631515503, "learning_rate": 1.6599067877464305e-05, "loss": 0.3092, "step": 689900 }, { "epoch": 9.506489212201373, "grad_norm": 5.739774703979492, "learning_rate": 1.6591709532074063e-05, "loss": 0.3439, "step": 690000 }, { "epoch": 9.507866964261112, "grad_norm": 1.464240312576294, "learning_rate": 1.6584352113398913e-05, "loss": 0.3439, "step": 690100 }, { "epoch": 9.50924471632085, "grad_norm": 2.0904932022094727, "learning_rate": 1.6576995622063897e-05, "loss": 0.3141, "step": 690200 }, { "epoch": 9.51062246838059, "grad_norm": 5.582279205322266, "learning_rate": 1.6569640058693983e-05, "loss": 0.2866, "step": 690300 }, { "epoch": 9.51200022044033, "grad_norm": 2.8447728157043457, "learning_rate": 1.656228542391406e-05, "loss": 0.3404, "step": 690400 }, { "epoch": 9.513377972500068, "grad_norm": 4.412662506103516, "learning_rate": 1.6554931718348945e-05, "loss": 0.2327, "step": 690500 }, { "epoch": 9.514755724559809, "grad_norm": 1.2669757604599, "learning_rate": 1.6547578942623377e-05, "loss": 0.3176, "step": 690600 }, { "epoch": 9.516133476619547, "grad_norm": 2.0791256427764893, "learning_rate": 1.6540227097362018e-05, "loss": 0.356, "step": 690700 }, { "epoch": 9.517511228679286, "grad_norm": 4.004638195037842, "learning_rate": 1.6532876183189435e-05, "loss": 0.3339, "step": 690800 }, { "epoch": 9.518888980739026, "grad_norm": 1.1769605875015259, "learning_rate": 1.6525526200730123e-05, "loss": 0.3044, "step": 690900 }, { "epoch": 9.520266732798765, "grad_norm": 2.9707272052764893, "learning_rate": 1.65181771506085e-05, "loss": 0.2599, "step": 691000 }, { "epoch": 9.521644484858506, "grad_norm": 4.949897766113281, "learning_rate": 1.6510829033448918e-05, "loss": 0.3621, "step": 691100 }, { "epoch": 9.523022236918244, "grad_norm": 3.8138391971588135, "learning_rate": 1.650348184987563e-05, "loss": 0.3635, "step": 691200 }, { "epoch": 9.524399988977983, "grad_norm": 7.296191692352295, "learning_rate": 1.6496135600512822e-05, "loss": 0.3191, "step": 691300 }, { "epoch": 9.525777741037723, "grad_norm": 5.1465959548950195, "learning_rate": 1.6488790285984584e-05, "loss": 0.2777, "step": 691400 }, { "epoch": 9.527155493097462, "grad_norm": 2.4102354049682617, "learning_rate": 1.648144590691494e-05, "loss": 0.2873, "step": 691500 }, { "epoch": 9.5285332451572, "grad_norm": 7.835136890411377, "learning_rate": 1.6474102463927837e-05, "loss": 0.3528, "step": 691600 }, { "epoch": 9.529910997216941, "grad_norm": 3.2894773483276367, "learning_rate": 1.6466759957647143e-05, "loss": 0.3366, "step": 691700 }, { "epoch": 9.53128874927668, "grad_norm": 2.3257899284362793, "learning_rate": 1.645941838869664e-05, "loss": 0.2946, "step": 691800 }, { "epoch": 9.53266650133642, "grad_norm": 0.7014309763908386, "learning_rate": 1.6452077757700023e-05, "loss": 0.2742, "step": 691900 }, { "epoch": 9.534044253396159, "grad_norm": 5.086578369140625, "learning_rate": 1.6444738065280917e-05, "loss": 0.3439, "step": 692000 }, { "epoch": 9.535422005455898, "grad_norm": 4.228861331939697, "learning_rate": 1.6437399312062876e-05, "loss": 0.3527, "step": 692100 }, { "epoch": 9.536799757515638, "grad_norm": 0.7030498385429382, "learning_rate": 1.6430061498669357e-05, "loss": 0.35, "step": 692200 }, { "epoch": 9.538177509575377, "grad_norm": 2.3977200984954834, "learning_rate": 1.6422724625723743e-05, "loss": 0.2661, "step": 692300 }, { "epoch": 9.539555261635115, "grad_norm": 2.7346768379211426, "learning_rate": 1.641538869384936e-05, "loss": 0.3489, "step": 692400 }, { "epoch": 9.540933013694856, "grad_norm": 1.3992122411727905, "learning_rate": 1.6408053703669397e-05, "loss": 0.3124, "step": 692500 }, { "epoch": 9.542310765754594, "grad_norm": 5.367378234863281, "learning_rate": 1.6400719655807025e-05, "loss": 0.3768, "step": 692600 }, { "epoch": 9.543688517814335, "grad_norm": 2.5158753395080566, "learning_rate": 1.6393386550885302e-05, "loss": 0.3369, "step": 692700 }, { "epoch": 9.545066269874074, "grad_norm": 0.03340692073106766, "learning_rate": 1.638605438952721e-05, "loss": 0.3301, "step": 692800 }, { "epoch": 9.546444021933812, "grad_norm": 2.227006435394287, "learning_rate": 1.637872317235566e-05, "loss": 0.3786, "step": 692900 }, { "epoch": 9.547821773993553, "grad_norm": 3.787794589996338, "learning_rate": 1.6371392899993474e-05, "loss": 0.3269, "step": 693000 }, { "epoch": 9.549199526053291, "grad_norm": 2.7167136669158936, "learning_rate": 1.6364063573063393e-05, "loss": 0.3373, "step": 693100 }, { "epoch": 9.55057727811303, "grad_norm": 4.415078639984131, "learning_rate": 1.635673519218808e-05, "loss": 0.2869, "step": 693200 }, { "epoch": 9.55195503017277, "grad_norm": 3.1602120399475098, "learning_rate": 1.6349407757990115e-05, "loss": 0.3058, "step": 693300 }, { "epoch": 9.55333278223251, "grad_norm": 1.7955783605575562, "learning_rate": 1.634208127109202e-05, "loss": 0.3554, "step": 693400 }, { "epoch": 9.55471053429225, "grad_norm": 2.6662306785583496, "learning_rate": 1.6334755732116203e-05, "loss": 0.304, "step": 693500 }, { "epoch": 9.556088286351988, "grad_norm": 3.034860610961914, "learning_rate": 1.632750438289198e-05, "loss": 0.3325, "step": 693600 }, { "epoch": 9.557466038411727, "grad_norm": 2.304090976715088, "learning_rate": 1.632018073213291e-05, "loss": 0.3176, "step": 693700 }, { "epoch": 9.558843790471467, "grad_norm": 5.1975812911987305, "learning_rate": 1.6312858031156687e-05, "loss": 0.2813, "step": 693800 }, { "epoch": 9.560221542531206, "grad_norm": 4.145631313323975, "learning_rate": 1.6305536280585407e-05, "loss": 0.3323, "step": 693900 }, { "epoch": 9.561599294590945, "grad_norm": 2.6459829807281494, "learning_rate": 1.6298215481041097e-05, "loss": 0.3432, "step": 694000 }, { "epoch": 9.562977046650685, "grad_norm": 1.3821395635604858, "learning_rate": 1.6290895633145683e-05, "loss": 0.3033, "step": 694100 }, { "epoch": 9.564354798710424, "grad_norm": 4.769979476928711, "learning_rate": 1.6283576737521025e-05, "loss": 0.3476, "step": 694200 }, { "epoch": 9.565732550770164, "grad_norm": 1.6830343008041382, "learning_rate": 1.6276258794788914e-05, "loss": 0.3083, "step": 694300 }, { "epoch": 9.567110302829903, "grad_norm": 5.125626087188721, "learning_rate": 1.6268941805571035e-05, "loss": 0.3219, "step": 694400 }, { "epoch": 9.568488054889642, "grad_norm": 2.8273632526397705, "learning_rate": 1.6261625770489004e-05, "loss": 0.3178, "step": 694500 }, { "epoch": 9.569865806949382, "grad_norm": 4.085282802581787, "learning_rate": 1.625431069016438e-05, "loss": 0.2738, "step": 694600 }, { "epoch": 9.57124355900912, "grad_norm": 1.0584511756896973, "learning_rate": 1.6246996565218568e-05, "loss": 0.3193, "step": 694700 }, { "epoch": 9.57262131106886, "grad_norm": 1.325571894645691, "learning_rate": 1.623968339627298e-05, "loss": 0.3041, "step": 694800 }, { "epoch": 9.5739990631286, "grad_norm": 2.0018537044525146, "learning_rate": 1.6232371183948893e-05, "loss": 0.2926, "step": 694900 }, { "epoch": 9.575376815188338, "grad_norm": 109.8460464477539, "learning_rate": 1.6225059928867514e-05, "loss": 0.3145, "step": 695000 }, { "epoch": 9.576754567248077, "grad_norm": 7.8019561767578125, "learning_rate": 1.6217749631649985e-05, "loss": 0.3566, "step": 695100 }, { "epoch": 9.578132319307818, "grad_norm": 1.925970435142517, "learning_rate": 1.621044029291735e-05, "loss": 0.3246, "step": 695200 }, { "epoch": 9.579510071367556, "grad_norm": 2.6698803901672363, "learning_rate": 1.620313191329056e-05, "loss": 0.3754, "step": 695300 }, { "epoch": 9.580887823427297, "grad_norm": 5.3387861251831055, "learning_rate": 1.6195824493390506e-05, "loss": 0.3424, "step": 695400 }, { "epoch": 9.582265575487035, "grad_norm": 0.6787081956863403, "learning_rate": 1.618851803383799e-05, "loss": 0.31, "step": 695500 }, { "epoch": 9.583643327546774, "grad_norm": 2.0670363903045654, "learning_rate": 1.6181212535253746e-05, "loss": 0.3114, "step": 695600 }, { "epoch": 9.585021079606515, "grad_norm": 3.8436789512634277, "learning_rate": 1.61739079982584e-05, "loss": 0.3358, "step": 695700 }, { "epoch": 9.586398831666253, "grad_norm": 0.8414889574050903, "learning_rate": 1.6166604423472516e-05, "loss": 0.3309, "step": 695800 }, { "epoch": 9.587776583725992, "grad_norm": 5.926618576049805, "learning_rate": 1.6159301811516563e-05, "loss": 0.3124, "step": 695900 }, { "epoch": 9.589154335785732, "grad_norm": 3.80885910987854, "learning_rate": 1.6152000163010936e-05, "loss": 0.3386, "step": 696000 }, { "epoch": 9.590532087845471, "grad_norm": 6.9158034324646, "learning_rate": 1.614469947857596e-05, "loss": 0.3089, "step": 696100 }, { "epoch": 9.591909839905211, "grad_norm": 1.3664408922195435, "learning_rate": 1.613739975883185e-05, "loss": 0.2705, "step": 696200 }, { "epoch": 9.59328759196495, "grad_norm": 1.1309088468551636, "learning_rate": 1.6130101004398774e-05, "loss": 0.2748, "step": 696300 }, { "epoch": 9.594665344024689, "grad_norm": 15.730361938476562, "learning_rate": 1.6122803215896774e-05, "loss": 0.311, "step": 696400 }, { "epoch": 9.59604309608443, "grad_norm": 2.9531161785125732, "learning_rate": 1.6115506393945843e-05, "loss": 0.2845, "step": 696500 }, { "epoch": 9.597420848144168, "grad_norm": 3.1972222328186035, "learning_rate": 1.610821053916589e-05, "loss": 0.3344, "step": 696600 }, { "epoch": 9.598798600203907, "grad_norm": 2.167327404022217, "learning_rate": 1.6100915652176736e-05, "loss": 0.3334, "step": 696700 }, { "epoch": 9.600176352263647, "grad_norm": 6.5683746337890625, "learning_rate": 1.6093621733598117e-05, "loss": 0.3285, "step": 696800 }, { "epoch": 9.601554104323386, "grad_norm": 2.083061933517456, "learning_rate": 1.6086328784049683e-05, "loss": 0.2731, "step": 696900 }, { "epoch": 9.602931856383126, "grad_norm": 1.7082992792129517, "learning_rate": 1.6079036804151005e-05, "loss": 0.2815, "step": 697000 }, { "epoch": 9.604309608442865, "grad_norm": 6.441465854644775, "learning_rate": 1.6071818699813007e-05, "loss": 0.3118, "step": 697100 }, { "epoch": 9.605687360502603, "grad_norm": 3.2115180492401123, "learning_rate": 1.6064528651360295e-05, "loss": 0.3486, "step": 697200 }, { "epoch": 9.607065112562344, "grad_norm": 6.019472122192383, "learning_rate": 1.605723957440938e-05, "loss": 0.3699, "step": 697300 }, { "epoch": 9.608442864622083, "grad_norm": 5.26919412612915, "learning_rate": 1.6049951469579504e-05, "loss": 0.3735, "step": 697400 }, { "epoch": 9.609820616681821, "grad_norm": 16.880033493041992, "learning_rate": 1.6042664337489818e-05, "loss": 0.2804, "step": 697500 }, { "epoch": 9.611198368741562, "grad_norm": 5.169192314147949, "learning_rate": 1.6035378178759428e-05, "loss": 0.2727, "step": 697600 }, { "epoch": 9.6125761208013, "grad_norm": 3.3329882621765137, "learning_rate": 1.6028092994007304e-05, "loss": 0.2687, "step": 697700 }, { "epoch": 9.61395387286104, "grad_norm": 6.606675624847412, "learning_rate": 1.6020808783852363e-05, "loss": 0.2946, "step": 697800 }, { "epoch": 9.61533162492078, "grad_norm": 2.520848512649536, "learning_rate": 1.601352554891347e-05, "loss": 0.3851, "step": 697900 }, { "epoch": 9.616709376980518, "grad_norm": 4.3599958419799805, "learning_rate": 1.600624328980932e-05, "loss": 0.307, "step": 698000 }, { "epoch": 9.618087129040259, "grad_norm": 2.7149875164031982, "learning_rate": 1.599896200715861e-05, "loss": 0.3296, "step": 698100 }, { "epoch": 9.619464881099997, "grad_norm": 9.638872146606445, "learning_rate": 1.5991681701579924e-05, "loss": 0.3016, "step": 698200 }, { "epoch": 9.620842633159736, "grad_norm": 0.9224613308906555, "learning_rate": 1.5984402373691744e-05, "loss": 0.2632, "step": 698300 }, { "epoch": 9.622220385219476, "grad_norm": 4.34193754196167, "learning_rate": 1.5977124024112493e-05, "loss": 0.3456, "step": 698400 }, { "epoch": 9.623598137279215, "grad_norm": 4.808312892913818, "learning_rate": 1.5969919422319324e-05, "loss": 0.3476, "step": 698500 }, { "epoch": 9.624975889338955, "grad_norm": 6.366484642028809, "learning_rate": 1.5962643021414338e-05, "loss": 0.3375, "step": 698600 }, { "epoch": 9.626353641398694, "grad_norm": 2.1494252681732178, "learning_rate": 1.5955367600666865e-05, "loss": 0.3255, "step": 698700 }, { "epoch": 9.627731393458433, "grad_norm": 0.07877679169178009, "learning_rate": 1.5948093160694962e-05, "loss": 0.3152, "step": 698800 }, { "epoch": 9.629109145518173, "grad_norm": 3.3718981742858887, "learning_rate": 1.5940892431842506e-05, "loss": 0.3654, "step": 698900 }, { "epoch": 9.630486897577912, "grad_norm": 4.589846611022949, "learning_rate": 1.593361994545253e-05, "loss": 0.292, "step": 699000 }, { "epoch": 9.63186464963765, "grad_norm": 3.9629225730895996, "learning_rate": 1.5926348441685715e-05, "loss": 0.293, "step": 699100 }, { "epoch": 9.633242401697391, "grad_norm": 0.20854812860488892, "learning_rate": 1.5919077921159797e-05, "loss": 0.3158, "step": 699200 }, { "epoch": 9.63462015375713, "grad_norm": 2.1737875938415527, "learning_rate": 1.591180838449246e-05, "loss": 0.3727, "step": 699300 }, { "epoch": 9.635997905816868, "grad_norm": 2.6579370498657227, "learning_rate": 1.5904539832301296e-05, "loss": 0.323, "step": 699400 }, { "epoch": 9.637375657876609, "grad_norm": 0.4594109058380127, "learning_rate": 1.589727226520379e-05, "loss": 0.3268, "step": 699500 }, { "epoch": 9.638753409936347, "grad_norm": 3.778564691543579, "learning_rate": 1.5890005683817358e-05, "loss": 0.2986, "step": 699600 }, { "epoch": 9.640131161996088, "grad_norm": 2.7593014240264893, "learning_rate": 1.588274008875935e-05, "loss": 0.3273, "step": 699700 }, { "epoch": 9.641508914055827, "grad_norm": 3.726316452026367, "learning_rate": 1.587547548064699e-05, "loss": 0.3043, "step": 699800 }, { "epoch": 9.642886666115565, "grad_norm": 6.5598225593566895, "learning_rate": 1.5868211860097467e-05, "loss": 0.3298, "step": 699900 }, { "epoch": 9.644264418175306, "grad_norm": 3.1059436798095703, "learning_rate": 1.5860949227727857e-05, "loss": 0.3183, "step": 700000 }, { "epoch": 9.645642170235044, "grad_norm": 1.6653426885604858, "learning_rate": 1.585368758415516e-05, "loss": 0.2629, "step": 700100 }, { "epoch": 9.647019922294783, "grad_norm": 6.720180988311768, "learning_rate": 1.5846426929996273e-05, "loss": 0.2941, "step": 700200 }, { "epoch": 9.648397674354523, "grad_norm": 5.051051616668701, "learning_rate": 1.583916726586804e-05, "loss": 0.2755, "step": 700300 }, { "epoch": 9.649775426414262, "grad_norm": 3.5669496059417725, "learning_rate": 1.5831908592387208e-05, "loss": 0.3523, "step": 700400 }, { "epoch": 9.651153178474003, "grad_norm": 3.3381452560424805, "learning_rate": 1.582465091017043e-05, "loss": 0.2955, "step": 700500 }, { "epoch": 9.652530930533741, "grad_norm": 10.62447738647461, "learning_rate": 1.5817394219834295e-05, "loss": 0.4033, "step": 700600 }, { "epoch": 9.65390868259348, "grad_norm": 4.705462455749512, "learning_rate": 1.5810138521995292e-05, "loss": 0.2642, "step": 700700 }, { "epoch": 9.65528643465322, "grad_norm": 1.075913429260254, "learning_rate": 1.5802883817269813e-05, "loss": 0.3894, "step": 700800 }, { "epoch": 9.656664186712959, "grad_norm": 4.889693260192871, "learning_rate": 1.5795630106274195e-05, "loss": 0.3047, "step": 700900 }, { "epoch": 9.658041938772698, "grad_norm": 3.811338186264038, "learning_rate": 1.5788377389624693e-05, "loss": 0.3193, "step": 701000 }, { "epoch": 9.659419690832438, "grad_norm": 2.9069392681121826, "learning_rate": 1.5781125667937435e-05, "loss": 0.2691, "step": 701100 }, { "epoch": 9.660797442892177, "grad_norm": 3.4906105995178223, "learning_rate": 1.5773874941828518e-05, "loss": 0.3038, "step": 701200 }, { "epoch": 9.662175194951917, "grad_norm": 2.4097721576690674, "learning_rate": 1.5766625211913902e-05, "loss": 0.3174, "step": 701300 }, { "epoch": 9.663552947011656, "grad_norm": 2.8913867473602295, "learning_rate": 1.5759376478809512e-05, "loss": 0.3832, "step": 701400 }, { "epoch": 9.664930699071395, "grad_norm": 7.890986442565918, "learning_rate": 1.5752128743131144e-05, "loss": 0.3149, "step": 701500 }, { "epoch": 9.666308451131135, "grad_norm": 2.572356939315796, "learning_rate": 1.5744882005494543e-05, "loss": 0.3155, "step": 701600 }, { "epoch": 9.667686203190874, "grad_norm": 3.5634989738464355, "learning_rate": 1.5737636266515364e-05, "loss": 0.2649, "step": 701700 }, { "epoch": 9.669063955250612, "grad_norm": 2.391688585281372, "learning_rate": 1.573039152680916e-05, "loss": 0.3004, "step": 701800 }, { "epoch": 9.670441707310353, "grad_norm": 19.462615966796875, "learning_rate": 1.5723147786991397e-05, "loss": 0.2901, "step": 701900 }, { "epoch": 9.671819459370091, "grad_norm": 2.35721492767334, "learning_rate": 1.5715905047677492e-05, "loss": 0.2816, "step": 702000 }, { "epoch": 9.673197211429832, "grad_norm": 4.3905181884765625, "learning_rate": 1.5708663309482728e-05, "loss": 0.345, "step": 702100 }, { "epoch": 9.67457496348957, "grad_norm": 3.691394090652466, "learning_rate": 1.570142257302234e-05, "loss": 0.3047, "step": 702200 }, { "epoch": 9.67595271554931, "grad_norm": 2.775655508041382, "learning_rate": 1.5694182838911477e-05, "loss": 0.3513, "step": 702300 }, { "epoch": 9.67733046760905, "grad_norm": 3.659954309463501, "learning_rate": 1.568694410776518e-05, "loss": 0.2753, "step": 702400 }, { "epoch": 9.678708219668788, "grad_norm": 3.304738759994507, "learning_rate": 1.5679706380198407e-05, "loss": 0.3199, "step": 702500 }, { "epoch": 9.680085971728527, "grad_norm": 1.7630168199539185, "learning_rate": 1.567246965682605e-05, "loss": 0.3914, "step": 702600 }, { "epoch": 9.681463723788267, "grad_norm": 4.834746360778809, "learning_rate": 1.566523393826291e-05, "loss": 0.3158, "step": 702700 }, { "epoch": 9.682841475848006, "grad_norm": 1.9091852903366089, "learning_rate": 1.5657999225123687e-05, "loss": 0.2939, "step": 702800 }, { "epoch": 9.684219227907747, "grad_norm": 2.500666856765747, "learning_rate": 1.5650765518023023e-05, "loss": 0.3269, "step": 702900 }, { "epoch": 9.685596979967485, "grad_norm": 1.8422175645828247, "learning_rate": 1.564353281757545e-05, "loss": 0.3114, "step": 703000 }, { "epoch": 9.686974732027224, "grad_norm": 0.5250390768051147, "learning_rate": 1.5636301124395405e-05, "loss": 0.3565, "step": 703100 }, { "epoch": 9.688352484086964, "grad_norm": 11.935446739196777, "learning_rate": 1.562907043909728e-05, "loss": 0.3233, "step": 703200 }, { "epoch": 9.689730236146703, "grad_norm": 3.246448516845703, "learning_rate": 1.5621840762295357e-05, "loss": 0.3015, "step": 703300 }, { "epoch": 9.691107988206442, "grad_norm": 4.2183942794799805, "learning_rate": 1.5614612094603837e-05, "loss": 0.4, "step": 703400 }, { "epoch": 9.692485740266182, "grad_norm": 0.8239351511001587, "learning_rate": 1.560738443663681e-05, "loss": 0.3339, "step": 703500 }, { "epoch": 9.69386349232592, "grad_norm": 5.141870975494385, "learning_rate": 1.560015778900832e-05, "loss": 0.3047, "step": 703600 }, { "epoch": 9.69524124438566, "grad_norm": 2.0303468704223633, "learning_rate": 1.559293215233231e-05, "loss": 0.3216, "step": 703700 }, { "epoch": 9.6966189964454, "grad_norm": 4.558074951171875, "learning_rate": 1.5585707527222622e-05, "loss": 0.306, "step": 703800 }, { "epoch": 9.697996748505139, "grad_norm": 6.400722503662109, "learning_rate": 1.5578483914293035e-05, "loss": 0.3079, "step": 703900 }, { "epoch": 9.699374500564879, "grad_norm": 4.560937881469727, "learning_rate": 1.5571261314157247e-05, "loss": 0.3149, "step": 704000 }, { "epoch": 9.700752252624618, "grad_norm": 4.5805487632751465, "learning_rate": 1.556403972742882e-05, "loss": 0.3337, "step": 704100 }, { "epoch": 9.702130004684356, "grad_norm": 38.73161697387695, "learning_rate": 1.5556819154721283e-05, "loss": 0.288, "step": 704200 }, { "epoch": 9.703507756744097, "grad_norm": 1.80768620967865, "learning_rate": 1.5549671787204335e-05, "loss": 0.3344, "step": 704300 }, { "epoch": 9.704885508803835, "grad_norm": 2.6821987628936768, "learning_rate": 1.554245323422326e-05, "loss": 0.311, "step": 704400 }, { "epoch": 9.706263260863574, "grad_norm": 2.034491539001465, "learning_rate": 1.5535235697096957e-05, "loss": 0.3513, "step": 704500 }, { "epoch": 9.707641012923315, "grad_norm": 5.47861385345459, "learning_rate": 1.5528019176438582e-05, "loss": 0.2697, "step": 704600 }, { "epoch": 9.709018764983053, "grad_norm": 15.920827865600586, "learning_rate": 1.5520803672861232e-05, "loss": 0.2776, "step": 704700 }, { "epoch": 9.710396517042794, "grad_norm": 3.699014902114868, "learning_rate": 1.551358918697789e-05, "loss": 0.3833, "step": 704800 }, { "epoch": 9.711774269102532, "grad_norm": 1.7043111324310303, "learning_rate": 1.5506375719401473e-05, "loss": 0.324, "step": 704900 }, { "epoch": 9.713152021162271, "grad_norm": 29.512975692749023, "learning_rate": 1.54991632707448e-05, "loss": 0.3091, "step": 705000 }, { "epoch": 9.714529773222011, "grad_norm": 7.502317428588867, "learning_rate": 1.5491951841620606e-05, "loss": 0.337, "step": 705100 }, { "epoch": 9.71590752528175, "grad_norm": 1.7898133993148804, "learning_rate": 1.5484741432641525e-05, "loss": 0.312, "step": 705200 }, { "epoch": 9.717285277341489, "grad_norm": 2.392545700073242, "learning_rate": 1.547753204442013e-05, "loss": 0.3264, "step": 705300 }, { "epoch": 9.71866302940123, "grad_norm": 8.011979103088379, "learning_rate": 1.5470323677568905e-05, "loss": 0.2837, "step": 705400 }, { "epoch": 9.720040781460968, "grad_norm": 1.7386094331741333, "learning_rate": 1.546311633270021e-05, "loss": 0.2984, "step": 705500 }, { "epoch": 9.721418533520708, "grad_norm": 4.689943790435791, "learning_rate": 1.5455910010426382e-05, "loss": 0.3077, "step": 705600 }, { "epoch": 9.722796285580447, "grad_norm": 2.996116876602173, "learning_rate": 1.5448704711359615e-05, "loss": 0.2759, "step": 705700 }, { "epoch": 9.724174037640186, "grad_norm": 8.044126510620117, "learning_rate": 1.5441500436112024e-05, "loss": 0.3215, "step": 705800 }, { "epoch": 9.725551789699926, "grad_norm": 2.8170111179351807, "learning_rate": 1.543429718529566e-05, "loss": 0.3509, "step": 705900 }, { "epoch": 9.726929541759665, "grad_norm": 2.577775239944458, "learning_rate": 1.5427094959522494e-05, "loss": 0.3055, "step": 706000 }, { "epoch": 9.728307293819404, "grad_norm": 5.114053249359131, "learning_rate": 1.5419893759404355e-05, "loss": 0.3041, "step": 706100 }, { "epoch": 9.729685045879144, "grad_norm": 4.764209747314453, "learning_rate": 1.5412693585553063e-05, "loss": 0.3564, "step": 706200 }, { "epoch": 9.731062797938883, "grad_norm": 3.8005058765411377, "learning_rate": 1.5405494438580274e-05, "loss": 0.2987, "step": 706300 }, { "epoch": 9.732440549998623, "grad_norm": 3.850269079208374, "learning_rate": 1.539829631909762e-05, "loss": 0.2657, "step": 706400 }, { "epoch": 9.733818302058362, "grad_norm": 3.09354567527771, "learning_rate": 1.5391099227716595e-05, "loss": 0.2989, "step": 706500 }, { "epoch": 9.7351960541181, "grad_norm": 4.967903137207031, "learning_rate": 1.538390316504864e-05, "loss": 0.2744, "step": 706600 }, { "epoch": 9.73657380617784, "grad_norm": 2.23189377784729, "learning_rate": 1.5376708131705116e-05, "loss": 0.2572, "step": 706700 }, { "epoch": 9.73795155823758, "grad_norm": 3.1625421047210693, "learning_rate": 1.5369514128297235e-05, "loss": 0.2929, "step": 706800 }, { "epoch": 9.739329310297318, "grad_norm": 4.154350757598877, "learning_rate": 1.5362321155436188e-05, "loss": 0.3265, "step": 706900 }, { "epoch": 9.740707062357059, "grad_norm": 0.32776451110839844, "learning_rate": 1.535512921373307e-05, "loss": 0.2713, "step": 707000 }, { "epoch": 9.742084814416797, "grad_norm": 4.893360614776611, "learning_rate": 1.5347938303798846e-05, "loss": 0.3338, "step": 707100 }, { "epoch": 9.743462566476538, "grad_norm": 4.980701923370361, "learning_rate": 1.5340748426244433e-05, "loss": 0.3142, "step": 707200 }, { "epoch": 9.744840318536276, "grad_norm": 2.2574617862701416, "learning_rate": 1.5333559581680655e-05, "loss": 0.3143, "step": 707300 }, { "epoch": 9.746218070596015, "grad_norm": 2.1633872985839844, "learning_rate": 1.5326371770718237e-05, "loss": 0.2726, "step": 707400 }, { "epoch": 9.747595822655756, "grad_norm": 4.102578639984131, "learning_rate": 1.5319184993967804e-05, "loss": 0.3183, "step": 707500 }, { "epoch": 9.748973574715494, "grad_norm": 14.609067916870117, "learning_rate": 1.5311999252039924e-05, "loss": 0.2999, "step": 707600 }, { "epoch": 9.750351326775233, "grad_norm": 2.2417547702789307, "learning_rate": 1.5304886387482626e-05, "loss": 0.3263, "step": 707700 }, { "epoch": 9.751729078834973, "grad_norm": 4.8949079513549805, "learning_rate": 1.5297702706667712e-05, "loss": 0.3341, "step": 707800 }, { "epoch": 9.753106830894712, "grad_norm": 3.1890227794647217, "learning_rate": 1.529052006250037e-05, "loss": 0.302, "step": 707900 }, { "epoch": 9.75448458295445, "grad_norm": 3.2141008377075195, "learning_rate": 1.5283410266523478e-05, "loss": 0.3177, "step": 708000 }, { "epoch": 9.755862335014191, "grad_norm": 0.39695999026298523, "learning_rate": 1.527622968710012e-05, "loss": 0.3236, "step": 708100 }, { "epoch": 9.75724008707393, "grad_norm": 1.4784473180770874, "learning_rate": 1.5269050146148595e-05, "loss": 0.2904, "step": 708200 }, { "epoch": 9.75861783913367, "grad_norm": 3.742617607116699, "learning_rate": 1.5261871644278824e-05, "loss": 0.3331, "step": 708300 }, { "epoch": 9.759995591193409, "grad_norm": 3.3070783615112305, "learning_rate": 1.5254694182100678e-05, "loss": 0.3384, "step": 708400 }, { "epoch": 9.761373343253148, "grad_norm": 0.3331131339073181, "learning_rate": 1.5247517760223906e-05, "loss": 0.3308, "step": 708500 }, { "epoch": 9.762751095312888, "grad_norm": 2.4163436889648438, "learning_rate": 1.5240342379258174e-05, "loss": 0.3483, "step": 708600 }, { "epoch": 9.764128847372627, "grad_norm": 0.8620812296867371, "learning_rate": 1.5233168039813079e-05, "loss": 0.3437, "step": 708700 }, { "epoch": 9.765506599432365, "grad_norm": 3.4354171752929688, "learning_rate": 1.5225994742498122e-05, "loss": 0.3456, "step": 708800 }, { "epoch": 9.766884351492106, "grad_norm": 2.2740471363067627, "learning_rate": 1.5218822487922708e-05, "loss": 0.3948, "step": 708900 }, { "epoch": 9.768262103551844, "grad_norm": 3.7027769088745117, "learning_rate": 1.5211651276696141e-05, "loss": 0.3068, "step": 709000 }, { "epoch": 9.769639855611585, "grad_norm": 3.6294806003570557, "learning_rate": 1.5204481109427663e-05, "loss": 0.3617, "step": 709100 }, { "epoch": 9.771017607671324, "grad_norm": 2.9316868782043457, "learning_rate": 1.5197311986726432e-05, "loss": 0.307, "step": 709200 }, { "epoch": 9.772395359731062, "grad_norm": 2.4531657695770264, "learning_rate": 1.5190143909201477e-05, "loss": 0.3562, "step": 709300 }, { "epoch": 9.773773111790803, "grad_norm": 2.2681167125701904, "learning_rate": 1.5182976877461774e-05, "loss": 0.3072, "step": 709400 }, { "epoch": 9.775150863850541, "grad_norm": 14.267127990722656, "learning_rate": 1.517581089211622e-05, "loss": 0.3428, "step": 709500 }, { "epoch": 9.77652861591028, "grad_norm": 2.158982276916504, "learning_rate": 1.5168645953773558e-05, "loss": 0.3107, "step": 709600 }, { "epoch": 9.77790636797002, "grad_norm": 4.477296352386475, "learning_rate": 1.516148206304251e-05, "loss": 0.3151, "step": 709700 }, { "epoch": 9.779284120029759, "grad_norm": 2.5479581356048584, "learning_rate": 1.5154319220531698e-05, "loss": 0.3081, "step": 709800 }, { "epoch": 9.7806618720895, "grad_norm": 3.171149730682373, "learning_rate": 1.5147157426849613e-05, "loss": 0.3362, "step": 709900 }, { "epoch": 9.782039624149238, "grad_norm": 6.223041534423828, "learning_rate": 1.5139996682604716e-05, "loss": 0.3002, "step": 710000 }, { "epoch": 9.783417376208977, "grad_norm": 1.9789401292800903, "learning_rate": 1.5132836988405318e-05, "loss": 0.3222, "step": 710100 }, { "epoch": 9.784795128268717, "grad_norm": 3.9383902549743652, "learning_rate": 1.5125678344859701e-05, "loss": 0.3473, "step": 710200 }, { "epoch": 9.786172880328456, "grad_norm": 4.3298563957214355, "learning_rate": 1.5118520752576006e-05, "loss": 0.2954, "step": 710300 }, { "epoch": 9.787550632388195, "grad_norm": 3.706490993499756, "learning_rate": 1.5111364212162314e-05, "loss": 0.329, "step": 710400 }, { "epoch": 9.788928384447935, "grad_norm": 2.9419243335723877, "learning_rate": 1.5104208724226626e-05, "loss": 0.361, "step": 710500 }, { "epoch": 9.790306136507674, "grad_norm": 82.68695068359375, "learning_rate": 1.509705428937682e-05, "loss": 0.3263, "step": 710600 }, { "epoch": 9.791683888567414, "grad_norm": 5.371208667755127, "learning_rate": 1.5089900908220694e-05, "loss": 0.3345, "step": 710700 }, { "epoch": 9.793061640627153, "grad_norm": 1.4671239852905273, "learning_rate": 1.5082748581365988e-05, "loss": 0.3247, "step": 710800 }, { "epoch": 9.794439392686892, "grad_norm": 2.2416317462921143, "learning_rate": 1.507559730942031e-05, "loss": 0.3345, "step": 710900 }, { "epoch": 9.795817144746632, "grad_norm": 1.836391806602478, "learning_rate": 1.5068447092991197e-05, "loss": 0.2734, "step": 711000 }, { "epoch": 9.79719489680637, "grad_norm": 10.556929588317871, "learning_rate": 1.5061297932686114e-05, "loss": 0.3113, "step": 711100 }, { "epoch": 9.79857264886611, "grad_norm": 2.046957492828369, "learning_rate": 1.5054149829112408e-05, "loss": 0.2577, "step": 711200 }, { "epoch": 9.79995040092585, "grad_norm": 2.323071002960205, "learning_rate": 1.5047002782877337e-05, "loss": 0.3219, "step": 711300 }, { "epoch": 9.801328152985588, "grad_norm": 0.9787241816520691, "learning_rate": 1.5039856794588092e-05, "loss": 0.3118, "step": 711400 }, { "epoch": 9.802705905045329, "grad_norm": 2.4513192176818848, "learning_rate": 1.5032711864851764e-05, "loss": 0.3404, "step": 711500 }, { "epoch": 9.804083657105068, "grad_norm": 2.3759207725524902, "learning_rate": 1.5025567994275336e-05, "loss": 0.2763, "step": 711600 }, { "epoch": 9.805461409164806, "grad_norm": 6.9838786125183105, "learning_rate": 1.5018425183465736e-05, "loss": 0.3166, "step": 711700 }, { "epoch": 9.806839161224547, "grad_norm": 2.0206189155578613, "learning_rate": 1.501128343302977e-05, "loss": 0.2958, "step": 711800 }, { "epoch": 9.808216913284285, "grad_norm": 3.627972364425659, "learning_rate": 1.500414274357416e-05, "loss": 0.2664, "step": 711900 }, { "epoch": 9.809594665344024, "grad_norm": 5.444515705108643, "learning_rate": 1.4997003115705555e-05, "loss": 0.3267, "step": 712000 }, { "epoch": 9.810972417403764, "grad_norm": 3.542719841003418, "learning_rate": 1.4989864550030507e-05, "loss": 0.3198, "step": 712100 }, { "epoch": 9.812350169463503, "grad_norm": 2.6820003986358643, "learning_rate": 1.4982727047155458e-05, "loss": 0.3348, "step": 712200 }, { "epoch": 9.813727921523242, "grad_norm": 2.5595321655273438, "learning_rate": 1.497559060768679e-05, "loss": 0.295, "step": 712300 }, { "epoch": 9.815105673582982, "grad_norm": 2.1263515949249268, "learning_rate": 1.496845523223077e-05, "loss": 0.3062, "step": 712400 }, { "epoch": 9.816483425642721, "grad_norm": 2.0316073894500732, "learning_rate": 1.49613209213936e-05, "loss": 0.2799, "step": 712500 }, { "epoch": 9.817861177702461, "grad_norm": 4.8766608238220215, "learning_rate": 1.4954187675781348e-05, "loss": 0.2872, "step": 712600 }, { "epoch": 9.8192389297622, "grad_norm": 1.3855955600738525, "learning_rate": 1.4947055496000042e-05, "loss": 0.3232, "step": 712700 }, { "epoch": 9.820616681821939, "grad_norm": 2.3175644874572754, "learning_rate": 1.4939924382655612e-05, "loss": 0.3342, "step": 712800 }, { "epoch": 9.82199443388168, "grad_norm": 1.045776128768921, "learning_rate": 1.4932794336353841e-05, "loss": 0.3062, "step": 712900 }, { "epoch": 9.823372185941418, "grad_norm": 4.187001705169678, "learning_rate": 1.4925665357700487e-05, "loss": 0.2842, "step": 713000 }, { "epoch": 9.824749938001156, "grad_norm": 4.426389217376709, "learning_rate": 1.4918537447301199e-05, "loss": 0.3434, "step": 713100 }, { "epoch": 9.826127690060897, "grad_norm": 8.635143280029297, "learning_rate": 1.491141060576151e-05, "loss": 0.337, "step": 713200 }, { "epoch": 9.827505442120636, "grad_norm": 3.3043558597564697, "learning_rate": 1.4904284833686893e-05, "loss": 0.3429, "step": 713300 }, { "epoch": 9.828883194180376, "grad_norm": 4.820959568023682, "learning_rate": 1.489723137340394e-05, "loss": 0.3258, "step": 713400 }, { "epoch": 9.830260946240115, "grad_norm": 2.582156181335449, "learning_rate": 1.4890107731365744e-05, "loss": 0.3299, "step": 713500 }, { "epoch": 9.831638698299853, "grad_norm": 3.1191978454589844, "learning_rate": 1.4882985160602402e-05, "loss": 0.3117, "step": 713600 }, { "epoch": 9.833016450359594, "grad_norm": 1.1854172945022583, "learning_rate": 1.4875863661719016e-05, "loss": 0.3368, "step": 713700 }, { "epoch": 9.834394202419332, "grad_norm": 1.875817894935608, "learning_rate": 1.4868743235320606e-05, "loss": 0.2716, "step": 713800 }, { "epoch": 9.835771954479071, "grad_norm": 3.114182472229004, "learning_rate": 1.4861623882012082e-05, "loss": 0.335, "step": 713900 }, { "epoch": 9.837149706538812, "grad_norm": 6.847827434539795, "learning_rate": 1.4854505602398257e-05, "loss": 0.3188, "step": 714000 }, { "epoch": 9.83852745859855, "grad_norm": 2.2460103034973145, "learning_rate": 1.4847388397083872e-05, "loss": 0.346, "step": 714100 }, { "epoch": 9.83990521065829, "grad_norm": 1.74933660030365, "learning_rate": 1.4840272266673586e-05, "loss": 0.3253, "step": 714200 }, { "epoch": 9.84128296271803, "grad_norm": 4.5215277671813965, "learning_rate": 1.4833157211771923e-05, "loss": 0.2876, "step": 714300 }, { "epoch": 9.842660714777768, "grad_norm": 5.034599781036377, "learning_rate": 1.4826114367442515e-05, "loss": 0.2875, "step": 714400 }, { "epoch": 9.844038466837508, "grad_norm": 0.8060349822044373, "learning_rate": 1.4819001454601254e-05, "loss": 0.3501, "step": 714500 }, { "epoch": 9.845416218897247, "grad_norm": 0.05606954172253609, "learning_rate": 1.4811889619075706e-05, "loss": 0.3047, "step": 714600 }, { "epoch": 9.846793970956986, "grad_norm": 1.3229053020477295, "learning_rate": 1.4804778861470035e-05, "loss": 0.3594, "step": 714700 }, { "epoch": 9.848171723016726, "grad_norm": 1.2164642810821533, "learning_rate": 1.479766918238835e-05, "loss": 0.3533, "step": 714800 }, { "epoch": 9.849549475076465, "grad_norm": 2.3053691387176514, "learning_rate": 1.4790560582434662e-05, "loss": 0.3331, "step": 714900 }, { "epoch": 9.850927227136205, "grad_norm": 2.7995054721832275, "learning_rate": 1.478345306221287e-05, "loss": 0.3558, "step": 715000 }, { "epoch": 9.852304979195944, "grad_norm": 2.0508270263671875, "learning_rate": 1.4776346622326792e-05, "loss": 0.307, "step": 715100 }, { "epoch": 9.853682731255683, "grad_norm": 5.861050605773926, "learning_rate": 1.4769241263380163e-05, "loss": 0.2463, "step": 715200 }, { "epoch": 9.855060483315423, "grad_norm": 2.458979845046997, "learning_rate": 1.4762136985976626e-05, "loss": 0.3354, "step": 715300 }, { "epoch": 9.856438235375162, "grad_norm": 1.6369178295135498, "learning_rate": 1.4755033790719707e-05, "loss": 0.2857, "step": 715400 }, { "epoch": 9.8578159874349, "grad_norm": 3.129263162612915, "learning_rate": 1.4747931678212884e-05, "loss": 0.3243, "step": 715500 }, { "epoch": 9.859193739494641, "grad_norm": 5.650224208831787, "learning_rate": 1.4740830649059502e-05, "loss": 0.2871, "step": 715600 }, { "epoch": 9.86057149155438, "grad_norm": 0.9417009353637695, "learning_rate": 1.4733730703862825e-05, "loss": 0.3099, "step": 715700 }, { "epoch": 9.86194924361412, "grad_norm": 2.6635608673095703, "learning_rate": 1.4726631843226031e-05, "loss": 0.3388, "step": 715800 }, { "epoch": 9.863326995673859, "grad_norm": 2.5292627811431885, "learning_rate": 1.4719534067752224e-05, "loss": 0.3035, "step": 715900 }, { "epoch": 9.864704747733597, "grad_norm": 0.5057100057601929, "learning_rate": 1.471243737804437e-05, "loss": 0.3263, "step": 716000 }, { "epoch": 9.866082499793338, "grad_norm": 4.652040481567383, "learning_rate": 1.4705341774705395e-05, "loss": 0.311, "step": 716100 }, { "epoch": 9.867460251853077, "grad_norm": 2.5622265338897705, "learning_rate": 1.4698247258338079e-05, "loss": 0.3066, "step": 716200 }, { "epoch": 9.868838003912815, "grad_norm": 8.261514663696289, "learning_rate": 1.4691153829545165e-05, "loss": 0.3083, "step": 716300 }, { "epoch": 9.870215755972556, "grad_norm": 2.6371586322784424, "learning_rate": 1.4684061488929253e-05, "loss": 0.3174, "step": 716400 }, { "epoch": 9.871593508032294, "grad_norm": 62.25011444091797, "learning_rate": 1.467697023709289e-05, "loss": 0.3197, "step": 716500 }, { "epoch": 9.872971260092033, "grad_norm": 0.07412987947463989, "learning_rate": 1.46698800746385e-05, "loss": 0.3445, "step": 716600 }, { "epoch": 9.874349012151773, "grad_norm": 2.3381614685058594, "learning_rate": 1.4662791002168447e-05, "loss": 0.3293, "step": 716700 }, { "epoch": 9.875726764211512, "grad_norm": 9.242097854614258, "learning_rate": 1.4655703020284961e-05, "loss": 0.3575, "step": 716800 }, { "epoch": 9.877104516271253, "grad_norm": 3.0195701122283936, "learning_rate": 1.4648616129590226e-05, "loss": 0.3236, "step": 716900 }, { "epoch": 9.878482268330991, "grad_norm": 3.3999199867248535, "learning_rate": 1.4641530330686286e-05, "loss": 0.2787, "step": 717000 }, { "epoch": 9.87986002039073, "grad_norm": 1.4309579133987427, "learning_rate": 1.4634445624175128e-05, "loss": 0.3437, "step": 717100 }, { "epoch": 9.88123777245047, "grad_norm": 3.080718517303467, "learning_rate": 1.4627362010658647e-05, "loss": 0.2883, "step": 717200 }, { "epoch": 9.882615524510209, "grad_norm": 1.9679582118988037, "learning_rate": 1.4620279490738615e-05, "loss": 0.3478, "step": 717300 }, { "epoch": 9.88399327656995, "grad_norm": 1.5996785163879395, "learning_rate": 1.4613198065016727e-05, "loss": 0.3943, "step": 717400 }, { "epoch": 9.885371028629688, "grad_norm": 3.8279483318328857, "learning_rate": 1.4606117734094597e-05, "loss": 0.3022, "step": 717500 }, { "epoch": 9.886748780689427, "grad_norm": 2.4598217010498047, "learning_rate": 1.4599038498573724e-05, "loss": 0.3554, "step": 717600 }, { "epoch": 9.888126532749167, "grad_norm": 1.9888883829116821, "learning_rate": 1.4591960359055529e-05, "loss": 0.2952, "step": 717700 }, { "epoch": 9.889504284808906, "grad_norm": 1.9636355638504028, "learning_rate": 1.4584883316141353e-05, "loss": 0.311, "step": 717800 }, { "epoch": 9.890882036868645, "grad_norm": 1.9344828128814697, "learning_rate": 1.4577807370432408e-05, "loss": 0.3049, "step": 717900 }, { "epoch": 9.892259788928385, "grad_norm": 3.5512256622314453, "learning_rate": 1.4570732522529828e-05, "loss": 0.3654, "step": 718000 }, { "epoch": 9.893637540988124, "grad_norm": 1.6024874448776245, "learning_rate": 1.4563658773034663e-05, "loss": 0.3393, "step": 718100 }, { "epoch": 9.895015293047862, "grad_norm": 2.5506958961486816, "learning_rate": 1.455658612254788e-05, "loss": 0.3253, "step": 718200 }, { "epoch": 9.896393045107603, "grad_norm": 2.268040180206299, "learning_rate": 1.4549514571670324e-05, "loss": 0.3194, "step": 718300 }, { "epoch": 9.897770797167341, "grad_norm": 1.7642951011657715, "learning_rate": 1.454244412100275e-05, "loss": 0.3306, "step": 718400 }, { "epoch": 9.899148549227082, "grad_norm": 3.9760279655456543, "learning_rate": 1.4535374771145848e-05, "loss": 0.291, "step": 718500 }, { "epoch": 9.90052630128682, "grad_norm": 2.7596538066864014, "learning_rate": 1.4528306522700175e-05, "loss": 0.3369, "step": 718600 }, { "epoch": 9.90190405334656, "grad_norm": 1.5413869619369507, "learning_rate": 1.4521239376266224e-05, "loss": 0.3209, "step": 718700 }, { "epoch": 9.9032818054063, "grad_norm": 2.7431859970092773, "learning_rate": 1.45141733324444e-05, "loss": 0.2955, "step": 718800 }, { "epoch": 9.904659557466038, "grad_norm": 1.5346322059631348, "learning_rate": 1.4507108391834984e-05, "loss": 0.255, "step": 718900 }, { "epoch": 9.906037309525777, "grad_norm": 2.869378089904785, "learning_rate": 1.4500115187940302e-05, "loss": 0.3162, "step": 719000 }, { "epoch": 9.907415061585517, "grad_norm": 13.489603042602539, "learning_rate": 1.4493123066474644e-05, "loss": 0.3316, "step": 719100 }, { "epoch": 9.908792813645256, "grad_norm": 1.4707773923873901, "learning_rate": 1.4486061416997158e-05, "loss": 0.2865, "step": 719200 }, { "epoch": 9.910170565704997, "grad_norm": 1.8340994119644165, "learning_rate": 1.4479000873120344e-05, "loss": 0.31, "step": 719300 }, { "epoch": 9.911548317764735, "grad_norm": 1.2378714084625244, "learning_rate": 1.447194143544402e-05, "loss": 0.2684, "step": 719400 }, { "epoch": 9.912926069824474, "grad_norm": 2.8528451919555664, "learning_rate": 1.4464883104567913e-05, "loss": 0.2904, "step": 719500 }, { "epoch": 9.914303821884214, "grad_norm": 2.7626633644104004, "learning_rate": 1.445782588109167e-05, "loss": 0.3193, "step": 719600 }, { "epoch": 9.915681573943953, "grad_norm": 0.23233921825885773, "learning_rate": 1.4450769765614857e-05, "loss": 0.2861, "step": 719700 }, { "epoch": 9.917059326003692, "grad_norm": 4.531364917755127, "learning_rate": 1.4443714758736898e-05, "loss": 0.251, "step": 719800 }, { "epoch": 9.918437078063432, "grad_norm": 2.497335433959961, "learning_rate": 1.4436660861057181e-05, "loss": 0.3549, "step": 719900 }, { "epoch": 9.91981483012317, "grad_norm": 2.77644681930542, "learning_rate": 1.4429608073174941e-05, "loss": 0.3022, "step": 720000 }, { "epoch": 9.921192582182911, "grad_norm": 1.5866984128952026, "learning_rate": 1.4422556395689372e-05, "loss": 0.321, "step": 720100 }, { "epoch": 9.92257033424265, "grad_norm": 1.859381914138794, "learning_rate": 1.4415505829199532e-05, "loss": 0.31, "step": 720200 }, { "epoch": 9.923948086302389, "grad_norm": 3.298774242401123, "learning_rate": 1.4408456374304424e-05, "loss": 0.3041, "step": 720300 }, { "epoch": 9.925325838362129, "grad_norm": 2.7661261558532715, "learning_rate": 1.440140803160291e-05, "loss": 0.3219, "step": 720400 }, { "epoch": 9.926703590421868, "grad_norm": 8.83280086517334, "learning_rate": 1.4394360801693806e-05, "loss": 0.3213, "step": 720500 }, { "epoch": 9.928081342481606, "grad_norm": 2.0057129859924316, "learning_rate": 1.438731468517579e-05, "loss": 0.3385, "step": 720600 }, { "epoch": 9.929459094541347, "grad_norm": 5.539236068725586, "learning_rate": 1.4380269682647487e-05, "loss": 0.3112, "step": 720700 }, { "epoch": 9.930836846601085, "grad_norm": 1.8977854251861572, "learning_rate": 1.4373225794707385e-05, "loss": 0.3497, "step": 720800 }, { "epoch": 9.932214598660824, "grad_norm": 1.5052602291107178, "learning_rate": 1.4366183021953903e-05, "loss": 0.2446, "step": 720900 }, { "epoch": 9.933592350720565, "grad_norm": 0.15553897619247437, "learning_rate": 1.4359141364985382e-05, "loss": 0.3405, "step": 721000 }, { "epoch": 9.934970102780303, "grad_norm": 5.117846965789795, "learning_rate": 1.4352100824400026e-05, "loss": 0.2837, "step": 721100 }, { "epoch": 9.936347854840044, "grad_norm": 2.749969959259033, "learning_rate": 1.4345061400795961e-05, "loss": 0.3189, "step": 721200 }, { "epoch": 9.937725606899782, "grad_norm": 8.230259895324707, "learning_rate": 1.4338023094771239e-05, "loss": 0.3192, "step": 721300 }, { "epoch": 9.939103358959521, "grad_norm": 6.327287197113037, "learning_rate": 1.4330985906923782e-05, "loss": 0.283, "step": 721400 }, { "epoch": 9.940481111019261, "grad_norm": 6.003288745880127, "learning_rate": 1.4323949837851445e-05, "loss": 0.3371, "step": 721500 }, { "epoch": 9.941858863079, "grad_norm": 3.5814056396484375, "learning_rate": 1.4316914888151986e-05, "loss": 0.3138, "step": 721600 }, { "epoch": 9.94323661513874, "grad_norm": 11.655320167541504, "learning_rate": 1.4309881058423052e-05, "loss": 0.314, "step": 721700 }, { "epoch": 9.94461436719848, "grad_norm": 3.9682466983795166, "learning_rate": 1.430284834926219e-05, "loss": 0.3669, "step": 721800 }, { "epoch": 9.945992119258218, "grad_norm": 3.8927829265594482, "learning_rate": 1.4295816761266882e-05, "loss": 0.2976, "step": 721900 }, { "epoch": 9.947369871317958, "grad_norm": 3.0761945247650146, "learning_rate": 1.4288786295034499e-05, "loss": 0.3407, "step": 722000 }, { "epoch": 9.948747623377697, "grad_norm": 4.82087516784668, "learning_rate": 1.42817569511623e-05, "loss": 0.2746, "step": 722100 }, { "epoch": 9.950125375437436, "grad_norm": 1.373894214630127, "learning_rate": 1.427472873024748e-05, "loss": 0.2938, "step": 722200 }, { "epoch": 9.951503127497176, "grad_norm": 1.8402490615844727, "learning_rate": 1.4267701632887118e-05, "loss": 0.3042, "step": 722300 }, { "epoch": 9.952880879556915, "grad_norm": 4.238638877868652, "learning_rate": 1.4260675659678182e-05, "loss": 0.3074, "step": 722400 }, { "epoch": 9.954258631616653, "grad_norm": 1.7759859561920166, "learning_rate": 1.4253650811217586e-05, "loss": 0.3583, "step": 722500 }, { "epoch": 9.955636383676394, "grad_norm": 1.4588953256607056, "learning_rate": 1.424662708810213e-05, "loss": 0.3156, "step": 722600 }, { "epoch": 9.957014135736133, "grad_norm": 1.7758018970489502, "learning_rate": 1.4239604490928506e-05, "loss": 0.2542, "step": 722700 }, { "epoch": 9.958391887795873, "grad_norm": 6.296766757965088, "learning_rate": 1.4232583020293314e-05, "loss": 0.2991, "step": 722800 }, { "epoch": 9.959769639855612, "grad_norm": 3.489271879196167, "learning_rate": 1.4225562676793065e-05, "loss": 0.3329, "step": 722900 }, { "epoch": 9.96114739191535, "grad_norm": 4.678731441497803, "learning_rate": 1.4218543461024189e-05, "loss": 0.313, "step": 723000 }, { "epoch": 9.96252514397509, "grad_norm": 4.383260726928711, "learning_rate": 1.4211525373582986e-05, "loss": 0.2831, "step": 723100 }, { "epoch": 9.96390289603483, "grad_norm": 2.5311295986175537, "learning_rate": 1.4204578579060728e-05, "loss": 0.2954, "step": 723200 }, { "epoch": 9.965280648094568, "grad_norm": 2.3203036785125732, "learning_rate": 1.4197562738765315e-05, "loss": 0.3161, "step": 723300 }, { "epoch": 9.966658400154309, "grad_norm": 5.560645580291748, "learning_rate": 1.4190548028580002e-05, "loss": 0.2859, "step": 723400 }, { "epoch": 9.968036152214047, "grad_norm": 2.037083625793457, "learning_rate": 1.4183534449100716e-05, "loss": 0.3011, "step": 723500 }, { "epoch": 9.969413904273788, "grad_norm": 1.9922387599945068, "learning_rate": 1.4176522000923302e-05, "loss": 0.2708, "step": 723600 }, { "epoch": 9.970791656333526, "grad_norm": 3.6708500385284424, "learning_rate": 1.4169510684643519e-05, "loss": 0.3035, "step": 723700 }, { "epoch": 9.972169408393265, "grad_norm": 1.4355281591415405, "learning_rate": 1.4162500500857003e-05, "loss": 0.3347, "step": 723800 }, { "epoch": 9.973547160453005, "grad_norm": 5.6878662109375, "learning_rate": 1.4155491450159289e-05, "loss": 0.3428, "step": 723900 }, { "epoch": 9.974924912512744, "grad_norm": 2.0668327808380127, "learning_rate": 1.4148483533145851e-05, "loss": 0.2941, "step": 724000 }, { "epoch": 9.976302664572483, "grad_norm": 5.348653793334961, "learning_rate": 1.414147675041205e-05, "loss": 0.3552, "step": 724100 }, { "epoch": 9.977680416632223, "grad_norm": 3.0700888633728027, "learning_rate": 1.4134471102553132e-05, "loss": 0.3476, "step": 724200 }, { "epoch": 9.979058168691962, "grad_norm": 13.120688438415527, "learning_rate": 1.4127466590164279e-05, "loss": 0.3498, "step": 724300 }, { "epoch": 9.980435920751702, "grad_norm": 2.3017940521240234, "learning_rate": 1.4120463213840556e-05, "loss": 0.265, "step": 724400 }, { "epoch": 9.981813672811441, "grad_norm": 75.67904663085938, "learning_rate": 1.411346097417692e-05, "loss": 0.3503, "step": 724500 }, { "epoch": 9.98319142487118, "grad_norm": 8.453766822814941, "learning_rate": 1.4106459871768263e-05, "loss": 0.2894, "step": 724600 }, { "epoch": 9.98456917693092, "grad_norm": 3.7739968299865723, "learning_rate": 1.4099459907209369e-05, "loss": 0.2567, "step": 724700 }, { "epoch": 9.985946928990659, "grad_norm": 6.567869663238525, "learning_rate": 1.4092461081094898e-05, "loss": 0.3133, "step": 724800 }, { "epoch": 9.987324681050398, "grad_norm": 1.419635534286499, "learning_rate": 1.4085463394019464e-05, "loss": 0.3031, "step": 724900 }, { "epoch": 9.988702433110138, "grad_norm": 2.9086742401123047, "learning_rate": 1.4078466846577533e-05, "loss": 0.2775, "step": 725000 }, { "epoch": 9.990080185169877, "grad_norm": 3.0090432167053223, "learning_rate": 1.4071471439363518e-05, "loss": 0.276, "step": 725100 }, { "epoch": 9.991457937229615, "grad_norm": 1.9325841665267944, "learning_rate": 1.4064477172971695e-05, "loss": 0.2695, "step": 725200 }, { "epoch": 9.992835689289356, "grad_norm": 5.092696189880371, "learning_rate": 1.4057484047996282e-05, "loss": 0.2886, "step": 725300 }, { "epoch": 9.994213441349094, "grad_norm": 2.2072484493255615, "learning_rate": 1.4050492065031358e-05, "loss": 0.2941, "step": 725400 }, { "epoch": 9.995591193408835, "grad_norm": 2.657590627670288, "learning_rate": 1.4043501224670952e-05, "loss": 0.359, "step": 725500 }, { "epoch": 9.996968945468574, "grad_norm": 14.954183578491211, "learning_rate": 1.4036511527508947e-05, "loss": 0.3251, "step": 725600 }, { "epoch": 9.998346697528312, "grad_norm": 4.210543632507324, "learning_rate": 1.4029522974139181e-05, "loss": 0.3161, "step": 725700 }, { "epoch": 9.999724449588053, "grad_norm": 9.302190780639648, "learning_rate": 1.4022535565155338e-05, "loss": 0.3349, "step": 725800 }, { "epoch": 10.001102201647791, "grad_norm": 3.400059223175049, "learning_rate": 1.4015549301151056e-05, "loss": 0.2936, "step": 725900 }, { "epoch": 10.00247995370753, "grad_norm": 4.8409833908081055, "learning_rate": 1.400856418271985e-05, "loss": 0.2517, "step": 726000 }, { "epoch": 10.00385770576727, "grad_norm": 5.489073276519775, "learning_rate": 1.4001580210455143e-05, "loss": 0.2631, "step": 726100 }, { "epoch": 10.005235457827009, "grad_norm": 6.839758396148682, "learning_rate": 1.399459738495024e-05, "loss": 0.3201, "step": 726200 }, { "epoch": 10.00661320988675, "grad_norm": 2.8148086071014404, "learning_rate": 1.3987615706798397e-05, "loss": 0.2547, "step": 726300 }, { "epoch": 10.007990961946488, "grad_norm": 2.114227771759033, "learning_rate": 1.3980635176592716e-05, "loss": 0.2726, "step": 726400 }, { "epoch": 10.009368714006227, "grad_norm": 5.301149845123291, "learning_rate": 1.3973655794926242e-05, "loss": 0.3044, "step": 726500 }, { "epoch": 10.010746466065967, "grad_norm": 0.5168023705482483, "learning_rate": 1.3966677562391916e-05, "loss": 0.2595, "step": 726600 }, { "epoch": 10.012124218125706, "grad_norm": 1.8498117923736572, "learning_rate": 1.3959700479582572e-05, "loss": 0.2763, "step": 726700 }, { "epoch": 10.013501970185445, "grad_norm": 1.5371010303497314, "learning_rate": 1.3952724547090929e-05, "loss": 0.2726, "step": 726800 }, { "epoch": 10.014879722245185, "grad_norm": 4.805660724639893, "learning_rate": 1.3945749765509646e-05, "loss": 0.3086, "step": 726900 }, { "epoch": 10.016257474304924, "grad_norm": 2.421093463897705, "learning_rate": 1.3938776135431275e-05, "loss": 0.2814, "step": 727000 }, { "epoch": 10.017635226364664, "grad_norm": 3.1444449424743652, "learning_rate": 1.3931803657448238e-05, "loss": 0.2762, "step": 727100 }, { "epoch": 10.019012978424403, "grad_norm": 5.2588372230529785, "learning_rate": 1.3924832332152905e-05, "loss": 0.2743, "step": 727200 }, { "epoch": 10.020390730484142, "grad_norm": 1.2897651195526123, "learning_rate": 1.3917862160137517e-05, "loss": 0.2888, "step": 727300 }, { "epoch": 10.021768482543882, "grad_norm": 2.2728793621063232, "learning_rate": 1.391089314199421e-05, "loss": 0.2659, "step": 727400 }, { "epoch": 10.02314623460362, "grad_norm": 1.017311692237854, "learning_rate": 1.3903925278315054e-05, "loss": 0.2736, "step": 727500 }, { "epoch": 10.02452398666336, "grad_norm": 2.155237913131714, "learning_rate": 1.3896958569692013e-05, "loss": 0.2486, "step": 727600 }, { "epoch": 10.0259017387231, "grad_norm": 4.465235710144043, "learning_rate": 1.3889993016716934e-05, "loss": 0.3012, "step": 727700 }, { "epoch": 10.027279490782838, "grad_norm": 3.0036001205444336, "learning_rate": 1.3883028619981569e-05, "loss": 0.3288, "step": 727800 }, { "epoch": 10.028657242842579, "grad_norm": 2.107567071914673, "learning_rate": 1.3876065380077582e-05, "loss": 0.2845, "step": 727900 }, { "epoch": 10.030034994902318, "grad_norm": 3.331310272216797, "learning_rate": 1.3869103297596553e-05, "loss": 0.2522, "step": 728000 }, { "epoch": 10.031412746962056, "grad_norm": 8.626947402954102, "learning_rate": 1.3862142373129925e-05, "loss": 0.2874, "step": 728100 }, { "epoch": 10.032790499021797, "grad_norm": 5.837547302246094, "learning_rate": 1.3855182607269072e-05, "loss": 0.2735, "step": 728200 }, { "epoch": 10.034168251081535, "grad_norm": 6.366975784301758, "learning_rate": 1.3848224000605284e-05, "loss": 0.2652, "step": 728300 }, { "epoch": 10.035546003141274, "grad_norm": 6.6881890296936035, "learning_rate": 1.384126655372969e-05, "loss": 0.2909, "step": 728400 }, { "epoch": 10.036923755201014, "grad_norm": 2.935879945755005, "learning_rate": 1.3834310267233379e-05, "loss": 0.3375, "step": 728500 }, { "epoch": 10.038301507260753, "grad_norm": 0.49858301877975464, "learning_rate": 1.3827355141707332e-05, "loss": 0.2761, "step": 728600 }, { "epoch": 10.039679259320494, "grad_norm": 2.6344661712646484, "learning_rate": 1.3820401177742405e-05, "loss": 0.2603, "step": 728700 }, { "epoch": 10.041057011380232, "grad_norm": 11.682106971740723, "learning_rate": 1.3813448375929395e-05, "loss": 0.3186, "step": 728800 }, { "epoch": 10.04243476343997, "grad_norm": 3.943646192550659, "learning_rate": 1.3806496736858953e-05, "loss": 0.2964, "step": 728900 }, { "epoch": 10.043812515499711, "grad_norm": 3.1700892448425293, "learning_rate": 1.3799546261121682e-05, "loss": 0.2298, "step": 729000 }, { "epoch": 10.04519026755945, "grad_norm": 2.388519048690796, "learning_rate": 1.3792596949308032e-05, "loss": 0.2763, "step": 729100 }, { "epoch": 10.046568019619189, "grad_norm": 4.996067047119141, "learning_rate": 1.37856488020084e-05, "loss": 0.3055, "step": 729200 }, { "epoch": 10.047945771678929, "grad_norm": 1.8150806427001953, "learning_rate": 1.3778701819813073e-05, "loss": 0.2841, "step": 729300 }, { "epoch": 10.049323523738668, "grad_norm": 3.8512775897979736, "learning_rate": 1.3771756003312228e-05, "loss": 0.3006, "step": 729400 }, { "epoch": 10.050701275798406, "grad_norm": 3.4396886825561523, "learning_rate": 1.376481135309593e-05, "loss": 0.3091, "step": 729500 }, { "epoch": 10.052079027858147, "grad_norm": 6.079519271850586, "learning_rate": 1.375786786975419e-05, "loss": 0.3078, "step": 729600 }, { "epoch": 10.053456779917886, "grad_norm": 0.27762430906295776, "learning_rate": 1.3750925553876866e-05, "loss": 0.2583, "step": 729700 }, { "epoch": 10.054834531977626, "grad_norm": 3.54455304145813, "learning_rate": 1.3743984406053755e-05, "loss": 0.2898, "step": 729800 }, { "epoch": 10.056212284037365, "grad_norm": 5.610214710235596, "learning_rate": 1.3737113820879626e-05, "loss": 0.2955, "step": 729900 }, { "epoch": 10.057590036097103, "grad_norm": 3.868779420852661, "learning_rate": 1.373017499923865e-05, "loss": 0.3041, "step": 730000 }, { "epoch": 10.058967788156844, "grad_norm": 0.3032437264919281, "learning_rate": 1.3723237347414767e-05, "loss": 0.2272, "step": 730100 }, { "epoch": 10.060345540216582, "grad_norm": 2.3694279193878174, "learning_rate": 1.3716300865997338e-05, "loss": 0.291, "step": 730200 }, { "epoch": 10.061723292276321, "grad_norm": 5.370370864868164, "learning_rate": 1.370936555557568e-05, "loss": 0.2935, "step": 730300 }, { "epoch": 10.063101044336062, "grad_norm": 5.260372638702393, "learning_rate": 1.3702431416738957e-05, "loss": 0.3145, "step": 730400 }, { "epoch": 10.0644787963958, "grad_norm": 2.998494863510132, "learning_rate": 1.3695498450076287e-05, "loss": 0.2738, "step": 730500 }, { "epoch": 10.06585654845554, "grad_norm": 3.5449092388153076, "learning_rate": 1.3688566656176636e-05, "loss": 0.3368, "step": 730600 }, { "epoch": 10.06723430051528, "grad_norm": 3.2793149948120117, "learning_rate": 1.3681636035628916e-05, "loss": 0.3254, "step": 730700 }, { "epoch": 10.068612052575018, "grad_norm": 0.44621741771698, "learning_rate": 1.3674706589021898e-05, "loss": 0.2847, "step": 730800 }, { "epoch": 10.069989804634758, "grad_norm": 2.9050819873809814, "learning_rate": 1.3667778316944287e-05, "loss": 0.3426, "step": 730900 }, { "epoch": 10.071367556694497, "grad_norm": 1.4298114776611328, "learning_rate": 1.3660851219984683e-05, "loss": 0.2409, "step": 731000 }, { "epoch": 10.072745308754236, "grad_norm": 1.2119760513305664, "learning_rate": 1.3653925298731566e-05, "loss": 0.2892, "step": 731100 }, { "epoch": 10.074123060813976, "grad_norm": 3.488784074783325, "learning_rate": 1.3647000553773322e-05, "loss": 0.3079, "step": 731200 }, { "epoch": 10.075500812873715, "grad_norm": 12.563276290893555, "learning_rate": 1.3640076985698264e-05, "loss": 0.2986, "step": 731300 }, { "epoch": 10.076878564933455, "grad_norm": 2.4049036502838135, "learning_rate": 1.3633154595094564e-05, "loss": 0.2963, "step": 731400 }, { "epoch": 10.078256316993194, "grad_norm": 2.8991074562072754, "learning_rate": 1.362630258884244e-05, "loss": 0.3828, "step": 731500 }, { "epoch": 10.079634069052933, "grad_norm": 5.3424248695373535, "learning_rate": 1.3619382543156274e-05, "loss": 0.2668, "step": 731600 }, { "epoch": 10.081011821112673, "grad_norm": 1.4488914012908936, "learning_rate": 1.3612463676699566e-05, "loss": 0.3032, "step": 731700 }, { "epoch": 10.082389573172412, "grad_norm": 2.85404634475708, "learning_rate": 1.3605545990060124e-05, "loss": 0.3251, "step": 731800 }, { "epoch": 10.08376732523215, "grad_norm": 3.097522258758545, "learning_rate": 1.3598629483825623e-05, "loss": 0.2487, "step": 731900 }, { "epoch": 10.085145077291891, "grad_norm": 3.816513776779175, "learning_rate": 1.3591714158583669e-05, "loss": 0.279, "step": 732000 }, { "epoch": 10.08652282935163, "grad_norm": 4.017214298248291, "learning_rate": 1.358480001492177e-05, "loss": 0.284, "step": 732100 }, { "epoch": 10.08790058141137, "grad_norm": 3.5168819427490234, "learning_rate": 1.3577887053427281e-05, "loss": 0.2769, "step": 732200 }, { "epoch": 10.089278333471109, "grad_norm": 2.658149480819702, "learning_rate": 1.3570975274687512e-05, "loss": 0.2712, "step": 732300 }, { "epoch": 10.090656085530847, "grad_norm": 6.469781398773193, "learning_rate": 1.356406467928966e-05, "loss": 0.3032, "step": 732400 }, { "epoch": 10.092033837590588, "grad_norm": 2.4033329486846924, "learning_rate": 1.3557155267820805e-05, "loss": 0.2852, "step": 732500 }, { "epoch": 10.093411589650326, "grad_norm": 2.9667887687683105, "learning_rate": 1.3550247040867937e-05, "loss": 0.319, "step": 732600 }, { "epoch": 10.094789341710065, "grad_norm": 3.0170135498046875, "learning_rate": 1.3543339999017959e-05, "loss": 0.2911, "step": 732700 }, { "epoch": 10.096167093769806, "grad_norm": 6.031506061553955, "learning_rate": 1.3536434142857654e-05, "loss": 0.2968, "step": 732800 }, { "epoch": 10.097544845829544, "grad_norm": 1.1898102760314941, "learning_rate": 1.352952947297369e-05, "loss": 0.3323, "step": 732900 }, { "epoch": 10.098922597889285, "grad_norm": 1.4634368419647217, "learning_rate": 1.3522625989952674e-05, "loss": 0.2327, "step": 733000 }, { "epoch": 10.100300349949023, "grad_norm": 26.642776489257812, "learning_rate": 1.3515723694381098e-05, "loss": 0.2999, "step": 733100 }, { "epoch": 10.101678102008762, "grad_norm": 9.944792747497559, "learning_rate": 1.3508822586845336e-05, "loss": 0.2775, "step": 733200 }, { "epoch": 10.103055854068502, "grad_norm": 34.712032318115234, "learning_rate": 1.3501922667931665e-05, "loss": 0.2859, "step": 733300 }, { "epoch": 10.104433606128241, "grad_norm": 10.03830337524414, "learning_rate": 1.3495023938226283e-05, "loss": 0.3115, "step": 733400 }, { "epoch": 10.10581135818798, "grad_norm": 1.809373140335083, "learning_rate": 1.348812639831526e-05, "loss": 0.2655, "step": 733500 }, { "epoch": 10.10718911024772, "grad_norm": 1.4340425729751587, "learning_rate": 1.348123004878458e-05, "loss": 0.2798, "step": 733600 }, { "epoch": 10.108566862307459, "grad_norm": 2.3071742057800293, "learning_rate": 1.347433489022014e-05, "loss": 0.289, "step": 733700 }, { "epoch": 10.109944614367198, "grad_norm": 4.753567218780518, "learning_rate": 1.3467440923207708e-05, "loss": 0.3061, "step": 733800 }, { "epoch": 10.111322366426938, "grad_norm": 2.845501661300659, "learning_rate": 1.3460548148332947e-05, "loss": 0.2425, "step": 733900 }, { "epoch": 10.112700118486677, "grad_norm": 1.8497999906539917, "learning_rate": 1.3453656566181444e-05, "loss": 0.2571, "step": 734000 }, { "epoch": 10.114077870546417, "grad_norm": 1.6104137897491455, "learning_rate": 1.3446766177338689e-05, "loss": 0.3211, "step": 734100 }, { "epoch": 10.115455622606156, "grad_norm": 4.806046485900879, "learning_rate": 1.3439876982390035e-05, "loss": 0.2859, "step": 734200 }, { "epoch": 10.116833374665894, "grad_norm": 3.6091084480285645, "learning_rate": 1.343298898192077e-05, "loss": 0.2629, "step": 734300 }, { "epoch": 10.118211126725635, "grad_norm": 3.761121988296509, "learning_rate": 1.3426102176516056e-05, "loss": 0.2801, "step": 734400 }, { "epoch": 10.119588878785374, "grad_norm": 8.344184875488281, "learning_rate": 1.3419216566760955e-05, "loss": 0.2581, "step": 734500 }, { "epoch": 10.120966630845112, "grad_norm": 0.8365576863288879, "learning_rate": 1.3412332153240445e-05, "loss": 0.2947, "step": 734600 }, { "epoch": 10.122344382904853, "grad_norm": 0.5754960775375366, "learning_rate": 1.3405448936539398e-05, "loss": 0.2758, "step": 734700 }, { "epoch": 10.123722134964591, "grad_norm": 2.604860782623291, "learning_rate": 1.339856691724256e-05, "loss": 0.2448, "step": 734800 }, { "epoch": 10.125099887024332, "grad_norm": 3.7445244789123535, "learning_rate": 1.3391686095934619e-05, "loss": 0.2776, "step": 734900 }, { "epoch": 10.12647763908407, "grad_norm": 1.6025595664978027, "learning_rate": 1.338480647320011e-05, "loss": 0.2502, "step": 735000 }, { "epoch": 10.12785539114381, "grad_norm": 2.389474868774414, "learning_rate": 1.3377928049623514e-05, "loss": 0.2614, "step": 735100 }, { "epoch": 10.12923314320355, "grad_norm": 23.347993850708008, "learning_rate": 1.3371050825789167e-05, "loss": 0.3194, "step": 735200 }, { "epoch": 10.130610895263288, "grad_norm": 4.336905479431152, "learning_rate": 1.3364174802281338e-05, "loss": 0.3071, "step": 735300 }, { "epoch": 10.131988647323027, "grad_norm": 1.7863801717758179, "learning_rate": 1.3357299979684198e-05, "loss": 0.272, "step": 735400 }, { "epoch": 10.133366399382767, "grad_norm": 1.7998559474945068, "learning_rate": 1.3350426358581755e-05, "loss": 0.254, "step": 735500 }, { "epoch": 10.134744151442506, "grad_norm": 6.019591331481934, "learning_rate": 1.3343622657796016e-05, "loss": 0.3062, "step": 735600 }, { "epoch": 10.136121903502247, "grad_norm": 4.577052116394043, "learning_rate": 1.3336751429405245e-05, "loss": 0.2543, "step": 735700 }, { "epoch": 10.137499655561985, "grad_norm": 5.137942790985107, "learning_rate": 1.3329881404254911e-05, "loss": 0.3173, "step": 735800 }, { "epoch": 10.138877407621724, "grad_norm": 2.155247926712036, "learning_rate": 1.3323012582928636e-05, "loss": 0.2948, "step": 735900 }, { "epoch": 10.140255159681464, "grad_norm": 3.973073959350586, "learning_rate": 1.331614496600998e-05, "loss": 0.273, "step": 736000 }, { "epoch": 10.141632911741203, "grad_norm": 2.047118902206421, "learning_rate": 1.3309278554082372e-05, "loss": 0.3261, "step": 736100 }, { "epoch": 10.143010663800942, "grad_norm": 0.5330919623374939, "learning_rate": 1.330241334772914e-05, "loss": 0.312, "step": 736200 }, { "epoch": 10.144388415860682, "grad_norm": 3.573786973953247, "learning_rate": 1.3295549347533531e-05, "loss": 0.2882, "step": 736300 }, { "epoch": 10.14576616792042, "grad_norm": 4.5614094734191895, "learning_rate": 1.3288686554078683e-05, "loss": 0.3029, "step": 736400 }, { "epoch": 10.147143919980161, "grad_norm": 3.086592197418213, "learning_rate": 1.3281824967947616e-05, "loss": 0.339, "step": 736500 }, { "epoch": 10.1485216720399, "grad_norm": 1.4972175359725952, "learning_rate": 1.3275033187524442e-05, "loss": 0.3316, "step": 736600 }, { "epoch": 10.149899424099639, "grad_norm": 6.534455299377441, "learning_rate": 1.3268174005701828e-05, "loss": 0.3234, "step": 736700 }, { "epoch": 10.151277176159379, "grad_norm": 1.3562685251235962, "learning_rate": 1.3261316032945641e-05, "loss": 0.2655, "step": 736800 }, { "epoch": 10.152654928219118, "grad_norm": 1.747830867767334, "learning_rate": 1.3254459269838511e-05, "loss": 0.2523, "step": 736900 }, { "epoch": 10.154032680278856, "grad_norm": 3.305440664291382, "learning_rate": 1.3247603716962937e-05, "loss": 0.2712, "step": 737000 }, { "epoch": 10.155410432338597, "grad_norm": 0.5590237379074097, "learning_rate": 1.3240749374901352e-05, "loss": 0.2486, "step": 737100 }, { "epoch": 10.156788184398335, "grad_norm": 3.1297619342803955, "learning_rate": 1.3233896244236058e-05, "loss": 0.3334, "step": 737200 }, { "epoch": 10.158165936458076, "grad_norm": 2.03391170501709, "learning_rate": 1.3227044325549249e-05, "loss": 0.2675, "step": 737300 }, { "epoch": 10.159543688517815, "grad_norm": 10.185530662536621, "learning_rate": 1.322019361942305e-05, "loss": 0.2704, "step": 737400 }, { "epoch": 10.160921440577553, "grad_norm": 1.7408205270767212, "learning_rate": 1.3213344126439466e-05, "loss": 0.2709, "step": 737500 }, { "epoch": 10.162299192637294, "grad_norm": 13.838605880737305, "learning_rate": 1.3206495847180385e-05, "loss": 0.2715, "step": 737600 }, { "epoch": 10.163676944697032, "grad_norm": 0.7557150721549988, "learning_rate": 1.3199648782227601e-05, "loss": 0.2838, "step": 737700 }, { "epoch": 10.165054696756771, "grad_norm": 0.598028302192688, "learning_rate": 1.3192802932162812e-05, "loss": 0.2817, "step": 737800 }, { "epoch": 10.166432448816511, "grad_norm": 1.9788265228271484, "learning_rate": 1.3185958297567617e-05, "loss": 0.2724, "step": 737900 }, { "epoch": 10.16781020087625, "grad_norm": 1.9298508167266846, "learning_rate": 1.3179114879023487e-05, "loss": 0.2627, "step": 738000 }, { "epoch": 10.169187952935989, "grad_norm": 15.164461135864258, "learning_rate": 1.3172272677111813e-05, "loss": 0.3314, "step": 738100 }, { "epoch": 10.17056570499573, "grad_norm": 11.63528823852539, "learning_rate": 1.3165431692413895e-05, "loss": 0.2752, "step": 738200 }, { "epoch": 10.171943457055468, "grad_norm": 3.259536027908325, "learning_rate": 1.315859192551087e-05, "loss": 0.2667, "step": 738300 }, { "epoch": 10.173321209115208, "grad_norm": 2.716491460800171, "learning_rate": 1.3151753376983829e-05, "loss": 0.2895, "step": 738400 }, { "epoch": 10.174698961174947, "grad_norm": 1.2367470264434814, "learning_rate": 1.3144916047413756e-05, "loss": 0.2931, "step": 738500 }, { "epoch": 10.176076713234686, "grad_norm": 3.94279408454895, "learning_rate": 1.3138079937381494e-05, "loss": 0.3295, "step": 738600 }, { "epoch": 10.177454465294426, "grad_norm": 1.1917893886566162, "learning_rate": 1.3131245047467828e-05, "loss": 0.3154, "step": 738700 }, { "epoch": 10.178832217354165, "grad_norm": 3.9291369915008545, "learning_rate": 1.3124411378253394e-05, "loss": 0.2687, "step": 738800 }, { "epoch": 10.180209969413903, "grad_norm": 11.463781356811523, "learning_rate": 1.311757893031877e-05, "loss": 0.3287, "step": 738900 }, { "epoch": 10.181587721473644, "grad_norm": 3.051361560821533, "learning_rate": 1.3110747704244383e-05, "loss": 0.361, "step": 739000 }, { "epoch": 10.182965473533383, "grad_norm": 9.015741348266602, "learning_rate": 1.3103917700610605e-05, "loss": 0.3095, "step": 739100 }, { "epoch": 10.184343225593123, "grad_norm": 2.560722827911377, "learning_rate": 1.3097088919997659e-05, "loss": 0.3095, "step": 739200 }, { "epoch": 10.185720977652862, "grad_norm": 2.2690556049346924, "learning_rate": 1.3090261362985702e-05, "loss": 0.2518, "step": 739300 }, { "epoch": 10.1870987297126, "grad_norm": 4.059639930725098, "learning_rate": 1.3083435030154752e-05, "loss": 0.295, "step": 739400 }, { "epoch": 10.18847648177234, "grad_norm": 2.6916866302490234, "learning_rate": 1.3076609922084765e-05, "loss": 0.3174, "step": 739500 }, { "epoch": 10.18985423383208, "grad_norm": 1.2301732301712036, "learning_rate": 1.3069786039355541e-05, "loss": 0.3035, "step": 739600 }, { "epoch": 10.191231985891818, "grad_norm": 3.22831392288208, "learning_rate": 1.3062963382546819e-05, "loss": 0.3063, "step": 739700 }, { "epoch": 10.192609737951559, "grad_norm": 4.637147903442383, "learning_rate": 1.3056141952238225e-05, "loss": 0.3091, "step": 739800 }, { "epoch": 10.193987490011297, "grad_norm": 5.76689338684082, "learning_rate": 1.3049321749009266e-05, "loss": 0.2603, "step": 739900 }, { "epoch": 10.195365242071038, "grad_norm": 1.3951846361160278, "learning_rate": 1.3042502773439347e-05, "loss": 0.323, "step": 740000 }, { "epoch": 10.196742994130776, "grad_norm": 1.2893242835998535, "learning_rate": 1.303568502610779e-05, "loss": 0.302, "step": 740100 }, { "epoch": 10.198120746190515, "grad_norm": 3.65417742729187, "learning_rate": 1.3028868507593776e-05, "loss": 0.2824, "step": 740200 }, { "epoch": 10.199498498250255, "grad_norm": 4.525059223175049, "learning_rate": 1.302205321847642e-05, "loss": 0.3006, "step": 740300 }, { "epoch": 10.200876250309994, "grad_norm": 2.6735799312591553, "learning_rate": 1.301523915933472e-05, "loss": 0.281, "step": 740400 }, { "epoch": 10.202254002369733, "grad_norm": 1.3909764289855957, "learning_rate": 1.3008426330747557e-05, "loss": 0.2684, "step": 740500 }, { "epoch": 10.203631754429473, "grad_norm": 0.34083470702171326, "learning_rate": 1.3001614733293706e-05, "loss": 0.3036, "step": 740600 }, { "epoch": 10.205009506489212, "grad_norm": 1.9238654375076294, "learning_rate": 1.2994804367551855e-05, "loss": 0.3122, "step": 740700 }, { "epoch": 10.206387258548952, "grad_norm": 2.0558156967163086, "learning_rate": 1.2987995234100596e-05, "loss": 0.3425, "step": 740800 }, { "epoch": 10.207765010608691, "grad_norm": 2.3343918323516846, "learning_rate": 1.2981187333518369e-05, "loss": 0.2769, "step": 740900 }, { "epoch": 10.20914276266843, "grad_norm": 1.6474955081939697, "learning_rate": 1.297438066638357e-05, "loss": 0.3105, "step": 741000 }, { "epoch": 10.21052051472817, "grad_norm": 2.036238670349121, "learning_rate": 1.2967575233274445e-05, "loss": 0.2899, "step": 741100 }, { "epoch": 10.211898266787909, "grad_norm": 1.719221591949463, "learning_rate": 1.296077103476914e-05, "loss": 0.3463, "step": 741200 }, { "epoch": 10.213276018847647, "grad_norm": 2.2464818954467773, "learning_rate": 1.2953968071445719e-05, "loss": 0.2829, "step": 741300 }, { "epoch": 10.214653770907388, "grad_norm": 4.058666229248047, "learning_rate": 1.2947166343882135e-05, "loss": 0.2862, "step": 741400 }, { "epoch": 10.216031522967127, "grad_norm": 0.3164874017238617, "learning_rate": 1.2940365852656223e-05, "loss": 0.2515, "step": 741500 }, { "epoch": 10.217409275026867, "grad_norm": 10.059370040893555, "learning_rate": 1.293356659834571e-05, "loss": 0.3382, "step": 741600 }, { "epoch": 10.218787027086606, "grad_norm": 1.440412163734436, "learning_rate": 1.2926768581528233e-05, "loss": 0.2707, "step": 741700 }, { "epoch": 10.220164779146344, "grad_norm": 5.088151454925537, "learning_rate": 1.2919971802781333e-05, "loss": 0.3139, "step": 741800 }, { "epoch": 10.221542531206085, "grad_norm": 2.0129246711730957, "learning_rate": 1.2913176262682408e-05, "loss": 0.2599, "step": 741900 }, { "epoch": 10.222920283265823, "grad_norm": 1.8397879600524902, "learning_rate": 1.2906449898681477e-05, "loss": 0.2954, "step": 742000 }, { "epoch": 10.224298035325562, "grad_norm": 3.9937195777893066, "learning_rate": 1.2899656825209485e-05, "loss": 0.2788, "step": 742100 }, { "epoch": 10.225675787385303, "grad_norm": 3.7888314723968506, "learning_rate": 1.2892864992111353e-05, "loss": 0.2661, "step": 742200 }, { "epoch": 10.227053539445041, "grad_norm": 2.9476096630096436, "learning_rate": 1.2886074399964057e-05, "loss": 0.2453, "step": 742300 }, { "epoch": 10.22843129150478, "grad_norm": 4.9511799812316895, "learning_rate": 1.2879285049344505e-05, "loss": 0.3062, "step": 742400 }, { "epoch": 10.22980904356452, "grad_norm": 1.0618526935577393, "learning_rate": 1.28724969408295e-05, "loss": 0.2955, "step": 742500 }, { "epoch": 10.231186795624259, "grad_norm": 4.4451003074646, "learning_rate": 1.2865710074995713e-05, "loss": 0.2665, "step": 742600 }, { "epoch": 10.232564547684, "grad_norm": 0.5552589893341064, "learning_rate": 1.2858924452419712e-05, "loss": 0.2429, "step": 742700 }, { "epoch": 10.233942299743738, "grad_norm": 2.503857135772705, "learning_rate": 1.2852140073677981e-05, "loss": 0.2642, "step": 742800 }, { "epoch": 10.235320051803477, "grad_norm": 0.1566755622625351, "learning_rate": 1.2845356939346903e-05, "loss": 0.2925, "step": 742900 }, { "epoch": 10.236697803863217, "grad_norm": 4.694526195526123, "learning_rate": 1.2838575050002712e-05, "loss": 0.2714, "step": 743000 }, { "epoch": 10.238075555922956, "grad_norm": 37.567081451416016, "learning_rate": 1.2831794406221579e-05, "loss": 0.3262, "step": 743100 }, { "epoch": 10.239453307982695, "grad_norm": 1.7142767906188965, "learning_rate": 1.2825015008579572e-05, "loss": 0.2406, "step": 743200 }, { "epoch": 10.240831060042435, "grad_norm": 2.3837833404541016, "learning_rate": 1.2818236857652598e-05, "loss": 0.3072, "step": 743300 }, { "epoch": 10.242208812102174, "grad_norm": 0.5881015658378601, "learning_rate": 1.2811459954016516e-05, "loss": 0.2713, "step": 743400 }, { "epoch": 10.243586564161914, "grad_norm": 1.1353763341903687, "learning_rate": 1.2804752048625922e-05, "loss": 0.2909, "step": 743500 }, { "epoch": 10.244964316221653, "grad_norm": 4.626894950866699, "learning_rate": 1.2797977628811448e-05, "loss": 0.3014, "step": 743600 }, { "epoch": 10.246342068281391, "grad_norm": 6.538792610168457, "learning_rate": 1.2791204458008993e-05, "loss": 0.2946, "step": 743700 }, { "epoch": 10.247719820341132, "grad_norm": 1.1734533309936523, "learning_rate": 1.2784432536793956e-05, "loss": 0.3298, "step": 743800 }, { "epoch": 10.24909757240087, "grad_norm": 4.123232364654541, "learning_rate": 1.277766186574167e-05, "loss": 0.3029, "step": 743900 }, { "epoch": 10.25047532446061, "grad_norm": 2.223881244659424, "learning_rate": 1.2770892445427317e-05, "loss": 0.2839, "step": 744000 }, { "epoch": 10.25185307652035, "grad_norm": 1.725261926651001, "learning_rate": 1.2764124276426003e-05, "loss": 0.3164, "step": 744100 }, { "epoch": 10.253230828580088, "grad_norm": 0.7118489742279053, "learning_rate": 1.2757357359312733e-05, "loss": 0.3455, "step": 744200 }, { "epoch": 10.254608580639829, "grad_norm": 3.9301257133483887, "learning_rate": 1.2750591694662379e-05, "loss": 0.2928, "step": 744300 }, { "epoch": 10.255986332699567, "grad_norm": 2.872012138366699, "learning_rate": 1.2743827283049704e-05, "loss": 0.2894, "step": 744400 }, { "epoch": 10.257364084759306, "grad_norm": 4.396772384643555, "learning_rate": 1.2737064125049395e-05, "loss": 0.2665, "step": 744500 }, { "epoch": 10.258741836819047, "grad_norm": 4.133399963378906, "learning_rate": 1.2730302221236027e-05, "loss": 0.2834, "step": 744600 }, { "epoch": 10.260119588878785, "grad_norm": 1.3539600372314453, "learning_rate": 1.2723541572184036e-05, "loss": 0.2854, "step": 744700 }, { "epoch": 10.261497340938524, "grad_norm": 2.0183892250061035, "learning_rate": 1.2716782178467793e-05, "loss": 0.2574, "step": 744800 }, { "epoch": 10.262875092998264, "grad_norm": 0.10228355973958969, "learning_rate": 1.2710024040661535e-05, "loss": 0.2971, "step": 744900 }, { "epoch": 10.264252845058003, "grad_norm": 0.18874327838420868, "learning_rate": 1.2703267159339388e-05, "loss": 0.2545, "step": 745000 }, { "epoch": 10.265630597117744, "grad_norm": 0.07201740890741348, "learning_rate": 1.2696511535075398e-05, "loss": 0.2964, "step": 745100 }, { "epoch": 10.267008349177482, "grad_norm": 2.653935432434082, "learning_rate": 1.2689757168443495e-05, "loss": 0.3243, "step": 745200 }, { "epoch": 10.26838610123722, "grad_norm": 6.803628444671631, "learning_rate": 1.2683004060017483e-05, "loss": 0.2882, "step": 745300 }, { "epoch": 10.269763853296961, "grad_norm": 3.905977964401245, "learning_rate": 1.2676252210371084e-05, "loss": 0.3046, "step": 745400 }, { "epoch": 10.2711416053567, "grad_norm": 3.8674843311309814, "learning_rate": 1.2669501620077893e-05, "loss": 0.2301, "step": 745500 }, { "epoch": 10.272519357416439, "grad_norm": 3.3966798782348633, "learning_rate": 1.2662752289711418e-05, "loss": 0.3044, "step": 745600 }, { "epoch": 10.273897109476179, "grad_norm": 2.188098192214966, "learning_rate": 1.2656004219845033e-05, "loss": 0.2905, "step": 745700 }, { "epoch": 10.275274861535918, "grad_norm": 7.0580644607543945, "learning_rate": 1.2649257411052035e-05, "loss": 0.2874, "step": 745800 }, { "epoch": 10.276652613595658, "grad_norm": 7.363040447235107, "learning_rate": 1.2642511863905613e-05, "loss": 0.2318, "step": 745900 }, { "epoch": 10.278030365655397, "grad_norm": 1.6303383111953735, "learning_rate": 1.2635767578978797e-05, "loss": 0.3022, "step": 746000 }, { "epoch": 10.279408117715136, "grad_norm": 4.955488681793213, "learning_rate": 1.262902455684457e-05, "loss": 0.2944, "step": 746100 }, { "epoch": 10.280785869774876, "grad_norm": 2.6587889194488525, "learning_rate": 1.2622282798075798e-05, "loss": 0.2932, "step": 746200 }, { "epoch": 10.282163621834615, "grad_norm": 2.686431884765625, "learning_rate": 1.2615542303245206e-05, "loss": 0.2757, "step": 746300 }, { "epoch": 10.283541373894353, "grad_norm": 1.9033284187316895, "learning_rate": 1.2608803072925442e-05, "loss": 0.2779, "step": 746400 }, { "epoch": 10.284919125954094, "grad_norm": 5.9717912673950195, "learning_rate": 1.260206510768905e-05, "loss": 0.2772, "step": 746500 }, { "epoch": 10.286296878013832, "grad_norm": 0.2728531062602997, "learning_rate": 1.2595328408108447e-05, "loss": 0.2913, "step": 746600 }, { "epoch": 10.287674630073571, "grad_norm": 2.6363723278045654, "learning_rate": 1.2588592974755937e-05, "loss": 0.2491, "step": 746700 }, { "epoch": 10.289052382133312, "grad_norm": 31.184005737304688, "learning_rate": 1.2581858808203738e-05, "loss": 0.289, "step": 746800 }, { "epoch": 10.29043013419305, "grad_norm": 1.7844102382659912, "learning_rate": 1.2575125909023966e-05, "loss": 0.2829, "step": 746900 }, { "epoch": 10.29180788625279, "grad_norm": 1.8636345863342285, "learning_rate": 1.2568394277788602e-05, "loss": 0.2821, "step": 747000 }, { "epoch": 10.29318563831253, "grad_norm": 2.7658934593200684, "learning_rate": 1.2561663915069526e-05, "loss": 0.3015, "step": 747100 }, { "epoch": 10.294563390372268, "grad_norm": 10.444513320922852, "learning_rate": 1.2554934821438533e-05, "loss": 0.3579, "step": 747200 }, { "epoch": 10.295941142432008, "grad_norm": 1.9455314874649048, "learning_rate": 1.2548206997467274e-05, "loss": 0.2568, "step": 747300 }, { "epoch": 10.297318894491747, "grad_norm": 1.7967429161071777, "learning_rate": 1.2541480443727322e-05, "loss": 0.2987, "step": 747400 }, { "epoch": 10.298696646551486, "grad_norm": 2.6173243522644043, "learning_rate": 1.2534822407327163e-05, "loss": 0.2736, "step": 747500 }, { "epoch": 10.300074398611226, "grad_norm": 1.535183072090149, "learning_rate": 1.2528098383047525e-05, "loss": 0.2561, "step": 747600 }, { "epoch": 10.301452150670965, "grad_norm": 7.821176052093506, "learning_rate": 1.252137563070753e-05, "loss": 0.2792, "step": 747700 }, { "epoch": 10.302829902730705, "grad_norm": 0.6094760894775391, "learning_rate": 1.2514654150878296e-05, "loss": 0.2472, "step": 747800 }, { "epoch": 10.304207654790444, "grad_norm": 2.171246290206909, "learning_rate": 1.2507933944130863e-05, "loss": 0.2789, "step": 747900 }, { "epoch": 10.305585406850183, "grad_norm": 9.747644424438477, "learning_rate": 1.2501215011036132e-05, "loss": 0.3167, "step": 748000 }, { "epoch": 10.306963158909923, "grad_norm": 0.5684568285942078, "learning_rate": 1.249449735216493e-05, "loss": 0.2438, "step": 748100 }, { "epoch": 10.308340910969662, "grad_norm": 2.6274960041046143, "learning_rate": 1.2487780968087934e-05, "loss": 0.2968, "step": 748200 }, { "epoch": 10.3097186630294, "grad_norm": 1.7250436544418335, "learning_rate": 1.2481065859375756e-05, "loss": 0.2701, "step": 748300 }, { "epoch": 10.31109641508914, "grad_norm": 0.046784158796072006, "learning_rate": 1.2474352026598861e-05, "loss": 0.3154, "step": 748400 }, { "epoch": 10.31247416714888, "grad_norm": 2.8722221851348877, "learning_rate": 1.2467639470327632e-05, "loss": 0.2613, "step": 748500 }, { "epoch": 10.31385191920862, "grad_norm": 0.864767849445343, "learning_rate": 1.2460928191132345e-05, "loss": 0.2587, "step": 748600 }, { "epoch": 10.315229671268359, "grad_norm": 2.326507091522217, "learning_rate": 1.2454218189583146e-05, "loss": 0.2634, "step": 748700 }, { "epoch": 10.316607423328097, "grad_norm": 4.557476997375488, "learning_rate": 1.2447509466250079e-05, "loss": 0.2237, "step": 748800 }, { "epoch": 10.317985175387838, "grad_norm": 2.7181835174560547, "learning_rate": 1.2440869089816697e-05, "loss": 0.3133, "step": 748900 }, { "epoch": 10.319362927447576, "grad_norm": 1.1447982788085938, "learning_rate": 1.2434162911829248e-05, "loss": 0.2767, "step": 749000 }, { "epoch": 10.320740679507315, "grad_norm": 4.563024044036865, "learning_rate": 1.2427458013761725e-05, "loss": 0.2612, "step": 749100 }, { "epoch": 10.322118431567056, "grad_norm": 3.108729600906372, "learning_rate": 1.2420754396183766e-05, "loss": 0.2696, "step": 749200 }, { "epoch": 10.323496183626794, "grad_norm": 6.1577582359313965, "learning_rate": 1.2414052059664856e-05, "loss": 0.2363, "step": 749300 }, { "epoch": 10.324873935686535, "grad_norm": 52.30073928833008, "learning_rate": 1.2407351004774411e-05, "loss": 0.2866, "step": 749400 }, { "epoch": 10.326251687746273, "grad_norm": 4.92311429977417, "learning_rate": 1.2400651232081702e-05, "loss": 0.2829, "step": 749500 }, { "epoch": 10.327629439806012, "grad_norm": 0.09638363122940063, "learning_rate": 1.2393952742155919e-05, "loss": 0.2792, "step": 749600 }, { "epoch": 10.329007191865752, "grad_norm": 0.4088597297668457, "learning_rate": 1.2387255535566143e-05, "loss": 0.2588, "step": 749700 }, { "epoch": 10.330384943925491, "grad_norm": 1.1428639888763428, "learning_rate": 1.2380559612881325e-05, "loss": 0.2564, "step": 749800 }, { "epoch": 10.33176269598523, "grad_norm": 2.0343477725982666, "learning_rate": 1.237386497467031e-05, "loss": 0.2187, "step": 749900 }, { "epoch": 10.33314044804497, "grad_norm": 1.163087248802185, "learning_rate": 1.2367171621501858e-05, "loss": 0.2757, "step": 750000 }, { "epoch": 10.334518200104709, "grad_norm": 0.33483877778053284, "learning_rate": 1.2360479553944586e-05, "loss": 0.2293, "step": 750100 }, { "epoch": 10.33589595216445, "grad_norm": 49.04188537597656, "learning_rate": 1.235378877256703e-05, "loss": 0.2841, "step": 750200 }, { "epoch": 10.337273704224188, "grad_norm": 2.908079147338867, "learning_rate": 1.2347099277937615e-05, "loss": 0.2342, "step": 750300 }, { "epoch": 10.338651456283927, "grad_norm": 11.657766342163086, "learning_rate": 1.2340411070624638e-05, "loss": 0.3136, "step": 750400 }, { "epoch": 10.340029208343667, "grad_norm": 2.917191505432129, "learning_rate": 1.2333724151196288e-05, "loss": 0.304, "step": 750500 }, { "epoch": 10.341406960403406, "grad_norm": 1.011428952217102, "learning_rate": 1.2327038520220658e-05, "loss": 0.3094, "step": 750600 }, { "epoch": 10.342784712463144, "grad_norm": 1.028139352798462, "learning_rate": 1.2320354178265743e-05, "loss": 0.2623, "step": 750700 }, { "epoch": 10.344162464522885, "grad_norm": 3.5651354789733887, "learning_rate": 1.2313671125899389e-05, "loss": 0.3188, "step": 750800 }, { "epoch": 10.345540216582624, "grad_norm": 2.692758798599243, "learning_rate": 1.2306989363689372e-05, "loss": 0.2815, "step": 750900 }, { "epoch": 10.346917968642362, "grad_norm": 1.5006853342056274, "learning_rate": 1.2300308892203339e-05, "loss": 0.3066, "step": 751000 }, { "epoch": 10.348295720702103, "grad_norm": 2.941396474838257, "learning_rate": 1.2293629712008817e-05, "loss": 0.2623, "step": 751100 }, { "epoch": 10.349673472761841, "grad_norm": 2.8104448318481445, "learning_rate": 1.2286951823673244e-05, "loss": 0.2835, "step": 751200 }, { "epoch": 10.351051224821582, "grad_norm": 4.43010139465332, "learning_rate": 1.2280275227763954e-05, "loss": 0.3457, "step": 751300 }, { "epoch": 10.35242897688132, "grad_norm": 2.877288818359375, "learning_rate": 1.2273599924848147e-05, "loss": 0.3042, "step": 751400 }, { "epoch": 10.35380672894106, "grad_norm": 1.2747961282730103, "learning_rate": 1.2266925915492914e-05, "loss": 0.2414, "step": 751500 }, { "epoch": 10.3551844810008, "grad_norm": 4.4095258712768555, "learning_rate": 1.2260253200265258e-05, "loss": 0.2994, "step": 751600 }, { "epoch": 10.356562233060538, "grad_norm": 15.69942855834961, "learning_rate": 1.2253581779732069e-05, "loss": 0.2389, "step": 751700 }, { "epoch": 10.357939985120277, "grad_norm": 3.9615352153778076, "learning_rate": 1.2246911654460096e-05, "loss": 0.2768, "step": 751800 }, { "epoch": 10.359317737180017, "grad_norm": 1.496427059173584, "learning_rate": 1.224024282501601e-05, "loss": 0.2717, "step": 751900 }, { "epoch": 10.360695489239756, "grad_norm": 111.40361022949219, "learning_rate": 1.2233575291966388e-05, "loss": 0.2973, "step": 752000 }, { "epoch": 10.362073241299496, "grad_norm": 2.0507266521453857, "learning_rate": 1.2226909055877626e-05, "loss": 0.2757, "step": 752100 }, { "epoch": 10.363450993359235, "grad_norm": 3.4131627082824707, "learning_rate": 1.2220244117316078e-05, "loss": 0.2844, "step": 752200 }, { "epoch": 10.364828745418974, "grad_norm": 0.6258629560470581, "learning_rate": 1.2213580476847972e-05, "loss": 0.2986, "step": 752300 }, { "epoch": 10.366206497478714, "grad_norm": 3.588021993637085, "learning_rate": 1.2206918135039399e-05, "loss": 0.3108, "step": 752400 }, { "epoch": 10.367584249538453, "grad_norm": 1.8724514245986938, "learning_rate": 1.2200257092456377e-05, "loss": 0.2382, "step": 752500 }, { "epoch": 10.368962001598192, "grad_norm": 3.2880165576934814, "learning_rate": 1.2193597349664782e-05, "loss": 0.2265, "step": 752600 }, { "epoch": 10.370339753657932, "grad_norm": 1.7806172370910645, "learning_rate": 1.2186938907230407e-05, "loss": 0.255, "step": 752700 }, { "epoch": 10.37171750571767, "grad_norm": 4.624688148498535, "learning_rate": 1.2180281765718901e-05, "loss": 0.3445, "step": 752800 }, { "epoch": 10.373095257777411, "grad_norm": 2.98949933052063, "learning_rate": 1.2173625925695838e-05, "loss": 0.2988, "step": 752900 }, { "epoch": 10.37447300983715, "grad_norm": 3.386470317840576, "learning_rate": 1.2166971387726671e-05, "loss": 0.2419, "step": 753000 }, { "epoch": 10.375850761896888, "grad_norm": 2.0045390129089355, "learning_rate": 1.2160318152376726e-05, "loss": 0.2826, "step": 753100 }, { "epoch": 10.377228513956629, "grad_norm": 9.526384353637695, "learning_rate": 1.2153666220211225e-05, "loss": 0.4032, "step": 753200 }, { "epoch": 10.378606266016368, "grad_norm": 4.81276798248291, "learning_rate": 1.2147015591795296e-05, "loss": 0.3102, "step": 753300 }, { "epoch": 10.379984018076106, "grad_norm": 2.2522106170654297, "learning_rate": 1.2140366267693926e-05, "loss": 0.3012, "step": 753400 }, { "epoch": 10.381361770135847, "grad_norm": 0.10712818056344986, "learning_rate": 1.2133718248472026e-05, "loss": 0.2639, "step": 753500 }, { "epoch": 10.382739522195585, "grad_norm": 2.9620654582977295, "learning_rate": 1.2127071534694381e-05, "loss": 0.2425, "step": 753600 }, { "epoch": 10.384117274255326, "grad_norm": 2.1191298961639404, "learning_rate": 1.2120426126925659e-05, "loss": 0.2882, "step": 753700 }, { "epoch": 10.385495026315064, "grad_norm": 0.9867560863494873, "learning_rate": 1.2113782025730408e-05, "loss": 0.319, "step": 753800 }, { "epoch": 10.386872778374803, "grad_norm": 10.295557022094727, "learning_rate": 1.21071392316731e-05, "loss": 0.289, "step": 753900 }, { "epoch": 10.388250530434544, "grad_norm": 1.3176130056381226, "learning_rate": 1.210049774531805e-05, "loss": 0.3031, "step": 754000 }, { "epoch": 10.389628282494282, "grad_norm": 4.669096946716309, "learning_rate": 1.2093923962532621e-05, "loss": 0.3646, "step": 754100 }, { "epoch": 10.391006034554021, "grad_norm": 2.310152530670166, "learning_rate": 1.2087285080183599e-05, "loss": 0.2546, "step": 754200 }, { "epoch": 10.392383786613761, "grad_norm": 1.4878326654434204, "learning_rate": 1.2080647507223555e-05, "loss": 0.2774, "step": 754300 }, { "epoch": 10.3937615386735, "grad_norm": 4.410424709320068, "learning_rate": 1.2074011244216399e-05, "loss": 0.2829, "step": 754400 }, { "epoch": 10.39513929073324, "grad_norm": 3.3908345699310303, "learning_rate": 1.2067376291725896e-05, "loss": 0.2398, "step": 754500 }, { "epoch": 10.39651704279298, "grad_norm": 2.1960129737854004, "learning_rate": 1.2060742650315728e-05, "loss": 0.2901, "step": 754600 }, { "epoch": 10.397894794852718, "grad_norm": 5.312730312347412, "learning_rate": 1.2054110320549481e-05, "loss": 0.2523, "step": 754700 }, { "epoch": 10.399272546912458, "grad_norm": 5.1576409339904785, "learning_rate": 1.2047479302990563e-05, "loss": 0.2979, "step": 754800 }, { "epoch": 10.400650298972197, "grad_norm": 1.6031330823898315, "learning_rate": 1.2040849598202332e-05, "loss": 0.3266, "step": 754900 }, { "epoch": 10.402028051031936, "grad_norm": 3.285905122756958, "learning_rate": 1.2034221206748025e-05, "loss": 0.2421, "step": 755000 }, { "epoch": 10.403405803091676, "grad_norm": 0.20130255818367004, "learning_rate": 1.202759412919074e-05, "loss": 0.3022, "step": 755100 }, { "epoch": 10.404783555151415, "grad_norm": 0.8661023378372192, "learning_rate": 1.202096836609349e-05, "loss": 0.2495, "step": 755200 }, { "epoch": 10.406161307211153, "grad_norm": 4.751830577850342, "learning_rate": 1.2014343918019179e-05, "loss": 0.2838, "step": 755300 }, { "epoch": 10.407539059270894, "grad_norm": 0.5373607873916626, "learning_rate": 1.2007720785530576e-05, "loss": 0.2949, "step": 755400 }, { "epoch": 10.408916811330633, "grad_norm": 1.6983964443206787, "learning_rate": 1.2001098969190339e-05, "loss": 0.288, "step": 755500 }, { "epoch": 10.410294563390373, "grad_norm": 3.3707869052886963, "learning_rate": 1.1994478469561035e-05, "loss": 0.2924, "step": 755600 }, { "epoch": 10.411672315450112, "grad_norm": 4.937296390533447, "learning_rate": 1.1987859287205124e-05, "loss": 0.2731, "step": 755700 }, { "epoch": 10.41305006750985, "grad_norm": 1.05008864402771, "learning_rate": 1.1981241422684913e-05, "loss": 0.3511, "step": 755800 }, { "epoch": 10.41442781956959, "grad_norm": 5.632351398468018, "learning_rate": 1.197462487656265e-05, "loss": 0.2902, "step": 755900 }, { "epoch": 10.41580557162933, "grad_norm": 2.395057201385498, "learning_rate": 1.1968009649400427e-05, "loss": 0.2568, "step": 756000 }, { "epoch": 10.417183323689068, "grad_norm": 3.044116258621216, "learning_rate": 1.1961395741760237e-05, "loss": 0.2637, "step": 756100 }, { "epoch": 10.418561075748809, "grad_norm": 4.785831928253174, "learning_rate": 1.1954783154203976e-05, "loss": 0.2885, "step": 756200 }, { "epoch": 10.419938827808547, "grad_norm": 4.038703918457031, "learning_rate": 1.1948171887293424e-05, "loss": 0.3689, "step": 756300 }, { "epoch": 10.421316579868288, "grad_norm": 1.6830896139144897, "learning_rate": 1.194156194159023e-05, "loss": 0.2559, "step": 756400 }, { "epoch": 10.422694331928026, "grad_norm": 2.0679242610931396, "learning_rate": 1.1934953317655937e-05, "loss": 0.2213, "step": 756500 }, { "epoch": 10.424072083987765, "grad_norm": 3.4630651473999023, "learning_rate": 1.1928346016051989e-05, "loss": 0.2433, "step": 756600 }, { "epoch": 10.425449836047505, "grad_norm": 2.459794759750366, "learning_rate": 1.1921740037339724e-05, "loss": 0.2893, "step": 756700 }, { "epoch": 10.426827588107244, "grad_norm": 4.568902969360352, "learning_rate": 1.1915135382080328e-05, "loss": 0.2658, "step": 756800 }, { "epoch": 10.428205340166983, "grad_norm": 6.512533187866211, "learning_rate": 1.1908532050834913e-05, "loss": 0.3044, "step": 756900 }, { "epoch": 10.429583092226723, "grad_norm": 2.0240204334259033, "learning_rate": 1.1901930044164488e-05, "loss": 0.2797, "step": 757000 }, { "epoch": 10.430960844286462, "grad_norm": 1.690431833267212, "learning_rate": 1.1895329362629882e-05, "loss": 0.2829, "step": 757100 }, { "epoch": 10.432338596346202, "grad_norm": 4.47620964050293, "learning_rate": 1.1888730006791883e-05, "loss": 0.2543, "step": 757200 }, { "epoch": 10.433716348405941, "grad_norm": 3.4773223400115967, "learning_rate": 1.1882131977211145e-05, "loss": 0.226, "step": 757300 }, { "epoch": 10.43509410046568, "grad_norm": 3.734255313873291, "learning_rate": 1.1875535274448185e-05, "loss": 0.2784, "step": 757400 }, { "epoch": 10.43647185252542, "grad_norm": 4.417179584503174, "learning_rate": 1.1868939899063448e-05, "loss": 0.276, "step": 757500 }, { "epoch": 10.437849604585159, "grad_norm": 1.5073133707046509, "learning_rate": 1.1862345851617225e-05, "loss": 0.2577, "step": 757600 }, { "epoch": 10.439227356644897, "grad_norm": 1.0989325046539307, "learning_rate": 1.1855753132669734e-05, "loss": 0.2756, "step": 757700 }, { "epoch": 10.440605108704638, "grad_norm": 2.778719902038574, "learning_rate": 1.1849161742781037e-05, "loss": 0.2515, "step": 757800 }, { "epoch": 10.441982860764377, "grad_norm": 1.7250019311904907, "learning_rate": 1.1842571682511122e-05, "loss": 0.3062, "step": 757900 }, { "epoch": 10.443360612824117, "grad_norm": 0.7426195740699768, "learning_rate": 1.1835982952419855e-05, "loss": 0.3213, "step": 758000 }, { "epoch": 10.444738364883856, "grad_norm": 2.1860883235931396, "learning_rate": 1.1829395553066973e-05, "loss": 0.3038, "step": 758100 }, { "epoch": 10.446116116943594, "grad_norm": 2.356360673904419, "learning_rate": 1.1822809485012097e-05, "loss": 0.2423, "step": 758200 }, { "epoch": 10.447493869003335, "grad_norm": 2.5138392448425293, "learning_rate": 1.1816224748814774e-05, "loss": 0.3191, "step": 758300 }, { "epoch": 10.448871621063073, "grad_norm": 1.3884849548339844, "learning_rate": 1.1809641345034382e-05, "loss": 0.2583, "step": 758400 }, { "epoch": 10.450249373122812, "grad_norm": 4.124725818634033, "learning_rate": 1.180305927423023e-05, "loss": 0.2689, "step": 758500 }, { "epoch": 10.451627125182553, "grad_norm": 13.507261276245117, "learning_rate": 1.1796478536961507e-05, "loss": 0.2307, "step": 758600 }, { "epoch": 10.453004877242291, "grad_norm": 7.8579277992248535, "learning_rate": 1.1789899133787273e-05, "loss": 0.2265, "step": 758700 }, { "epoch": 10.454382629302032, "grad_norm": 3.59801983833313, "learning_rate": 1.1783321065266467e-05, "loss": 0.2886, "step": 758800 }, { "epoch": 10.45576038136177, "grad_norm": 1.100722074508667, "learning_rate": 1.1776744331957945e-05, "loss": 0.2614, "step": 758900 }, { "epoch": 10.457138133421509, "grad_norm": 2.635401487350464, "learning_rate": 1.177016893442044e-05, "loss": 0.3052, "step": 759000 }, { "epoch": 10.45851588548125, "grad_norm": 1.6851189136505127, "learning_rate": 1.176359487321255e-05, "loss": 0.2378, "step": 759100 }, { "epoch": 10.459893637540988, "grad_norm": 2.566514253616333, "learning_rate": 1.1757022148892788e-05, "loss": 0.2898, "step": 759200 }, { "epoch": 10.461271389600727, "grad_norm": 2.4905130863189697, "learning_rate": 1.1750450762019539e-05, "loss": 0.2195, "step": 759300 }, { "epoch": 10.462649141660467, "grad_norm": 3.3154752254486084, "learning_rate": 1.174388071315106e-05, "loss": 0.2952, "step": 759400 }, { "epoch": 10.464026893720206, "grad_norm": 3.634671211242676, "learning_rate": 1.1737312002845524e-05, "loss": 0.266, "step": 759500 }, { "epoch": 10.465404645779945, "grad_norm": 3.4203808307647705, "learning_rate": 1.1730744631660983e-05, "loss": 0.261, "step": 759600 }, { "epoch": 10.466782397839685, "grad_norm": 2.1456093788146973, "learning_rate": 1.172417860015536e-05, "loss": 0.2862, "step": 759700 }, { "epoch": 10.468160149899424, "grad_norm": 0.7388341426849365, "learning_rate": 1.1717613908886463e-05, "loss": 0.2782, "step": 759800 }, { "epoch": 10.469537901959164, "grad_norm": 1.9185842275619507, "learning_rate": 1.1711050558412019e-05, "loss": 0.2849, "step": 759900 }, { "epoch": 10.470915654018903, "grad_norm": 2.684519052505493, "learning_rate": 1.1704488549289596e-05, "loss": 0.2788, "step": 760000 }, { "epoch": 10.472293406078641, "grad_norm": 5.093529224395752, "learning_rate": 1.1697927882076676e-05, "loss": 0.3208, "step": 760100 }, { "epoch": 10.473671158138382, "grad_norm": 8.305863380432129, "learning_rate": 1.1691368557330639e-05, "loss": 0.2597, "step": 760200 }, { "epoch": 10.47504891019812, "grad_norm": 2.561605453491211, "learning_rate": 1.1684810575608718e-05, "loss": 0.2565, "step": 760300 }, { "epoch": 10.47642666225786, "grad_norm": 8.820394515991211, "learning_rate": 1.1678319497196891e-05, "loss": 0.2822, "step": 760400 }, { "epoch": 10.4778044143176, "grad_norm": 7.952504634857178, "learning_rate": 1.1671764189750344e-05, "loss": 0.2441, "step": 760500 }, { "epoch": 10.479182166377338, "grad_norm": 1.5357391834259033, "learning_rate": 1.1665210226993407e-05, "loss": 0.2639, "step": 760600 }, { "epoch": 10.480559918437079, "grad_norm": 3.7912161350250244, "learning_rate": 1.1658657609482879e-05, "loss": 0.37, "step": 760700 }, { "epoch": 10.481937670496817, "grad_norm": 1.7062973976135254, "learning_rate": 1.1652106337775433e-05, "loss": 0.2361, "step": 760800 }, { "epoch": 10.483315422556556, "grad_norm": 3.3193728923797607, "learning_rate": 1.164555641242762e-05, "loss": 0.2882, "step": 760900 }, { "epoch": 10.484693174616297, "grad_norm": 4.0004353523254395, "learning_rate": 1.1639073313111158e-05, "loss": 0.3063, "step": 761000 }, { "epoch": 10.486070926676035, "grad_norm": 0.07798045128583908, "learning_rate": 1.16325260686744e-05, "loss": 0.2888, "step": 761100 }, { "epoch": 10.487448678735774, "grad_norm": 3.9397013187408447, "learning_rate": 1.162598017226072e-05, "loss": 0.265, "step": 761200 }, { "epoch": 10.488826430795514, "grad_norm": 2.286264657974243, "learning_rate": 1.1619435624426233e-05, "loss": 0.3416, "step": 761300 }, { "epoch": 10.490204182855253, "grad_norm": 1.174958348274231, "learning_rate": 1.1612892425726945e-05, "loss": 0.2919, "step": 761400 }, { "epoch": 10.491581934914993, "grad_norm": 1.3542767763137817, "learning_rate": 1.1606350576718726e-05, "loss": 0.2532, "step": 761500 }, { "epoch": 10.492959686974732, "grad_norm": 4.155183792114258, "learning_rate": 1.1599810077957325e-05, "loss": 0.2757, "step": 761600 }, { "epoch": 10.49433743903447, "grad_norm": 5.336221694946289, "learning_rate": 1.1593270929998414e-05, "loss": 0.2772, "step": 761700 }, { "epoch": 10.495715191094211, "grad_norm": 1.6508089303970337, "learning_rate": 1.1586733133397503e-05, "loss": 0.2409, "step": 761800 }, { "epoch": 10.49709294315395, "grad_norm": 2.2604453563690186, "learning_rate": 1.158026204646312e-05, "loss": 0.2615, "step": 761900 }, { "epoch": 10.498470695213689, "grad_norm": 2.0784566402435303, "learning_rate": 1.1573726940716941e-05, "loss": 0.2886, "step": 762000 }, { "epoch": 10.499848447273429, "grad_norm": 4.2743916511535645, "learning_rate": 1.156719318798914e-05, "loss": 0.2743, "step": 762100 }, { "epoch": 10.501226199333168, "grad_norm": 0.0776483416557312, "learning_rate": 1.1560660788834797e-05, "loss": 0.2677, "step": 762200 }, { "epoch": 10.502603951392908, "grad_norm": 1.19465172290802, "learning_rate": 1.1554129743808858e-05, "loss": 0.2956, "step": 762300 }, { "epoch": 10.503981703452647, "grad_norm": 1.1728020906448364, "learning_rate": 1.1547600053466174e-05, "loss": 0.2795, "step": 762400 }, { "epoch": 10.505359455512385, "grad_norm": 1.5463505983352661, "learning_rate": 1.1541071718361495e-05, "loss": 0.2705, "step": 762500 }, { "epoch": 10.506737207572126, "grad_norm": 3.5012853145599365, "learning_rate": 1.1534544739049402e-05, "loss": 0.3235, "step": 762600 }, { "epoch": 10.508114959631865, "grad_norm": 2.580396890640259, "learning_rate": 1.1528019116084414e-05, "loss": 0.2739, "step": 762700 }, { "epoch": 10.509492711691603, "grad_norm": 5.822792053222656, "learning_rate": 1.152149485002092e-05, "loss": 0.2574, "step": 762800 }, { "epoch": 10.510870463751344, "grad_norm": 2.132537603378296, "learning_rate": 1.1514971941413176e-05, "loss": 0.268, "step": 762900 }, { "epoch": 10.512248215811082, "grad_norm": 1.9471551179885864, "learning_rate": 1.1508450390815352e-05, "loss": 0.2648, "step": 763000 }, { "epoch": 10.513625967870823, "grad_norm": 3.351454019546509, "learning_rate": 1.1501930198781473e-05, "loss": 0.274, "step": 763100 }, { "epoch": 10.515003719930561, "grad_norm": 2.9792375564575195, "learning_rate": 1.1495411365865477e-05, "loss": 0.2832, "step": 763200 }, { "epoch": 10.5163814719903, "grad_norm": 0.9721190929412842, "learning_rate": 1.1488893892621157e-05, "loss": 0.2185, "step": 763300 }, { "epoch": 10.51775922405004, "grad_norm": 3.034794569015503, "learning_rate": 1.1482377779602212e-05, "loss": 0.2446, "step": 763400 }, { "epoch": 10.51913697610978, "grad_norm": 2.5384576320648193, "learning_rate": 1.1475863027362227e-05, "loss": 0.2745, "step": 763500 }, { "epoch": 10.520514728169518, "grad_norm": 3.1671626567840576, "learning_rate": 1.146934963645466e-05, "loss": 0.2794, "step": 763600 }, { "epoch": 10.521892480229258, "grad_norm": 3.7602734565734863, "learning_rate": 1.1462837607432845e-05, "loss": 0.2817, "step": 763700 }, { "epoch": 10.523270232288997, "grad_norm": 2.8570141792297363, "learning_rate": 1.1456326940850027e-05, "loss": 0.2683, "step": 763800 }, { "epoch": 10.524647984348736, "grad_norm": 3.899150848388672, "learning_rate": 1.1449817637259307e-05, "loss": 0.312, "step": 763900 }, { "epoch": 10.526025736408476, "grad_norm": 1.554233193397522, "learning_rate": 1.144330969721369e-05, "loss": 0.2748, "step": 764000 }, { "epoch": 10.527403488468215, "grad_norm": 1.1329936981201172, "learning_rate": 1.143680312126607e-05, "loss": 0.308, "step": 764100 }, { "epoch": 10.528781240527955, "grad_norm": 1.8282331228256226, "learning_rate": 1.1430297909969203e-05, "loss": 0.2853, "step": 764200 }, { "epoch": 10.530158992587694, "grad_norm": 3.2350423336029053, "learning_rate": 1.1423794063875728e-05, "loss": 0.2603, "step": 764300 }, { "epoch": 10.531536744647433, "grad_norm": 1.669077754020691, "learning_rate": 1.1417291583538198e-05, "loss": 0.3047, "step": 764400 }, { "epoch": 10.532914496707173, "grad_norm": 3.144928216934204, "learning_rate": 1.1410855473884288e-05, "loss": 0.2843, "step": 764500 }, { "epoch": 10.534292248766912, "grad_norm": 3.8136847019195557, "learning_rate": 1.1404355713044435e-05, "loss": 0.2771, "step": 764600 }, { "epoch": 10.535670000826652, "grad_norm": 1.5378443002700806, "learning_rate": 1.139785731961192e-05, "loss": 0.2354, "step": 764700 }, { "epoch": 10.53704775288639, "grad_norm": 0.5432489514350891, "learning_rate": 1.1391360294138792e-05, "loss": 0.247, "step": 764800 }, { "epoch": 10.53842550494613, "grad_norm": 9.730171203613281, "learning_rate": 1.138486463717703e-05, "loss": 0.3176, "step": 764900 }, { "epoch": 10.53980325700587, "grad_norm": 0.02651713229715824, "learning_rate": 1.137837034927845e-05, "loss": 0.2837, "step": 765000 }, { "epoch": 10.541181009065609, "grad_norm": 3.5589966773986816, "learning_rate": 1.1371877430994786e-05, "loss": 0.2787, "step": 765100 }, { "epoch": 10.542558761125347, "grad_norm": 0.10133332014083862, "learning_rate": 1.1365385882877657e-05, "loss": 0.2651, "step": 765200 }, { "epoch": 10.543936513185088, "grad_norm": 9.68149471282959, "learning_rate": 1.1358895705478541e-05, "loss": 0.255, "step": 765300 }, { "epoch": 10.545314265244826, "grad_norm": 0.5385342836380005, "learning_rate": 1.1352406899348802e-05, "loss": 0.2949, "step": 765400 }, { "epoch": 10.546692017304565, "grad_norm": 3.5299580097198486, "learning_rate": 1.134598433259048e-05, "loss": 0.2727, "step": 765500 }, { "epoch": 10.548069769364306, "grad_norm": 7.250683784484863, "learning_rate": 1.1339498256926735e-05, "loss": 0.2468, "step": 765600 }, { "epoch": 10.549447521424044, "grad_norm": 1.0019599199295044, "learning_rate": 1.1333013554180278e-05, "loss": 0.292, "step": 765700 }, { "epoch": 10.550825273483785, "grad_norm": 2.9831621646881104, "learning_rate": 1.1326530224902041e-05, "loss": 0.2646, "step": 765800 }, { "epoch": 10.552203025543523, "grad_norm": 2.2045834064483643, "learning_rate": 1.1320048269642794e-05, "loss": 0.2564, "step": 765900 }, { "epoch": 10.553580777603262, "grad_norm": 1.5811694860458374, "learning_rate": 1.1313567688953232e-05, "loss": 0.2693, "step": 766000 }, { "epoch": 10.554958529663002, "grad_norm": 10.051183700561523, "learning_rate": 1.1307088483383894e-05, "loss": 0.2753, "step": 766100 }, { "epoch": 10.556336281722741, "grad_norm": 7.651517391204834, "learning_rate": 1.1300610653485235e-05, "loss": 0.3351, "step": 766200 }, { "epoch": 10.55771403378248, "grad_norm": 1.2165464162826538, "learning_rate": 1.1294134199807593e-05, "loss": 0.3092, "step": 766300 }, { "epoch": 10.55909178584222, "grad_norm": 2.310483455657959, "learning_rate": 1.1287659122901142e-05, "loss": 0.2323, "step": 766400 }, { "epoch": 10.560469537901959, "grad_norm": 0.7303256988525391, "learning_rate": 1.1281185423315986e-05, "loss": 0.3074, "step": 766500 }, { "epoch": 10.5618472899617, "grad_norm": 2.2746641635894775, "learning_rate": 1.1274713101602117e-05, "loss": 0.2492, "step": 766600 }, { "epoch": 10.563225042021438, "grad_norm": 4.0299072265625, "learning_rate": 1.1268242158309367e-05, "loss": 0.2878, "step": 766700 }, { "epoch": 10.564602794081177, "grad_norm": 2.308422803878784, "learning_rate": 1.1261772593987485e-05, "loss": 0.3317, "step": 766800 }, { "epoch": 10.565980546140917, "grad_norm": 1.060150384902954, "learning_rate": 1.1255304409186109e-05, "loss": 0.2804, "step": 766900 }, { "epoch": 10.567358298200656, "grad_norm": 4.13946008682251, "learning_rate": 1.124883760445473e-05, "loss": 0.2509, "step": 767000 }, { "epoch": 10.568736050260394, "grad_norm": 1.7353650331497192, "learning_rate": 1.1242372180342723e-05, "loss": 0.2983, "step": 767100 }, { "epoch": 10.570113802320135, "grad_norm": 2.8163986206054688, "learning_rate": 1.1235908137399377e-05, "loss": 0.2609, "step": 767200 }, { "epoch": 10.571491554379874, "grad_norm": 4.111927509307861, "learning_rate": 1.122944547617385e-05, "loss": 0.2775, "step": 767300 }, { "epoch": 10.572869306439614, "grad_norm": 1.1802095174789429, "learning_rate": 1.122298419721517e-05, "loss": 0.2631, "step": 767400 }, { "epoch": 10.574247058499353, "grad_norm": 2.2581400871276855, "learning_rate": 1.1216524301072241e-05, "loss": 0.281, "step": 767500 }, { "epoch": 10.575624810559091, "grad_norm": 3.908735990524292, "learning_rate": 1.1210065788293893e-05, "loss": 0.2821, "step": 767600 }, { "epoch": 10.577002562618832, "grad_norm": 4.9254302978515625, "learning_rate": 1.1203608659428782e-05, "loss": 0.2582, "step": 767700 }, { "epoch": 10.57838031467857, "grad_norm": 2.7829370498657227, "learning_rate": 1.1197152915025492e-05, "loss": 0.2516, "step": 767800 }, { "epoch": 10.579758066738309, "grad_norm": 5.179299354553223, "learning_rate": 1.119069855563247e-05, "loss": 0.2832, "step": 767900 }, { "epoch": 10.58113581879805, "grad_norm": 52.03510665893555, "learning_rate": 1.1184245581798048e-05, "loss": 0.2683, "step": 768000 }, { "epoch": 10.582513570857788, "grad_norm": 4.16298770904541, "learning_rate": 1.1177793994070426e-05, "loss": 0.3071, "step": 768100 }, { "epoch": 10.583891322917527, "grad_norm": 1.292060136795044, "learning_rate": 1.1171343792997702e-05, "loss": 0.3333, "step": 768200 }, { "epoch": 10.585269074977267, "grad_norm": 0.9507945775985718, "learning_rate": 1.1164894979127875e-05, "loss": 0.2938, "step": 768300 }, { "epoch": 10.586646827037006, "grad_norm": 0.2684326469898224, "learning_rate": 1.1158447553008777e-05, "loss": 0.2605, "step": 768400 }, { "epoch": 10.588024579096746, "grad_norm": 2.818380117416382, "learning_rate": 1.1152001515188172e-05, "loss": 0.3408, "step": 768500 }, { "epoch": 10.589402331156485, "grad_norm": 3.7058634757995605, "learning_rate": 1.1145556866213675e-05, "loss": 0.2753, "step": 768600 }, { "epoch": 10.590780083216224, "grad_norm": 5.835156440734863, "learning_rate": 1.1139113606632781e-05, "loss": 0.2879, "step": 768700 }, { "epoch": 10.592157835275964, "grad_norm": 1.4286340475082397, "learning_rate": 1.113267173699289e-05, "loss": 0.2724, "step": 768800 }, { "epoch": 10.593535587335703, "grad_norm": 2.8151721954345703, "learning_rate": 1.1126231257841276e-05, "loss": 0.2566, "step": 768900 }, { "epoch": 10.594913339395443, "grad_norm": 5.139896869659424, "learning_rate": 1.1119792169725075e-05, "loss": 0.2791, "step": 769000 }, { "epoch": 10.596291091455182, "grad_norm": 4.433551788330078, "learning_rate": 1.111335447319134e-05, "loss": 0.2404, "step": 769100 }, { "epoch": 10.59766884351492, "grad_norm": 1.5295460224151611, "learning_rate": 1.1106918168786967e-05, "loss": 0.2794, "step": 769200 }, { "epoch": 10.599046595574661, "grad_norm": 5.430045127868652, "learning_rate": 1.110048325705877e-05, "loss": 0.2423, "step": 769300 }, { "epoch": 10.6004243476344, "grad_norm": 4.622608184814453, "learning_rate": 1.1094049738553414e-05, "loss": 0.3259, "step": 769400 }, { "epoch": 10.601802099694138, "grad_norm": 4.558523654937744, "learning_rate": 1.1087617613817459e-05, "loss": 0.2754, "step": 769500 }, { "epoch": 10.603179851753879, "grad_norm": 7.286407947540283, "learning_rate": 1.1081186883397374e-05, "loss": 0.2962, "step": 769600 }, { "epoch": 10.604557603813618, "grad_norm": 8.92766284942627, "learning_rate": 1.1074757547839447e-05, "loss": 0.3157, "step": 769700 }, { "epoch": 10.605935355873356, "grad_norm": 1.6710656881332397, "learning_rate": 1.1068329607689891e-05, "loss": 0.2983, "step": 769800 }, { "epoch": 10.607313107933097, "grad_norm": 2.8927485942840576, "learning_rate": 1.1061903063494814e-05, "loss": 0.295, "step": 769900 }, { "epoch": 10.608690859992835, "grad_norm": 3.3385496139526367, "learning_rate": 1.1055477915800154e-05, "loss": 0.2927, "step": 770000 }, { "epoch": 10.610068612052576, "grad_norm": 3.53165864944458, "learning_rate": 1.1049054165151777e-05, "loss": 0.3037, "step": 770100 }, { "epoch": 10.611446364112314, "grad_norm": 7.7597784996032715, "learning_rate": 1.1042631812095421e-05, "loss": 0.2645, "step": 770200 }, { "epoch": 10.612824116172053, "grad_norm": 1.823333740234375, "learning_rate": 1.1036210857176687e-05, "loss": 0.2731, "step": 770300 }, { "epoch": 10.614201868231794, "grad_norm": 2.3267180919647217, "learning_rate": 1.102979130094106e-05, "loss": 0.2912, "step": 770400 }, { "epoch": 10.615579620291532, "grad_norm": 3.1392085552215576, "learning_rate": 1.1023373143933932e-05, "loss": 0.2996, "step": 770500 }, { "epoch": 10.616957372351271, "grad_norm": 14.297091484069824, "learning_rate": 1.1016956386700537e-05, "loss": 0.3103, "step": 770600 }, { "epoch": 10.618335124411011, "grad_norm": 2.082451820373535, "learning_rate": 1.1010541029786028e-05, "loss": 0.2963, "step": 770700 }, { "epoch": 10.61971287647075, "grad_norm": 5.52354097366333, "learning_rate": 1.1004127073735427e-05, "loss": 0.2985, "step": 770800 }, { "epoch": 10.62109062853049, "grad_norm": 2.6013951301574707, "learning_rate": 1.0997714519093621e-05, "loss": 0.3285, "step": 770900 }, { "epoch": 10.62246838059023, "grad_norm": 1.1924774646759033, "learning_rate": 1.0991303366405385e-05, "loss": 0.2757, "step": 771000 }, { "epoch": 10.623846132649968, "grad_norm": 5.271380424499512, "learning_rate": 1.0984893616215383e-05, "loss": 0.3092, "step": 771100 }, { "epoch": 10.625223884709708, "grad_norm": 3.838233232498169, "learning_rate": 1.097848526906817e-05, "loss": 0.251, "step": 771200 }, { "epoch": 10.626601636769447, "grad_norm": 20.113243103027344, "learning_rate": 1.0972078325508158e-05, "loss": 0.288, "step": 771300 }, { "epoch": 10.627979388829186, "grad_norm": 3.4025511741638184, "learning_rate": 1.0965672786079636e-05, "loss": 0.2736, "step": 771400 }, { "epoch": 10.629357140888926, "grad_norm": 2.321033477783203, "learning_rate": 1.0959268651326814e-05, "loss": 0.2741, "step": 771500 }, { "epoch": 10.630734892948665, "grad_norm": 2.4975409507751465, "learning_rate": 1.0952865921793728e-05, "loss": 0.2762, "step": 771600 }, { "epoch": 10.632112645008405, "grad_norm": 7.202080726623535, "learning_rate": 1.0946464598024338e-05, "loss": 0.3082, "step": 771700 }, { "epoch": 10.633490397068144, "grad_norm": 6.662028789520264, "learning_rate": 1.0940128672774091e-05, "loss": 0.2679, "step": 771800 }, { "epoch": 10.634868149127882, "grad_norm": 3.882417917251587, "learning_rate": 1.0933730148092242e-05, "loss": 0.2894, "step": 771900 }, { "epoch": 10.636245901187623, "grad_norm": 2.4470911026000977, "learning_rate": 1.0927333030799781e-05, "loss": 0.3134, "step": 772000 }, { "epoch": 10.637623653247362, "grad_norm": 3.0687241554260254, "learning_rate": 1.0920937321440154e-05, "loss": 0.2108, "step": 772100 }, { "epoch": 10.6390014053071, "grad_norm": 3.4485580921173096, "learning_rate": 1.0914543020556718e-05, "loss": 0.3533, "step": 772200 }, { "epoch": 10.64037915736684, "grad_norm": 2.846353054046631, "learning_rate": 1.0908150128692714e-05, "loss": 0.2601, "step": 772300 }, { "epoch": 10.64175690942658, "grad_norm": 1.0572539567947388, "learning_rate": 1.090175864639124e-05, "loss": 0.2527, "step": 772400 }, { "epoch": 10.643134661486318, "grad_norm": 2.260268211364746, "learning_rate": 1.0895368574195275e-05, "loss": 0.2597, "step": 772500 }, { "epoch": 10.644512413546058, "grad_norm": 2.93265438079834, "learning_rate": 1.088897991264771e-05, "loss": 0.2727, "step": 772600 }, { "epoch": 10.645890165605797, "grad_norm": 5.983810901641846, "learning_rate": 1.0882592662291271e-05, "loss": 0.2436, "step": 772700 }, { "epoch": 10.647267917665538, "grad_norm": 1.459702730178833, "learning_rate": 1.0876206823668598e-05, "loss": 0.2735, "step": 772800 }, { "epoch": 10.648645669725276, "grad_norm": 2.6378164291381836, "learning_rate": 1.0869822397322213e-05, "loss": 0.2779, "step": 772900 }, { "epoch": 10.650023421785015, "grad_norm": 1.8068956136703491, "learning_rate": 1.0863439383794498e-05, "loss": 0.3083, "step": 773000 }, { "epoch": 10.651401173844755, "grad_norm": 2.5236852169036865, "learning_rate": 1.0857057783627707e-05, "loss": 0.3019, "step": 773100 }, { "epoch": 10.652778925904494, "grad_norm": 3.3121347427368164, "learning_rate": 1.0850677597364e-05, "loss": 0.2628, "step": 773200 }, { "epoch": 10.654156677964234, "grad_norm": 2.2889928817749023, "learning_rate": 1.084429882554542e-05, "loss": 0.2627, "step": 773300 }, { "epoch": 10.655534430023973, "grad_norm": 1.271621823310852, "learning_rate": 1.0837921468713854e-05, "loss": 0.2799, "step": 773400 }, { "epoch": 10.656912182083712, "grad_norm": 0.14077383279800415, "learning_rate": 1.0831545527411113e-05, "loss": 0.2806, "step": 773500 }, { "epoch": 10.658289934143452, "grad_norm": 2.4750192165374756, "learning_rate": 1.0825171002178857e-05, "loss": 0.3032, "step": 773600 }, { "epoch": 10.659667686203191, "grad_norm": 4.034747123718262, "learning_rate": 1.0818797893558618e-05, "loss": 0.2859, "step": 773700 }, { "epoch": 10.66104543826293, "grad_norm": 4.517995357513428, "learning_rate": 1.081242620209184e-05, "loss": 0.288, "step": 773800 }, { "epoch": 10.66242319032267, "grad_norm": 1.7797316312789917, "learning_rate": 1.0806055928319841e-05, "loss": 0.2438, "step": 773900 }, { "epoch": 10.663800942382409, "grad_norm": 4.498326301574707, "learning_rate": 1.0799687072783785e-05, "loss": 0.2748, "step": 774000 }, { "epoch": 10.665178694442147, "grad_norm": 4.289167404174805, "learning_rate": 1.079331963602476e-05, "loss": 0.3089, "step": 774100 }, { "epoch": 10.666556446501888, "grad_norm": 1.539554476737976, "learning_rate": 1.078695361858369e-05, "loss": 0.2936, "step": 774200 }, { "epoch": 10.667934198561627, "grad_norm": 4.27036714553833, "learning_rate": 1.0780589021001428e-05, "loss": 0.2893, "step": 774300 }, { "epoch": 10.669311950621367, "grad_norm": 3.4730494022369385, "learning_rate": 1.0774225843818655e-05, "loss": 0.2613, "step": 774400 }, { "epoch": 10.670689702681106, "grad_norm": 3.011395215988159, "learning_rate": 1.0767864087575963e-05, "loss": 0.2605, "step": 774500 }, { "epoch": 10.672067454740844, "grad_norm": 1.6344833374023438, "learning_rate": 1.0761503752813835e-05, "loss": 0.2884, "step": 774600 }, { "epoch": 10.673445206800585, "grad_norm": 1.7804741859436035, "learning_rate": 1.0755144840072583e-05, "loss": 0.2662, "step": 774700 }, { "epoch": 10.674822958860323, "grad_norm": 5.178990364074707, "learning_rate": 1.0748787349892438e-05, "loss": 0.2516, "step": 774800 }, { "epoch": 10.676200710920062, "grad_norm": 1.7916109561920166, "learning_rate": 1.0742431282813519e-05, "loss": 0.2946, "step": 774900 }, { "epoch": 10.677578462979803, "grad_norm": 2.858278512954712, "learning_rate": 1.0736076639375783e-05, "loss": 0.2635, "step": 775000 }, { "epoch": 10.678956215039541, "grad_norm": 0.6872262358665466, "learning_rate": 1.0729723420119097e-05, "loss": 0.2692, "step": 775100 }, { "epoch": 10.680333967099282, "grad_norm": 7.852147102355957, "learning_rate": 1.0723435136474438e-05, "loss": 0.2918, "step": 775200 }, { "epoch": 10.68171171915902, "grad_norm": 3.1291286945343018, "learning_rate": 1.0717084752943691e-05, "loss": 0.2944, "step": 775300 }, { "epoch": 10.683089471218759, "grad_norm": 0.6081647276878357, "learning_rate": 1.0710735795207444e-05, "loss": 0.2385, "step": 775400 }, { "epoch": 10.6844672232785, "grad_norm": 4.6381988525390625, "learning_rate": 1.0704451732056988e-05, "loss": 0.3411, "step": 775500 }, { "epoch": 10.685844975338238, "grad_norm": 1.4564452171325684, "learning_rate": 1.0698169067379613e-05, "loss": 0.2644, "step": 775600 }, { "epoch": 10.687222727397977, "grad_norm": 4.635112285614014, "learning_rate": 1.0691824361709158e-05, "loss": 0.3095, "step": 775700 }, { "epoch": 10.688600479457717, "grad_norm": 5.736720561981201, "learning_rate": 1.06854810839792e-05, "loss": 0.2858, "step": 775800 }, { "epoch": 10.689978231517456, "grad_norm": 1.1003854274749756, "learning_rate": 1.0679139234728655e-05, "loss": 0.2512, "step": 775900 }, { "epoch": 10.691355983577196, "grad_norm": 0.9752110838890076, "learning_rate": 1.067279881449628e-05, "loss": 0.273, "step": 776000 }, { "epoch": 10.692733735636935, "grad_norm": 9.139801025390625, "learning_rate": 1.0666459823820744e-05, "loss": 0.2612, "step": 776100 }, { "epoch": 10.694111487696674, "grad_norm": 4.557865619659424, "learning_rate": 1.0660122263240564e-05, "loss": 0.278, "step": 776200 }, { "epoch": 10.695489239756414, "grad_norm": 1.6658694744110107, "learning_rate": 1.0653786133294139e-05, "loss": 0.2566, "step": 776300 }, { "epoch": 10.696866991816153, "grad_norm": 5.797301292419434, "learning_rate": 1.0647451434519769e-05, "loss": 0.242, "step": 776400 }, { "epoch": 10.698244743875891, "grad_norm": 1.4015822410583496, "learning_rate": 1.0641118167455627e-05, "loss": 0.2775, "step": 776500 }, { "epoch": 10.699622495935632, "grad_norm": 0.024202750995755196, "learning_rate": 1.0634786332639732e-05, "loss": 0.2327, "step": 776600 }, { "epoch": 10.70100024799537, "grad_norm": 8.05154800415039, "learning_rate": 1.0628455930610022e-05, "loss": 0.299, "step": 776700 }, { "epoch": 10.70237800005511, "grad_norm": 12.252554893493652, "learning_rate": 1.0622126961904307e-05, "loss": 0.2668, "step": 776800 }, { "epoch": 10.70375575211485, "grad_norm": 3.6365504264831543, "learning_rate": 1.061579942706025e-05, "loss": 0.3129, "step": 776900 }, { "epoch": 10.705133504174588, "grad_norm": 2.3534231185913086, "learning_rate": 1.0609473326615403e-05, "loss": 0.2734, "step": 777000 }, { "epoch": 10.706511256234329, "grad_norm": 6.118051528930664, "learning_rate": 1.0603148661107218e-05, "loss": 0.311, "step": 777100 }, { "epoch": 10.707889008294067, "grad_norm": 16.952945709228516, "learning_rate": 1.0596825431072987e-05, "loss": 0.2963, "step": 777200 }, { "epoch": 10.709266760353806, "grad_norm": 1.9015147686004639, "learning_rate": 1.0590503637049922e-05, "loss": 0.2367, "step": 777300 }, { "epoch": 10.710644512413547, "grad_norm": 2.550863265991211, "learning_rate": 1.058418327957507e-05, "loss": 0.3152, "step": 777400 }, { "epoch": 10.712022264473285, "grad_norm": 1.7357666492462158, "learning_rate": 1.0577864359185401e-05, "loss": 0.2795, "step": 777500 }, { "epoch": 10.713400016533026, "grad_norm": 3.509672164916992, "learning_rate": 1.0571546876417719e-05, "loss": 0.297, "step": 777600 }, { "epoch": 10.714777768592764, "grad_norm": 4.266869068145752, "learning_rate": 1.0565230831808733e-05, "loss": 0.2736, "step": 777700 }, { "epoch": 10.716155520652503, "grad_norm": 3.8890371322631836, "learning_rate": 1.0558916225895037e-05, "loss": 0.3083, "step": 777800 }, { "epoch": 10.717533272712243, "grad_norm": 2.2083747386932373, "learning_rate": 1.0552603059213076e-05, "loss": 0.273, "step": 777900 }, { "epoch": 10.718911024771982, "grad_norm": 3.997001886367798, "learning_rate": 1.0546291332299178e-05, "loss": 0.2594, "step": 778000 }, { "epoch": 10.72028877683172, "grad_norm": 1.5380661487579346, "learning_rate": 1.0539981045689576e-05, "loss": 0.313, "step": 778100 }, { "epoch": 10.721666528891461, "grad_norm": 3.687814235687256, "learning_rate": 1.0533672199920342e-05, "loss": 0.263, "step": 778200 }, { "epoch": 10.7230442809512, "grad_norm": 3.641199827194214, "learning_rate": 1.0527364795527454e-05, "loss": 0.2477, "step": 778300 }, { "epoch": 10.724422033010939, "grad_norm": 3.2585480213165283, "learning_rate": 1.0521058833046766e-05, "loss": 0.2532, "step": 778400 }, { "epoch": 10.725799785070679, "grad_norm": 3.120032787322998, "learning_rate": 1.0514754313013996e-05, "loss": 0.2623, "step": 778500 }, { "epoch": 10.727177537130418, "grad_norm": 2.5921339988708496, "learning_rate": 1.0508451235964731e-05, "loss": 0.2749, "step": 778600 }, { "epoch": 10.728555289190158, "grad_norm": 5.440725326538086, "learning_rate": 1.0502149602434463e-05, "loss": 0.2984, "step": 778700 }, { "epoch": 10.729933041249897, "grad_norm": 3.4707186222076416, "learning_rate": 1.0495849412958556e-05, "loss": 0.2831, "step": 778800 }, { "epoch": 10.731310793309635, "grad_norm": 0.2926226556301117, "learning_rate": 1.0489550668072223e-05, "loss": 0.287, "step": 778900 }, { "epoch": 10.732688545369376, "grad_norm": 6.388592720031738, "learning_rate": 1.0483253368310598e-05, "loss": 0.3076, "step": 779000 }, { "epoch": 10.734066297429115, "grad_norm": 1.4608204364776611, "learning_rate": 1.0476957514208656e-05, "loss": 0.2912, "step": 779100 }, { "epoch": 10.735444049488853, "grad_norm": 12.02379322052002, "learning_rate": 1.047066310630125e-05, "loss": 0.2722, "step": 779200 }, { "epoch": 10.736821801548594, "grad_norm": 9.220056533813477, "learning_rate": 1.0464370145123138e-05, "loss": 0.231, "step": 779300 }, { "epoch": 10.738199553608332, "grad_norm": 2.7719051837921143, "learning_rate": 1.0458078631208944e-05, "loss": 0.2743, "step": 779400 }, { "epoch": 10.739577305668073, "grad_norm": 2.0675292015075684, "learning_rate": 1.0451788565093148e-05, "loss": 0.2602, "step": 779500 }, { "epoch": 10.740955057727811, "grad_norm": 1.8196871280670166, "learning_rate": 1.0445499947310138e-05, "loss": 0.3196, "step": 779600 }, { "epoch": 10.74233280978755, "grad_norm": 1.4246639013290405, "learning_rate": 1.0439212778394152e-05, "loss": 0.2636, "step": 779700 }, { "epoch": 10.74371056184729, "grad_norm": 0.8489853143692017, "learning_rate": 1.0432927058879329e-05, "loss": 0.2927, "step": 779800 }, { "epoch": 10.74508831390703, "grad_norm": 4.6068243980407715, "learning_rate": 1.0426642789299657e-05, "loss": 0.2591, "step": 779900 }, { "epoch": 10.746466065966768, "grad_norm": 2.7164809703826904, "learning_rate": 1.042035997018903e-05, "loss": 0.317, "step": 780000 }, { "epoch": 10.747843818026508, "grad_norm": 0.564778208732605, "learning_rate": 1.041407860208122e-05, "loss": 0.2914, "step": 780100 }, { "epoch": 10.749221570086247, "grad_norm": 2.1687119007110596, "learning_rate": 1.0407798685509824e-05, "loss": 0.2432, "step": 780200 }, { "epoch": 10.750599322145987, "grad_norm": 3.0310451984405518, "learning_rate": 1.0401520221008372e-05, "loss": 0.3147, "step": 780300 }, { "epoch": 10.751977074205726, "grad_norm": 1.4888153076171875, "learning_rate": 1.0395243209110265e-05, "loss": 0.2278, "step": 780400 }, { "epoch": 10.753354826265465, "grad_norm": 2.801438570022583, "learning_rate": 1.0388967650348742e-05, "loss": 0.2947, "step": 780500 }, { "epoch": 10.754732578325205, "grad_norm": 5.014444828033447, "learning_rate": 1.0382693545256959e-05, "loss": 0.3015, "step": 780600 }, { "epoch": 10.756110330384944, "grad_norm": 2.237200975418091, "learning_rate": 1.037642089436794e-05, "loss": 0.2948, "step": 780700 }, { "epoch": 10.757488082444683, "grad_norm": 1.9687060117721558, "learning_rate": 1.0370212402973403e-05, "loss": 0.1908, "step": 780800 }, { "epoch": 10.758865834504423, "grad_norm": 0.17617133259773254, "learning_rate": 1.0363942647533131e-05, "loss": 0.2616, "step": 780900 }, { "epoch": 10.760243586564162, "grad_norm": 20.812875747680664, "learning_rate": 1.0357674347888586e-05, "loss": 0.2735, "step": 781000 }, { "epoch": 10.7616213386239, "grad_norm": 3.0998921394348145, "learning_rate": 1.0351470165794902e-05, "loss": 0.2434, "step": 781100 }, { "epoch": 10.76299909068364, "grad_norm": 2.067453145980835, "learning_rate": 1.0345204764768053e-05, "loss": 0.3096, "step": 781200 }, { "epoch": 10.76437684274338, "grad_norm": 2.0949366092681885, "learning_rate": 1.0338940821128823e-05, "loss": 0.3119, "step": 781300 }, { "epoch": 10.76575459480312, "grad_norm": 2.4511618614196777, "learning_rate": 1.0332678335409358e-05, "loss": 0.2608, "step": 781400 }, { "epoch": 10.767132346862859, "grad_norm": 4.615516185760498, "learning_rate": 1.0326417308141697e-05, "loss": 0.2034, "step": 781500 }, { "epoch": 10.768510098922597, "grad_norm": 2.0676310062408447, "learning_rate": 1.0320157739857754e-05, "loss": 0.2587, "step": 781600 }, { "epoch": 10.769887850982338, "grad_norm": 1.017842173576355, "learning_rate": 1.0313899631089299e-05, "loss": 0.2753, "step": 781700 }, { "epoch": 10.771265603042076, "grad_norm": 2.830345630645752, "learning_rate": 1.0307642982367988e-05, "loss": 0.2845, "step": 781800 }, { "epoch": 10.772643355101817, "grad_norm": 3.0426549911499023, "learning_rate": 1.0301387794225368e-05, "loss": 0.2454, "step": 781900 }, { "epoch": 10.774021107161555, "grad_norm": 1.7366609573364258, "learning_rate": 1.0295134067192834e-05, "loss": 0.2683, "step": 782000 }, { "epoch": 10.775398859221294, "grad_norm": 3.190941333770752, "learning_rate": 1.0288881801801681e-05, "loss": 0.288, "step": 782100 }, { "epoch": 10.776776611281035, "grad_norm": 5.287817001342773, "learning_rate": 1.0282630998583082e-05, "loss": 0.3225, "step": 782200 }, { "epoch": 10.778154363340773, "grad_norm": 0.6237702369689941, "learning_rate": 1.027638165806806e-05, "loss": 0.2933, "step": 782300 }, { "epoch": 10.779532115400512, "grad_norm": 1.593192219734192, "learning_rate": 1.0270133780787522e-05, "loss": 0.2773, "step": 782400 }, { "epoch": 10.780909867460252, "grad_norm": 6.648743152618408, "learning_rate": 1.0263887367272266e-05, "loss": 0.3082, "step": 782500 }, { "epoch": 10.782287619519991, "grad_norm": 2.956794500350952, "learning_rate": 1.0257642418052969e-05, "loss": 0.2647, "step": 782600 }, { "epoch": 10.78366537157973, "grad_norm": 3.487407684326172, "learning_rate": 1.025139893366015e-05, "loss": 0.2942, "step": 782700 }, { "epoch": 10.78504312363947, "grad_norm": 8.754105567932129, "learning_rate": 1.0245156914624236e-05, "loss": 0.3385, "step": 782800 }, { "epoch": 10.786420875699209, "grad_norm": 0.9119150638580322, "learning_rate": 1.0238916361475533e-05, "loss": 0.2473, "step": 782900 }, { "epoch": 10.78779862775895, "grad_norm": 5.8906569480896, "learning_rate": 1.0232677274744178e-05, "loss": 0.2508, "step": 783000 }, { "epoch": 10.789176379818688, "grad_norm": 4.298729419708252, "learning_rate": 1.0226439654960225e-05, "loss": 0.264, "step": 783100 }, { "epoch": 10.790554131878427, "grad_norm": 2.2790465354919434, "learning_rate": 1.0220203502653602e-05, "loss": 0.2989, "step": 783200 }, { "epoch": 10.791931883938167, "grad_norm": 5.4851861000061035, "learning_rate": 1.0213968818354087e-05, "loss": 0.2796, "step": 783300 }, { "epoch": 10.793309635997906, "grad_norm": 1.391113519668579, "learning_rate": 1.0207735602591366e-05, "loss": 0.2437, "step": 783400 }, { "epoch": 10.794687388057644, "grad_norm": 6.2586870193481445, "learning_rate": 1.0201503855894959e-05, "loss": 0.2422, "step": 783500 }, { "epoch": 10.796065140117385, "grad_norm": 2.0227832794189453, "learning_rate": 1.0195273578794311e-05, "loss": 0.286, "step": 783600 }, { "epoch": 10.797442892177123, "grad_norm": 1.4082139730453491, "learning_rate": 1.0189044771818692e-05, "loss": 0.2485, "step": 783700 }, { "epoch": 10.798820644236864, "grad_norm": 2.4410436153411865, "learning_rate": 1.018281743549728e-05, "loss": 0.2863, "step": 783800 }, { "epoch": 10.800198396296603, "grad_norm": 4.288768291473389, "learning_rate": 1.0176591570359134e-05, "loss": 0.2856, "step": 783900 }, { "epoch": 10.801576148356341, "grad_norm": 3.8041419982910156, "learning_rate": 1.0170367176933156e-05, "loss": 0.3127, "step": 784000 }, { "epoch": 10.802953900416082, "grad_norm": 6.286481857299805, "learning_rate": 1.016414425574814e-05, "loss": 0.2277, "step": 784100 }, { "epoch": 10.80433165247582, "grad_norm": 2.96346116065979, "learning_rate": 1.0157922807332765e-05, "loss": 0.286, "step": 784200 }, { "epoch": 10.805709404535559, "grad_norm": 12.562845230102539, "learning_rate": 1.015170283221556e-05, "loss": 0.3062, "step": 784300 }, { "epoch": 10.8070871565953, "grad_norm": 1.5047742128372192, "learning_rate": 1.0145484330924953e-05, "loss": 0.2456, "step": 784400 }, { "epoch": 10.808464908655038, "grad_norm": 2.896204948425293, "learning_rate": 1.0139267303989247e-05, "loss": 0.2627, "step": 784500 }, { "epoch": 10.809842660714779, "grad_norm": 3.08923602104187, "learning_rate": 1.01330517519366e-05, "loss": 0.2397, "step": 784600 }, { "epoch": 10.811220412774517, "grad_norm": 1.4122380018234253, "learning_rate": 1.0126837675295044e-05, "loss": 0.2702, "step": 784700 }, { "epoch": 10.812598164834256, "grad_norm": 2.1778883934020996, "learning_rate": 1.0120625074592509e-05, "loss": 0.2598, "step": 784800 }, { "epoch": 10.813975916893996, "grad_norm": 2.1174709796905518, "learning_rate": 1.0114413950356797e-05, "loss": 0.2657, "step": 784900 }, { "epoch": 10.815353668953735, "grad_norm": 1.0692293643951416, "learning_rate": 1.0108204303115555e-05, "loss": 0.2591, "step": 785000 }, { "epoch": 10.816731421013474, "grad_norm": 6.256170272827148, "learning_rate": 1.010199613339634e-05, "loss": 0.2901, "step": 785100 }, { "epoch": 10.818109173073214, "grad_norm": 4.349870681762695, "learning_rate": 1.0095789441726566e-05, "loss": 0.2875, "step": 785200 }, { "epoch": 10.819486925132953, "grad_norm": 2.953732967376709, "learning_rate": 1.0089584228633505e-05, "loss": 0.2792, "step": 785300 }, { "epoch": 10.820864677192692, "grad_norm": 0.0522027425467968, "learning_rate": 1.0083380494644337e-05, "loss": 0.2931, "step": 785400 }, { "epoch": 10.822242429252432, "grad_norm": 2.148622989654541, "learning_rate": 1.0077178240286104e-05, "loss": 0.3317, "step": 785500 }, { "epoch": 10.82362018131217, "grad_norm": 4.769834041595459, "learning_rate": 1.0070977466085716e-05, "loss": 0.2721, "step": 785600 }, { "epoch": 10.824997933371911, "grad_norm": 2.5272176265716553, "learning_rate": 1.0064778172569951e-05, "loss": 0.2646, "step": 785700 }, { "epoch": 10.82637568543165, "grad_norm": 2.9587318897247314, "learning_rate": 1.0058580360265478e-05, "loss": 0.2996, "step": 785800 }, { "epoch": 10.827753437491388, "grad_norm": 1.2316334247589111, "learning_rate": 1.0052384029698842e-05, "loss": 0.2998, "step": 785900 }, { "epoch": 10.829131189551129, "grad_norm": 6.3303542137146, "learning_rate": 1.0046189181396433e-05, "loss": 0.2833, "step": 786000 }, { "epoch": 10.830508941610868, "grad_norm": 1.4202358722686768, "learning_rate": 1.0039995815884547e-05, "loss": 0.2599, "step": 786100 }, { "epoch": 10.831886693670608, "grad_norm": 6.6583147048950195, "learning_rate": 1.0033803933689363e-05, "loss": 0.2541, "step": 786200 }, { "epoch": 10.833264445730347, "grad_norm": 1.2129735946655273, "learning_rate": 1.0027613535336868e-05, "loss": 0.2253, "step": 786300 }, { "epoch": 10.834642197790085, "grad_norm": 2.8143773078918457, "learning_rate": 1.0021424621352993e-05, "loss": 0.2644, "step": 786400 }, { "epoch": 10.836019949849826, "grad_norm": 2.1632111072540283, "learning_rate": 1.0015237192263528e-05, "loss": 0.296, "step": 786500 }, { "epoch": 10.837397701909564, "grad_norm": 1.8440625667572021, "learning_rate": 1.0009051248594102e-05, "loss": 0.2441, "step": 786600 }, { "epoch": 10.838775453969303, "grad_norm": 4.708512306213379, "learning_rate": 1.0002866790870267e-05, "loss": 0.2372, "step": 786700 }, { "epoch": 10.840153206029044, "grad_norm": 1.7876691818237305, "learning_rate": 9.996683819617402e-06, "loss": 0.2759, "step": 786800 }, { "epoch": 10.841530958088782, "grad_norm": 3.8953144550323486, "learning_rate": 9.990564142841016e-06, "loss": 0.2666, "step": 786900 }, { "epoch": 10.84290871014852, "grad_norm": 30.061588287353516, "learning_rate": 9.984384131227999e-06, "loss": 0.2214, "step": 787000 }, { "epoch": 10.844286462208261, "grad_norm": 0.29400742053985596, "learning_rate": 9.978205607656158e-06, "loss": 0.2554, "step": 787100 }, { "epoch": 10.845664214268, "grad_norm": 1.9202631711959839, "learning_rate": 9.972028572650404e-06, "loss": 0.3088, "step": 787200 }, { "epoch": 10.84704196632774, "grad_norm": 4.585844993591309, "learning_rate": 9.965853026735492e-06, "loss": 0.3102, "step": 787300 }, { "epoch": 10.848419718387479, "grad_norm": 2.103652238845825, "learning_rate": 9.959678970436065e-06, "loss": 0.262, "step": 787400 }, { "epoch": 10.849797470447218, "grad_norm": 3.472585916519165, "learning_rate": 9.953506404276639e-06, "loss": 0.2783, "step": 787500 }, { "epoch": 10.851175222506958, "grad_norm": 3.5521347522735596, "learning_rate": 9.947335328781625e-06, "loss": 0.2862, "step": 787600 }, { "epoch": 10.852552974566697, "grad_norm": 4.6136250495910645, "learning_rate": 9.941165744475263e-06, "loss": 0.2973, "step": 787700 }, { "epoch": 10.853930726626436, "grad_norm": 2.368006944656372, "learning_rate": 9.934997651881709e-06, "loss": 0.3062, "step": 787800 }, { "epoch": 10.855308478686176, "grad_norm": 3.4752609729766846, "learning_rate": 9.928831051524967e-06, "loss": 0.2935, "step": 787900 }, { "epoch": 10.856686230745915, "grad_norm": 3.9899001121520996, "learning_rate": 9.922665943928915e-06, "loss": 0.2509, "step": 788000 }, { "epoch": 10.858063982805655, "grad_norm": 5.404050350189209, "learning_rate": 9.916502329617313e-06, "loss": 0.2766, "step": 788100 }, { "epoch": 10.859441734865394, "grad_norm": 2.512906789779663, "learning_rate": 9.910340209113805e-06, "loss": 0.2563, "step": 788200 }, { "epoch": 10.860819486925132, "grad_norm": 2.6941845417022705, "learning_rate": 9.904179582941874e-06, "loss": 0.2486, "step": 788300 }, { "epoch": 10.862197238984873, "grad_norm": 1.6795685291290283, "learning_rate": 9.898020451624915e-06, "loss": 0.3124, "step": 788400 }, { "epoch": 10.863574991044612, "grad_norm": 0.1793992817401886, "learning_rate": 9.891862815686164e-06, "loss": 0.3055, "step": 788500 }, { "epoch": 10.86495274310435, "grad_norm": 3.1668787002563477, "learning_rate": 9.885706675648757e-06, "loss": 0.2696, "step": 788600 }, { "epoch": 10.86633049516409, "grad_norm": 3.6055996417999268, "learning_rate": 9.87955203203567e-06, "loss": 0.2773, "step": 788700 }, { "epoch": 10.86770824722383, "grad_norm": 2.612092971801758, "learning_rate": 9.873398885369788e-06, "loss": 0.2717, "step": 788800 }, { "epoch": 10.86908599928357, "grad_norm": 2.8825790882110596, "learning_rate": 9.867247236173865e-06, "loss": 0.2737, "step": 788900 }, { "epoch": 10.870463751343308, "grad_norm": 1.5203964710235596, "learning_rate": 9.861097084970477e-06, "loss": 0.3036, "step": 789000 }, { "epoch": 10.871841503403047, "grad_norm": 2.303419351577759, "learning_rate": 9.854948432282134e-06, "loss": 0.2877, "step": 789100 }, { "epoch": 10.873219255462788, "grad_norm": 2.1686339378356934, "learning_rate": 9.848801278631202e-06, "loss": 0.3271, "step": 789200 }, { "epoch": 10.874597007522526, "grad_norm": 3.583930730819702, "learning_rate": 9.842655624539894e-06, "loss": 0.3394, "step": 789300 }, { "epoch": 10.875974759582265, "grad_norm": 5.633982181549072, "learning_rate": 9.836511470530327e-06, "loss": 0.3057, "step": 789400 }, { "epoch": 10.877352511642005, "grad_norm": 1.0355987548828125, "learning_rate": 9.830368817124482e-06, "loss": 0.2814, "step": 789500 }, { "epoch": 10.878730263701744, "grad_norm": 4.735191822052002, "learning_rate": 9.82428906893472e-06, "loss": 0.2501, "step": 789600 }, { "epoch": 10.880108015761483, "grad_norm": 3.952105760574341, "learning_rate": 9.818149403282677e-06, "loss": 0.2682, "step": 789700 }, { "epoch": 10.881485767821223, "grad_norm": 3.350972890853882, "learning_rate": 9.812011239794292e-06, "loss": 0.2955, "step": 789800 }, { "epoch": 10.882863519880962, "grad_norm": 2.655555248260498, "learning_rate": 9.805874578991054e-06, "loss": 0.2557, "step": 789900 }, { "epoch": 10.884241271940702, "grad_norm": 1.6831485033035278, "learning_rate": 9.79973942139429e-06, "loss": 0.2866, "step": 790000 }, { "epoch": 10.885619024000441, "grad_norm": 0.10608773678541183, "learning_rate": 9.793605767525213e-06, "loss": 0.2722, "step": 790100 }, { "epoch": 10.88699677606018, "grad_norm": 0.6877732872962952, "learning_rate": 9.787473617904908e-06, "loss": 0.3316, "step": 790200 }, { "epoch": 10.88837452811992, "grad_norm": 28.814584732055664, "learning_rate": 9.781342973054344e-06, "loss": 0.2958, "step": 790300 }, { "epoch": 10.889752280179659, "grad_norm": 1.559552550315857, "learning_rate": 9.775213833494333e-06, "loss": 0.2552, "step": 790400 }, { "epoch": 10.8911300322394, "grad_norm": 1.3686186075210571, "learning_rate": 9.769086199745588e-06, "loss": 0.3365, "step": 790500 }, { "epoch": 10.892507784299138, "grad_norm": 2.9944419860839844, "learning_rate": 9.762960072328687e-06, "loss": 0.2485, "step": 790600 }, { "epoch": 10.893885536358876, "grad_norm": 1.8911799192428589, "learning_rate": 9.756835451764074e-06, "loss": 0.2793, "step": 790700 }, { "epoch": 10.895263288418617, "grad_norm": 3.6825525760650635, "learning_rate": 9.750712338572053e-06, "loss": 0.3408, "step": 790800 }, { "epoch": 10.896641040478356, "grad_norm": 2.4583678245544434, "learning_rate": 9.744590733272832e-06, "loss": 0.2775, "step": 790900 }, { "epoch": 10.898018792538094, "grad_norm": 5.033134937286377, "learning_rate": 9.738470636386454e-06, "loss": 0.2926, "step": 791000 }, { "epoch": 10.899396544597835, "grad_norm": 3.261211633682251, "learning_rate": 9.732352048432874e-06, "loss": 0.3007, "step": 791100 }, { "epoch": 10.900774296657573, "grad_norm": 1.8147833347320557, "learning_rate": 9.726234969931876e-06, "loss": 0.2786, "step": 791200 }, { "epoch": 10.902152048717312, "grad_norm": 2.4565181732177734, "learning_rate": 9.720119401403155e-06, "loss": 0.2951, "step": 791300 }, { "epoch": 10.903529800777052, "grad_norm": 0.3112247884273529, "learning_rate": 9.714005343366244e-06, "loss": 0.2408, "step": 791400 }, { "epoch": 10.904907552836791, "grad_norm": 4.672830581665039, "learning_rate": 9.707892796340574e-06, "loss": 0.2718, "step": 791500 }, { "epoch": 10.906285304896532, "grad_norm": 4.682251930236816, "learning_rate": 9.70178176084544e-06, "loss": 0.2391, "step": 791600 }, { "epoch": 10.90766305695627, "grad_norm": 1.8366261720657349, "learning_rate": 9.695672237400002e-06, "loss": 0.2814, "step": 791700 }, { "epoch": 10.909040809016009, "grad_norm": 1.409472942352295, "learning_rate": 9.689564226523286e-06, "loss": 0.2598, "step": 791800 }, { "epoch": 10.91041856107575, "grad_norm": 2.2234275341033936, "learning_rate": 9.683457728734215e-06, "loss": 0.2685, "step": 791900 }, { "epoch": 10.911796313135488, "grad_norm": 0.5442966222763062, "learning_rate": 9.67735274455155e-06, "loss": 0.2757, "step": 792000 }, { "epoch": 10.913174065195227, "grad_norm": 4.018232822418213, "learning_rate": 9.671249274493948e-06, "loss": 0.3243, "step": 792100 }, { "epoch": 10.914551817254967, "grad_norm": 0.061273328959941864, "learning_rate": 9.665147319079942e-06, "loss": 0.3161, "step": 792200 }, { "epoch": 10.915929569314706, "grad_norm": 2.9252161979675293, "learning_rate": 9.659046878827912e-06, "loss": 0.2236, "step": 792300 }, { "epoch": 10.917307321374446, "grad_norm": 1.966133952140808, "learning_rate": 9.652947954256116e-06, "loss": 0.2435, "step": 792400 }, { "epoch": 10.918685073434185, "grad_norm": 0.2597993314266205, "learning_rate": 9.646850545882694e-06, "loss": 0.2444, "step": 792500 }, { "epoch": 10.920062825493924, "grad_norm": 3.497936487197876, "learning_rate": 9.640754654225667e-06, "loss": 0.2782, "step": 792600 }, { "epoch": 10.921440577553664, "grad_norm": 0.7135214805603027, "learning_rate": 9.634660279802889e-06, "loss": 0.3057, "step": 792700 }, { "epoch": 10.922818329613403, "grad_norm": 2.791632890701294, "learning_rate": 9.628567423132123e-06, "loss": 0.2883, "step": 792800 }, { "epoch": 10.924196081673141, "grad_norm": 0.2594141364097595, "learning_rate": 9.622536990597859e-06, "loss": 0.3237, "step": 792900 }, { "epoch": 10.925573833732882, "grad_norm": 0.4906339943408966, "learning_rate": 9.61644715579341e-06, "loss": 0.274, "step": 793000 }, { "epoch": 10.92695158579262, "grad_norm": 5.79037618637085, "learning_rate": 9.610358840288256e-06, "loss": 0.3077, "step": 793100 }, { "epoch": 10.928329337852361, "grad_norm": 1.7678356170654297, "learning_rate": 9.604272044599641e-06, "loss": 0.2919, "step": 793200 }, { "epoch": 10.9297070899121, "grad_norm": 1.488146185874939, "learning_rate": 9.598186769244681e-06, "loss": 0.2493, "step": 793300 }, { "epoch": 10.931084841971838, "grad_norm": 3.3995895385742188, "learning_rate": 9.592163844755469e-06, "loss": 0.289, "step": 793400 }, { "epoch": 10.932462594031579, "grad_norm": 2.4592740535736084, "learning_rate": 9.586081596402357e-06, "loss": 0.2593, "step": 793500 }, { "epoch": 10.933840346091317, "grad_norm": 2.7111661434173584, "learning_rate": 9.580000869928254e-06, "loss": 0.2668, "step": 793600 }, { "epoch": 10.935218098151056, "grad_norm": 0.9519830942153931, "learning_rate": 9.573921665849765e-06, "loss": 0.2518, "step": 793700 }, { "epoch": 10.936595850210796, "grad_norm": 1.3351515531539917, "learning_rate": 9.567843984683329e-06, "loss": 0.2927, "step": 793800 }, { "epoch": 10.937973602270535, "grad_norm": 1.2917457818984985, "learning_rate": 9.561767826945295e-06, "loss": 0.3068, "step": 793900 }, { "epoch": 10.939351354330274, "grad_norm": 5.182137489318848, "learning_rate": 9.555693193151848e-06, "loss": 0.2685, "step": 794000 }, { "epoch": 10.940729106390014, "grad_norm": 2.707472324371338, "learning_rate": 9.549620083819076e-06, "loss": 0.3139, "step": 794100 }, { "epoch": 10.942106858449753, "grad_norm": 2.3209433555603027, "learning_rate": 9.5435484994629e-06, "loss": 0.3056, "step": 794200 }, { "epoch": 10.943484610509493, "grad_norm": 1.4986072778701782, "learning_rate": 9.537478440599146e-06, "loss": 0.2547, "step": 794300 }, { "epoch": 10.944862362569232, "grad_norm": 1.9973628520965576, "learning_rate": 9.531409907743513e-06, "loss": 0.252, "step": 794400 }, { "epoch": 10.94624011462897, "grad_norm": 2.1416170597076416, "learning_rate": 9.525342901411515e-06, "loss": 0.2691, "step": 794500 }, { "epoch": 10.947617866688711, "grad_norm": 1.4314957857131958, "learning_rate": 9.519277422118596e-06, "loss": 0.264, "step": 794600 }, { "epoch": 10.94899561874845, "grad_norm": 2.565685749053955, "learning_rate": 9.513213470380055e-06, "loss": 0.3419, "step": 794700 }, { "epoch": 10.95037337080819, "grad_norm": 0.5604676008224487, "learning_rate": 9.507151046711043e-06, "loss": 0.2747, "step": 794800 }, { "epoch": 10.951751122867929, "grad_norm": 1.9582836627960205, "learning_rate": 9.501090151626603e-06, "loss": 0.2715, "step": 794900 }, { "epoch": 10.953128874927668, "grad_norm": 1.1105514764785767, "learning_rate": 9.495030785641641e-06, "loss": 0.2651, "step": 795000 }, { "epoch": 10.954506626987408, "grad_norm": 1.4833425283432007, "learning_rate": 9.488972949270929e-06, "loss": 0.2687, "step": 795100 }, { "epoch": 10.955884379047147, "grad_norm": 2.4590578079223633, "learning_rate": 9.4829166430291e-06, "loss": 0.2663, "step": 795200 }, { "epoch": 10.957262131106885, "grad_norm": 1.5963034629821777, "learning_rate": 9.476861867430675e-06, "loss": 0.2504, "step": 795300 }, { "epoch": 10.958639883166626, "grad_norm": 2.6099188327789307, "learning_rate": 9.470808622990049e-06, "loss": 0.2456, "step": 795400 }, { "epoch": 10.960017635226365, "grad_norm": 3.3358867168426514, "learning_rate": 9.464756910221469e-06, "loss": 0.3019, "step": 795500 }, { "epoch": 10.961395387286103, "grad_norm": 1.5220539569854736, "learning_rate": 9.458706729639049e-06, "loss": 0.286, "step": 795600 }, { "epoch": 10.962773139345844, "grad_norm": 2.2715752124786377, "learning_rate": 9.452658081756798e-06, "loss": 0.2799, "step": 795700 }, { "epoch": 10.964150891405582, "grad_norm": 0.5971372127532959, "learning_rate": 9.446610967088564e-06, "loss": 0.2653, "step": 795800 }, { "epoch": 10.965528643465323, "grad_norm": 2.510394334793091, "learning_rate": 9.440565386148089e-06, "loss": 0.3195, "step": 795900 }, { "epoch": 10.966906395525061, "grad_norm": 1.0859014987945557, "learning_rate": 9.434521339448982e-06, "loss": 0.2384, "step": 796000 }, { "epoch": 10.9682841475848, "grad_norm": 2.482124090194702, "learning_rate": 9.428478827504712e-06, "loss": 0.2625, "step": 796100 }, { "epoch": 10.96966189964454, "grad_norm": 1.537619948387146, "learning_rate": 9.422437850828611e-06, "loss": 0.255, "step": 796200 }, { "epoch": 10.97103965170428, "grad_norm": 3.8805816173553467, "learning_rate": 9.4163984099339e-06, "loss": 0.3077, "step": 796300 }, { "epoch": 10.972417403764018, "grad_norm": 2.1127543449401855, "learning_rate": 9.410360505333664e-06, "loss": 0.2404, "step": 796400 }, { "epoch": 10.973795155823758, "grad_norm": 1.609237790107727, "learning_rate": 9.404324137540845e-06, "loss": 0.3018, "step": 796500 }, { "epoch": 10.975172907883497, "grad_norm": 1.9624664783477783, "learning_rate": 9.39828930706826e-06, "loss": 0.2487, "step": 796600 }, { "epoch": 10.976550659943237, "grad_norm": 4.536030292510986, "learning_rate": 9.392256014428631e-06, "loss": 0.2436, "step": 796700 }, { "epoch": 10.977928412002976, "grad_norm": 3.4404239654541016, "learning_rate": 9.386284570060922e-06, "loss": 0.2802, "step": 796800 }, { "epoch": 10.979306164062715, "grad_norm": 2.6929402351379395, "learning_rate": 9.380254339233568e-06, "loss": 0.2337, "step": 796900 }, { "epoch": 10.980683916122455, "grad_norm": 1.7988147735595703, "learning_rate": 9.374225647771303e-06, "loss": 0.2414, "step": 797000 }, { "epoch": 10.982061668182194, "grad_norm": 2.566760301589966, "learning_rate": 9.368198496186316e-06, "loss": 0.2462, "step": 797100 }, { "epoch": 10.983439420241933, "grad_norm": 2.2484383583068848, "learning_rate": 9.362172884990623e-06, "loss": 0.2922, "step": 797200 }, { "epoch": 10.984817172301673, "grad_norm": 5.80052375793457, "learning_rate": 9.35614881469613e-06, "loss": 0.2992, "step": 797300 }, { "epoch": 10.986194924361412, "grad_norm": 2.1852049827575684, "learning_rate": 9.350126285814615e-06, "loss": 0.2221, "step": 797400 }, { "epoch": 10.987572676421152, "grad_norm": 0.43161359429359436, "learning_rate": 9.344105298857736e-06, "loss": 0.2653, "step": 797500 }, { "epoch": 10.98895042848089, "grad_norm": 5.528257369995117, "learning_rate": 9.338085854336989e-06, "loss": 0.2699, "step": 797600 }, { "epoch": 10.99032818054063, "grad_norm": 0.2190031260251999, "learning_rate": 9.332067952763775e-06, "loss": 0.222, "step": 797700 }, { "epoch": 10.99170593260037, "grad_norm": 2.633159637451172, "learning_rate": 9.326051594649331e-06, "loss": 0.3166, "step": 797800 }, { "epoch": 10.993083684660109, "grad_norm": 4.198328018188477, "learning_rate": 9.320036780504775e-06, "loss": 0.2449, "step": 797900 }, { "epoch": 10.994461436719847, "grad_norm": 1.6569182872772217, "learning_rate": 9.314023510841102e-06, "loss": 0.2913, "step": 798000 }, { "epoch": 10.995839188779588, "grad_norm": 5.501071453094482, "learning_rate": 9.308011786169178e-06, "loss": 0.2805, "step": 798100 }, { "epoch": 10.997216940839326, "grad_norm": 3.474391222000122, "learning_rate": 9.302001606999717e-06, "loss": 0.2498, "step": 798200 }, { "epoch": 10.998594692899065, "grad_norm": 2.8189375400543213, "learning_rate": 9.295992973843326e-06, "loss": 0.2577, "step": 798300 }, { "epoch": 10.999972444958805, "grad_norm": 9.539854049682617, "learning_rate": 9.289985887210456e-06, "loss": 0.2943, "step": 798400 }, { "epoch": 11.001350197018544, "grad_norm": 0.051772598177194595, "learning_rate": 9.283980347611454e-06, "loss": 0.3072, "step": 798500 }, { "epoch": 11.002727949078285, "grad_norm": 30.267906188964844, "learning_rate": 9.277976355556504e-06, "loss": 0.2881, "step": 798600 }, { "epoch": 11.004105701138023, "grad_norm": 2.5710878372192383, "learning_rate": 9.271973911555697e-06, "loss": 0.2722, "step": 798700 }, { "epoch": 11.005483453197762, "grad_norm": 0.3158681392669678, "learning_rate": 9.26597301611895e-06, "loss": 0.2993, "step": 798800 }, { "epoch": 11.006861205257502, "grad_norm": 2.6168432235717773, "learning_rate": 9.259973669756089e-06, "loss": 0.3254, "step": 798900 }, { "epoch": 11.008238957317241, "grad_norm": 2.0278635025024414, "learning_rate": 9.253975872976772e-06, "loss": 0.2269, "step": 799000 }, { "epoch": 11.00961670937698, "grad_norm": 5.680697441101074, "learning_rate": 9.247979626290558e-06, "loss": 0.248, "step": 799100 }, { "epoch": 11.01099446143672, "grad_norm": 3.3901679515838623, "learning_rate": 9.241984930206845e-06, "loss": 0.2131, "step": 799200 }, { "epoch": 11.012372213496459, "grad_norm": 1.2839778661727905, "learning_rate": 9.235991785234916e-06, "loss": 0.2889, "step": 799300 }, { "epoch": 11.0137499655562, "grad_norm": 1.7533848285675049, "learning_rate": 9.230000191883935e-06, "loss": 0.2235, "step": 799400 }, { "epoch": 11.015127717615938, "grad_norm": 1.2543329000473022, "learning_rate": 9.224010150662905e-06, "loss": 0.243, "step": 799500 }, { "epoch": 11.016505469675677, "grad_norm": 1.1958026885986328, "learning_rate": 9.218021662080702e-06, "loss": 0.263, "step": 799600 }, { "epoch": 11.017883221735417, "grad_norm": 1.521730661392212, "learning_rate": 9.212034726646102e-06, "loss": 0.2509, "step": 799700 }, { "epoch": 11.019260973795156, "grad_norm": 3.23846435546875, "learning_rate": 9.2060493448677e-06, "loss": 0.2206, "step": 799800 }, { "epoch": 11.020638725854894, "grad_norm": 2.9404137134552, "learning_rate": 9.200065517253996e-06, "loss": 0.2636, "step": 799900 }, { "epoch": 11.022016477914635, "grad_norm": 2.66621994972229, "learning_rate": 9.19408324431336e-06, "loss": 0.1668, "step": 800000 }, { "epoch": 11.023394229974373, "grad_norm": 2.9230761528015137, "learning_rate": 9.188102526554003e-06, "loss": 0.2811, "step": 800100 }, { "epoch": 11.024771982034114, "grad_norm": 2.5457375049591064, "learning_rate": 9.182123364484014e-06, "loss": 0.2705, "step": 800200 }, { "epoch": 11.026149734093853, "grad_norm": 1.7223994731903076, "learning_rate": 9.176145758611354e-06, "loss": 0.2996, "step": 800300 }, { "epoch": 11.027527486153591, "grad_norm": 2.6430580615997314, "learning_rate": 9.170169709443869e-06, "loss": 0.2233, "step": 800400 }, { "epoch": 11.028905238213332, "grad_norm": 2.9103152751922607, "learning_rate": 9.164195217489239e-06, "loss": 0.2395, "step": 800500 }, { "epoch": 11.03028299027307, "grad_norm": 1.6545778512954712, "learning_rate": 9.158222283255024e-06, "loss": 0.2305, "step": 800600 }, { "epoch": 11.031660742332809, "grad_norm": 1.0206098556518555, "learning_rate": 9.15225090724867e-06, "loss": 0.3106, "step": 800700 }, { "epoch": 11.03303849439255, "grad_norm": 2.5714898109436035, "learning_rate": 9.14628108997746e-06, "loss": 0.3489, "step": 800800 }, { "epoch": 11.034416246452288, "grad_norm": 3.59450626373291, "learning_rate": 9.140312831948563e-06, "loss": 0.2657, "step": 800900 }, { "epoch": 11.035793998512029, "grad_norm": 1.8729352951049805, "learning_rate": 9.134346133669034e-06, "loss": 0.207, "step": 801000 }, { "epoch": 11.037171750571767, "grad_norm": 6.837676048278809, "learning_rate": 9.128380995645758e-06, "loss": 0.2125, "step": 801100 }, { "epoch": 11.038549502631506, "grad_norm": 11.230830192565918, "learning_rate": 9.122417418385493e-06, "loss": 0.2479, "step": 801200 }, { "epoch": 11.039927254691246, "grad_norm": 3.2756309509277344, "learning_rate": 9.116455402394887e-06, "loss": 0.2557, "step": 801300 }, { "epoch": 11.041305006750985, "grad_norm": 2.231621265411377, "learning_rate": 9.110494948180457e-06, "loss": 0.2321, "step": 801400 }, { "epoch": 11.042682758810724, "grad_norm": 1.2976126670837402, "learning_rate": 9.10453605624855e-06, "loss": 0.2535, "step": 801500 }, { "epoch": 11.044060510870464, "grad_norm": 2.2005112171173096, "learning_rate": 9.098578727105424e-06, "loss": 0.2508, "step": 801600 }, { "epoch": 11.045438262930203, "grad_norm": 1.8822119235992432, "learning_rate": 9.092622961257178e-06, "loss": 0.2127, "step": 801700 }, { "epoch": 11.046816014989943, "grad_norm": 0.2824857234954834, "learning_rate": 9.086668759209774e-06, "loss": 0.2373, "step": 801800 }, { "epoch": 11.048193767049682, "grad_norm": 5.193145275115967, "learning_rate": 9.08071612146906e-06, "loss": 0.2958, "step": 801900 }, { "epoch": 11.04957151910942, "grad_norm": 1.5914549827575684, "learning_rate": 9.074765048540752e-06, "loss": 0.2585, "step": 802000 }, { "epoch": 11.050949271169161, "grad_norm": 3.134385108947754, "learning_rate": 9.068815540930408e-06, "loss": 0.265, "step": 802100 }, { "epoch": 11.0523270232289, "grad_norm": 2.1289002895355225, "learning_rate": 9.062867599143484e-06, "loss": 0.2643, "step": 802200 }, { "epoch": 11.053704775288638, "grad_norm": 4.590718746185303, "learning_rate": 9.056921223685274e-06, "loss": 0.2631, "step": 802300 }, { "epoch": 11.055082527348379, "grad_norm": 1.478545904159546, "learning_rate": 9.050976415060969e-06, "loss": 0.2742, "step": 802400 }, { "epoch": 11.056460279408117, "grad_norm": 1.5437530279159546, "learning_rate": 9.045033173775595e-06, "loss": 0.2155, "step": 802500 }, { "epoch": 11.057838031467856, "grad_norm": 4.8874101638793945, "learning_rate": 9.039091500334066e-06, "loss": 0.3035, "step": 802600 }, { "epoch": 11.059215783527597, "grad_norm": 4.0117034912109375, "learning_rate": 9.033151395241177e-06, "loss": 0.2683, "step": 802700 }, { "epoch": 11.060593535587335, "grad_norm": 2.4324307441711426, "learning_rate": 9.027212859001535e-06, "loss": 0.2695, "step": 802800 }, { "epoch": 11.061971287647076, "grad_norm": 1.3751418590545654, "learning_rate": 9.021275892119669e-06, "loss": 0.2259, "step": 802900 }, { "epoch": 11.063349039706814, "grad_norm": 4.446757793426514, "learning_rate": 9.015340495099959e-06, "loss": 0.2446, "step": 803000 }, { "epoch": 11.064726791766553, "grad_norm": 2.4575319290161133, "learning_rate": 9.009406668446632e-06, "loss": 0.2142, "step": 803100 }, { "epoch": 11.066104543826293, "grad_norm": 0.7464675903320312, "learning_rate": 9.003474412663805e-06, "loss": 0.3184, "step": 803200 }, { "epoch": 11.067482295886032, "grad_norm": 3.1375722885131836, "learning_rate": 8.997543728255466e-06, "loss": 0.2399, "step": 803300 }, { "epoch": 11.06886004794577, "grad_norm": 3.3817145824432373, "learning_rate": 8.991614615725443e-06, "loss": 0.2403, "step": 803400 }, { "epoch": 11.070237800005511, "grad_norm": 2.955003499984741, "learning_rate": 8.985687075577438e-06, "loss": 0.2406, "step": 803500 }, { "epoch": 11.07161555206525, "grad_norm": 0.18885573744773865, "learning_rate": 8.979761108315036e-06, "loss": 0.254, "step": 803600 }, { "epoch": 11.07299330412499, "grad_norm": 3.5578806400299072, "learning_rate": 8.973836714441683e-06, "loss": 0.2276, "step": 803700 }, { "epoch": 11.074371056184729, "grad_norm": 5.592334747314453, "learning_rate": 8.967913894460673e-06, "loss": 0.2655, "step": 803800 }, { "epoch": 11.075748808244468, "grad_norm": 2.156681537628174, "learning_rate": 8.961992648875198e-06, "loss": 0.2558, "step": 803900 }, { "epoch": 11.077126560304208, "grad_norm": 14.878314971923828, "learning_rate": 8.956072978188283e-06, "loss": 0.2464, "step": 804000 }, { "epoch": 11.078504312363947, "grad_norm": 1.813043475151062, "learning_rate": 8.950154882902835e-06, "loss": 0.2349, "step": 804100 }, { "epoch": 11.079882064423686, "grad_norm": 2.362821578979492, "learning_rate": 8.944238363521623e-06, "loss": 0.2504, "step": 804200 }, { "epoch": 11.081259816483426, "grad_norm": 1.6859230995178223, "learning_rate": 8.938323420547308e-06, "loss": 0.2555, "step": 804300 }, { "epoch": 11.082637568543165, "grad_norm": 1.8618556261062622, "learning_rate": 8.932410054482376e-06, "loss": 0.2499, "step": 804400 }, { "epoch": 11.084015320602905, "grad_norm": 2.7331643104553223, "learning_rate": 8.926498265829193e-06, "loss": 0.2535, "step": 804500 }, { "epoch": 11.085393072662644, "grad_norm": 2.594290256500244, "learning_rate": 8.920588055090004e-06, "loss": 0.3183, "step": 804600 }, { "epoch": 11.086770824722382, "grad_norm": 2.9784297943115234, "learning_rate": 8.91467942276692e-06, "loss": 0.2438, "step": 804700 }, { "epoch": 11.088148576782123, "grad_norm": 1.6363935470581055, "learning_rate": 8.908772369361893e-06, "loss": 0.2563, "step": 804800 }, { "epoch": 11.089526328841862, "grad_norm": 2.9799437522888184, "learning_rate": 8.902866895376763e-06, "loss": 0.2468, "step": 804900 }, { "epoch": 11.0909040809016, "grad_norm": 0.405773401260376, "learning_rate": 8.896963001313254e-06, "loss": 0.255, "step": 805000 }, { "epoch": 11.09228183296134, "grad_norm": 3.7427282333374023, "learning_rate": 8.891119702984551e-06, "loss": 0.2647, "step": 805100 }, { "epoch": 11.09365958502108, "grad_norm": 0.04064955562353134, "learning_rate": 8.885218954457059e-06, "loss": 0.2292, "step": 805200 }, { "epoch": 11.09503733708082, "grad_norm": 1.1493785381317139, "learning_rate": 8.879319787350447e-06, "loss": 0.2376, "step": 805300 }, { "epoch": 11.096415089140558, "grad_norm": 2.099971294403076, "learning_rate": 8.873422202165893e-06, "loss": 0.2452, "step": 805400 }, { "epoch": 11.097792841200297, "grad_norm": 2.8384177684783936, "learning_rate": 8.867526199404411e-06, "loss": 0.2156, "step": 805500 }, { "epoch": 11.099170593260038, "grad_norm": 0.3695179522037506, "learning_rate": 8.861631779566896e-06, "loss": 0.2054, "step": 805600 }, { "epoch": 11.100548345319776, "grad_norm": 4.0854997634887695, "learning_rate": 8.855738943154122e-06, "loss": 0.2159, "step": 805700 }, { "epoch": 11.101926097379515, "grad_norm": 1.8802731037139893, "learning_rate": 8.849847690666703e-06, "loss": 0.2298, "step": 805800 }, { "epoch": 11.103303849439255, "grad_norm": 3.3940842151641846, "learning_rate": 8.843958022605131e-06, "loss": 0.3119, "step": 805900 }, { "epoch": 11.104681601498994, "grad_norm": 2.429543972015381, "learning_rate": 8.838069939469783e-06, "loss": 0.315, "step": 806000 }, { "epoch": 11.106059353558734, "grad_norm": 1.094051480293274, "learning_rate": 8.832183441760864e-06, "loss": 0.3217, "step": 806100 }, { "epoch": 11.107437105618473, "grad_norm": 1.3612397909164429, "learning_rate": 8.826298529978456e-06, "loss": 0.2853, "step": 806200 }, { "epoch": 11.108814857678212, "grad_norm": 1.6259535551071167, "learning_rate": 8.820415204622522e-06, "loss": 0.236, "step": 806300 }, { "epoch": 11.110192609737952, "grad_norm": 4.023580551147461, "learning_rate": 8.814533466192894e-06, "loss": 0.2455, "step": 806400 }, { "epoch": 11.11157036179769, "grad_norm": 0.9507949948310852, "learning_rate": 8.808653315189227e-06, "loss": 0.2696, "step": 806500 }, { "epoch": 11.11294811385743, "grad_norm": 1.7639344930648804, "learning_rate": 8.802774752111098e-06, "loss": 0.2289, "step": 806600 }, { "epoch": 11.11432586591717, "grad_norm": 4.252608299255371, "learning_rate": 8.796897777457905e-06, "loss": 0.2456, "step": 806700 }, { "epoch": 11.115703617976909, "grad_norm": 3.606848955154419, "learning_rate": 8.791022391728926e-06, "loss": 0.2544, "step": 806800 }, { "epoch": 11.117081370036647, "grad_norm": 5.053679943084717, "learning_rate": 8.785148595423306e-06, "loss": 0.2693, "step": 806900 }, { "epoch": 11.118459122096388, "grad_norm": 0.5052111148834229, "learning_rate": 8.779276389040066e-06, "loss": 0.2511, "step": 807000 }, { "epoch": 11.119836874156126, "grad_norm": 2.9068005084991455, "learning_rate": 8.773464471363464e-06, "loss": 0.2321, "step": 807100 }, { "epoch": 11.121214626215867, "grad_norm": 4.298202991485596, "learning_rate": 8.767595430409787e-06, "loss": 0.2916, "step": 807200 }, { "epoch": 11.122592378275606, "grad_norm": 1.2209360599517822, "learning_rate": 8.7617279808697e-06, "loss": 0.2222, "step": 807300 }, { "epoch": 11.123970130335344, "grad_norm": 2.6864845752716064, "learning_rate": 8.75586212324169e-06, "loss": 0.2031, "step": 807400 }, { "epoch": 11.125347882395085, "grad_norm": 8.401939392089844, "learning_rate": 8.749997858024092e-06, "loss": 0.2732, "step": 807500 }, { "epoch": 11.126725634454823, "grad_norm": 1.0111336708068848, "learning_rate": 8.744135185715095e-06, "loss": 0.2857, "step": 807600 }, { "epoch": 11.128103386514562, "grad_norm": 2.792137384414673, "learning_rate": 8.738274106812775e-06, "loss": 0.2324, "step": 807700 }, { "epoch": 11.129481138574302, "grad_norm": 1.0686736106872559, "learning_rate": 8.732414621815057e-06, "loss": 0.2457, "step": 807800 }, { "epoch": 11.130858890634041, "grad_norm": 1.4294456243515015, "learning_rate": 8.726556731219716e-06, "loss": 0.2718, "step": 807900 }, { "epoch": 11.132236642693782, "grad_norm": 1.3688405752182007, "learning_rate": 8.720700435524432e-06, "loss": 0.2688, "step": 808000 }, { "epoch": 11.13361439475352, "grad_norm": 3.928964853286743, "learning_rate": 8.714845735226724e-06, "loss": 0.2611, "step": 808100 }, { "epoch": 11.134992146813259, "grad_norm": 5.94156551361084, "learning_rate": 8.70899263082397e-06, "loss": 0.2849, "step": 808200 }, { "epoch": 11.136369898873, "grad_norm": 2.9743258953094482, "learning_rate": 8.703199629989768e-06, "loss": 0.3191, "step": 808300 }, { "epoch": 11.137747650932738, "grad_norm": 0.4662185311317444, "learning_rate": 8.697349702897192e-06, "loss": 0.2271, "step": 808400 }, { "epoch": 11.139125402992477, "grad_norm": 0.9455997943878174, "learning_rate": 8.691501373185965e-06, "loss": 0.2527, "step": 808500 }, { "epoch": 11.140503155052217, "grad_norm": 1.784304141998291, "learning_rate": 8.685654641352912e-06, "loss": 0.2657, "step": 808600 }, { "epoch": 11.141880907111956, "grad_norm": 1.861839771270752, "learning_rate": 8.679809507894747e-06, "loss": 0.2373, "step": 808700 }, { "epoch": 11.143258659171696, "grad_norm": 2.105329990386963, "learning_rate": 8.67396597330806e-06, "loss": 0.2243, "step": 808800 }, { "epoch": 11.144636411231435, "grad_norm": 2.089622974395752, "learning_rate": 8.668124038089272e-06, "loss": 0.2589, "step": 808900 }, { "epoch": 11.146014163291174, "grad_norm": 2.660186290740967, "learning_rate": 8.662283702734688e-06, "loss": 0.2309, "step": 809000 }, { "epoch": 11.147391915350914, "grad_norm": 5.6004414558410645, "learning_rate": 8.656444967740473e-06, "loss": 0.2496, "step": 809100 }, { "epoch": 11.148769667410653, "grad_norm": 2.58648419380188, "learning_rate": 8.650607833602668e-06, "loss": 0.2646, "step": 809200 }, { "epoch": 11.150147419470391, "grad_norm": 2.803346872329712, "learning_rate": 8.644772300817155e-06, "loss": 0.2604, "step": 809300 }, { "epoch": 11.151525171530132, "grad_norm": 1.6983903646469116, "learning_rate": 8.638938369879705e-06, "loss": 0.2919, "step": 809400 }, { "epoch": 11.15290292358987, "grad_norm": 3.894742488861084, "learning_rate": 8.633106041285934e-06, "loss": 0.2834, "step": 809500 }, { "epoch": 11.154280675649611, "grad_norm": 4.846644401550293, "learning_rate": 8.627275315531317e-06, "loss": 0.2678, "step": 809600 }, { "epoch": 11.15565842770935, "grad_norm": 3.635613203048706, "learning_rate": 8.621446193111214e-06, "loss": 0.2639, "step": 809700 }, { "epoch": 11.157036179769088, "grad_norm": 2.513417959213257, "learning_rate": 8.615618674520847e-06, "loss": 0.3041, "step": 809800 }, { "epoch": 11.158413931828829, "grad_norm": 11.026405334472656, "learning_rate": 8.609792760255289e-06, "loss": 0.2554, "step": 809900 }, { "epoch": 11.159791683888567, "grad_norm": 0.08315371721982956, "learning_rate": 8.603968450809468e-06, "loss": 0.2605, "step": 810000 }, { "epoch": 11.161169435948306, "grad_norm": 2.526890516281128, "learning_rate": 8.598145746678196e-06, "loss": 0.2653, "step": 810100 }, { "epoch": 11.162547188008046, "grad_norm": 3.762650489807129, "learning_rate": 8.592324648356153e-06, "loss": 0.3039, "step": 810200 }, { "epoch": 11.163924940067785, "grad_norm": 1.738736629486084, "learning_rate": 8.586505156337854e-06, "loss": 0.2277, "step": 810300 }, { "epoch": 11.165302692127526, "grad_norm": 2.326447010040283, "learning_rate": 8.580687271117703e-06, "loss": 0.3003, "step": 810400 }, { "epoch": 11.166680444187264, "grad_norm": 0.4140963554382324, "learning_rate": 8.574870993189978e-06, "loss": 0.2757, "step": 810500 }, { "epoch": 11.168058196247003, "grad_norm": 4.045792579650879, "learning_rate": 8.569056323048763e-06, "loss": 0.2509, "step": 810600 }, { "epoch": 11.169435948306743, "grad_norm": 2.8864753246307373, "learning_rate": 8.563243261188063e-06, "loss": 0.279, "step": 810700 }, { "epoch": 11.170813700366482, "grad_norm": 0.9622142314910889, "learning_rate": 8.557489914667541e-06, "loss": 0.2455, "step": 810800 }, { "epoch": 11.17219145242622, "grad_norm": 3.372331380844116, "learning_rate": 8.551680054754168e-06, "loss": 0.2429, "step": 810900 }, { "epoch": 11.173569204485961, "grad_norm": 2.4155023097991943, "learning_rate": 8.545871804597516e-06, "loss": 0.2758, "step": 811000 }, { "epoch": 11.1749469565457, "grad_norm": 1.3167738914489746, "learning_rate": 8.540065164691009e-06, "loss": 0.2344, "step": 811100 }, { "epoch": 11.176324708605438, "grad_norm": 0.4571068584918976, "learning_rate": 8.53426013552797e-06, "loss": 0.2495, "step": 811200 }, { "epoch": 11.177702460665179, "grad_norm": 2.1056313514709473, "learning_rate": 8.528456717601566e-06, "loss": 0.2238, "step": 811300 }, { "epoch": 11.179080212724918, "grad_norm": 4.007205009460449, "learning_rate": 8.522654911404813e-06, "loss": 0.2452, "step": 811400 }, { "epoch": 11.180457964784658, "grad_norm": 3.4492440223693848, "learning_rate": 8.516854717430615e-06, "loss": 0.2214, "step": 811500 }, { "epoch": 11.181835716844397, "grad_norm": 6.595731258392334, "learning_rate": 8.51105613617174e-06, "loss": 0.2465, "step": 811600 }, { "epoch": 11.183213468904135, "grad_norm": 1.945192813873291, "learning_rate": 8.505259168120779e-06, "loss": 0.2729, "step": 811700 }, { "epoch": 11.184591220963876, "grad_norm": 0.3217388391494751, "learning_rate": 8.499463813770227e-06, "loss": 0.2178, "step": 811800 }, { "epoch": 11.185968973023614, "grad_norm": 3.6789467334747314, "learning_rate": 8.493670073612438e-06, "loss": 0.2311, "step": 811900 }, { "epoch": 11.187346725083353, "grad_norm": 1.3070392608642578, "learning_rate": 8.487877948139603e-06, "loss": 0.2507, "step": 812000 }, { "epoch": 11.188724477143094, "grad_norm": 4.377415657043457, "learning_rate": 8.482087437843811e-06, "loss": 0.2734, "step": 812100 }, { "epoch": 11.190102229202832, "grad_norm": 2.521705389022827, "learning_rate": 8.47629854321698e-06, "loss": 0.2785, "step": 812200 }, { "epoch": 11.191479981262573, "grad_norm": 2.87312912940979, "learning_rate": 8.470511264750915e-06, "loss": 0.2135, "step": 812300 }, { "epoch": 11.192857733322311, "grad_norm": 2.738273859024048, "learning_rate": 8.46472560293727e-06, "loss": 0.2837, "step": 812400 }, { "epoch": 11.19423548538205, "grad_norm": 0.4938458800315857, "learning_rate": 8.458941558267574e-06, "loss": 0.2707, "step": 812500 }, { "epoch": 11.19561323744179, "grad_norm": 3.2724947929382324, "learning_rate": 8.453216947494634e-06, "loss": 0.2319, "step": 812600 }, { "epoch": 11.19699098950153, "grad_norm": 3.5377845764160156, "learning_rate": 8.447436122403146e-06, "loss": 0.2661, "step": 812700 }, { "epoch": 11.198368741561268, "grad_norm": 3.653688907623291, "learning_rate": 8.441656915924423e-06, "loss": 0.2337, "step": 812800 }, { "epoch": 11.199746493621008, "grad_norm": 6.7691216468811035, "learning_rate": 8.435879328549444e-06, "loss": 0.2487, "step": 812900 }, { "epoch": 11.201124245680747, "grad_norm": 2.1774661540985107, "learning_rate": 8.430103360769058e-06, "loss": 0.2838, "step": 813000 }, { "epoch": 11.202501997740487, "grad_norm": 0.24400591850280762, "learning_rate": 8.424329013073941e-06, "loss": 0.2455, "step": 813100 }, { "epoch": 11.203879749800226, "grad_norm": 2.814746379852295, "learning_rate": 8.418556285954676e-06, "loss": 0.2258, "step": 813200 }, { "epoch": 11.205257501859965, "grad_norm": 4.687884330749512, "learning_rate": 8.412785179901674e-06, "loss": 0.27, "step": 813300 }, { "epoch": 11.206635253919705, "grad_norm": 1.1854617595672607, "learning_rate": 8.407073382221865e-06, "loss": 0.2543, "step": 813400 }, { "epoch": 11.208013005979444, "grad_norm": 2.406607151031494, "learning_rate": 8.401305503549206e-06, "loss": 0.2375, "step": 813500 }, { "epoch": 11.209390758039183, "grad_norm": 5.085211753845215, "learning_rate": 8.395539247408354e-06, "loss": 0.236, "step": 813600 }, { "epoch": 11.210768510098923, "grad_norm": 1.9845657348632812, "learning_rate": 8.389774614289187e-06, "loss": 0.2448, "step": 813700 }, { "epoch": 11.212146262158662, "grad_norm": 3.0627851486206055, "learning_rate": 8.384011604681435e-06, "loss": 0.253, "step": 813800 }, { "epoch": 11.213524014218402, "grad_norm": 1.4288195371627808, "learning_rate": 8.378250219074684e-06, "loss": 0.207, "step": 813900 }, { "epoch": 11.21490176627814, "grad_norm": 0.9665579199790955, "learning_rate": 8.372490457958405e-06, "loss": 0.259, "step": 814000 }, { "epoch": 11.21627951833788, "grad_norm": 0.6886402368545532, "learning_rate": 8.366732321821925e-06, "loss": 0.2641, "step": 814100 }, { "epoch": 11.21765727039762, "grad_norm": 2.7925777435302734, "learning_rate": 8.36097581115441e-06, "loss": 0.2834, "step": 814200 }, { "epoch": 11.219035022457359, "grad_norm": 0.9384520649909973, "learning_rate": 8.355220926444926e-06, "loss": 0.2398, "step": 814300 }, { "epoch": 11.220412774517097, "grad_norm": 4.041644096374512, "learning_rate": 8.349467668182366e-06, "loss": 0.2637, "step": 814400 }, { "epoch": 11.221790526576838, "grad_norm": 0.6133745908737183, "learning_rate": 8.343716036855491e-06, "loss": 0.2538, "step": 814500 }, { "epoch": 11.223168278636576, "grad_norm": 1.0587536096572876, "learning_rate": 8.337966032952943e-06, "loss": 0.2363, "step": 814600 }, { "epoch": 11.224546030696317, "grad_norm": 0.3194536566734314, "learning_rate": 8.332217656963222e-06, "loss": 0.2388, "step": 814700 }, { "epoch": 11.225923782756055, "grad_norm": 1.4994847774505615, "learning_rate": 8.326470909374672e-06, "loss": 0.2857, "step": 814800 }, { "epoch": 11.227301534815794, "grad_norm": 2.0027263164520264, "learning_rate": 8.320725790675505e-06, "loss": 0.2446, "step": 814900 }, { "epoch": 11.228679286875535, "grad_norm": 0.6959890127182007, "learning_rate": 8.314982301353797e-06, "loss": 0.2217, "step": 815000 }, { "epoch": 11.230057038935273, "grad_norm": 1.6134371757507324, "learning_rate": 8.309240441897503e-06, "loss": 0.2418, "step": 815100 }, { "epoch": 11.231434790995012, "grad_norm": 0.4799920320510864, "learning_rate": 8.303500212794405e-06, "loss": 0.2415, "step": 815200 }, { "epoch": 11.232812543054752, "grad_norm": 0.3603655695915222, "learning_rate": 8.297761614532177e-06, "loss": 0.3008, "step": 815300 }, { "epoch": 11.234190295114491, "grad_norm": 3.2173988819122314, "learning_rate": 8.292024647598342e-06, "loss": 0.2451, "step": 815400 }, { "epoch": 11.23556804717423, "grad_norm": 2.5884618759155273, "learning_rate": 8.286289312480272e-06, "loss": 0.2684, "step": 815500 }, { "epoch": 11.23694579923397, "grad_norm": 1.267899990081787, "learning_rate": 8.28055560966522e-06, "loss": 0.2404, "step": 815600 }, { "epoch": 11.238323551293709, "grad_norm": 1.1178793907165527, "learning_rate": 8.274823539640303e-06, "loss": 0.251, "step": 815700 }, { "epoch": 11.23970130335345, "grad_norm": 0.8813108801841736, "learning_rate": 8.269093102892476e-06, "loss": 0.2574, "step": 815800 }, { "epoch": 11.241079055413188, "grad_norm": 1.0528873205184937, "learning_rate": 8.263364299908572e-06, "loss": 0.2211, "step": 815900 }, { "epoch": 11.242456807472927, "grad_norm": 3.4537768363952637, "learning_rate": 8.257637131175293e-06, "loss": 0.2969, "step": 816000 }, { "epoch": 11.243834559532667, "grad_norm": 2.411853313446045, "learning_rate": 8.251911597179185e-06, "loss": 0.2534, "step": 816100 }, { "epoch": 11.245212311592406, "grad_norm": 2.665461778640747, "learning_rate": 8.246187698406648e-06, "loss": 0.201, "step": 816200 }, { "epoch": 11.246590063652144, "grad_norm": 3.1475048065185547, "learning_rate": 8.240465435343977e-06, "loss": 0.2649, "step": 816300 }, { "epoch": 11.247967815711885, "grad_norm": 4.453071117401123, "learning_rate": 8.234744808477291e-06, "loss": 0.2324, "step": 816400 }, { "epoch": 11.249345567771623, "grad_norm": 1.4761182069778442, "learning_rate": 8.229025818292605e-06, "loss": 0.2241, "step": 816500 }, { "epoch": 11.250723319831364, "grad_norm": 9.515789031982422, "learning_rate": 8.223308465275752e-06, "loss": 0.2405, "step": 816600 }, { "epoch": 11.252101071891103, "grad_norm": 4.889324188232422, "learning_rate": 8.217592749912477e-06, "loss": 0.3041, "step": 816700 }, { "epoch": 11.253478823950841, "grad_norm": 0.2434859573841095, "learning_rate": 8.211878672688332e-06, "loss": 0.2368, "step": 816800 }, { "epoch": 11.254856576010582, "grad_norm": 2.01082706451416, "learning_rate": 8.206166234088775e-06, "loss": 0.1933, "step": 816900 }, { "epoch": 11.25623432807032, "grad_norm": 2.4082260131835938, "learning_rate": 8.20045543459911e-06, "loss": 0.2033, "step": 817000 }, { "epoch": 11.257612080130059, "grad_norm": 1.6180120706558228, "learning_rate": 8.194746274704495e-06, "loss": 0.2716, "step": 817100 }, { "epoch": 11.2589898321898, "grad_norm": 2.4339616298675537, "learning_rate": 8.189038754889935e-06, "loss": 0.2198, "step": 817200 }, { "epoch": 11.260367584249538, "grad_norm": 4.213840007781982, "learning_rate": 8.183332875640339e-06, "loss": 0.2347, "step": 817300 }, { "epoch": 11.261745336309279, "grad_norm": 4.162495136260986, "learning_rate": 8.177628637440429e-06, "loss": 0.2333, "step": 817400 }, { "epoch": 11.263123088369017, "grad_norm": 4.897585868835449, "learning_rate": 8.17192604077482e-06, "loss": 0.3174, "step": 817500 }, { "epoch": 11.264500840428756, "grad_norm": 2.8180994987487793, "learning_rate": 8.166225086127982e-06, "loss": 0.2626, "step": 817600 }, { "epoch": 11.265878592488496, "grad_norm": 2.5467939376831055, "learning_rate": 8.160525773984237e-06, "loss": 0.2976, "step": 817700 }, { "epoch": 11.267256344548235, "grad_norm": 2.797877311706543, "learning_rate": 8.154828104827758e-06, "loss": 0.2855, "step": 817800 }, { "epoch": 11.268634096607974, "grad_norm": 1.1890180110931396, "learning_rate": 8.149132079142598e-06, "loss": 0.3223, "step": 817900 }, { "epoch": 11.270011848667714, "grad_norm": 2.9433889389038086, "learning_rate": 8.143437697412677e-06, "loss": 0.2795, "step": 818000 }, { "epoch": 11.271389600727453, "grad_norm": 2.9260828495025635, "learning_rate": 8.137744960121743e-06, "loss": 0.2693, "step": 818100 }, { "epoch": 11.272767352787193, "grad_norm": 0.6027050614356995, "learning_rate": 8.132053867753438e-06, "loss": 0.2451, "step": 818200 }, { "epoch": 11.274145104846932, "grad_norm": 5.5438666343688965, "learning_rate": 8.126364420791244e-06, "loss": 0.2571, "step": 818300 }, { "epoch": 11.27552285690667, "grad_norm": 0.6320498585700989, "learning_rate": 8.120676619718499e-06, "loss": 0.2789, "step": 818400 }, { "epoch": 11.276900608966411, "grad_norm": 2.2426869869232178, "learning_rate": 8.114990465018417e-06, "loss": 0.2221, "step": 818500 }, { "epoch": 11.27827836102615, "grad_norm": 2.597557306289673, "learning_rate": 8.10930595717408e-06, "loss": 0.2575, "step": 818600 }, { "epoch": 11.279656113085888, "grad_norm": 2.977400779724121, "learning_rate": 8.103623096668404e-06, "loss": 0.2387, "step": 818700 }, { "epoch": 11.281033865145629, "grad_norm": 1.482456088066101, "learning_rate": 8.097941883984165e-06, "loss": 0.2356, "step": 818800 }, { "epoch": 11.282411617205367, "grad_norm": 2.108266592025757, "learning_rate": 8.092262319604025e-06, "loss": 0.2447, "step": 818900 }, { "epoch": 11.283789369265108, "grad_norm": 3.8023102283477783, "learning_rate": 8.0865844040105e-06, "loss": 0.2554, "step": 819000 }, { "epoch": 11.285167121324847, "grad_norm": 1.9457556009292603, "learning_rate": 8.080908137685937e-06, "loss": 0.2658, "step": 819100 }, { "epoch": 11.286544873384585, "grad_norm": 63.96593475341797, "learning_rate": 8.075233521112578e-06, "loss": 0.2808, "step": 819200 }, { "epoch": 11.287922625444326, "grad_norm": 2.547879934310913, "learning_rate": 8.069560554772525e-06, "loss": 0.2544, "step": 819300 }, { "epoch": 11.289300377504064, "grad_norm": 1.9845930337905884, "learning_rate": 8.063889239147687e-06, "loss": 0.2184, "step": 819400 }, { "epoch": 11.290678129563803, "grad_norm": 0.7359409332275391, "learning_rate": 8.058219574719897e-06, "loss": 0.2235, "step": 819500 }, { "epoch": 11.292055881623543, "grad_norm": 3.082850217819214, "learning_rate": 8.052608233920918e-06, "loss": 0.2599, "step": 819600 }, { "epoch": 11.293433633683282, "grad_norm": 3.8624794483184814, "learning_rate": 8.046941856808098e-06, "loss": 0.2968, "step": 819700 }, { "epoch": 11.29481138574302, "grad_norm": 1.931824803352356, "learning_rate": 8.041277132332085e-06, "loss": 0.2476, "step": 819800 }, { "epoch": 11.296189137802761, "grad_norm": 0.8910874128341675, "learning_rate": 8.035614060974126e-06, "loss": 0.2725, "step": 819900 }, { "epoch": 11.2975668898625, "grad_norm": 2.8127384185791016, "learning_rate": 8.02995264321533e-06, "loss": 0.2901, "step": 820000 }, { "epoch": 11.29894464192224, "grad_norm": 3.815598249435425, "learning_rate": 8.024292879536673e-06, "loss": 0.2368, "step": 820100 }, { "epoch": 11.300322393981979, "grad_norm": 0.46387845277786255, "learning_rate": 8.018634770418964e-06, "loss": 0.3128, "step": 820200 }, { "epoch": 11.301700146041718, "grad_norm": 2.069627285003662, "learning_rate": 8.012978316342898e-06, "loss": 0.2631, "step": 820300 }, { "epoch": 11.303077898101458, "grad_norm": 4.214239120483398, "learning_rate": 8.007323517789038e-06, "loss": 0.238, "step": 820400 }, { "epoch": 11.304455650161197, "grad_norm": 2.893728256225586, "learning_rate": 8.001670375237748e-06, "loss": 0.2818, "step": 820500 }, { "epoch": 11.305833402220935, "grad_norm": 3.32974910736084, "learning_rate": 7.996018889169315e-06, "loss": 0.3177, "step": 820600 }, { "epoch": 11.307211154280676, "grad_norm": 6.075284481048584, "learning_rate": 7.990369060063866e-06, "loss": 0.2842, "step": 820700 }, { "epoch": 11.308588906340415, "grad_norm": 2.7968854904174805, "learning_rate": 7.984720888401369e-06, "loss": 0.2269, "step": 820800 }, { "epoch": 11.309966658400155, "grad_norm": 4.132541179656982, "learning_rate": 7.979074374661682e-06, "loss": 0.2761, "step": 820900 }, { "epoch": 11.311344410459894, "grad_norm": 1.629943609237671, "learning_rate": 7.973429519324482e-06, "loss": 0.2532, "step": 821000 }, { "epoch": 11.312722162519632, "grad_norm": 1.9705862998962402, "learning_rate": 7.967786322869353e-06, "loss": 0.2355, "step": 821100 }, { "epoch": 11.314099914579373, "grad_norm": 1.30427086353302, "learning_rate": 7.962144785775694e-06, "loss": 0.2754, "step": 821200 }, { "epoch": 11.315477666639111, "grad_norm": 5.001206398010254, "learning_rate": 7.956504908522798e-06, "loss": 0.2215, "step": 821300 }, { "epoch": 11.31685541869885, "grad_norm": 3.0664825439453125, "learning_rate": 7.950866691589785e-06, "loss": 0.2376, "step": 821400 }, { "epoch": 11.31823317075859, "grad_norm": 1.475354552268982, "learning_rate": 7.94523013545567e-06, "loss": 0.2293, "step": 821500 }, { "epoch": 11.31961092281833, "grad_norm": 1.8670979738235474, "learning_rate": 7.939651581322954e-06, "loss": 0.2205, "step": 821600 }, { "epoch": 11.32098867487807, "grad_norm": 4.623377799987793, "learning_rate": 7.934018331603092e-06, "loss": 0.2757, "step": 821700 }, { "epoch": 11.322366426937808, "grad_norm": 1.1972743272781372, "learning_rate": 7.928386744113479e-06, "loss": 0.2226, "step": 821800 }, { "epoch": 11.323744178997547, "grad_norm": 6.778616428375244, "learning_rate": 7.922756819332532e-06, "loss": 0.2771, "step": 821900 }, { "epoch": 11.325121931057287, "grad_norm": 2.5903711318969727, "learning_rate": 7.917128557738557e-06, "loss": 0.2705, "step": 822000 }, { "epoch": 11.326499683117026, "grad_norm": 7.197052955627441, "learning_rate": 7.911501959809697e-06, "loss": 0.2325, "step": 822100 }, { "epoch": 11.327877435176765, "grad_norm": 1.4442648887634277, "learning_rate": 7.905877026023951e-06, "loss": 0.2209, "step": 822200 }, { "epoch": 11.329255187236505, "grad_norm": 1.5552010536193848, "learning_rate": 7.900253756859196e-06, "loss": 0.2312, "step": 822300 }, { "epoch": 11.330632939296244, "grad_norm": 3.9230003356933594, "learning_rate": 7.894632152793163e-06, "loss": 0.2728, "step": 822400 }, { "epoch": 11.332010691355984, "grad_norm": 2.773829698562622, "learning_rate": 7.889012214303421e-06, "loss": 0.2302, "step": 822500 }, { "epoch": 11.333388443415723, "grad_norm": 3.3008341789245605, "learning_rate": 7.883393941867428e-06, "loss": 0.2351, "step": 822600 }, { "epoch": 11.334766195475462, "grad_norm": 3.915314197540283, "learning_rate": 7.877777335962469e-06, "loss": 0.227, "step": 822700 }, { "epoch": 11.336143947535202, "grad_norm": 2.4626643657684326, "learning_rate": 7.872162397065723e-06, "loss": 0.1891, "step": 822800 }, { "epoch": 11.33752169959494, "grad_norm": 2.7219760417938232, "learning_rate": 7.866549125654187e-06, "loss": 0.2518, "step": 822900 }, { "epoch": 11.33889945165468, "grad_norm": 2.2657501697540283, "learning_rate": 7.860937522204746e-06, "loss": 0.2352, "step": 823000 }, { "epoch": 11.34027720371442, "grad_norm": 3.819279432296753, "learning_rate": 7.855327587194152e-06, "loss": 0.2787, "step": 823100 }, { "epoch": 11.341654955774159, "grad_norm": 0.12675903737545013, "learning_rate": 7.849719321098963e-06, "loss": 0.2518, "step": 823200 }, { "epoch": 11.343032707833899, "grad_norm": 1.066680908203125, "learning_rate": 7.844112724395649e-06, "loss": 0.275, "step": 823300 }, { "epoch": 11.344410459893638, "grad_norm": 0.3959878981113434, "learning_rate": 7.838507797560528e-06, "loss": 0.2252, "step": 823400 }, { "epoch": 11.345788211953376, "grad_norm": 3.517610788345337, "learning_rate": 7.832904541069748e-06, "loss": 0.2338, "step": 823500 }, { "epoch": 11.347165964013117, "grad_norm": 2.9037115573883057, "learning_rate": 7.827302955399343e-06, "loss": 0.2456, "step": 823600 }, { "epoch": 11.348543716072856, "grad_norm": 3.2053000926971436, "learning_rate": 7.821703041025206e-06, "loss": 0.2672, "step": 823700 }, { "epoch": 11.349921468132594, "grad_norm": 2.894637107849121, "learning_rate": 7.816104798423065e-06, "loss": 0.2373, "step": 823800 }, { "epoch": 11.351299220192335, "grad_norm": 3.433753252029419, "learning_rate": 7.810508228068518e-06, "loss": 0.2369, "step": 823900 }, { "epoch": 11.352676972252073, "grad_norm": 1.3889623880386353, "learning_rate": 7.804913330437029e-06, "loss": 0.2429, "step": 824000 }, { "epoch": 11.354054724311812, "grad_norm": 1.1524457931518555, "learning_rate": 7.799320106003922e-06, "loss": 0.2301, "step": 824100 }, { "epoch": 11.355432476371552, "grad_norm": 3.7813222408294678, "learning_rate": 7.793728555244349e-06, "loss": 0.2977, "step": 824200 }, { "epoch": 11.356810228431291, "grad_norm": 2.7795779705047607, "learning_rate": 7.78813867863336e-06, "loss": 0.3137, "step": 824300 }, { "epoch": 11.358187980491032, "grad_norm": 0.5353711843490601, "learning_rate": 7.782550476645833e-06, "loss": 0.2466, "step": 824400 }, { "epoch": 11.35956573255077, "grad_norm": 7.459709167480469, "learning_rate": 7.777019806732113e-06, "loss": 0.1837, "step": 824500 }, { "epoch": 11.360943484610509, "grad_norm": 2.112063407897949, "learning_rate": 7.771434938657527e-06, "loss": 0.2484, "step": 824600 }, { "epoch": 11.36232123667025, "grad_norm": 2.9148638248443604, "learning_rate": 7.765851746625476e-06, "loss": 0.2424, "step": 824700 }, { "epoch": 11.363698988729988, "grad_norm": 4.938729286193848, "learning_rate": 7.760270231110287e-06, "loss": 0.2741, "step": 824800 }, { "epoch": 11.365076740789727, "grad_norm": 4.208714008331299, "learning_rate": 7.75469039258613e-06, "loss": 0.225, "step": 824900 }, { "epoch": 11.366454492849467, "grad_norm": 3.2872002124786377, "learning_rate": 7.74911223152703e-06, "loss": 0.2465, "step": 825000 }, { "epoch": 11.367832244909206, "grad_norm": 1.6545212268829346, "learning_rate": 7.7435357484069e-06, "loss": 0.2494, "step": 825100 }, { "epoch": 11.369209996968946, "grad_norm": 1.3026200532913208, "learning_rate": 7.737960943699476e-06, "loss": 0.2847, "step": 825200 }, { "epoch": 11.370587749028685, "grad_norm": 0.009530747309327126, "learning_rate": 7.732387817878377e-06, "loss": 0.2516, "step": 825300 }, { "epoch": 11.371965501088424, "grad_norm": 3.345026969909668, "learning_rate": 7.726816371417053e-06, "loss": 0.2219, "step": 825400 }, { "epoch": 11.373343253148164, "grad_norm": 7.327831745147705, "learning_rate": 7.72124660478885e-06, "loss": 0.2598, "step": 825500 }, { "epoch": 11.374721005207903, "grad_norm": 2.048129081726074, "learning_rate": 7.715678518466923e-06, "loss": 0.248, "step": 825600 }, { "epoch": 11.376098757267641, "grad_norm": 1.9174838066101074, "learning_rate": 7.710112112924326e-06, "loss": 0.2183, "step": 825700 }, { "epoch": 11.377476509327382, "grad_norm": 5.567663669586182, "learning_rate": 7.704547388633953e-06, "loss": 0.2857, "step": 825800 }, { "epoch": 11.37885426138712, "grad_norm": 1.9314908981323242, "learning_rate": 7.698984346068558e-06, "loss": 0.2518, "step": 825900 }, { "epoch": 11.38023201344686, "grad_norm": 2.685676097869873, "learning_rate": 7.693422985700731e-06, "loss": 0.2404, "step": 826000 }, { "epoch": 11.3816097655066, "grad_norm": 0.1806098371744156, "learning_rate": 7.68786330800296e-06, "loss": 0.2852, "step": 826100 }, { "epoch": 11.382987517566338, "grad_norm": 2.8371596336364746, "learning_rate": 7.682305313447552e-06, "loss": 0.3025, "step": 826200 }, { "epoch": 11.384365269626079, "grad_norm": 1.0369031429290771, "learning_rate": 7.676749002506695e-06, "loss": 0.252, "step": 826300 }, { "epoch": 11.385743021685817, "grad_norm": 2.0922703742980957, "learning_rate": 7.671194375652431e-06, "loss": 0.2278, "step": 826400 }, { "epoch": 11.387120773745556, "grad_norm": 3.1735498905181885, "learning_rate": 7.66564143335665e-06, "loss": 0.2324, "step": 826500 }, { "epoch": 11.388498525805296, "grad_norm": 2.1867239475250244, "learning_rate": 7.660090176091093e-06, "loss": 0.2322, "step": 826600 }, { "epoch": 11.389876277865035, "grad_norm": 5.179723262786865, "learning_rate": 7.654540604327376e-06, "loss": 0.2584, "step": 826700 }, { "epoch": 11.391254029924776, "grad_norm": 2.861086845397949, "learning_rate": 7.648992718536972e-06, "loss": 0.212, "step": 826800 }, { "epoch": 11.392631781984514, "grad_norm": 3.130150556564331, "learning_rate": 7.643446519191186e-06, "loss": 0.2519, "step": 826900 }, { "epoch": 11.394009534044253, "grad_norm": 1.7339131832122803, "learning_rate": 7.637902006761209e-06, "loss": 0.2844, "step": 827000 }, { "epoch": 11.395387286103993, "grad_norm": 3.3964924812316895, "learning_rate": 7.632359181718072e-06, "loss": 0.2686, "step": 827100 }, { "epoch": 11.396765038163732, "grad_norm": 1.8778835535049438, "learning_rate": 7.626818044532655e-06, "loss": 0.2757, "step": 827200 }, { "epoch": 11.39814279022347, "grad_norm": 3.3743250370025635, "learning_rate": 7.621278595675717e-06, "loss": 0.2364, "step": 827300 }, { "epoch": 11.399520542283211, "grad_norm": 2.3929708003997803, "learning_rate": 7.6157408356178705e-06, "loss": 0.3007, "step": 827400 }, { "epoch": 11.40089829434295, "grad_norm": 4.376112461090088, "learning_rate": 7.610204764829557e-06, "loss": 0.2602, "step": 827500 }, { "epoch": 11.40227604640269, "grad_norm": 13.557110786437988, "learning_rate": 7.604670383781113e-06, "loss": 0.2618, "step": 827600 }, { "epoch": 11.403653798462429, "grad_norm": 4.27698278427124, "learning_rate": 7.599137692942695e-06, "loss": 0.238, "step": 827700 }, { "epoch": 11.405031550522168, "grad_norm": 3.7122349739074707, "learning_rate": 7.59366199441552e-06, "loss": 0.2832, "step": 827800 }, { "epoch": 11.406409302581908, "grad_norm": 3.6863527297973633, "learning_rate": 7.588132668493303e-06, "loss": 0.2471, "step": 827900 }, { "epoch": 11.407787054641647, "grad_norm": 3.14986515045166, "learning_rate": 7.582605034186076e-06, "loss": 0.2478, "step": 828000 }, { "epoch": 11.409164806701385, "grad_norm": 2.0508217811584473, "learning_rate": 7.577079091963455e-06, "loss": 0.2425, "step": 828100 }, { "epoch": 11.410542558761126, "grad_norm": 0.10330581665039062, "learning_rate": 7.571554842294894e-06, "loss": 0.2787, "step": 828200 }, { "epoch": 11.411920310820864, "grad_norm": 2.3158676624298096, "learning_rate": 7.566032285649689e-06, "loss": 0.2445, "step": 828300 }, { "epoch": 11.413298062880603, "grad_norm": 3.709444999694824, "learning_rate": 7.560511422497025e-06, "loss": 0.223, "step": 828400 }, { "epoch": 11.414675814940344, "grad_norm": 3.366015911102295, "learning_rate": 7.554992253305936e-06, "loss": 0.206, "step": 828500 }, { "epoch": 11.416053567000082, "grad_norm": 3.258861541748047, "learning_rate": 7.54947477854528e-06, "loss": 0.2309, "step": 828600 }, { "epoch": 11.417431319059823, "grad_norm": 1.229088306427002, "learning_rate": 7.5439589986838205e-06, "loss": 0.2624, "step": 828700 }, { "epoch": 11.418809071119561, "grad_norm": 1.9554978609085083, "learning_rate": 7.538444914190128e-06, "loss": 0.2266, "step": 828800 }, { "epoch": 11.4201868231793, "grad_norm": 3.9325079917907715, "learning_rate": 7.532932525532672e-06, "loss": 0.25, "step": 828900 }, { "epoch": 11.42156457523904, "grad_norm": 1.9586195945739746, "learning_rate": 7.52742183317974e-06, "loss": 0.251, "step": 829000 }, { "epoch": 11.42294232729878, "grad_norm": 3.803563117980957, "learning_rate": 7.52191283759951e-06, "loss": 0.3028, "step": 829100 }, { "epoch": 11.424320079358518, "grad_norm": 0.4155932068824768, "learning_rate": 7.516405539259993e-06, "loss": 0.2765, "step": 829200 }, { "epoch": 11.425697831418258, "grad_norm": 1.5774508714675903, "learning_rate": 7.5108999386290515e-06, "loss": 0.2887, "step": 829300 }, { "epoch": 11.427075583477997, "grad_norm": 2.342172622680664, "learning_rate": 7.505396036174424e-06, "loss": 0.2473, "step": 829400 }, { "epoch": 11.428453335537737, "grad_norm": 1.164090871810913, "learning_rate": 7.4998938323637e-06, "loss": 0.2701, "step": 829500 }, { "epoch": 11.429831087597476, "grad_norm": 5.180591106414795, "learning_rate": 7.494393327664313e-06, "loss": 0.2647, "step": 829600 }, { "epoch": 11.431208839657215, "grad_norm": 2.7870445251464844, "learning_rate": 7.488894522543556e-06, "loss": 0.2297, "step": 829700 }, { "epoch": 11.432586591716955, "grad_norm": 3.5560641288757324, "learning_rate": 7.483397417468594e-06, "loss": 0.2615, "step": 829800 }, { "epoch": 11.433964343776694, "grad_norm": 1.4392467737197876, "learning_rate": 7.477902012906427e-06, "loss": 0.273, "step": 829900 }, { "epoch": 11.435342095836432, "grad_norm": 0.049318090081214905, "learning_rate": 7.472408309323909e-06, "loss": 0.2499, "step": 830000 }, { "epoch": 11.436719847896173, "grad_norm": 2.0783803462982178, "learning_rate": 7.4669163071877715e-06, "loss": 0.2269, "step": 830100 }, { "epoch": 11.438097599955912, "grad_norm": 0.8156190514564514, "learning_rate": 7.461426006964577e-06, "loss": 0.2097, "step": 830200 }, { "epoch": 11.439475352015652, "grad_norm": 0.13967573642730713, "learning_rate": 7.455992286670897e-06, "loss": 0.3045, "step": 830300 }, { "epoch": 11.44085310407539, "grad_norm": 5.60945987701416, "learning_rate": 7.450505374641975e-06, "loss": 0.2743, "step": 830400 }, { "epoch": 11.44223085613513, "grad_norm": 3.027225971221924, "learning_rate": 7.445020165920197e-06, "loss": 0.2571, "step": 830500 }, { "epoch": 11.44360860819487, "grad_norm": 3.76334810256958, "learning_rate": 7.439536660971562e-06, "loss": 0.2436, "step": 830600 }, { "epoch": 11.444986360254608, "grad_norm": 1.4432705640792847, "learning_rate": 7.434054860261907e-06, "loss": 0.2469, "step": 830700 }, { "epoch": 11.446364112314347, "grad_norm": 2.087167501449585, "learning_rate": 7.428574764256951e-06, "loss": 0.2584, "step": 830800 }, { "epoch": 11.447741864374088, "grad_norm": 1.2185357809066772, "learning_rate": 7.423096373422268e-06, "loss": 0.2636, "step": 830900 }, { "epoch": 11.449119616433826, "grad_norm": 3.5740129947662354, "learning_rate": 7.417619688223244e-06, "loss": 0.2329, "step": 831000 }, { "epoch": 11.450497368493567, "grad_norm": 1.6161437034606934, "learning_rate": 7.41214470912517e-06, "loss": 0.2437, "step": 831100 }, { "epoch": 11.451875120553305, "grad_norm": 1.40609872341156, "learning_rate": 7.40667143659318e-06, "loss": 0.3074, "step": 831200 }, { "epoch": 11.453252872613044, "grad_norm": 2.3260626792907715, "learning_rate": 7.401199871092239e-06, "loss": 0.2335, "step": 831300 }, { "epoch": 11.454630624672784, "grad_norm": 0.19453950226306915, "learning_rate": 7.395730013087202e-06, "loss": 0.2505, "step": 831400 }, { "epoch": 11.456008376732523, "grad_norm": 2.5065441131591797, "learning_rate": 7.390261863042747e-06, "loss": 0.3053, "step": 831500 }, { "epoch": 11.457386128792262, "grad_norm": 0.7695389986038208, "learning_rate": 7.384795421423431e-06, "loss": 0.2472, "step": 831600 }, { "epoch": 11.458763880852002, "grad_norm": 5.614015579223633, "learning_rate": 7.379330688693648e-06, "loss": 0.2897, "step": 831700 }, { "epoch": 11.460141632911741, "grad_norm": 1.5453665256500244, "learning_rate": 7.373867665317661e-06, "loss": 0.2391, "step": 831800 }, { "epoch": 11.461519384971481, "grad_norm": 2.757171392440796, "learning_rate": 7.368406351759584e-06, "loss": 0.23, "step": 831900 }, { "epoch": 11.46289713703122, "grad_norm": 6.970754623413086, "learning_rate": 7.362946748483385e-06, "loss": 0.2459, "step": 832000 }, { "epoch": 11.464274889090959, "grad_norm": 4.505046367645264, "learning_rate": 7.357488855952867e-06, "loss": 0.2707, "step": 832100 }, { "epoch": 11.4656526411507, "grad_norm": 1.1355433464050293, "learning_rate": 7.3520326746317305e-06, "loss": 0.2396, "step": 832200 }, { "epoch": 11.467030393210438, "grad_norm": 2.8249053955078125, "learning_rate": 7.346578204983487e-06, "loss": 0.2348, "step": 832300 }, { "epoch": 11.468408145270176, "grad_norm": 3.498345375061035, "learning_rate": 7.3411254474715276e-06, "loss": 0.2403, "step": 832400 }, { "epoch": 11.469785897329917, "grad_norm": 3.023056983947754, "learning_rate": 7.3356744025591e-06, "loss": 0.2341, "step": 832500 }, { "epoch": 11.471163649389656, "grad_norm": 3.470332384109497, "learning_rate": 7.330225070709296e-06, "loss": 0.3224, "step": 832600 }, { "epoch": 11.472541401449394, "grad_norm": 0.06458701938390732, "learning_rate": 7.324777452385049e-06, "loss": 0.2624, "step": 832700 }, { "epoch": 11.473919153509135, "grad_norm": 1.161067008972168, "learning_rate": 7.319331548049174e-06, "loss": 0.2452, "step": 832800 }, { "epoch": 11.475296905568873, "grad_norm": 1.7118459939956665, "learning_rate": 7.313887358164337e-06, "loss": 0.2769, "step": 832900 }, { "epoch": 11.476674657628614, "grad_norm": 0.36733904480934143, "learning_rate": 7.308444883193029e-06, "loss": 0.2156, "step": 833000 }, { "epoch": 11.478052409688352, "grad_norm": 4.318186283111572, "learning_rate": 7.303004123597639e-06, "loss": 0.2413, "step": 833100 }, { "epoch": 11.479430161748091, "grad_norm": 3.4880118370056152, "learning_rate": 7.297565079840373e-06, "loss": 0.2868, "step": 833200 }, { "epoch": 11.480807913807832, "grad_norm": 3.008613348007202, "learning_rate": 7.292127752383302e-06, "loss": 0.2551, "step": 833300 }, { "epoch": 11.48218566586757, "grad_norm": 1.0480797290802002, "learning_rate": 7.286692141688362e-06, "loss": 0.263, "step": 833400 }, { "epoch": 11.483563417927309, "grad_norm": 4.988347053527832, "learning_rate": 7.281258248217341e-06, "loss": 0.2553, "step": 833500 }, { "epoch": 11.48494116998705, "grad_norm": 1.2590678930282593, "learning_rate": 7.275826072431872e-06, "loss": 0.2465, "step": 833600 }, { "epoch": 11.486318922046788, "grad_norm": 1.4309666156768799, "learning_rate": 7.270395614793436e-06, "loss": 0.2601, "step": 833700 }, { "epoch": 11.487696674106529, "grad_norm": 2.491604804992676, "learning_rate": 7.2650211546450615e-06, "loss": 0.2371, "step": 833800 }, { "epoch": 11.489074426166267, "grad_norm": 1.3212556838989258, "learning_rate": 7.259594117491629e-06, "loss": 0.2475, "step": 833900 }, { "epoch": 11.490452178226006, "grad_norm": 4.518353462219238, "learning_rate": 7.254168799864214e-06, "loss": 0.2192, "step": 834000 }, { "epoch": 11.491829930285746, "grad_norm": 5.0000786781311035, "learning_rate": 7.248745202223739e-06, "loss": 0.2575, "step": 834100 }, { "epoch": 11.493207682345485, "grad_norm": 3.223984956741333, "learning_rate": 7.2433233250309705e-06, "loss": 0.2803, "step": 834200 }, { "epoch": 11.494585434405224, "grad_norm": 0.8992794156074524, "learning_rate": 7.2379031687465175e-06, "loss": 0.2776, "step": 834300 }, { "epoch": 11.495963186464964, "grad_norm": 0.08151814341545105, "learning_rate": 7.232484733830844e-06, "loss": 0.2626, "step": 834400 }, { "epoch": 11.497340938524703, "grad_norm": 4.147632122039795, "learning_rate": 7.227068020744279e-06, "loss": 0.2282, "step": 834500 }, { "epoch": 11.498718690584443, "grad_norm": 0.3707927167415619, "learning_rate": 7.2216530299470095e-06, "loss": 0.2473, "step": 834600 }, { "epoch": 11.500096442644182, "grad_norm": 3.010037899017334, "learning_rate": 7.216239761899059e-06, "loss": 0.2504, "step": 834700 }, { "epoch": 11.50147419470392, "grad_norm": 3.460237503051758, "learning_rate": 7.210828217060303e-06, "loss": 0.2582, "step": 834800 }, { "epoch": 11.502851946763661, "grad_norm": 2.639714479446411, "learning_rate": 7.205418395890496e-06, "loss": 0.2535, "step": 834900 }, { "epoch": 11.5042296988234, "grad_norm": 2.8627631664276123, "learning_rate": 7.200010298849218e-06, "loss": 0.2206, "step": 835000 }, { "epoch": 11.505607450883138, "grad_norm": 1.9577405452728271, "learning_rate": 7.194603926395916e-06, "loss": 0.2695, "step": 835100 }, { "epoch": 11.506985202942879, "grad_norm": 6.856706619262695, "learning_rate": 7.189199278989902e-06, "loss": 0.2888, "step": 835200 }, { "epoch": 11.508362955002617, "grad_norm": 1.630034327507019, "learning_rate": 7.183796357090317e-06, "loss": 0.2348, "step": 835300 }, { "epoch": 11.509740707062358, "grad_norm": 2.0960500240325928, "learning_rate": 7.178395161156159e-06, "loss": 0.228, "step": 835400 }, { "epoch": 11.511118459122097, "grad_norm": 3.3258602619171143, "learning_rate": 7.172995691646301e-06, "loss": 0.2986, "step": 835500 }, { "epoch": 11.512496211181835, "grad_norm": 2.4146888256073, "learning_rate": 7.167597949019458e-06, "loss": 0.2644, "step": 835600 }, { "epoch": 11.513873963241576, "grad_norm": 0.9140627384185791, "learning_rate": 7.1622019337341784e-06, "loss": 0.2552, "step": 835700 }, { "epoch": 11.515251715301314, "grad_norm": 1.3609150648117065, "learning_rate": 7.156807646248902e-06, "loss": 0.2445, "step": 835800 }, { "epoch": 11.516629467361053, "grad_norm": 2.699533462524414, "learning_rate": 7.151415087021891e-06, "loss": 0.2059, "step": 835900 }, { "epoch": 11.518007219420793, "grad_norm": 2.312948226928711, "learning_rate": 7.146024256511262e-06, "loss": 0.2191, "step": 836000 }, { "epoch": 11.519384971480532, "grad_norm": 2.9952449798583984, "learning_rate": 7.140635155175005e-06, "loss": 0.2373, "step": 836100 }, { "epoch": 11.520762723540273, "grad_norm": 2.5041937828063965, "learning_rate": 7.1352477834709555e-06, "loss": 0.2313, "step": 836200 }, { "epoch": 11.522140475600011, "grad_norm": 3.4156904220581055, "learning_rate": 7.129862141856784e-06, "loss": 0.2752, "step": 836300 }, { "epoch": 11.52351822765975, "grad_norm": 3.0293500423431396, "learning_rate": 7.1244782307900456e-06, "loss": 0.2655, "step": 836400 }, { "epoch": 11.52489597971949, "grad_norm": 19.489761352539062, "learning_rate": 7.119096050728113e-06, "loss": 0.2432, "step": 836500 }, { "epoch": 11.526273731779229, "grad_norm": 2.130206823348999, "learning_rate": 7.113769398042e-06, "loss": 0.2462, "step": 836600 }, { "epoch": 11.527651483838968, "grad_norm": 3.5766313076019287, "learning_rate": 7.108390664039837e-06, "loss": 0.2663, "step": 836700 }, { "epoch": 11.529029235898708, "grad_norm": 2.1663081645965576, "learning_rate": 7.103013662409202e-06, "loss": 0.2895, "step": 836800 }, { "epoch": 11.530406987958447, "grad_norm": 1.807091474533081, "learning_rate": 7.097638393606913e-06, "loss": 0.2139, "step": 836900 }, { "epoch": 11.531784740018185, "grad_norm": 2.310610055923462, "learning_rate": 7.0922648580896195e-06, "loss": 0.2193, "step": 837000 }, { "epoch": 11.533162492077926, "grad_norm": 1.3277782201766968, "learning_rate": 7.086893056313826e-06, "loss": 0.2319, "step": 837100 }, { "epoch": 11.534540244137665, "grad_norm": 7.29790735244751, "learning_rate": 7.0815766808259e-06, "loss": 0.2355, "step": 837200 }, { "epoch": 11.535917996197405, "grad_norm": 5.29612398147583, "learning_rate": 7.076208330553256e-06, "loss": 0.2602, "step": 837300 }, { "epoch": 11.537295748257144, "grad_norm": 0.4412059187889099, "learning_rate": 7.07084171538621e-06, "loss": 0.2582, "step": 837400 }, { "epoch": 11.538673500316882, "grad_norm": 1.5155586004257202, "learning_rate": 7.065476835780676e-06, "loss": 0.2515, "step": 837500 }, { "epoch": 11.540051252376623, "grad_norm": 0.13803496956825256, "learning_rate": 7.060113692192419e-06, "loss": 0.246, "step": 837600 }, { "epoch": 11.541429004436361, "grad_norm": 1.4031720161437988, "learning_rate": 7.054752285077082e-06, "loss": 0.2418, "step": 837700 }, { "epoch": 11.542806756496102, "grad_norm": 2.182058572769165, "learning_rate": 7.049392614890126e-06, "loss": 0.2996, "step": 837800 }, { "epoch": 11.54418450855584, "grad_norm": 2.9265987873077393, "learning_rate": 7.04403468208689e-06, "loss": 0.2314, "step": 837900 }, { "epoch": 11.54556226061558, "grad_norm": 3.079571008682251, "learning_rate": 7.038678487122573e-06, "loss": 0.2492, "step": 838000 }, { "epoch": 11.54694001267532, "grad_norm": 4.0816497802734375, "learning_rate": 7.033324030452195e-06, "loss": 0.2177, "step": 838100 }, { "epoch": 11.548317764735058, "grad_norm": 3.5425825119018555, "learning_rate": 7.027971312530638e-06, "loss": 0.2991, "step": 838200 }, { "epoch": 11.549695516794797, "grad_norm": 3.13627552986145, "learning_rate": 7.0226203338126556e-06, "loss": 0.2338, "step": 838300 }, { "epoch": 11.551073268854537, "grad_norm": 0.2931537926197052, "learning_rate": 7.017271094752844e-06, "loss": 0.2394, "step": 838400 }, { "epoch": 11.552451020914276, "grad_norm": 1.6076018810272217, "learning_rate": 7.0119235958056315e-06, "loss": 0.235, "step": 838500 }, { "epoch": 11.553828772974015, "grad_norm": 2.1508982181549072, "learning_rate": 7.006577837425336e-06, "loss": 0.2503, "step": 838600 }, { "epoch": 11.555206525033755, "grad_norm": 3.9645638465881348, "learning_rate": 7.001233820066097e-06, "loss": 0.2565, "step": 838700 }, { "epoch": 11.556584277093494, "grad_norm": 1.8535536527633667, "learning_rate": 6.995891544181909e-06, "loss": 0.2275, "step": 838800 }, { "epoch": 11.557962029153234, "grad_norm": 1.0841606855392456, "learning_rate": 6.990551010226628e-06, "loss": 0.2394, "step": 838900 }, { "epoch": 11.559339781212973, "grad_norm": 1.7006827592849731, "learning_rate": 6.985212218653977e-06, "loss": 0.2384, "step": 839000 }, { "epoch": 11.560717533272712, "grad_norm": 2.692349433898926, "learning_rate": 6.9798751699175e-06, "loss": 0.2523, "step": 839100 }, { "epoch": 11.562095285332452, "grad_norm": 0.5705708861351013, "learning_rate": 6.9745398644706e-06, "loss": 0.2699, "step": 839200 }, { "epoch": 11.56347303739219, "grad_norm": 1.5722116231918335, "learning_rate": 6.969206302766546e-06, "loss": 0.2228, "step": 839300 }, { "epoch": 11.56485078945193, "grad_norm": 1.0157722234725952, "learning_rate": 6.963874485258458e-06, "loss": 0.2048, "step": 839400 }, { "epoch": 11.56622854151167, "grad_norm": 0.03590843081474304, "learning_rate": 6.9585444123992855e-06, "loss": 0.2562, "step": 839500 }, { "epoch": 11.567606293571409, "grad_norm": 2.249206066131592, "learning_rate": 6.953216084641856e-06, "loss": 0.2291, "step": 839600 }, { "epoch": 11.568984045631149, "grad_norm": 2.757840156555176, "learning_rate": 6.947889502438856e-06, "loss": 0.2386, "step": 839700 }, { "epoch": 11.570361797690888, "grad_norm": 0.055385906249284744, "learning_rate": 6.942564666242767e-06, "loss": 0.2459, "step": 839800 }, { "epoch": 11.571739549750626, "grad_norm": 4.499365329742432, "learning_rate": 6.937241576505979e-06, "loss": 0.2494, "step": 839900 }, { "epoch": 11.573117301810367, "grad_norm": 2.5777533054351807, "learning_rate": 6.931920233680729e-06, "loss": 0.3055, "step": 840000 }, { "epoch": 11.574495053870105, "grad_norm": 3.3405919075012207, "learning_rate": 6.926600638219072e-06, "loss": 0.2401, "step": 840100 }, { "epoch": 11.575872805929844, "grad_norm": 2.5483813285827637, "learning_rate": 6.921282790572949e-06, "loss": 0.2528, "step": 840200 }, { "epoch": 11.577250557989585, "grad_norm": 0.49346649646759033, "learning_rate": 6.915966691194129e-06, "loss": 0.2241, "step": 840300 }, { "epoch": 11.578628310049323, "grad_norm": 4.195450782775879, "learning_rate": 6.9106523405342526e-06, "loss": 0.2292, "step": 840400 }, { "epoch": 11.580006062109064, "grad_norm": 0.4413321912288666, "learning_rate": 6.905339739044784e-06, "loss": 0.2374, "step": 840500 }, { "epoch": 11.581383814168802, "grad_norm": 0.779982328414917, "learning_rate": 6.900028887177068e-06, "loss": 0.2414, "step": 840600 }, { "epoch": 11.582761566228541, "grad_norm": 2.2243194580078125, "learning_rate": 6.894719785382293e-06, "loss": 0.2511, "step": 840700 }, { "epoch": 11.584139318288281, "grad_norm": 47.015743255615234, "learning_rate": 6.889412434111491e-06, "loss": 0.2608, "step": 840800 }, { "epoch": 11.58551707034802, "grad_norm": 3.3263680934906006, "learning_rate": 6.884106833815538e-06, "loss": 0.2373, "step": 840900 }, { "epoch": 11.586894822407759, "grad_norm": 2.4658260345458984, "learning_rate": 6.878802984945184e-06, "loss": 0.1951, "step": 841000 }, { "epoch": 11.5882725744675, "grad_norm": 0.18568913638591766, "learning_rate": 6.873500887951007e-06, "loss": 0.2802, "step": 841100 }, { "epoch": 11.589650326527238, "grad_norm": 4.120938301086426, "learning_rate": 6.868200543283455e-06, "loss": 0.2541, "step": 841200 }, { "epoch": 11.591028078586977, "grad_norm": 1.1207767724990845, "learning_rate": 6.8629019513928284e-06, "loss": 0.2878, "step": 841300 }, { "epoch": 11.592405830646717, "grad_norm": 0.6157251596450806, "learning_rate": 6.8576051127292565e-06, "loss": 0.2254, "step": 841400 }, { "epoch": 11.593783582706456, "grad_norm": 3.177783489227295, "learning_rate": 6.852310027742729e-06, "loss": 0.259, "step": 841500 }, { "epoch": 11.595161334766196, "grad_norm": 15.110915184020996, "learning_rate": 6.847016696883097e-06, "loss": 0.2834, "step": 841600 }, { "epoch": 11.596539086825935, "grad_norm": 5.06250524520874, "learning_rate": 6.841725120600067e-06, "loss": 0.2575, "step": 841700 }, { "epoch": 11.597916838885673, "grad_norm": 3.6861960887908936, "learning_rate": 6.836435299343162e-06, "loss": 0.2741, "step": 841800 }, { "epoch": 11.599294590945414, "grad_norm": 2.7153666019439697, "learning_rate": 6.831147233561806e-06, "loss": 0.2765, "step": 841900 }, { "epoch": 11.600672343005153, "grad_norm": 3.024301767349243, "learning_rate": 6.8258609237052294e-06, "loss": 0.2263, "step": 842000 }, { "epoch": 11.602050095064893, "grad_norm": 1.231520652770996, "learning_rate": 6.82057637022253e-06, "loss": 0.2378, "step": 842100 }, { "epoch": 11.603427847124632, "grad_norm": 1.4079076051712036, "learning_rate": 6.8153463928315165e-06, "loss": 0.2738, "step": 842200 }, { "epoch": 11.60480559918437, "grad_norm": 1.1549965143203735, "learning_rate": 6.810065335868344e-06, "loss": 0.253, "step": 842300 }, { "epoch": 11.60618335124411, "grad_norm": 2.221769094467163, "learning_rate": 6.80478603662098e-06, "loss": 0.2604, "step": 842400 }, { "epoch": 11.60756110330385, "grad_norm": 3.4098496437072754, "learning_rate": 6.799508495537916e-06, "loss": 0.208, "step": 842500 }, { "epoch": 11.608938855363588, "grad_norm": 6.142827987670898, "learning_rate": 6.794232713067504e-06, "loss": 0.2786, "step": 842600 }, { "epoch": 11.610316607423329, "grad_norm": 2.446105718612671, "learning_rate": 6.788958689657957e-06, "loss": 0.2241, "step": 842700 }, { "epoch": 11.611694359483067, "grad_norm": 2.647216320037842, "learning_rate": 6.783686425757321e-06, "loss": 0.2446, "step": 842800 }, { "epoch": 11.613072111542806, "grad_norm": 0.5367299914360046, "learning_rate": 6.778415921813505e-06, "loss": 0.3103, "step": 842900 }, { "epoch": 11.614449863602546, "grad_norm": 3.217264413833618, "learning_rate": 6.773147178274276e-06, "loss": 0.3026, "step": 843000 }, { "epoch": 11.615827615662285, "grad_norm": 2.834268808364868, "learning_rate": 6.767880195587232e-06, "loss": 0.2602, "step": 843100 }, { "epoch": 11.617205367722025, "grad_norm": 10.983319282531738, "learning_rate": 6.762614974199824e-06, "loss": 0.295, "step": 843200 }, { "epoch": 11.618583119781764, "grad_norm": 1.2028474807739258, "learning_rate": 6.7573515145593644e-06, "loss": 0.2698, "step": 843300 }, { "epoch": 11.619960871841503, "grad_norm": 5.578560829162598, "learning_rate": 6.7520898171130195e-06, "loss": 0.2524, "step": 843400 }, { "epoch": 11.621338623901243, "grad_norm": 2.5727131366729736, "learning_rate": 6.746829882307782e-06, "loss": 0.2366, "step": 843500 }, { "epoch": 11.622716375960982, "grad_norm": 2.489309310913086, "learning_rate": 6.741571710590527e-06, "loss": 0.2462, "step": 843600 }, { "epoch": 11.62409412802072, "grad_norm": 1.6929024457931519, "learning_rate": 6.736315302407955e-06, "loss": 0.2186, "step": 843700 }, { "epoch": 11.625471880080461, "grad_norm": 2.097480297088623, "learning_rate": 6.731060658206617e-06, "loss": 0.2461, "step": 843800 }, { "epoch": 11.6268496321402, "grad_norm": 2.3578040599823, "learning_rate": 6.725807778432927e-06, "loss": 0.2533, "step": 843900 }, { "epoch": 11.62822738419994, "grad_norm": 2.406205177307129, "learning_rate": 6.720556663533157e-06, "loss": 0.2551, "step": 844000 }, { "epoch": 11.629605136259679, "grad_norm": 1.4332845211029053, "learning_rate": 6.715307313953405e-06, "loss": 0.2927, "step": 844100 }, { "epoch": 11.630982888319418, "grad_norm": 2.657992362976074, "learning_rate": 6.710059730139621e-06, "loss": 0.225, "step": 844200 }, { "epoch": 11.632360640379158, "grad_norm": 3.3074700832366943, "learning_rate": 6.704813912537625e-06, "loss": 0.2334, "step": 844300 }, { "epoch": 11.633738392438897, "grad_norm": 0.8223738670349121, "learning_rate": 6.6995698615930815e-06, "loss": 0.2664, "step": 844400 }, { "epoch": 11.635116144498635, "grad_norm": 2.2995352745056152, "learning_rate": 6.6943275777514825e-06, "loss": 0.2291, "step": 844500 }, { "epoch": 11.636493896558376, "grad_norm": 4.352041244506836, "learning_rate": 6.6890870614581995e-06, "loss": 0.2339, "step": 844600 }, { "epoch": 11.637871648618114, "grad_norm": 1.7202414274215698, "learning_rate": 6.683848313158454e-06, "loss": 0.2454, "step": 844700 }, { "epoch": 11.639249400677855, "grad_norm": 1.3922858238220215, "learning_rate": 6.678663694340655e-06, "loss": 0.2758, "step": 844800 }, { "epoch": 11.640627152737594, "grad_norm": 3.378896474838257, "learning_rate": 6.673428465671927e-06, "loss": 0.2689, "step": 844900 }, { "epoch": 11.642004904797332, "grad_norm": 2.1404786109924316, "learning_rate": 6.668195006326998e-06, "loss": 0.2037, "step": 845000 }, { "epoch": 11.643382656857073, "grad_norm": 3.217641830444336, "learning_rate": 6.662963316750487e-06, "loss": 0.2665, "step": 845100 }, { "epoch": 11.644760408916811, "grad_norm": 2.824876070022583, "learning_rate": 6.657733397386842e-06, "loss": 0.2418, "step": 845200 }, { "epoch": 11.64613816097655, "grad_norm": 1.7568999528884888, "learning_rate": 6.6525052486803655e-06, "loss": 0.249, "step": 845300 }, { "epoch": 11.64751591303629, "grad_norm": 2.377793073654175, "learning_rate": 6.647278871075225e-06, "loss": 0.2878, "step": 845400 }, { "epoch": 11.648893665096029, "grad_norm": 3.0749120712280273, "learning_rate": 6.6420542650154345e-06, "loss": 0.2163, "step": 845500 }, { "epoch": 11.650271417155768, "grad_norm": 2.069171190261841, "learning_rate": 6.636831430944832e-06, "loss": 0.2063, "step": 845600 }, { "epoch": 11.651649169215508, "grad_norm": 4.620945453643799, "learning_rate": 6.631610369307137e-06, "loss": 0.2982, "step": 845700 }, { "epoch": 11.653026921275247, "grad_norm": 3.1854896545410156, "learning_rate": 6.6264432646563294e-06, "loss": 0.2258, "step": 845800 }, { "epoch": 11.654404673334987, "grad_norm": 2.1610448360443115, "learning_rate": 6.621225731479576e-06, "loss": 0.2812, "step": 845900 }, { "epoch": 11.655782425394726, "grad_norm": 2.504307985305786, "learning_rate": 6.616009972061503e-06, "loss": 0.2451, "step": 846000 }, { "epoch": 11.657160177454465, "grad_norm": 1.9328135251998901, "learning_rate": 6.6107959868452245e-06, "loss": 0.2721, "step": 846100 }, { "epoch": 11.658537929514205, "grad_norm": 0.039619311690330505, "learning_rate": 6.605583776273706e-06, "loss": 0.2269, "step": 846200 }, { "epoch": 11.659915681573944, "grad_norm": 2.4206550121307373, "learning_rate": 6.600373340789739e-06, "loss": 0.2651, "step": 846300 }, { "epoch": 11.661293433633684, "grad_norm": 2.9655678272247314, "learning_rate": 6.595164680835969e-06, "loss": 0.2417, "step": 846400 }, { "epoch": 11.662671185693423, "grad_norm": 1.7894355058670044, "learning_rate": 6.5899577968549165e-06, "loss": 0.2761, "step": 846500 }, { "epoch": 11.664048937753162, "grad_norm": 0.5256152749061584, "learning_rate": 6.584752689288916e-06, "loss": 0.2438, "step": 846600 }, { "epoch": 11.665426689812902, "grad_norm": 0.6700682640075684, "learning_rate": 6.5795493585801786e-06, "loss": 0.2662, "step": 846700 }, { "epoch": 11.66680444187264, "grad_norm": 1.5879675149917603, "learning_rate": 6.574347805170759e-06, "loss": 0.2817, "step": 846800 }, { "epoch": 11.66818219393238, "grad_norm": 20.59263038635254, "learning_rate": 6.569148029502546e-06, "loss": 0.3195, "step": 846900 }, { "epoch": 11.66955994599212, "grad_norm": 2.663388967514038, "learning_rate": 6.563950032017284e-06, "loss": 0.243, "step": 847000 }, { "epoch": 11.670937698051858, "grad_norm": 2.23234486579895, "learning_rate": 6.558753813156573e-06, "loss": 0.2536, "step": 847100 }, { "epoch": 11.672315450111597, "grad_norm": 2.4206724166870117, "learning_rate": 6.553559373361867e-06, "loss": 0.2948, "step": 847200 }, { "epoch": 11.673693202171338, "grad_norm": 2.317769765853882, "learning_rate": 6.548366713074447e-06, "loss": 0.2916, "step": 847300 }, { "epoch": 11.675070954231076, "grad_norm": 4.524205207824707, "learning_rate": 6.543175832735471e-06, "loss": 0.2382, "step": 847400 }, { "epoch": 11.676448706290817, "grad_norm": 1.031415581703186, "learning_rate": 6.537986732785923e-06, "loss": 0.1966, "step": 847500 }, { "epoch": 11.677826458350555, "grad_norm": 1.2295082807540894, "learning_rate": 6.532799413666634e-06, "loss": 0.2782, "step": 847600 }, { "epoch": 11.679204210410294, "grad_norm": 2.6194474697113037, "learning_rate": 6.527613875818305e-06, "loss": 0.2493, "step": 847700 }, { "epoch": 11.680581962470034, "grad_norm": 1.2957565784454346, "learning_rate": 6.522430119681476e-06, "loss": 0.2252, "step": 847800 }, { "epoch": 11.681959714529773, "grad_norm": 2.196861982345581, "learning_rate": 6.5172481456965225e-06, "loss": 0.2161, "step": 847900 }, { "epoch": 11.683337466589512, "grad_norm": 3.391505002975464, "learning_rate": 6.512067954303693e-06, "loss": 0.2171, "step": 848000 }, { "epoch": 11.684715218649252, "grad_norm": 1.6934423446655273, "learning_rate": 6.506889545943055e-06, "loss": 0.2348, "step": 848100 }, { "epoch": 11.686092970708991, "grad_norm": 4.701761245727539, "learning_rate": 6.501712921054559e-06, "loss": 0.2143, "step": 848200 }, { "epoch": 11.687470722768731, "grad_norm": 10.194208145141602, "learning_rate": 6.496538080077969e-06, "loss": 0.2064, "step": 848300 }, { "epoch": 11.68884847482847, "grad_norm": 2.013437509536743, "learning_rate": 6.491365023452925e-06, "loss": 0.2317, "step": 848400 }, { "epoch": 11.690226226888209, "grad_norm": 2.2102787494659424, "learning_rate": 6.486193751618916e-06, "loss": 0.2331, "step": 848500 }, { "epoch": 11.69160397894795, "grad_norm": 2.1643428802490234, "learning_rate": 6.481024265015235e-06, "loss": 0.2533, "step": 848600 }, { "epoch": 11.692981731007688, "grad_norm": 1.741525411605835, "learning_rate": 6.475856564081078e-06, "loss": 0.2438, "step": 848700 }, { "epoch": 11.694359483067426, "grad_norm": 1.0578083992004395, "learning_rate": 6.470690649255471e-06, "loss": 0.2508, "step": 848800 }, { "epoch": 11.695737235127167, "grad_norm": 2.697371244430542, "learning_rate": 6.4655265209772726e-06, "loss": 0.2536, "step": 848900 }, { "epoch": 11.697114987186906, "grad_norm": 1.634110689163208, "learning_rate": 6.460364179685206e-06, "loss": 0.2694, "step": 849000 }, { "epoch": 11.698492739246646, "grad_norm": 2.7399866580963135, "learning_rate": 6.455203625817849e-06, "loss": 0.3063, "step": 849100 }, { "epoch": 11.699870491306385, "grad_norm": 2.803243637084961, "learning_rate": 6.4500448598136085e-06, "loss": 0.2526, "step": 849200 }, { "epoch": 11.701248243366123, "grad_norm": 1.7457787990570068, "learning_rate": 6.444887882110738e-06, "loss": 0.2406, "step": 849300 }, { "epoch": 11.702625995425864, "grad_norm": 1.6382092237472534, "learning_rate": 6.439732693147364e-06, "loss": 0.2248, "step": 849400 }, { "epoch": 11.704003747485602, "grad_norm": 1.9540066719055176, "learning_rate": 6.434579293361444e-06, "loss": 0.2177, "step": 849500 }, { "epoch": 11.705381499545341, "grad_norm": 2.8187649250030518, "learning_rate": 6.429427683190786e-06, "loss": 0.2725, "step": 849600 }, { "epoch": 11.706759251605082, "grad_norm": 0.25920233130455017, "learning_rate": 6.424277863073033e-06, "loss": 0.2198, "step": 849700 }, { "epoch": 11.70813700366482, "grad_norm": 1.2644122838974, "learning_rate": 6.419129833445709e-06, "loss": 0.251, "step": 849800 }, { "epoch": 11.709514755724559, "grad_norm": 2.229199171066284, "learning_rate": 6.413983594746143e-06, "loss": 0.2385, "step": 849900 }, { "epoch": 11.7108925077843, "grad_norm": 6.176631927490234, "learning_rate": 6.408839147411549e-06, "loss": 0.2272, "step": 850000 }, { "epoch": 11.712270259844038, "grad_norm": 4.0549139976501465, "learning_rate": 6.403696491878979e-06, "loss": 0.2202, "step": 850100 }, { "epoch": 11.713648011903778, "grad_norm": 0.3421748876571655, "learning_rate": 6.398555628585318e-06, "loss": 0.2301, "step": 850200 }, { "epoch": 11.715025763963517, "grad_norm": 8.835271835327148, "learning_rate": 6.393416557967303e-06, "loss": 0.2232, "step": 850300 }, { "epoch": 11.716403516023256, "grad_norm": 6.385168075561523, "learning_rate": 6.3882792804615335e-06, "loss": 0.246, "step": 850400 }, { "epoch": 11.717781268082996, "grad_norm": 8.807048797607422, "learning_rate": 6.383143796504454e-06, "loss": 0.2604, "step": 850500 }, { "epoch": 11.719159020142735, "grad_norm": 3.4053125381469727, "learning_rate": 6.378010106532331e-06, "loss": 0.2763, "step": 850600 }, { "epoch": 11.720536772202475, "grad_norm": 2.7771785259246826, "learning_rate": 6.372878210981319e-06, "loss": 0.2316, "step": 850700 }, { "epoch": 11.721914524262214, "grad_norm": 2.2014052867889404, "learning_rate": 6.367748110287389e-06, "loss": 0.2479, "step": 850800 }, { "epoch": 11.723292276321953, "grad_norm": 1.3412858247756958, "learning_rate": 6.362619804886361e-06, "loss": 0.2076, "step": 850900 }, { "epoch": 11.724670028381693, "grad_norm": 4.020803928375244, "learning_rate": 6.357493295213918e-06, "loss": 0.2481, "step": 851000 }, { "epoch": 11.726047780441432, "grad_norm": 3.667405843734741, "learning_rate": 6.352368581705591e-06, "loss": 0.2889, "step": 851100 }, { "epoch": 11.72742553250117, "grad_norm": 2.3711185455322266, "learning_rate": 6.347245664796738e-06, "loss": 0.2992, "step": 851200 }, { "epoch": 11.728803284560911, "grad_norm": 2.894505500793457, "learning_rate": 6.342124544922587e-06, "loss": 0.2308, "step": 851300 }, { "epoch": 11.73018103662065, "grad_norm": 1.6895906925201416, "learning_rate": 6.337005222518194e-06, "loss": 0.2841, "step": 851400 }, { "epoch": 11.731558788680388, "grad_norm": 0.23754964768886566, "learning_rate": 6.331887698018483e-06, "loss": 0.2685, "step": 851500 }, { "epoch": 11.732936540740129, "grad_norm": 3.2579987049102783, "learning_rate": 6.3267719718582004e-06, "loss": 0.2747, "step": 851600 }, { "epoch": 11.734314292799867, "grad_norm": 6.439496994018555, "learning_rate": 6.321658044471959e-06, "loss": 0.2848, "step": 851700 }, { "epoch": 11.735692044859608, "grad_norm": 2.8947651386260986, "learning_rate": 6.316597028668494e-06, "loss": 0.2633, "step": 851800 }, { "epoch": 11.737069796919346, "grad_norm": 2.745131731033325, "learning_rate": 6.311486682134979e-06, "loss": 0.2302, "step": 851900 }, { "epoch": 11.738447548979085, "grad_norm": 0.8721083402633667, "learning_rate": 6.306378135674061e-06, "loss": 0.2778, "step": 852000 }, { "epoch": 11.739825301038826, "grad_norm": 1.388418436050415, "learning_rate": 6.301271389719749e-06, "loss": 0.253, "step": 852100 }, { "epoch": 11.741203053098564, "grad_norm": 3.2740159034729004, "learning_rate": 6.29616644470589e-06, "loss": 0.2729, "step": 852200 }, { "epoch": 11.742580805158303, "grad_norm": 2.527209520339966, "learning_rate": 6.291063301066161e-06, "loss": 0.2941, "step": 852300 }, { "epoch": 11.743958557218043, "grad_norm": 4.9052414894104, "learning_rate": 6.285961959234119e-06, "loss": 0.2582, "step": 852400 }, { "epoch": 11.745336309277782, "grad_norm": 1.8431191444396973, "learning_rate": 6.280862419643139e-06, "loss": 0.2782, "step": 852500 }, { "epoch": 11.746714061337522, "grad_norm": 2.012159585952759, "learning_rate": 6.275764682726441e-06, "loss": 0.2934, "step": 852600 }, { "epoch": 11.748091813397261, "grad_norm": 5.73352575302124, "learning_rate": 6.27066874891712e-06, "loss": 0.2461, "step": 852700 }, { "epoch": 11.749469565457, "grad_norm": 4.38605260848999, "learning_rate": 6.265574618648104e-06, "loss": 0.2805, "step": 852800 }, { "epoch": 11.75084731751674, "grad_norm": 0.7146219611167908, "learning_rate": 6.260482292352158e-06, "loss": 0.2547, "step": 852900 }, { "epoch": 11.752225069576479, "grad_norm": 3.161679983139038, "learning_rate": 6.2553917704618946e-06, "loss": 0.2342, "step": 853000 }, { "epoch": 11.753602821636218, "grad_norm": 1.6239672899246216, "learning_rate": 6.250303053409787e-06, "loss": 0.2584, "step": 853100 }, { "epoch": 11.754980573695958, "grad_norm": 1.5768834352493286, "learning_rate": 6.245216141628156e-06, "loss": 0.271, "step": 853200 }, { "epoch": 11.756358325755697, "grad_norm": 1.0062748193740845, "learning_rate": 6.240181877670289e-06, "loss": 0.2344, "step": 853300 }, { "epoch": 11.757736077815437, "grad_norm": 4.526578426361084, "learning_rate": 6.235098559662425e-06, "loss": 0.2503, "step": 853400 }, { "epoch": 11.759113829875176, "grad_norm": 2.852250337600708, "learning_rate": 6.230017048216739e-06, "loss": 0.2317, "step": 853500 }, { "epoch": 11.760491581934915, "grad_norm": 2.481154680252075, "learning_rate": 6.224937343764918e-06, "loss": 0.2559, "step": 853600 }, { "epoch": 11.761869333994655, "grad_norm": 2.323820114135742, "learning_rate": 6.219859446738507e-06, "loss": 0.2722, "step": 853700 }, { "epoch": 11.763247086054394, "grad_norm": 19.35616683959961, "learning_rate": 6.214783357568904e-06, "loss": 0.2672, "step": 853800 }, { "epoch": 11.764624838114132, "grad_norm": 2.4237706661224365, "learning_rate": 6.209709076687358e-06, "loss": 0.2722, "step": 853900 }, { "epoch": 11.766002590173873, "grad_norm": 1.6251882314682007, "learning_rate": 6.204636604524946e-06, "loss": 0.1929, "step": 854000 }, { "epoch": 11.767380342233611, "grad_norm": 2.4958372116088867, "learning_rate": 6.199565941512592e-06, "loss": 0.2408, "step": 854100 }, { "epoch": 11.76875809429335, "grad_norm": 0.703154444694519, "learning_rate": 6.194497088081081e-06, "loss": 0.2558, "step": 854200 }, { "epoch": 11.77013584635309, "grad_norm": 1.5437828302383423, "learning_rate": 6.189430044661052e-06, "loss": 0.2491, "step": 854300 }, { "epoch": 11.77151359841283, "grad_norm": 1.4526002407073975, "learning_rate": 6.184364811682955e-06, "loss": 0.2851, "step": 854400 }, { "epoch": 11.77289135047257, "grad_norm": 2.455385684967041, "learning_rate": 6.1793013895771286e-06, "loss": 0.2481, "step": 854500 }, { "epoch": 11.774269102532308, "grad_norm": 0.01590082049369812, "learning_rate": 6.1742397787737224e-06, "loss": 0.246, "step": 854600 }, { "epoch": 11.775646854592047, "grad_norm": 0.9369410276412964, "learning_rate": 6.169179979702741e-06, "loss": 0.2364, "step": 854700 }, { "epoch": 11.777024606651787, "grad_norm": 3.834155321121216, "learning_rate": 6.164121992794048e-06, "loss": 0.2143, "step": 854800 }, { "epoch": 11.778402358711526, "grad_norm": 0.04187556728720665, "learning_rate": 6.159065818477354e-06, "loss": 0.2607, "step": 854900 }, { "epoch": 11.779780110771267, "grad_norm": 2.9726696014404297, "learning_rate": 6.154011457182189e-06, "loss": 0.2266, "step": 855000 }, { "epoch": 11.781157862831005, "grad_norm": 2.342405319213867, "learning_rate": 6.148958909337964e-06, "loss": 0.2263, "step": 855100 }, { "epoch": 11.782535614890744, "grad_norm": 0.011447679251432419, "learning_rate": 6.143908175373906e-06, "loss": 0.2763, "step": 855200 }, { "epoch": 11.783913366950484, "grad_norm": 2.4221973419189453, "learning_rate": 6.13885925571911e-06, "loss": 0.251, "step": 855300 }, { "epoch": 11.785291119010223, "grad_norm": 1.5310330390930176, "learning_rate": 6.133812150802497e-06, "loss": 0.2434, "step": 855400 }, { "epoch": 11.786668871069962, "grad_norm": 4.3105950355529785, "learning_rate": 6.1287668610528596e-06, "loss": 0.2476, "step": 855500 }, { "epoch": 11.788046623129702, "grad_norm": 1.146614909172058, "learning_rate": 6.123723386898799e-06, "loss": 0.2231, "step": 855600 }, { "epoch": 11.78942437518944, "grad_norm": 0.026918886229395866, "learning_rate": 6.118681728768809e-06, "loss": 0.2525, "step": 855700 }, { "epoch": 11.79080212724918, "grad_norm": 3.294219970703125, "learning_rate": 6.113641887091185e-06, "loss": 0.275, "step": 855800 }, { "epoch": 11.79217987930892, "grad_norm": 1.6708319187164307, "learning_rate": 6.108603862294099e-06, "loss": 0.2368, "step": 855900 }, { "epoch": 11.793557631368659, "grad_norm": 3.0290560722351074, "learning_rate": 6.103567654805547e-06, "loss": 0.2509, "step": 856000 }, { "epoch": 11.794935383428399, "grad_norm": 1.4740827083587646, "learning_rate": 6.098533265053388e-06, "loss": 0.2361, "step": 856100 }, { "epoch": 11.796313135488138, "grad_norm": 1.8212511539459229, "learning_rate": 6.093500693465319e-06, "loss": 0.2402, "step": 856200 }, { "epoch": 11.797690887547876, "grad_norm": 1.5977269411087036, "learning_rate": 6.088469940468885e-06, "loss": 0.2547, "step": 856300 }, { "epoch": 11.799068639607617, "grad_norm": 1.766414761543274, "learning_rate": 6.083441006491464e-06, "loss": 0.2545, "step": 856400 }, { "epoch": 11.800446391667355, "grad_norm": 0.024048594757914543, "learning_rate": 6.0784138919603005e-06, "loss": 0.2307, "step": 856500 }, { "epoch": 11.801824143727094, "grad_norm": 5.119098663330078, "learning_rate": 6.073388597302463e-06, "loss": 0.2675, "step": 856600 }, { "epoch": 11.803201895786835, "grad_norm": 7.258779048919678, "learning_rate": 6.068365122944882e-06, "loss": 0.2393, "step": 856700 }, { "epoch": 11.804579647846573, "grad_norm": 2.4528822898864746, "learning_rate": 6.063343469314335e-06, "loss": 0.2634, "step": 856800 }, { "epoch": 11.805957399906314, "grad_norm": 2.5590145587921143, "learning_rate": 6.058323636837429e-06, "loss": 0.2004, "step": 856900 }, { "epoch": 11.807335151966052, "grad_norm": 1.577837347984314, "learning_rate": 6.053305625940616e-06, "loss": 0.1994, "step": 857000 }, { "epoch": 11.808712904025791, "grad_norm": 2.923936367034912, "learning_rate": 6.048289437050213e-06, "loss": 0.2318, "step": 857100 }, { "epoch": 11.810090656085531, "grad_norm": 5.290594100952148, "learning_rate": 6.043275070592373e-06, "loss": 0.251, "step": 857200 }, { "epoch": 11.81146840814527, "grad_norm": 0.627910315990448, "learning_rate": 6.038262526993083e-06, "loss": 0.2148, "step": 857300 }, { "epoch": 11.812846160205009, "grad_norm": 1.210464358329773, "learning_rate": 6.033251806678197e-06, "loss": 0.2952, "step": 857400 }, { "epoch": 11.81422391226475, "grad_norm": 4.6392903327941895, "learning_rate": 6.028242910073391e-06, "loss": 0.2146, "step": 857500 }, { "epoch": 11.815601664324488, "grad_norm": 2.447746992111206, "learning_rate": 6.023235837604192e-06, "loss": 0.2665, "step": 857600 }, { "epoch": 11.816979416384228, "grad_norm": 0.88821941614151, "learning_rate": 6.018280633142093e-06, "loss": 0.1949, "step": 857700 }, { "epoch": 11.818357168443967, "grad_norm": 2.402385711669922, "learning_rate": 6.0132771919681295e-06, "loss": 0.2509, "step": 857800 }, { "epoch": 11.819734920503706, "grad_norm": 2.473785877227783, "learning_rate": 6.0082755762012e-06, "loss": 0.2573, "step": 857900 }, { "epoch": 11.821112672563446, "grad_norm": 2.7061538696289062, "learning_rate": 6.00327578626621e-06, "loss": 0.2898, "step": 858000 }, { "epoch": 11.822490424623185, "grad_norm": 2.047415256500244, "learning_rate": 5.998277822587914e-06, "loss": 0.2523, "step": 858100 }, { "epoch": 11.823868176682923, "grad_norm": 2.0770041942596436, "learning_rate": 5.993331637917423e-06, "loss": 0.2451, "step": 858200 }, { "epoch": 11.825245928742664, "grad_norm": 3.0573275089263916, "learning_rate": 5.9883373097530255e-06, "loss": 0.2033, "step": 858300 }, { "epoch": 11.826623680802403, "grad_norm": 1.6069756746292114, "learning_rate": 5.983344809114425e-06, "loss": 0.2668, "step": 858400 }, { "epoch": 11.828001432862141, "grad_norm": 2.2553911209106445, "learning_rate": 5.978354136425756e-06, "loss": 0.2654, "step": 858500 }, { "epoch": 11.829379184921882, "grad_norm": 4.689764499664307, "learning_rate": 5.973365292111005e-06, "loss": 0.2621, "step": 858600 }, { "epoch": 11.83075693698162, "grad_norm": 0.35373467206954956, "learning_rate": 5.96837827659401e-06, "loss": 0.2149, "step": 858700 }, { "epoch": 11.83213468904136, "grad_norm": 0.8575272560119629, "learning_rate": 5.963393090298426e-06, "loss": 0.1969, "step": 858800 }, { "epoch": 11.8335124411011, "grad_norm": 2.2419016361236572, "learning_rate": 5.958409733647782e-06, "loss": 0.2443, "step": 858900 }, { "epoch": 11.834890193160838, "grad_norm": 3.074385404586792, "learning_rate": 5.9534282070654434e-06, "loss": 0.2682, "step": 859000 }, { "epoch": 11.836267945220579, "grad_norm": 2.369394063949585, "learning_rate": 5.948448510974612e-06, "loss": 0.3274, "step": 859100 }, { "epoch": 11.837645697280317, "grad_norm": 13.421640396118164, "learning_rate": 5.9434706457983295e-06, "loss": 0.275, "step": 859200 }, { "epoch": 11.839023449340058, "grad_norm": 2.7182819843292236, "learning_rate": 5.938494611959504e-06, "loss": 0.279, "step": 859300 }, { "epoch": 11.840401201399796, "grad_norm": 2.0081160068511963, "learning_rate": 5.93352040988086e-06, "loss": 0.3023, "step": 859400 }, { "epoch": 11.841778953459535, "grad_norm": 3.94633150100708, "learning_rate": 5.9285480399850014e-06, "loss": 0.2488, "step": 859500 }, { "epoch": 11.843156705519275, "grad_norm": 10.242376327514648, "learning_rate": 5.923577502694336e-06, "loss": 0.2418, "step": 859600 }, { "epoch": 11.844534457579014, "grad_norm": 0.5839519500732422, "learning_rate": 5.918608798431154e-06, "loss": 0.2283, "step": 859700 }, { "epoch": 11.845912209638753, "grad_norm": 4.316906452178955, "learning_rate": 5.913641927617556e-06, "loss": 0.2945, "step": 859800 }, { "epoch": 11.847289961698493, "grad_norm": 3.4589645862579346, "learning_rate": 5.9086768906755135e-06, "loss": 0.2747, "step": 859900 }, { "epoch": 11.848667713758232, "grad_norm": 5.400808811187744, "learning_rate": 5.903713688026836e-06, "loss": 0.2716, "step": 860000 }, { "epoch": 11.85004546581797, "grad_norm": 2.687760829925537, "learning_rate": 5.898752320093165e-06, "loss": 0.2363, "step": 860100 }, { "epoch": 11.851423217877711, "grad_norm": 1.7053422927856445, "learning_rate": 5.8937927872959885e-06, "loss": 0.2538, "step": 860200 }, { "epoch": 11.85280096993745, "grad_norm": 1.6721010208129883, "learning_rate": 5.888835090056662e-06, "loss": 0.235, "step": 860300 }, { "epoch": 11.85417872199719, "grad_norm": 0.1034838855266571, "learning_rate": 5.883879228796346e-06, "loss": 0.2095, "step": 860400 }, { "epoch": 11.855556474056929, "grad_norm": 1.4383562803268433, "learning_rate": 5.878925203936077e-06, "loss": 0.2646, "step": 860500 }, { "epoch": 11.856934226116667, "grad_norm": 2.245439052581787, "learning_rate": 5.873973015896733e-06, "loss": 0.2436, "step": 860600 }, { "epoch": 11.858311978176408, "grad_norm": 0.015083450824022293, "learning_rate": 5.869022665099018e-06, "loss": 0.2509, "step": 860700 }, { "epoch": 11.859689730236147, "grad_norm": 1.7889741659164429, "learning_rate": 5.864123627997033e-06, "loss": 0.2687, "step": 860800 }, { "epoch": 11.861067482295885, "grad_norm": 1.2468163967132568, "learning_rate": 5.859176934561183e-06, "loss": 0.2137, "step": 860900 }, { "epoch": 11.862445234355626, "grad_norm": 2.873322010040283, "learning_rate": 5.854232079623966e-06, "loss": 0.2442, "step": 861000 }, { "epoch": 11.863822986415364, "grad_norm": 6.619929313659668, "learning_rate": 5.84928906360548e-06, "loss": 0.2811, "step": 861100 }, { "epoch": 11.865200738475105, "grad_norm": 2.808436632156372, "learning_rate": 5.844347886925654e-06, "loss": 0.2339, "step": 861200 }, { "epoch": 11.866578490534843, "grad_norm": 0.7952606678009033, "learning_rate": 5.839408550004253e-06, "loss": 0.2548, "step": 861300 }, { "epoch": 11.867956242594582, "grad_norm": 2.291318655014038, "learning_rate": 5.834471053260921e-06, "loss": 0.2271, "step": 861400 }, { "epoch": 11.869333994654323, "grad_norm": 0.4325435757637024, "learning_rate": 5.829535397115104e-06, "loss": 0.261, "step": 861500 }, { "epoch": 11.870711746714061, "grad_norm": 2.2044060230255127, "learning_rate": 5.824601581986122e-06, "loss": 0.221, "step": 861600 }, { "epoch": 11.8720894987738, "grad_norm": 0.16177278757095337, "learning_rate": 5.819669608293133e-06, "loss": 0.2211, "step": 861700 }, { "epoch": 11.87346725083354, "grad_norm": 1.851431131362915, "learning_rate": 5.814739476455127e-06, "loss": 0.2219, "step": 861800 }, { "epoch": 11.874845002893279, "grad_norm": 2.610363721847534, "learning_rate": 5.809860460665949e-06, "loss": 0.2337, "step": 861900 }, { "epoch": 11.87622275495302, "grad_norm": 2.0685129165649414, "learning_rate": 5.804933995365266e-06, "loss": 0.2659, "step": 862000 }, { "epoch": 11.877600507012758, "grad_norm": 3.0322415828704834, "learning_rate": 5.800009373171441e-06, "loss": 0.242, "step": 862100 }, { "epoch": 11.878978259072497, "grad_norm": 2.206204652786255, "learning_rate": 5.795086594502827e-06, "loss": 0.2253, "step": 862200 }, { "epoch": 11.880356011132237, "grad_norm": 0.14909206330776215, "learning_rate": 5.790165659777657e-06, "loss": 0.2298, "step": 862300 }, { "epoch": 11.881733763191976, "grad_norm": 3.1101622581481934, "learning_rate": 5.78524656941397e-06, "loss": 0.2383, "step": 862400 }, { "epoch": 11.883111515251715, "grad_norm": 3.2435362339019775, "learning_rate": 5.780329323829688e-06, "loss": 0.2325, "step": 862500 }, { "epoch": 11.884489267311455, "grad_norm": 1.3852076530456543, "learning_rate": 5.775413923442543e-06, "loss": 0.2124, "step": 862600 }, { "epoch": 11.885867019371194, "grad_norm": 5.233317852020264, "learning_rate": 5.770500368670125e-06, "loss": 0.198, "step": 862700 }, { "epoch": 11.887244771430932, "grad_norm": 2.648132562637329, "learning_rate": 5.765588659929881e-06, "loss": 0.2392, "step": 862800 }, { "epoch": 11.888622523490673, "grad_norm": 5.508492946624756, "learning_rate": 5.760678797639062e-06, "loss": 0.2509, "step": 862900 }, { "epoch": 11.890000275550412, "grad_norm": 2.671232223510742, "learning_rate": 5.7557707822147924e-06, "loss": 0.2147, "step": 863000 }, { "epoch": 11.891378027610152, "grad_norm": 0.22083638608455658, "learning_rate": 5.750864614074047e-06, "loss": 0.2501, "step": 863100 }, { "epoch": 11.89275577966989, "grad_norm": 0.7079687118530273, "learning_rate": 5.745960293633614e-06, "loss": 0.2703, "step": 863200 }, { "epoch": 11.89413353172963, "grad_norm": 0.8027519583702087, "learning_rate": 5.7410578213101455e-06, "loss": 0.2168, "step": 863300 }, { "epoch": 11.89551128378937, "grad_norm": 2.065736770629883, "learning_rate": 5.7361571975201425e-06, "loss": 0.2071, "step": 863400 }, { "epoch": 11.896889035849108, "grad_norm": 1.3039445877075195, "learning_rate": 5.731258422679926e-06, "loss": 0.2674, "step": 863500 }, { "epoch": 11.898266787908849, "grad_norm": 1.9713029861450195, "learning_rate": 5.726361497205667e-06, "loss": 0.2245, "step": 863600 }, { "epoch": 11.899644539968588, "grad_norm": 2.251127004623413, "learning_rate": 5.721466421513394e-06, "loss": 0.252, "step": 863700 }, { "epoch": 11.901022292028326, "grad_norm": 5.01297664642334, "learning_rate": 5.7165731960189754e-06, "loss": 0.3057, "step": 863800 }, { "epoch": 11.902400044088067, "grad_norm": 0.8853189945220947, "learning_rate": 5.711681821138104e-06, "loss": 0.2352, "step": 863900 }, { "epoch": 11.903777796147805, "grad_norm": 2.0619289875030518, "learning_rate": 5.706792297286325e-06, "loss": 0.2204, "step": 864000 }, { "epoch": 11.905155548207544, "grad_norm": 2.3923721313476562, "learning_rate": 5.7019046248790375e-06, "loss": 0.2724, "step": 864100 }, { "epoch": 11.906533300267284, "grad_norm": 2.575901746749878, "learning_rate": 5.697018804331466e-06, "loss": 0.2564, "step": 864200 }, { "epoch": 11.907911052327023, "grad_norm": 3.117515802383423, "learning_rate": 5.69213483605869e-06, "loss": 0.2141, "step": 864300 }, { "epoch": 11.909288804386762, "grad_norm": 2.222916603088379, "learning_rate": 5.687252720475637e-06, "loss": 0.2245, "step": 864400 }, { "epoch": 11.910666556446502, "grad_norm": 2.944580316543579, "learning_rate": 5.682372457997056e-06, "loss": 0.2625, "step": 864500 }, { "epoch": 11.91204430850624, "grad_norm": 1.9862767457962036, "learning_rate": 5.677494049037547e-06, "loss": 0.1844, "step": 864600 }, { "epoch": 11.913422060565981, "grad_norm": 0.038270335644483566, "learning_rate": 5.672617494011561e-06, "loss": 0.2479, "step": 864700 }, { "epoch": 11.91479981262572, "grad_norm": 1.7927380800247192, "learning_rate": 5.667742793333395e-06, "loss": 0.2493, "step": 864800 }, { "epoch": 11.916177564685459, "grad_norm": 7.083962440490723, "learning_rate": 5.6628699474171645e-06, "loss": 0.2163, "step": 864900 }, { "epoch": 11.917555316745199, "grad_norm": 2.262510061264038, "learning_rate": 5.657998956676849e-06, "loss": 0.281, "step": 865000 }, { "epoch": 11.918933068804938, "grad_norm": 1.6414446830749512, "learning_rate": 5.6531298215262774e-06, "loss": 0.3048, "step": 865100 }, { "epoch": 11.920310820864676, "grad_norm": 2.964630365371704, "learning_rate": 5.648262542379086e-06, "loss": 0.2005, "step": 865200 }, { "epoch": 11.921688572924417, "grad_norm": 1.1724342107772827, "learning_rate": 5.643397119648781e-06, "loss": 0.2459, "step": 865300 }, { "epoch": 11.923066324984156, "grad_norm": 1.5080740451812744, "learning_rate": 5.638533553748716e-06, "loss": 0.2525, "step": 865400 }, { "epoch": 11.924444077043896, "grad_norm": 4.028144836425781, "learning_rate": 5.633671845092061e-06, "loss": 0.2629, "step": 865500 }, { "epoch": 11.925821829103635, "grad_norm": 0.2799889147281647, "learning_rate": 5.628811994091857e-06, "loss": 0.2503, "step": 865600 }, { "epoch": 11.927199581163373, "grad_norm": 5.697725772857666, "learning_rate": 5.623954001160961e-06, "loss": 0.2125, "step": 865700 }, { "epoch": 11.928577333223114, "grad_norm": 0.8435084223747253, "learning_rate": 5.6190978667120945e-06, "loss": 0.263, "step": 865800 }, { "epoch": 11.929955085282852, "grad_norm": 3.621826171875, "learning_rate": 5.614243591157801e-06, "loss": 0.2493, "step": 865900 }, { "epoch": 11.931332837342591, "grad_norm": 0.9587473273277283, "learning_rate": 5.609391174910479e-06, "loss": 0.2625, "step": 866000 }, { "epoch": 11.932710589402332, "grad_norm": 3.8569228649139404, "learning_rate": 5.604589114740699e-06, "loss": 0.2509, "step": 866100 }, { "epoch": 11.93408834146207, "grad_norm": 2.1629412174224854, "learning_rate": 5.5997403997405334e-06, "loss": 0.2198, "step": 866200 }, { "epoch": 11.93546609352181, "grad_norm": 3.3584821224212646, "learning_rate": 5.594893545279458e-06, "loss": 0.2788, "step": 866300 }, { "epoch": 11.93684384558155, "grad_norm": 2.0743823051452637, "learning_rate": 5.590048551769239e-06, "loss": 0.1929, "step": 866400 }, { "epoch": 11.938221597641288, "grad_norm": 0.39749419689178467, "learning_rate": 5.585205419621496e-06, "loss": 0.2537, "step": 866500 }, { "epoch": 11.939599349701028, "grad_norm": 0.5560060143470764, "learning_rate": 5.58036414924766e-06, "loss": 0.2549, "step": 866600 }, { "epoch": 11.940977101760767, "grad_norm": 2.6576650142669678, "learning_rate": 5.575524741059037e-06, "loss": 0.2328, "step": 866700 }, { "epoch": 11.942354853820506, "grad_norm": 2.8591365814208984, "learning_rate": 5.5706871954667415e-06, "loss": 0.2104, "step": 866800 }, { "epoch": 11.943732605880246, "grad_norm": 4.699713706970215, "learning_rate": 5.565851512881769e-06, "loss": 0.2423, "step": 866900 }, { "epoch": 11.945110357939985, "grad_norm": 3.336887836456299, "learning_rate": 5.561017693714917e-06, "loss": 0.2518, "step": 867000 }, { "epoch": 11.946488109999724, "grad_norm": 1.633097529411316, "learning_rate": 5.556185738376859e-06, "loss": 0.2308, "step": 867100 }, { "epoch": 11.947865862059464, "grad_norm": 1.0202280282974243, "learning_rate": 5.551355647278079e-06, "loss": 0.2672, "step": 867200 }, { "epoch": 11.949243614119203, "grad_norm": 1.6913769245147705, "learning_rate": 5.54652742082893e-06, "loss": 0.2413, "step": 867300 }, { "epoch": 11.950621366178943, "grad_norm": 3.2953696250915527, "learning_rate": 5.541701059439583e-06, "loss": 0.2349, "step": 867400 }, { "epoch": 11.951999118238682, "grad_norm": 1.0987346172332764, "learning_rate": 5.536876563520078e-06, "loss": 0.2648, "step": 867500 }, { "epoch": 11.95337687029842, "grad_norm": 1.3132703304290771, "learning_rate": 5.532053933480265e-06, "loss": 0.2708, "step": 867600 }, { "epoch": 11.95475462235816, "grad_norm": 3.4657421112060547, "learning_rate": 5.527233169729855e-06, "loss": 0.2191, "step": 867700 }, { "epoch": 11.9561323744179, "grad_norm": 1.6055585145950317, "learning_rate": 5.522414272678409e-06, "loss": 0.2337, "step": 867800 }, { "epoch": 11.95751012647764, "grad_norm": 0.8866155743598938, "learning_rate": 5.517597242735306e-06, "loss": 0.2758, "step": 867900 }, { "epoch": 11.958887878537379, "grad_norm": 10.017451286315918, "learning_rate": 5.51278208030977e-06, "loss": 0.2741, "step": 868000 }, { "epoch": 11.960265630597117, "grad_norm": 0.23922261595726013, "learning_rate": 5.507968785810889e-06, "loss": 0.2453, "step": 868100 }, { "epoch": 11.961643382656858, "grad_norm": 2.668151378631592, "learning_rate": 5.503205464659598e-06, "loss": 0.264, "step": 868200 }, { "epoch": 11.963021134716596, "grad_norm": 0.8464562296867371, "learning_rate": 5.498395888551132e-06, "loss": 0.2098, "step": 868300 }, { "epoch": 11.964398886776335, "grad_norm": 0.1544121652841568, "learning_rate": 5.493588181591486e-06, "loss": 0.2496, "step": 868400 }, { "epoch": 11.965776638836076, "grad_norm": 0.04807332530617714, "learning_rate": 5.48878234418911e-06, "loss": 0.2393, "step": 868500 }, { "epoch": 11.967154390895814, "grad_norm": 2.580390691757202, "learning_rate": 5.4839783767522905e-06, "loss": 0.2847, "step": 868600 }, { "epoch": 11.968532142955553, "grad_norm": 3.217088222503662, "learning_rate": 5.479176279689132e-06, "loss": 0.2727, "step": 868700 }, { "epoch": 11.969909895015293, "grad_norm": 3.766310214996338, "learning_rate": 5.474424046408715e-06, "loss": 0.229, "step": 868800 }, { "epoch": 11.971287647075032, "grad_norm": 1.7429208755493164, "learning_rate": 5.46962567260272e-06, "loss": 0.2284, "step": 868900 }, { "epoch": 11.972665399134772, "grad_norm": 1.832765817642212, "learning_rate": 5.464829170389732e-06, "loss": 0.2363, "step": 869000 }, { "epoch": 11.974043151194511, "grad_norm": 2.1697936058044434, "learning_rate": 5.460034540177231e-06, "loss": 0.2654, "step": 869100 }, { "epoch": 11.97542090325425, "grad_norm": 1.1668633222579956, "learning_rate": 5.45524178237255e-06, "loss": 0.2486, "step": 869200 }, { "epoch": 11.97679865531399, "grad_norm": 1.0695405006408691, "learning_rate": 5.450450897382867e-06, "loss": 0.2484, "step": 869300 }, { "epoch": 11.978176407373729, "grad_norm": 3.3183772563934326, "learning_rate": 5.445661885615186e-06, "loss": 0.2399, "step": 869400 }, { "epoch": 11.979554159433468, "grad_norm": 1.032438039779663, "learning_rate": 5.44087474747635e-06, "loss": 0.2586, "step": 869500 }, { "epoch": 11.980931911493208, "grad_norm": 3.3048338890075684, "learning_rate": 5.436089483373056e-06, "loss": 0.2623, "step": 869600 }, { "epoch": 11.982309663552947, "grad_norm": 3.9166269302368164, "learning_rate": 5.431306093711846e-06, "loss": 0.2211, "step": 869700 }, { "epoch": 11.983687415612687, "grad_norm": 1.294677734375, "learning_rate": 5.426524578899079e-06, "loss": 0.2232, "step": 869800 }, { "epoch": 11.985065167672426, "grad_norm": 3.3069231510162354, "learning_rate": 5.421744939340986e-06, "loss": 0.253, "step": 869900 }, { "epoch": 11.986442919732164, "grad_norm": 2.4778196811676025, "learning_rate": 5.416967175443611e-06, "loss": 0.286, "step": 870000 }, { "epoch": 11.987820671791905, "grad_norm": 2.4469211101531982, "learning_rate": 5.41219128761284e-06, "loss": 0.2347, "step": 870100 }, { "epoch": 11.989198423851644, "grad_norm": 0.7883899807929993, "learning_rate": 5.407417276254422e-06, "loss": 0.2491, "step": 870200 }, { "epoch": 11.990576175911382, "grad_norm": 1.967146873474121, "learning_rate": 5.402645141773938e-06, "loss": 0.2675, "step": 870300 }, { "epoch": 11.991953927971123, "grad_norm": 8.64643669128418, "learning_rate": 5.397874884576792e-06, "loss": 0.2785, "step": 870400 }, { "epoch": 11.993331680030861, "grad_norm": 2.2792370319366455, "learning_rate": 5.393106505068254e-06, "loss": 0.2357, "step": 870500 }, { "epoch": 11.994709432090602, "grad_norm": 1.4582953453063965, "learning_rate": 5.388340003653418e-06, "loss": 0.291, "step": 870600 }, { "epoch": 11.99608718415034, "grad_norm": 1.8107428550720215, "learning_rate": 5.383575380737216e-06, "loss": 0.2461, "step": 870700 }, { "epoch": 11.99746493621008, "grad_norm": 1.6535526514053345, "learning_rate": 5.378812636724431e-06, "loss": 0.2305, "step": 870800 }, { "epoch": 11.99884268826982, "grad_norm": 5.824621677398682, "learning_rate": 5.374051772019689e-06, "loss": 0.2651, "step": 870900 }, { "epoch": 12.000220440329558, "grad_norm": 0.4002041518688202, "learning_rate": 5.369292787027438e-06, "loss": 0.2277, "step": 871000 }, { "epoch": 12.001598192389297, "grad_norm": 2.512850522994995, "learning_rate": 5.364535682151991e-06, "loss": 0.2099, "step": 871100 }, { "epoch": 12.002975944449037, "grad_norm": 5.065108776092529, "learning_rate": 5.359780457797477e-06, "loss": 0.2162, "step": 871200 }, { "epoch": 12.004353696508776, "grad_norm": 0.08953443169593811, "learning_rate": 5.355027114367887e-06, "loss": 0.1947, "step": 871300 }, { "epoch": 12.005731448568516, "grad_norm": 1.594722867012024, "learning_rate": 5.350275652267031e-06, "loss": 0.1853, "step": 871400 }, { "epoch": 12.007109200628255, "grad_norm": 0.008780485019087791, "learning_rate": 5.345526071898575e-06, "loss": 0.1885, "step": 871500 }, { "epoch": 12.008486952687994, "grad_norm": 1.3999711275100708, "learning_rate": 5.340778373666026e-06, "loss": 0.223, "step": 871600 }, { "epoch": 12.009864704747734, "grad_norm": 3.716535806655884, "learning_rate": 5.336032557972723e-06, "loss": 0.2287, "step": 871700 }, { "epoch": 12.011242456807473, "grad_norm": 2.367682695388794, "learning_rate": 5.331288625221834e-06, "loss": 0.1967, "step": 871800 }, { "epoch": 12.012620208867212, "grad_norm": 1.9853510856628418, "learning_rate": 5.3265465758164e-06, "loss": 0.2365, "step": 871900 }, { "epoch": 12.013997960926952, "grad_norm": 2.095587730407715, "learning_rate": 5.321806410159267e-06, "loss": 0.266, "step": 872000 }, { "epoch": 12.01537571298669, "grad_norm": 2.4229087829589844, "learning_rate": 5.31706812865314e-06, "loss": 0.27, "step": 872100 }, { "epoch": 12.01675346504643, "grad_norm": 4.114567279815674, "learning_rate": 5.312331731700572e-06, "loss": 0.2225, "step": 872200 }, { "epoch": 12.01813121710617, "grad_norm": 2.40048885345459, "learning_rate": 5.307597219703931e-06, "loss": 0.2504, "step": 872300 }, { "epoch": 12.019508969165908, "grad_norm": 3.690300464630127, "learning_rate": 5.302864593065439e-06, "loss": 0.2213, "step": 872400 }, { "epoch": 12.020886721225649, "grad_norm": 4.3351664543151855, "learning_rate": 5.298133852187156e-06, "loss": 0.265, "step": 872500 }, { "epoch": 12.022264473285388, "grad_norm": 0.5285466909408569, "learning_rate": 5.293404997470995e-06, "loss": 0.2106, "step": 872600 }, { "epoch": 12.023642225345126, "grad_norm": 2.2210826873779297, "learning_rate": 5.28867802931868e-06, "loss": 0.195, "step": 872700 }, { "epoch": 12.025019977404867, "grad_norm": 2.008704662322998, "learning_rate": 5.283952948131807e-06, "loss": 0.2497, "step": 872800 }, { "epoch": 12.026397729464605, "grad_norm": 0.09192755073308945, "learning_rate": 5.279229754311786e-06, "loss": 0.2486, "step": 872900 }, { "epoch": 12.027775481524344, "grad_norm": 3.7384469509124756, "learning_rate": 5.274508448259871e-06, "loss": 0.2452, "step": 873000 }, { "epoch": 12.029153233584085, "grad_norm": 6.8460540771484375, "learning_rate": 5.2697890303771695e-06, "loss": 0.2496, "step": 873100 }, { "epoch": 12.030530985643823, "grad_norm": 2.203611373901367, "learning_rate": 5.265071501064626e-06, "loss": 0.2551, "step": 873200 }, { "epoch": 12.031908737703564, "grad_norm": 4.169670581817627, "learning_rate": 5.260355860723012e-06, "loss": 0.2495, "step": 873300 }, { "epoch": 12.033286489763302, "grad_norm": 0.34469807147979736, "learning_rate": 5.255642109752935e-06, "loss": 0.2493, "step": 873400 }, { "epoch": 12.034664241823041, "grad_norm": 0.015197351574897766, "learning_rate": 5.250930248554864e-06, "loss": 0.2417, "step": 873500 }, { "epoch": 12.036041993882781, "grad_norm": 1.5760308504104614, "learning_rate": 5.246220277529103e-06, "loss": 0.2191, "step": 873600 }, { "epoch": 12.03741974594252, "grad_norm": 9.653153419494629, "learning_rate": 5.241512197075768e-06, "loss": 0.2308, "step": 873700 }, { "epoch": 12.038797498002259, "grad_norm": 2.7352209091186523, "learning_rate": 5.236806007594847e-06, "loss": 0.2294, "step": 873800 }, { "epoch": 12.040175250062, "grad_norm": 2.580380916595459, "learning_rate": 5.232101709486168e-06, "loss": 0.2309, "step": 873900 }, { "epoch": 12.041553002121738, "grad_norm": 1.1028344631195068, "learning_rate": 5.227399303149357e-06, "loss": 0.2563, "step": 874000 }, { "epoch": 12.042930754181478, "grad_norm": 1.87930428981781, "learning_rate": 5.222698788983922e-06, "loss": 0.2322, "step": 874100 }, { "epoch": 12.044308506241217, "grad_norm": NaN, "learning_rate": 5.218047144235605e-06, "loss": 0.2192, "step": 874200 }, { "epoch": 12.045686258300956, "grad_norm": 3.269401788711548, "learning_rate": 5.213350396679088e-06, "loss": 0.2469, "step": 874300 }, { "epoch": 12.047064010360696, "grad_norm": 4.109653949737549, "learning_rate": 5.208655542487473e-06, "loss": 0.1925, "step": 874400 }, { "epoch": 12.048441762420435, "grad_norm": 2.3858397006988525, "learning_rate": 5.203962582059605e-06, "loss": 0.1866, "step": 874500 }, { "epoch": 12.049819514480173, "grad_norm": 2.3483970165252686, "learning_rate": 5.199271515794183e-06, "loss": 0.2412, "step": 874600 }, { "epoch": 12.051197266539914, "grad_norm": 2.6427602767944336, "learning_rate": 5.1945823440897394e-06, "loss": 0.187, "step": 874700 }, { "epoch": 12.052575018599653, "grad_norm": 1.9800199270248413, "learning_rate": 5.189895067344634e-06, "loss": 0.2479, "step": 874800 }, { "epoch": 12.053952770659393, "grad_norm": 3.604522466659546, "learning_rate": 5.185209685957087e-06, "loss": 0.2122, "step": 874900 }, { "epoch": 12.055330522719132, "grad_norm": 0.6902385950088501, "learning_rate": 5.1805262003251395e-06, "loss": 0.2121, "step": 875000 }, { "epoch": 12.05670827477887, "grad_norm": 4.640689849853516, "learning_rate": 5.175844610846669e-06, "loss": 0.2789, "step": 875100 }, { "epoch": 12.05808602683861, "grad_norm": 5.568762302398682, "learning_rate": 5.171164917919408e-06, "loss": 0.2276, "step": 875200 }, { "epoch": 12.05946377889835, "grad_norm": 1.4291658401489258, "learning_rate": 5.166487121940927e-06, "loss": 0.2379, "step": 875300 }, { "epoch": 12.060841530958088, "grad_norm": 4.2890825271606445, "learning_rate": 5.16181122330862e-06, "loss": 0.1896, "step": 875400 }, { "epoch": 12.062219283017829, "grad_norm": 1.5138558149337769, "learning_rate": 5.157137222419735e-06, "loss": 0.1939, "step": 875500 }, { "epoch": 12.063597035077567, "grad_norm": 1.2165333032608032, "learning_rate": 5.15246511967134e-06, "loss": 0.2266, "step": 875600 }, { "epoch": 12.064974787137308, "grad_norm": 2.8036587238311768, "learning_rate": 5.1477949154603696e-06, "loss": 0.2747, "step": 875700 }, { "epoch": 12.066352539197046, "grad_norm": 1.6879605054855347, "learning_rate": 5.14312661018357e-06, "loss": 0.2572, "step": 875800 }, { "epoch": 12.067730291256785, "grad_norm": 2.993696689605713, "learning_rate": 5.138460204237547e-06, "loss": 0.223, "step": 875900 }, { "epoch": 12.069108043316525, "grad_norm": 4.333001613616943, "learning_rate": 5.133795698018722e-06, "loss": 0.2698, "step": 876000 }, { "epoch": 12.070485795376264, "grad_norm": 1.8231858015060425, "learning_rate": 5.1291330919233875e-06, "loss": 0.2684, "step": 876100 }, { "epoch": 12.071863547436003, "grad_norm": 7.267999649047852, "learning_rate": 5.124472386347634e-06, "loss": 0.2417, "step": 876200 }, { "epoch": 12.073241299495743, "grad_norm": 3.1360344886779785, "learning_rate": 5.119813581687435e-06, "loss": 0.211, "step": 876300 }, { "epoch": 12.074619051555482, "grad_norm": 1.487412691116333, "learning_rate": 5.1151566783385594e-06, "loss": 0.2242, "step": 876400 }, { "epoch": 12.07599680361522, "grad_norm": 1.644063115119934, "learning_rate": 5.110501676696643e-06, "loss": 0.2221, "step": 876500 }, { "epoch": 12.077374555674961, "grad_norm": 2.2688229084014893, "learning_rate": 5.105848577157169e-06, "loss": 0.2127, "step": 876600 }, { "epoch": 12.0787523077347, "grad_norm": 3.4036290645599365, "learning_rate": 5.101197380115413e-06, "loss": 0.2438, "step": 876700 }, { "epoch": 12.08013005979444, "grad_norm": 1.6512223482131958, "learning_rate": 5.096548085966529e-06, "loss": 0.2217, "step": 876800 }, { "epoch": 12.081507811854179, "grad_norm": 0.5855816602706909, "learning_rate": 5.091900695105509e-06, "loss": 0.1952, "step": 876900 }, { "epoch": 12.082885563913917, "grad_norm": 0.8231651186943054, "learning_rate": 5.087255207927157e-06, "loss": 0.2064, "step": 877000 }, { "epoch": 12.084263315973658, "grad_norm": 3.6263163089752197, "learning_rate": 5.082611624826135e-06, "loss": 0.2598, "step": 877100 }, { "epoch": 12.085641068033397, "grad_norm": 2.1534066200256348, "learning_rate": 5.077969946196951e-06, "loss": 0.2222, "step": 877200 }, { "epoch": 12.087018820093135, "grad_norm": 1.4784257411956787, "learning_rate": 5.0733301724339315e-06, "loss": 0.2467, "step": 877300 }, { "epoch": 12.088396572152876, "grad_norm": 2.992199659347534, "learning_rate": 5.068692303931241e-06, "loss": 0.2829, "step": 877400 }, { "epoch": 12.089774324212614, "grad_norm": 0.5653468370437622, "learning_rate": 5.064056341082895e-06, "loss": 0.2068, "step": 877500 }, { "epoch": 12.091152076272355, "grad_norm": 1.0328912734985352, "learning_rate": 5.0594222842827525e-06, "loss": 0.1839, "step": 877600 }, { "epoch": 12.092529828332093, "grad_norm": 2.5018107891082764, "learning_rate": 5.054790133924492e-06, "loss": 0.2418, "step": 877700 }, { "epoch": 12.093907580391832, "grad_norm": 6.622915744781494, "learning_rate": 5.050159890401631e-06, "loss": 0.2373, "step": 877800 }, { "epoch": 12.095285332451573, "grad_norm": 3.7919318675994873, "learning_rate": 5.045531554107548e-06, "loss": 0.2533, "step": 877900 }, { "epoch": 12.096663084511311, "grad_norm": 2.1864354610443115, "learning_rate": 5.040905125435428e-06, "loss": 0.2453, "step": 878000 }, { "epoch": 12.09804083657105, "grad_norm": 1.983436107635498, "learning_rate": 5.0362806047783165e-06, "loss": 0.1929, "step": 878100 }, { "epoch": 12.09941858863079, "grad_norm": 2.7104105949401855, "learning_rate": 5.031657992529101e-06, "loss": 0.2468, "step": 878200 }, { "epoch": 12.100796340690529, "grad_norm": 0.5343512892723083, "learning_rate": 5.0270372890804854e-06, "loss": 0.2232, "step": 878300 }, { "epoch": 12.10217409275027, "grad_norm": 2.8182480335235596, "learning_rate": 5.022418494825014e-06, "loss": 0.2684, "step": 878400 }, { "epoch": 12.103551844810008, "grad_norm": 2.4777164459228516, "learning_rate": 5.0178016101550885e-06, "loss": 0.2103, "step": 878500 }, { "epoch": 12.104929596869747, "grad_norm": 1.2576487064361572, "learning_rate": 5.0131866354629395e-06, "loss": 0.2323, "step": 878600 }, { "epoch": 12.106307348929487, "grad_norm": 7.165399551391602, "learning_rate": 5.008573571140623e-06, "loss": 0.2498, "step": 878700 }, { "epoch": 12.107685100989226, "grad_norm": 2.007286310195923, "learning_rate": 5.0039624175800425e-06, "loss": 0.1984, "step": 878800 }, { "epoch": 12.109062853048965, "grad_norm": 2.9858615398406982, "learning_rate": 4.9993531751729625e-06, "loss": 0.2142, "step": 878900 }, { "epoch": 12.110440605108705, "grad_norm": 15.602959632873535, "learning_rate": 4.994745844310928e-06, "loss": 0.2963, "step": 879000 }, { "epoch": 12.111818357168444, "grad_norm": 3.1856021881103516, "learning_rate": 4.990140425385366e-06, "loss": 0.2316, "step": 879100 }, { "epoch": 12.113196109228184, "grad_norm": 13.943312644958496, "learning_rate": 4.985582944386211e-06, "loss": 0.2195, "step": 879200 }, { "epoch": 12.114573861287923, "grad_norm": 3.4007277488708496, "learning_rate": 4.980981331378088e-06, "loss": 0.246, "step": 879300 }, { "epoch": 12.115951613347661, "grad_norm": 2.927015542984009, "learning_rate": 4.976381631475808e-06, "loss": 0.239, "step": 879400 }, { "epoch": 12.117329365407402, "grad_norm": 1.080040693283081, "learning_rate": 4.971783845070132e-06, "loss": 0.2341, "step": 879500 }, { "epoch": 12.11870711746714, "grad_norm": 2.457324981689453, "learning_rate": 4.96718797255167e-06, "loss": 0.2177, "step": 879600 }, { "epoch": 12.12008486952688, "grad_norm": 2.072075366973877, "learning_rate": 4.962594014310875e-06, "loss": 0.2554, "step": 879700 }, { "epoch": 12.12146262158662, "grad_norm": 2.2446069717407227, "learning_rate": 4.95800197073801e-06, "loss": 0.2392, "step": 879800 }, { "epoch": 12.122840373646358, "grad_norm": 0.6934237480163574, "learning_rate": 4.9534118422232095e-06, "loss": 0.2147, "step": 879900 }, { "epoch": 12.124218125706099, "grad_norm": 0.6362463235855103, "learning_rate": 4.948823629156422e-06, "loss": 0.2125, "step": 880000 }, { "epoch": 12.125595877765837, "grad_norm": 1.511073350906372, "learning_rate": 4.94423733192743e-06, "loss": 0.2167, "step": 880100 }, { "epoch": 12.126973629825576, "grad_norm": 3.7890899181365967, "learning_rate": 4.939652950925873e-06, "loss": 0.2275, "step": 880200 }, { "epoch": 12.128351381885317, "grad_norm": 2.48779559135437, "learning_rate": 4.93507048654122e-06, "loss": 0.2282, "step": 880300 }, { "epoch": 12.129729133945055, "grad_norm": 2.0339746475219727, "learning_rate": 4.930489939162764e-06, "loss": 0.225, "step": 880400 }, { "epoch": 12.131106886004794, "grad_norm": 3.99558687210083, "learning_rate": 4.925911309179661e-06, "loss": 0.2301, "step": 880500 }, { "epoch": 12.132484638064534, "grad_norm": 3.0762410163879395, "learning_rate": 4.9213345969808755e-06, "loss": 0.2474, "step": 880600 }, { "epoch": 12.133862390124273, "grad_norm": 9.100939750671387, "learning_rate": 4.916759802955232e-06, "loss": 0.2379, "step": 880700 }, { "epoch": 12.135240142184012, "grad_norm": 2.1262528896331787, "learning_rate": 4.91218692749137e-06, "loss": 0.2263, "step": 880800 }, { "epoch": 12.136617894243752, "grad_norm": 2.44498872756958, "learning_rate": 4.9076159709777966e-06, "loss": 0.2089, "step": 880900 }, { "epoch": 12.13799564630349, "grad_norm": 1.489201307296753, "learning_rate": 4.903046933802819e-06, "loss": 0.2558, "step": 881000 }, { "epoch": 12.139373398363231, "grad_norm": 3.68166446685791, "learning_rate": 4.8984798163546165e-06, "loss": 0.2055, "step": 881100 }, { "epoch": 12.14075115042297, "grad_norm": 3.419917583465576, "learning_rate": 4.893914619021177e-06, "loss": 0.2586, "step": 881200 }, { "epoch": 12.142128902482709, "grad_norm": 2.3484318256378174, "learning_rate": 4.889396965450891e-06, "loss": 0.243, "step": 881300 }, { "epoch": 12.143506654542449, "grad_norm": 4.913581371307373, "learning_rate": 4.88483559029952e-06, "loss": 0.2082, "step": 881400 }, { "epoch": 12.144884406602188, "grad_norm": 0.4809964895248413, "learning_rate": 4.880276136422061e-06, "loss": 0.2868, "step": 881500 }, { "epoch": 12.146262158661926, "grad_norm": 3.337538242340088, "learning_rate": 4.875718604205867e-06, "loss": 0.2334, "step": 881600 }, { "epoch": 12.147639910721667, "grad_norm": 1.8974077701568604, "learning_rate": 4.871162994038128e-06, "loss": 0.2057, "step": 881700 }, { "epoch": 12.149017662781405, "grad_norm": 3.166837215423584, "learning_rate": 4.86660930630585e-06, "loss": 0.2001, "step": 881800 }, { "epoch": 12.150395414841146, "grad_norm": 9.224626541137695, "learning_rate": 4.862057541395904e-06, "loss": 0.1922, "step": 881900 }, { "epoch": 12.151773166900885, "grad_norm": 9.962299346923828, "learning_rate": 4.857507699694992e-06, "loss": 0.2057, "step": 882000 }, { "epoch": 12.153150918960623, "grad_norm": 4.584589958190918, "learning_rate": 4.852959781589634e-06, "loss": 0.2164, "step": 882100 }, { "epoch": 12.154528671020364, "grad_norm": 2.1607563495635986, "learning_rate": 4.848413787466205e-06, "loss": 0.2065, "step": 882200 }, { "epoch": 12.155906423080102, "grad_norm": 0.4329064190387726, "learning_rate": 4.843869717710909e-06, "loss": 0.2134, "step": 882300 }, { "epoch": 12.157284175139841, "grad_norm": 3.1414289474487305, "learning_rate": 4.839327572709792e-06, "loss": 0.2389, "step": 882400 }, { "epoch": 12.158661927199581, "grad_norm": 1.0964374542236328, "learning_rate": 4.834787352848724e-06, "loss": 0.2629, "step": 882500 }, { "epoch": 12.16003967925932, "grad_norm": 1.6723058223724365, "learning_rate": 4.830249058513425e-06, "loss": 0.2363, "step": 882600 }, { "epoch": 12.16141743131906, "grad_norm": 3.7809197902679443, "learning_rate": 4.825712690089459e-06, "loss": 0.228, "step": 882700 }, { "epoch": 12.1627951833788, "grad_norm": 2.670841693878174, "learning_rate": 4.821178247962187e-06, "loss": 0.2436, "step": 882800 }, { "epoch": 12.164172935438538, "grad_norm": 3.943086624145508, "learning_rate": 4.816645732516846e-06, "loss": 0.2013, "step": 882900 }, { "epoch": 12.165550687498278, "grad_norm": 2.231783151626587, "learning_rate": 4.812115144138505e-06, "loss": 0.2307, "step": 883000 }, { "epoch": 12.166928439558017, "grad_norm": 1.2964115142822266, "learning_rate": 4.807586483212047e-06, "loss": 0.244, "step": 883100 }, { "epoch": 12.168306191617756, "grad_norm": 5.346635818481445, "learning_rate": 4.80305975012221e-06, "loss": 0.2537, "step": 883200 }, { "epoch": 12.169683943677496, "grad_norm": 2.408895254135132, "learning_rate": 4.798534945253569e-06, "loss": 0.236, "step": 883300 }, { "epoch": 12.171061695737235, "grad_norm": 0.8133533596992493, "learning_rate": 4.7940120689905274e-06, "loss": 0.2125, "step": 883400 }, { "epoch": 12.172439447796975, "grad_norm": 1.6550309658050537, "learning_rate": 4.789491121717313e-06, "loss": 0.2146, "step": 883500 }, { "epoch": 12.173817199856714, "grad_norm": 1.2474266290664673, "learning_rate": 4.784972103818018e-06, "loss": 0.2861, "step": 883600 }, { "epoch": 12.175194951916453, "grad_norm": 1.8229119777679443, "learning_rate": 4.780500177004414e-06, "loss": 0.2229, "step": 883700 }, { "epoch": 12.176572703976193, "grad_norm": 2.0488181114196777, "learning_rate": 4.775984999701215e-06, "loss": 0.2405, "step": 883800 }, { "epoch": 12.177950456035932, "grad_norm": 2.033342123031616, "learning_rate": 4.771471752919341e-06, "loss": 0.2396, "step": 883900 }, { "epoch": 12.17932820809567, "grad_norm": 1.845656156539917, "learning_rate": 4.766960437042222e-06, "loss": 0.2318, "step": 884000 }, { "epoch": 12.18070596015541, "grad_norm": 8.718055725097656, "learning_rate": 4.762451052453121e-06, "loss": 0.2059, "step": 884100 }, { "epoch": 12.18208371221515, "grad_norm": 2.8062326908111572, "learning_rate": 4.757943599535119e-06, "loss": 0.233, "step": 884200 }, { "epoch": 12.18346146427489, "grad_norm": 1.0413531064987183, "learning_rate": 4.753438078671156e-06, "loss": 0.1812, "step": 884300 }, { "epoch": 12.184839216334629, "grad_norm": 0.8240475654602051, "learning_rate": 4.748934490244007e-06, "loss": 0.2503, "step": 884400 }, { "epoch": 12.186216968394367, "grad_norm": 3.3376643657684326, "learning_rate": 4.744432834636266e-06, "loss": 0.2409, "step": 884500 }, { "epoch": 12.187594720454108, "grad_norm": 1.5429868698120117, "learning_rate": 4.739933112230367e-06, "loss": 0.2177, "step": 884600 }, { "epoch": 12.188972472513846, "grad_norm": 1.6091680526733398, "learning_rate": 4.735435323408594e-06, "loss": 0.1963, "step": 884700 }, { "epoch": 12.190350224573585, "grad_norm": 1.2839714288711548, "learning_rate": 4.730939468553048e-06, "loss": 0.2544, "step": 884800 }, { "epoch": 12.191727976633326, "grad_norm": 1.3931279182434082, "learning_rate": 4.726445548045685e-06, "loss": 0.2169, "step": 884900 }, { "epoch": 12.193105728693064, "grad_norm": 1.524802803993225, "learning_rate": 4.721953562268275e-06, "loss": 0.245, "step": 885000 }, { "epoch": 12.194483480752803, "grad_norm": 11.86330509185791, "learning_rate": 4.717463511602451e-06, "loss": 0.2022, "step": 885100 }, { "epoch": 12.195861232812543, "grad_norm": 2.1775686740875244, "learning_rate": 4.712975396429649e-06, "loss": 0.2351, "step": 885200 }, { "epoch": 12.197238984872282, "grad_norm": 2.8879382610321045, "learning_rate": 4.708489217131163e-06, "loss": 0.2248, "step": 885300 }, { "epoch": 12.198616736932022, "grad_norm": 2.509336471557617, "learning_rate": 4.704004974088124e-06, "loss": 0.2066, "step": 885400 }, { "epoch": 12.199994488991761, "grad_norm": 3.2225842475891113, "learning_rate": 4.69952266768149e-06, "loss": 0.2198, "step": 885500 }, { "epoch": 12.2013722410515, "grad_norm": 2.8396997451782227, "learning_rate": 4.6950422982920416e-06, "loss": 0.2396, "step": 885600 }, { "epoch": 12.20274999311124, "grad_norm": 4.007664203643799, "learning_rate": 4.690563866300429e-06, "loss": 0.2499, "step": 885700 }, { "epoch": 12.204127745170979, "grad_norm": 2.9423089027404785, "learning_rate": 4.686087372087101e-06, "loss": 0.2396, "step": 885800 }, { "epoch": 12.205505497230718, "grad_norm": 3.4315288066864014, "learning_rate": 4.681612816032364e-06, "loss": 0.2249, "step": 885900 }, { "epoch": 12.206883249290458, "grad_norm": 3.3260977268218994, "learning_rate": 4.677140198516363e-06, "loss": 0.2616, "step": 886000 }, { "epoch": 12.208261001350197, "grad_norm": 0.037134140729904175, "learning_rate": 4.672669519919064e-06, "loss": 0.2274, "step": 886100 }, { "epoch": 12.209638753409937, "grad_norm": 1.937679648399353, "learning_rate": 4.6682007806202665e-06, "loss": 0.2588, "step": 886200 }, { "epoch": 12.211016505469676, "grad_norm": 3.2251245975494385, "learning_rate": 4.663733980999618e-06, "loss": 0.2281, "step": 886300 }, { "epoch": 12.212394257529414, "grad_norm": 0.41416215896606445, "learning_rate": 4.659269121436607e-06, "loss": 0.2491, "step": 886400 }, { "epoch": 12.213772009589155, "grad_norm": 2.800989866256714, "learning_rate": 4.654806202310528e-06, "loss": 0.1746, "step": 886500 }, { "epoch": 12.215149761648894, "grad_norm": 2.3401637077331543, "learning_rate": 4.650345224000545e-06, "loss": 0.2385, "step": 886600 }, { "epoch": 12.216527513708632, "grad_norm": 1.9223545789718628, "learning_rate": 4.645886186885631e-06, "loss": 0.2076, "step": 886700 }, { "epoch": 12.217905265768373, "grad_norm": 1.6307142972946167, "learning_rate": 4.641429091344601e-06, "loss": 0.2321, "step": 886800 }, { "epoch": 12.219283017828111, "grad_norm": 2.2302122116088867, "learning_rate": 4.636973937756115e-06, "loss": 0.2296, "step": 886900 }, { "epoch": 12.220660769887852, "grad_norm": 0.6077303290367126, "learning_rate": 4.632520726498663e-06, "loss": 0.1791, "step": 887000 }, { "epoch": 12.22203852194759, "grad_norm": 3.4959311485290527, "learning_rate": 4.628069457950566e-06, "loss": 0.2605, "step": 887100 }, { "epoch": 12.223416274007329, "grad_norm": 2.5908241271972656, "learning_rate": 4.6236201324899745e-06, "loss": 0.2163, "step": 887200 }, { "epoch": 12.22479402606707, "grad_norm": 2.3392996788024902, "learning_rate": 4.619172750494888e-06, "loss": 0.2211, "step": 887300 }, { "epoch": 12.226171778126808, "grad_norm": 1.9364537000656128, "learning_rate": 4.614771757101388e-06, "loss": 0.2107, "step": 887400 }, { "epoch": 12.227549530186547, "grad_norm": 1.2026662826538086, "learning_rate": 4.610328243726554e-06, "loss": 0.2417, "step": 887500 }, { "epoch": 12.228927282246287, "grad_norm": 3.282721757888794, "learning_rate": 4.605886674946436e-06, "loss": 0.231, "step": 887600 }, { "epoch": 12.230305034306026, "grad_norm": 2.1140096187591553, "learning_rate": 4.601447051138372e-06, "loss": 0.2234, "step": 887700 }, { "epoch": 12.231682786365766, "grad_norm": 0.3606036901473999, "learning_rate": 4.597009372679533e-06, "loss": 0.2164, "step": 887800 }, { "epoch": 12.233060538425505, "grad_norm": 2.3173654079437256, "learning_rate": 4.592573639946912e-06, "loss": 0.2653, "step": 887900 }, { "epoch": 12.234438290485244, "grad_norm": 2.7412893772125244, "learning_rate": 4.5881398533173525e-06, "loss": 0.2733, "step": 888000 }, { "epoch": 12.235816042544984, "grad_norm": 1.1821815967559814, "learning_rate": 4.583708013167536e-06, "loss": 0.1725, "step": 888100 }, { "epoch": 12.237193794604723, "grad_norm": 3.445291757583618, "learning_rate": 4.579278119873964e-06, "loss": 0.3038, "step": 888200 }, { "epoch": 12.238571546664462, "grad_norm": 1.0347822904586792, "learning_rate": 4.5748501738129666e-06, "loss": 0.2278, "step": 888300 }, { "epoch": 12.239949298724202, "grad_norm": 1.1081361770629883, "learning_rate": 4.570424175360736e-06, "loss": 0.2082, "step": 888400 }, { "epoch": 12.24132705078394, "grad_norm": 0.13742347061634064, "learning_rate": 4.566000124893283e-06, "loss": 0.199, "step": 888500 }, { "epoch": 12.242704802843681, "grad_norm": 3.644533395767212, "learning_rate": 4.561578022786443e-06, "loss": 0.1849, "step": 888600 }, { "epoch": 12.24408255490342, "grad_norm": 2.0544638633728027, "learning_rate": 4.557157869415907e-06, "loss": 0.2204, "step": 888700 }, { "epoch": 12.245460306963158, "grad_norm": 1.9735850095748901, "learning_rate": 4.5527396651571854e-06, "loss": 0.2315, "step": 888800 }, { "epoch": 12.246838059022899, "grad_norm": 1.8074002265930176, "learning_rate": 4.548323410385621e-06, "loss": 0.2105, "step": 888900 }, { "epoch": 12.248215811082638, "grad_norm": 2.581815004348755, "learning_rate": 4.543909105476401e-06, "loss": 0.2099, "step": 889000 }, { "epoch": 12.249593563142376, "grad_norm": 4.641282081604004, "learning_rate": 4.539496750804555e-06, "loss": 0.2868, "step": 889100 }, { "epoch": 12.250971315202117, "grad_norm": 3.4576730728149414, "learning_rate": 4.5350863467449174e-06, "loss": 0.2512, "step": 889200 }, { "epoch": 12.252349067261855, "grad_norm": 0.7717374563217163, "learning_rate": 4.5306778936721865e-06, "loss": 0.2304, "step": 889300 }, { "epoch": 12.253726819321596, "grad_norm": 2.422650098800659, "learning_rate": 4.526271391960874e-06, "loss": 0.2352, "step": 889400 }, { "epoch": 12.255104571381334, "grad_norm": 1.3010914325714111, "learning_rate": 4.521866841985348e-06, "loss": 0.256, "step": 889500 }, { "epoch": 12.256482323441073, "grad_norm": 2.878810167312622, "learning_rate": 4.51746424411978e-06, "loss": 0.2366, "step": 889600 }, { "epoch": 12.257860075500814, "grad_norm": 2.018059253692627, "learning_rate": 4.513063598738207e-06, "loss": 0.2004, "step": 889700 }, { "epoch": 12.259237827560552, "grad_norm": 3.264045238494873, "learning_rate": 4.508664906214478e-06, "loss": 0.2386, "step": 889800 }, { "epoch": 12.260615579620291, "grad_norm": 2.5459461212158203, "learning_rate": 4.504268166922295e-06, "loss": 0.187, "step": 889900 }, { "epoch": 12.261993331680031, "grad_norm": 5.724930763244629, "learning_rate": 4.499873381235169e-06, "loss": 0.2518, "step": 890000 }, { "epoch": 12.26337108373977, "grad_norm": 2.125060558319092, "learning_rate": 4.495480549526476e-06, "loss": 0.2456, "step": 890100 }, { "epoch": 12.264748835799509, "grad_norm": 3.596890926361084, "learning_rate": 4.491133571267698e-06, "loss": 0.2602, "step": 890200 }, { "epoch": 12.26612658785925, "grad_norm": 5.890937805175781, "learning_rate": 4.486744629086165e-06, "loss": 0.2585, "step": 890300 }, { "epoch": 12.267504339918988, "grad_norm": 1.3215614557266235, "learning_rate": 4.482357641998419e-06, "loss": 0.2216, "step": 890400 }, { "epoch": 12.268882091978728, "grad_norm": 3.3703508377075195, "learning_rate": 4.477972610377151e-06, "loss": 0.2246, "step": 890500 }, { "epoch": 12.270259844038467, "grad_norm": 3.8755743503570557, "learning_rate": 4.473589534594881e-06, "loss": 0.2704, "step": 890600 }, { "epoch": 12.271637596098206, "grad_norm": 1.4406187534332275, "learning_rate": 4.46920841502398e-06, "loss": 0.2184, "step": 890700 }, { "epoch": 12.273015348157946, "grad_norm": 2.0199484825134277, "learning_rate": 4.464829252036659e-06, "loss": 0.2556, "step": 890800 }, { "epoch": 12.274393100217685, "grad_norm": 6.4199538230896, "learning_rate": 4.460452046004932e-06, "loss": 0.2037, "step": 890900 }, { "epoch": 12.275770852277423, "grad_norm": 2.7342798709869385, "learning_rate": 4.45607679730068e-06, "loss": 0.2416, "step": 891000 }, { "epoch": 12.277148604337164, "grad_norm": 0.003172761993482709, "learning_rate": 4.451747229513811e-06, "loss": 0.2181, "step": 891100 }, { "epoch": 12.278526356396902, "grad_norm": 3.498115062713623, "learning_rate": 4.447375876996878e-06, "loss": 0.2134, "step": 891200 }, { "epoch": 12.279904108456643, "grad_norm": 0.88975590467453, "learning_rate": 4.443006482918307e-06, "loss": 0.2293, "step": 891300 }, { "epoch": 12.281281860516382, "grad_norm": 2.969186544418335, "learning_rate": 4.4386390476492855e-06, "loss": 0.2407, "step": 891400 }, { "epoch": 12.28265961257612, "grad_norm": 1.2038480043411255, "learning_rate": 4.434273571560864e-06, "loss": 0.2118, "step": 891500 }, { "epoch": 12.28403736463586, "grad_norm": 1.5316039323806763, "learning_rate": 4.4299100550239095e-06, "loss": 0.2167, "step": 891600 }, { "epoch": 12.2854151166956, "grad_norm": 5.393610954284668, "learning_rate": 4.42554849840911e-06, "loss": 0.2733, "step": 891700 }, { "epoch": 12.286792868755338, "grad_norm": 2.724255323410034, "learning_rate": 4.421188902087019e-06, "loss": 0.2297, "step": 891800 }, { "epoch": 12.288170620815078, "grad_norm": 1.8789863586425781, "learning_rate": 4.416831266428005e-06, "loss": 0.229, "step": 891900 }, { "epoch": 12.289548372874817, "grad_norm": 1.1820642948150635, "learning_rate": 4.412475591802261e-06, "loss": 0.2215, "step": 892000 }, { "epoch": 12.290926124934558, "grad_norm": 2.520380973815918, "learning_rate": 4.4081218785798366e-06, "loss": 0.1738, "step": 892100 }, { "epoch": 12.292303876994296, "grad_norm": 0.02531392127275467, "learning_rate": 4.403770127130591e-06, "loss": 0.2548, "step": 892200 }, { "epoch": 12.293681629054035, "grad_norm": 2.480377674102783, "learning_rate": 4.399420337824237e-06, "loss": 0.2467, "step": 892300 }, { "epoch": 12.295059381113775, "grad_norm": 2.417511224746704, "learning_rate": 4.395072511030302e-06, "loss": 0.1968, "step": 892400 }, { "epoch": 12.296437133173514, "grad_norm": 2.6826090812683105, "learning_rate": 4.390726647118168e-06, "loss": 0.2393, "step": 892500 }, { "epoch": 12.297814885233253, "grad_norm": 11.318028450012207, "learning_rate": 4.386382746457031e-06, "loss": 0.2612, "step": 892600 }, { "epoch": 12.299192637292993, "grad_norm": 3.5370519161224365, "learning_rate": 4.382040809415922e-06, "loss": 0.2466, "step": 892700 }, { "epoch": 12.300570389352732, "grad_norm": 1.5823495388031006, "learning_rate": 4.377700836363717e-06, "loss": 0.2041, "step": 892800 }, { "epoch": 12.30194814141247, "grad_norm": 1.4828613996505737, "learning_rate": 4.373362827669124e-06, "loss": 0.2203, "step": 892900 }, { "epoch": 12.303325893472211, "grad_norm": 2.4236152172088623, "learning_rate": 4.3690267837006685e-06, "loss": 0.2482, "step": 893000 }, { "epoch": 12.30470364553195, "grad_norm": 1.5310337543487549, "learning_rate": 4.364692704826726e-06, "loss": 0.2862, "step": 893100 }, { "epoch": 12.30608139759169, "grad_norm": 3.797976016998291, "learning_rate": 4.3603605914155015e-06, "loss": 0.2368, "step": 893200 }, { "epoch": 12.307459149651429, "grad_norm": 2.7901999950408936, "learning_rate": 4.356030443835029e-06, "loss": 0.216, "step": 893300 }, { "epoch": 12.308836901711167, "grad_norm": 0.10551207512617111, "learning_rate": 4.351702262453166e-06, "loss": 0.2539, "step": 893400 }, { "epoch": 12.310214653770908, "grad_norm": 4.287848472595215, "learning_rate": 4.347376047637629e-06, "loss": 0.272, "step": 893500 }, { "epoch": 12.311592405830647, "grad_norm": 8.090595245361328, "learning_rate": 4.343051799755937e-06, "loss": 0.2153, "step": 893600 }, { "epoch": 12.312970157890387, "grad_norm": 3.8656468391418457, "learning_rate": 4.3387295191754736e-06, "loss": 0.2262, "step": 893700 }, { "epoch": 12.314347909950126, "grad_norm": 1.2874608039855957, "learning_rate": 4.3344523996513814e-06, "loss": 0.1788, "step": 893800 }, { "epoch": 12.315725662009864, "grad_norm": 1.7229597568511963, "learning_rate": 4.33013403509261e-06, "loss": 0.2682, "step": 893900 }, { "epoch": 12.317103414069605, "grad_norm": 0.9462223052978516, "learning_rate": 4.325817638932496e-06, "loss": 0.2544, "step": 894000 }, { "epoch": 12.318481166129343, "grad_norm": 2.1751644611358643, "learning_rate": 4.321503211537727e-06, "loss": 0.1984, "step": 894100 }, { "epoch": 12.319858918189082, "grad_norm": 2.7860021591186523, "learning_rate": 4.317190753274837e-06, "loss": 0.265, "step": 894200 }, { "epoch": 12.321236670248823, "grad_norm": 1.043503761291504, "learning_rate": 4.312880264510211e-06, "loss": 0.235, "step": 894300 }, { "epoch": 12.322614422308561, "grad_norm": 2.568237781524658, "learning_rate": 4.308571745610018e-06, "loss": 0.2363, "step": 894400 }, { "epoch": 12.3239921743683, "grad_norm": 1.396524429321289, "learning_rate": 4.3042651969403006e-06, "loss": 0.2025, "step": 894500 }, { "epoch": 12.32536992642804, "grad_norm": 1.2532455921173096, "learning_rate": 4.2999606188669285e-06, "loss": 0.2292, "step": 894600 }, { "epoch": 12.326747678487779, "grad_norm": 3.6814794540405273, "learning_rate": 4.295658011755589e-06, "loss": 0.223, "step": 894700 }, { "epoch": 12.32812543054752, "grad_norm": 3.2660319805145264, "learning_rate": 4.291357375971819e-06, "loss": 0.2176, "step": 894800 }, { "epoch": 12.329503182607258, "grad_norm": 2.4927618503570557, "learning_rate": 4.287058711880967e-06, "loss": 0.1855, "step": 894900 }, { "epoch": 12.330880934666997, "grad_norm": 0.12143804877996445, "learning_rate": 4.282762019848241e-06, "loss": 0.1919, "step": 895000 }, { "epoch": 12.332258686726737, "grad_norm": 3.500173568725586, "learning_rate": 4.278467300238656e-06, "loss": 0.2131, "step": 895100 }, { "epoch": 12.333636438786476, "grad_norm": 0.8549736738204956, "learning_rate": 4.274174553417072e-06, "loss": 0.2439, "step": 895200 }, { "epoch": 12.335014190846215, "grad_norm": 0.2323756068944931, "learning_rate": 4.269883779748191e-06, "loss": 0.2316, "step": 895300 }, { "epoch": 12.336391942905955, "grad_norm": 2.947979688644409, "learning_rate": 4.265594979596523e-06, "loss": 0.203, "step": 895400 }, { "epoch": 12.337769694965694, "grad_norm": 3.2621753215789795, "learning_rate": 4.261308153326424e-06, "loss": 0.2663, "step": 895500 }, { "epoch": 12.339147447025434, "grad_norm": 2.5823898315429688, "learning_rate": 4.257023301302094e-06, "loss": 0.2162, "step": 895600 }, { "epoch": 12.340525199085173, "grad_norm": 4.950196266174316, "learning_rate": 4.252740423887534e-06, "loss": 0.1999, "step": 895700 }, { "epoch": 12.341902951144911, "grad_norm": 1.231783151626587, "learning_rate": 4.248459521446612e-06, "loss": 0.1752, "step": 895800 }, { "epoch": 12.343280703204652, "grad_norm": 0.034924864768981934, "learning_rate": 4.24418059434301e-06, "loss": 0.2263, "step": 895900 }, { "epoch": 12.34465845526439, "grad_norm": 2.1649532318115234, "learning_rate": 4.239903642940245e-06, "loss": 0.2122, "step": 896000 }, { "epoch": 12.34603620732413, "grad_norm": 1.5345295667648315, "learning_rate": 4.235628667601659e-06, "loss": 0.1807, "step": 896100 }, { "epoch": 12.34741395938387, "grad_norm": 0.6377744078636169, "learning_rate": 4.231355668690435e-06, "loss": 0.1801, "step": 896200 }, { "epoch": 12.348791711443608, "grad_norm": 2.3109936714172363, "learning_rate": 4.227084646569593e-06, "loss": 0.2436, "step": 896300 }, { "epoch": 12.350169463503349, "grad_norm": 2.32199764251709, "learning_rate": 4.22281560160197e-06, "loss": 0.242, "step": 896400 }, { "epoch": 12.351547215563087, "grad_norm": 2.498436212539673, "learning_rate": 4.21854853415025e-06, "loss": 0.265, "step": 896500 }, { "epoch": 12.352924967622826, "grad_norm": 2.88193941116333, "learning_rate": 4.214283444576942e-06, "loss": 0.2731, "step": 896600 }, { "epoch": 12.354302719682567, "grad_norm": 0.9335037469863892, "learning_rate": 4.210020333244378e-06, "loss": 0.2004, "step": 896700 }, { "epoch": 12.355680471742305, "grad_norm": 1.1421068906784058, "learning_rate": 4.2057592005147375e-06, "loss": 0.2313, "step": 896800 }, { "epoch": 12.357058223802044, "grad_norm": 2.845714569091797, "learning_rate": 4.20150004675003e-06, "loss": 0.2327, "step": 896900 }, { "epoch": 12.358435975861784, "grad_norm": 2.658376932144165, "learning_rate": 4.197242872312088e-06, "loss": 0.2203, "step": 897000 }, { "epoch": 12.359813727921523, "grad_norm": 9.237821578979492, "learning_rate": 4.192987677562576e-06, "loss": 0.2354, "step": 897100 }, { "epoch": 12.361191479981262, "grad_norm": 1.3623300790786743, "learning_rate": 4.188734462863e-06, "loss": 0.2281, "step": 897200 }, { "epoch": 12.362569232041002, "grad_norm": 2.9656741619110107, "learning_rate": 4.184483228574694e-06, "loss": 0.2365, "step": 897300 }, { "epoch": 12.36394698410074, "grad_norm": 1.0035396814346313, "learning_rate": 4.180233975058813e-06, "loss": 0.2448, "step": 897400 }, { "epoch": 12.365324736160481, "grad_norm": 3.20845365524292, "learning_rate": 4.175986702676366e-06, "loss": 0.2278, "step": 897500 }, { "epoch": 12.36670248822022, "grad_norm": 0.6244051456451416, "learning_rate": 4.1717838548874674e-06, "loss": 0.2104, "step": 897600 }, { "epoch": 12.368080240279959, "grad_norm": 2.60233211517334, "learning_rate": 4.167540526033856e-06, "loss": 0.2557, "step": 897700 }, { "epoch": 12.369457992339699, "grad_norm": 1.045715570449829, "learning_rate": 4.163299179392036e-06, "loss": 0.1759, "step": 897800 }, { "epoch": 12.370835744399438, "grad_norm": 3.569092273712158, "learning_rate": 4.159059815322343e-06, "loss": 0.2475, "step": 897900 }, { "epoch": 12.372213496459178, "grad_norm": 1.6751595735549927, "learning_rate": 4.1548224341849355e-06, "loss": 0.2556, "step": 898000 }, { "epoch": 12.373591248518917, "grad_norm": 1.4152687788009644, "learning_rate": 4.150587036339794e-06, "loss": 0.233, "step": 898100 }, { "epoch": 12.374969000578655, "grad_norm": 0.4961412250995636, "learning_rate": 4.1463536221467336e-06, "loss": 0.2386, "step": 898200 }, { "epoch": 12.376346752638396, "grad_norm": 6.542469501495361, "learning_rate": 4.142122191965414e-06, "loss": 0.229, "step": 898300 }, { "epoch": 12.377724504698135, "grad_norm": 0.7540145516395569, "learning_rate": 4.137892746155309e-06, "loss": 0.2058, "step": 898400 }, { "epoch": 12.379102256757873, "grad_norm": 1.9386975765228271, "learning_rate": 4.133665285075733e-06, "loss": 0.2001, "step": 898500 }, { "epoch": 12.380480008817614, "grad_norm": 0.07088484615087509, "learning_rate": 4.129439809085837e-06, "loss": 0.2456, "step": 898600 }, { "epoch": 12.381857760877352, "grad_norm": 1.2496007680892944, "learning_rate": 4.125216318544596e-06, "loss": 0.2389, "step": 898700 }, { "epoch": 12.383235512937091, "grad_norm": 2.4799609184265137, "learning_rate": 4.120994813810806e-06, "loss": 0.2254, "step": 898800 }, { "epoch": 12.384613264996831, "grad_norm": 1.1736985445022583, "learning_rate": 4.1167752952431094e-06, "loss": 0.2178, "step": 898900 }, { "epoch": 12.38599101705657, "grad_norm": 1.794600248336792, "learning_rate": 4.112557763199989e-06, "loss": 0.2218, "step": 899000 }, { "epoch": 12.38736876911631, "grad_norm": 3.3393659591674805, "learning_rate": 4.10834221803973e-06, "loss": 0.2787, "step": 899100 }, { "epoch": 12.38874652117605, "grad_norm": 1.3142123222351074, "learning_rate": 4.104128660120468e-06, "loss": 0.2515, "step": 899200 }, { "epoch": 12.390124273235788, "grad_norm": 2.613729238510132, "learning_rate": 4.099917089800181e-06, "loss": 0.2493, "step": 899300 }, { "epoch": 12.391502025295528, "grad_norm": 1.1995364427566528, "learning_rate": 4.095707507436643e-06, "loss": 0.2546, "step": 899400 }, { "epoch": 12.392879777355267, "grad_norm": 1.927012324333191, "learning_rate": 4.091499913387485e-06, "loss": 0.2654, "step": 899500 }, { "epoch": 12.394257529415006, "grad_norm": 4.1164326667785645, "learning_rate": 4.087294308010172e-06, "loss": 0.2524, "step": 899600 }, { "epoch": 12.395635281474746, "grad_norm": 2.5458950996398926, "learning_rate": 4.08309069166198e-06, "loss": 0.1971, "step": 899700 }, { "epoch": 12.397013033534485, "grad_norm": 3.189155101776123, "learning_rate": 4.078889064700041e-06, "loss": 0.2665, "step": 899800 }, { "epoch": 12.398390785594225, "grad_norm": 2.174677848815918, "learning_rate": 4.074689427481289e-06, "loss": 0.1797, "step": 899900 }, { "epoch": 12.399768537653964, "grad_norm": 0.23012645542621613, "learning_rate": 4.070491780362524e-06, "loss": 0.2148, "step": 900000 }, { "epoch": 12.399768537653964, "eval_accuracy": 0.8890996644481934, "eval_cer": 0.05624914639098662, "eval_loss": 0.321673184633255, "eval_runtime": 8874.9383, "eval_samples_per_second": 6.078, "eval_steps_per_second": 0.38, "eval_wer": 0.1209215257687301, "step": 900000 }, { "epoch": 12.401146289713703, "grad_norm": 3.005718231201172, "learning_rate": 4.066296123700337e-06, "loss": 0.2209, "step": 900100 }, { "epoch": 12.402524041773443, "grad_norm": 5.2009687423706055, "learning_rate": 4.06210245785118e-06, "loss": 0.2225, "step": 900200 }, { "epoch": 12.403901793833182, "grad_norm": 1.7844475507736206, "learning_rate": 4.057952690060678e-06, "loss": 0.2415, "step": 900300 }, { "epoch": 12.40527954589292, "grad_norm": 4.088395595550537, "learning_rate": 4.053762986989224e-06, "loss": 0.2257, "step": 900400 }, { "epoch": 12.40665729795266, "grad_norm": 2.0080440044403076, "learning_rate": 4.049575275795548e-06, "loss": 0.1994, "step": 900500 }, { "epoch": 12.4080350500124, "grad_norm": 1.5811798572540283, "learning_rate": 4.045389556835425e-06, "loss": 0.2094, "step": 900600 }, { "epoch": 12.40941280207214, "grad_norm": 1.148227572441101, "learning_rate": 4.041205830464457e-06, "loss": 0.259, "step": 900700 }, { "epoch": 12.410790554131879, "grad_norm": 2.7475385665893555, "learning_rate": 4.037024097038062e-06, "loss": 0.2099, "step": 900800 }, { "epoch": 12.412168306191617, "grad_norm": 4.51393461227417, "learning_rate": 4.0328443569115135e-06, "loss": 0.229, "step": 900900 }, { "epoch": 12.413546058251358, "grad_norm": 1.3130977153778076, "learning_rate": 4.028666610439884e-06, "loss": 0.2277, "step": 901000 }, { "epoch": 12.414923810311096, "grad_norm": 1.9018644094467163, "learning_rate": 4.024490857978115e-06, "loss": 0.2268, "step": 901100 }, { "epoch": 12.416301562370835, "grad_norm": 1.2803382873535156, "learning_rate": 4.020317099880938e-06, "loss": 0.2331, "step": 901200 }, { "epoch": 12.417679314430575, "grad_norm": 6.235776424407959, "learning_rate": 4.016145336502953e-06, "loss": 0.2287, "step": 901300 }, { "epoch": 12.419057066490314, "grad_norm": 4.757741451263428, "learning_rate": 4.011975568198557e-06, "loss": 0.2029, "step": 901400 }, { "epoch": 12.420434818550053, "grad_norm": 2.5834996700286865, "learning_rate": 4.007807795322006e-06, "loss": 0.2198, "step": 901500 }, { "epoch": 12.421812570609793, "grad_norm": 4.512655735015869, "learning_rate": 4.003642018227363e-06, "loss": 0.2549, "step": 901600 }, { "epoch": 12.423190322669532, "grad_norm": 1.9069846868515015, "learning_rate": 3.999478237268544e-06, "loss": 0.253, "step": 901700 }, { "epoch": 12.424568074729272, "grad_norm": 1.8341480493545532, "learning_rate": 3.995316452799269e-06, "loss": 0.2333, "step": 901800 }, { "epoch": 12.425945826789011, "grad_norm": 2.6564137935638428, "learning_rate": 3.99115666517311e-06, "loss": 0.1904, "step": 901900 }, { "epoch": 12.42732357884875, "grad_norm": 1.4518638849258423, "learning_rate": 3.986998874743479e-06, "loss": 0.2384, "step": 902000 }, { "epoch": 12.42870133090849, "grad_norm": 0.8784197568893433, "learning_rate": 3.98284308186357e-06, "loss": 0.2003, "step": 902100 }, { "epoch": 12.430079082968229, "grad_norm": 3.6955509185791016, "learning_rate": 3.978689286886453e-06, "loss": 0.2796, "step": 902200 }, { "epoch": 12.43145683502797, "grad_norm": 3.228609800338745, "learning_rate": 3.974537490165021e-06, "loss": 0.2464, "step": 902300 }, { "epoch": 12.432834587087708, "grad_norm": 1.7909231185913086, "learning_rate": 3.970429180138843e-06, "loss": 0.2529, "step": 902400 }, { "epoch": 12.434212339147447, "grad_norm": 1.9172004461288452, "learning_rate": 3.9662813609953965e-06, "loss": 0.2404, "step": 902500 }, { "epoch": 12.435590091207187, "grad_norm": 0.07181792706251144, "learning_rate": 3.96213554116174e-06, "loss": 0.2176, "step": 902600 }, { "epoch": 12.436967843266926, "grad_norm": 4.632223129272461, "learning_rate": 3.95799172099009e-06, "loss": 0.222, "step": 902700 }, { "epoch": 12.438345595326664, "grad_norm": 1.0618491172790527, "learning_rate": 3.953849900832488e-06, "loss": 0.2516, "step": 902800 }, { "epoch": 12.439723347386405, "grad_norm": 2.681830644607544, "learning_rate": 3.94971008104079e-06, "loss": 0.2139, "step": 902900 }, { "epoch": 12.441101099446144, "grad_norm": 4.660548210144043, "learning_rate": 3.945572261966706e-06, "loss": 0.1928, "step": 903000 }, { "epoch": 12.442478851505882, "grad_norm": 1.443814992904663, "learning_rate": 3.9414364439617714e-06, "loss": 0.2347, "step": 903100 }, { "epoch": 12.443856603565623, "grad_norm": 2.1228506565093994, "learning_rate": 3.937302627377326e-06, "loss": 0.2138, "step": 903200 }, { "epoch": 12.445234355625361, "grad_norm": 4.037171840667725, "learning_rate": 3.933170812564566e-06, "loss": 0.2817, "step": 903300 }, { "epoch": 12.446612107685102, "grad_norm": 0.15107671916484833, "learning_rate": 3.929040999874521e-06, "loss": 0.1826, "step": 903400 }, { "epoch": 12.44798985974484, "grad_norm": 2.934523105621338, "learning_rate": 3.924913189658024e-06, "loss": 0.2109, "step": 903500 }, { "epoch": 12.449367611804579, "grad_norm": 8.833013534545898, "learning_rate": 3.920787382265762e-06, "loss": 0.2436, "step": 903600 }, { "epoch": 12.45074536386432, "grad_norm": 0.7497681975364685, "learning_rate": 3.9166635780482495e-06, "loss": 0.1859, "step": 903700 }, { "epoch": 12.452123115924058, "grad_norm": 1.4005178213119507, "learning_rate": 3.912541777355814e-06, "loss": 0.2303, "step": 903800 }, { "epoch": 12.453500867983797, "grad_norm": 2.5171241760253906, "learning_rate": 3.9084219805386226e-06, "loss": 0.2566, "step": 903900 }, { "epoch": 12.454878620043537, "grad_norm": 3.2477614879608154, "learning_rate": 3.904304187946678e-06, "loss": 0.2437, "step": 904000 }, { "epoch": 12.456256372103276, "grad_norm": 4.606788158416748, "learning_rate": 3.900188399929813e-06, "loss": 0.2321, "step": 904100 }, { "epoch": 12.457634124163016, "grad_norm": 4.003744602203369, "learning_rate": 3.896074616837682e-06, "loss": 0.2302, "step": 904200 }, { "epoch": 12.459011876222755, "grad_norm": 2.9413046836853027, "learning_rate": 3.8919628390197585e-06, "loss": 0.2074, "step": 904300 }, { "epoch": 12.460389628282494, "grad_norm": 2.080765724182129, "learning_rate": 3.887853066825378e-06, "loss": 0.2469, "step": 904400 }, { "epoch": 12.461767380342234, "grad_norm": 2.9030494689941406, "learning_rate": 3.883745300603672e-06, "loss": 0.2506, "step": 904500 }, { "epoch": 12.463145132401973, "grad_norm": 4.502427577972412, "learning_rate": 3.879639540703623e-06, "loss": 0.2383, "step": 904600 }, { "epoch": 12.464522884461712, "grad_norm": 0.03270488604903221, "learning_rate": 3.875535787474043e-06, "loss": 0.2046, "step": 904700 }, { "epoch": 12.465900636521452, "grad_norm": 1.1360697746276855, "learning_rate": 3.8714340412635586e-06, "loss": 0.2155, "step": 904800 }, { "epoch": 12.46727838858119, "grad_norm": 4.122684001922607, "learning_rate": 3.867334302420629e-06, "loss": 0.2237, "step": 904900 }, { "epoch": 12.468656140640931, "grad_norm": 3.7630555629730225, "learning_rate": 3.863236571293553e-06, "loss": 0.2542, "step": 905000 }, { "epoch": 12.47003389270067, "grad_norm": 2.492401599884033, "learning_rate": 3.859140848230464e-06, "loss": 0.2326, "step": 905100 }, { "epoch": 12.471411644760408, "grad_norm": 1.852944254875183, "learning_rate": 3.8550471335792995e-06, "loss": 0.2512, "step": 905200 }, { "epoch": 12.472789396820149, "grad_norm": 2.078153133392334, "learning_rate": 3.850955427687853e-06, "loss": 0.206, "step": 905300 }, { "epoch": 12.474167148879888, "grad_norm": 1.2404680252075195, "learning_rate": 3.846865730903731e-06, "loss": 0.2035, "step": 905400 }, { "epoch": 12.475544900939626, "grad_norm": 1.0016200542449951, "learning_rate": 3.842778043574367e-06, "loss": 0.2042, "step": 905500 }, { "epoch": 12.476922652999367, "grad_norm": 1.049678087234497, "learning_rate": 3.838692366047036e-06, "loss": 0.2212, "step": 905600 }, { "epoch": 12.478300405059105, "grad_norm": 3.5079498291015625, "learning_rate": 3.834608698668847e-06, "loss": 0.1877, "step": 905700 }, { "epoch": 12.479678157118844, "grad_norm": 3.895480155944824, "learning_rate": 3.830527041786716e-06, "loss": 0.2373, "step": 905800 }, { "epoch": 12.481055909178584, "grad_norm": 2.989802122116089, "learning_rate": 3.82644739574741e-06, "loss": 0.2498, "step": 905900 }, { "epoch": 12.482433661238323, "grad_norm": 13.865406036376953, "learning_rate": 3.822369760897504e-06, "loss": 0.2434, "step": 906000 }, { "epoch": 12.483811413298064, "grad_norm": 0.863244354724884, "learning_rate": 3.818294137583426e-06, "loss": 0.204, "step": 906100 }, { "epoch": 12.485189165357802, "grad_norm": 3.711275100708008, "learning_rate": 3.8142205261514113e-06, "loss": 0.2441, "step": 906200 }, { "epoch": 12.48656691741754, "grad_norm": 0.5521007180213928, "learning_rate": 3.8101489269475444e-06, "loss": 0.2277, "step": 906300 }, { "epoch": 12.487944669477281, "grad_norm": 3.017240285873413, "learning_rate": 3.806120026220639e-06, "loss": 0.226, "step": 906400 }, { "epoch": 12.48932242153702, "grad_norm": 3.55344295501709, "learning_rate": 3.802052432379686e-06, "loss": 0.2306, "step": 906500 }, { "epoch": 12.49070017359676, "grad_norm": 5.871715068817139, "learning_rate": 3.7979868518006098e-06, "loss": 0.2471, "step": 906600 }, { "epoch": 12.492077925656499, "grad_norm": 2.2820990085601807, "learning_rate": 3.7939232848288086e-06, "loss": 0.2522, "step": 906700 }, { "epoch": 12.493455677716238, "grad_norm": 2.5023770332336426, "learning_rate": 3.7898617318095103e-06, "loss": 0.2637, "step": 906800 }, { "epoch": 12.494833429775978, "grad_norm": 2.548797369003296, "learning_rate": 3.785802193087753e-06, "loss": 0.2174, "step": 906900 }, { "epoch": 12.496211181835717, "grad_norm": 3.1944901943206787, "learning_rate": 3.781744669008426e-06, "loss": 0.1894, "step": 907000 }, { "epoch": 12.497588933895456, "grad_norm": 1.6214133501052856, "learning_rate": 3.777689159916232e-06, "loss": 0.2956, "step": 907100 }, { "epoch": 12.498966685955196, "grad_norm": 1.4016005992889404, "learning_rate": 3.773635666155699e-06, "loss": 0.2662, "step": 907200 }, { "epoch": 12.500344438014935, "grad_norm": 4.037769317626953, "learning_rate": 3.7695841880712e-06, "loss": 0.2751, "step": 907300 }, { "epoch": 12.501722190074673, "grad_norm": 4.5331854820251465, "learning_rate": 3.7655347260069344e-06, "loss": 0.2395, "step": 907400 }, { "epoch": 12.503099942134414, "grad_norm": 1.1057956218719482, "learning_rate": 3.76148728030692e-06, "loss": 0.2236, "step": 907500 }, { "epoch": 12.504477694194152, "grad_norm": 0.8608847856521606, "learning_rate": 3.757441851314999e-06, "loss": 0.2605, "step": 907600 }, { "epoch": 12.505855446253893, "grad_norm": 0.7966085076332092, "learning_rate": 3.7533984393748573e-06, "loss": 0.2164, "step": 907700 }, { "epoch": 12.507233198313632, "grad_norm": 4.589109897613525, "learning_rate": 3.749357044830013e-06, "loss": 0.2551, "step": 907800 }, { "epoch": 12.50861095037337, "grad_norm": 2.9132936000823975, "learning_rate": 3.745317668023788e-06, "loss": 0.2292, "step": 907900 }, { "epoch": 12.50998870243311, "grad_norm": 2.2123827934265137, "learning_rate": 3.7412803092993564e-06, "loss": 0.2141, "step": 908000 }, { "epoch": 12.51136645449285, "grad_norm": 0.7828893661499023, "learning_rate": 3.7372449689997225e-06, "loss": 0.2494, "step": 908100 }, { "epoch": 12.512744206552588, "grad_norm": 1.6209080219268799, "learning_rate": 3.733211647467687e-06, "loss": 0.1986, "step": 908200 }, { "epoch": 12.514121958612328, "grad_norm": 3.1257684230804443, "learning_rate": 3.7291803450459125e-06, "loss": 0.2128, "step": 908300 }, { "epoch": 12.515499710672067, "grad_norm": 3.1367995738983154, "learning_rate": 3.7251510620768827e-06, "loss": 0.247, "step": 908400 }, { "epoch": 12.516877462731808, "grad_norm": 0.22655609250068665, "learning_rate": 3.721164061535527e-06, "loss": 0.2375, "step": 908500 }, { "epoch": 12.518255214791546, "grad_norm": 3.3604624271392822, "learning_rate": 3.7171387982956686e-06, "loss": 0.2511, "step": 908600 }, { "epoch": 12.519632966851285, "grad_norm": 2.938551425933838, "learning_rate": 3.7131155555315338e-06, "loss": 0.2493, "step": 908700 }, { "epoch": 12.521010718911025, "grad_norm": 0.12210047990083694, "learning_rate": 3.7090943335849244e-06, "loss": 0.2302, "step": 908800 }, { "epoch": 12.522388470970764, "grad_norm": 4.032182216644287, "learning_rate": 3.7050751327974725e-06, "loss": 0.226, "step": 908900 }, { "epoch": 12.523766223030503, "grad_norm": 3.6658716201782227, "learning_rate": 3.7010579535106163e-06, "loss": 0.2752, "step": 909000 }, { "epoch": 12.525143975090243, "grad_norm": 0.4287591576576233, "learning_rate": 3.6970427960656452e-06, "loss": 0.2131, "step": 909100 }, { "epoch": 12.526521727149982, "grad_norm": 4.468510150909424, "learning_rate": 3.693029660803665e-06, "loss": 0.2594, "step": 909200 }, { "epoch": 12.527899479209722, "grad_norm": 2.376677989959717, "learning_rate": 3.6890185480656045e-06, "loss": 0.219, "step": 909300 }, { "epoch": 12.529277231269461, "grad_norm": 4.313483238220215, "learning_rate": 3.6850094581922346e-06, "loss": 0.1837, "step": 909400 }, { "epoch": 12.5306549833292, "grad_norm": 0.854306161403656, "learning_rate": 3.681002391524154e-06, "loss": 0.2229, "step": 909500 }, { "epoch": 12.53203273538894, "grad_norm": 0.8100183606147766, "learning_rate": 3.676997348401773e-06, "loss": 0.2147, "step": 909600 }, { "epoch": 12.533410487448679, "grad_norm": 2.148639678955078, "learning_rate": 3.6729943291653483e-06, "loss": 0.2059, "step": 909700 }, { "epoch": 12.534788239508417, "grad_norm": 1.154579758644104, "learning_rate": 3.6689933341549483e-06, "loss": 0.2261, "step": 909800 }, { "epoch": 12.536165991568158, "grad_norm": 2.6155924797058105, "learning_rate": 3.6649943637104913e-06, "loss": 0.1948, "step": 909900 }, { "epoch": 12.537543743627896, "grad_norm": 2.6509344577789307, "learning_rate": 3.660997418171694e-06, "loss": 0.2297, "step": 910000 }, { "epoch": 12.538921495687635, "grad_norm": 2.1501147747039795, "learning_rate": 3.65700249787813e-06, "loss": 0.2085, "step": 910100 }, { "epoch": 12.540299247747376, "grad_norm": 1.3811872005462646, "learning_rate": 3.6530096031691802e-06, "loss": 0.2425, "step": 910200 }, { "epoch": 12.541676999807114, "grad_norm": 0.125303253531456, "learning_rate": 3.64901873438407e-06, "loss": 0.2037, "step": 910300 }, { "epoch": 12.543054751866855, "grad_norm": 1.7733478546142578, "learning_rate": 3.645029891861835e-06, "loss": 0.1853, "step": 910400 }, { "epoch": 12.544432503926593, "grad_norm": 1.8176164627075195, "learning_rate": 3.6410829340677655e-06, "loss": 0.1872, "step": 910500 }, { "epoch": 12.545810255986332, "grad_norm": 2.1111767292022705, "learning_rate": 3.6370981248166578e-06, "loss": 0.2447, "step": 910600 }, { "epoch": 12.547188008046072, "grad_norm": 0.6519777774810791, "learning_rate": 3.633115342841143e-06, "loss": 0.2328, "step": 910700 }, { "epoch": 12.548565760105811, "grad_norm": 2.3024773597717285, "learning_rate": 3.6291345884795865e-06, "loss": 0.225, "step": 910800 }, { "epoch": 12.549943512165552, "grad_norm": 12.88027572631836, "learning_rate": 3.625155862070168e-06, "loss": 0.2381, "step": 910900 }, { "epoch": 12.55132126422529, "grad_norm": 2.284512519836426, "learning_rate": 3.6211791639508958e-06, "loss": 0.2086, "step": 911000 }, { "epoch": 12.552699016285029, "grad_norm": 0.705697774887085, "learning_rate": 3.6172044944596177e-06, "loss": 0.2481, "step": 911100 }, { "epoch": 12.55407676834477, "grad_norm": 1.4522850513458252, "learning_rate": 3.6132318539340078e-06, "loss": 0.2004, "step": 911200 }, { "epoch": 12.555454520404508, "grad_norm": 9.470004081726074, "learning_rate": 3.609261242711549e-06, "loss": 0.2399, "step": 911300 }, { "epoch": 12.556832272464247, "grad_norm": 1.3187912702560425, "learning_rate": 3.6052926611295787e-06, "loss": 0.2248, "step": 911400 }, { "epoch": 12.558210024523987, "grad_norm": 0.30397436022758484, "learning_rate": 3.6013261095252344e-06, "loss": 0.2389, "step": 911500 }, { "epoch": 12.559587776583726, "grad_norm": 2.7222912311553955, "learning_rate": 3.59736158823551e-06, "loss": 0.2245, "step": 911600 }, { "epoch": 12.560965528643464, "grad_norm": 2.3611936569213867, "learning_rate": 3.5933990975971936e-06, "loss": 0.2144, "step": 911700 }, { "epoch": 12.562343280703205, "grad_norm": 2.5766351222991943, "learning_rate": 3.5894386379469313e-06, "loss": 0.2337, "step": 911800 }, { "epoch": 12.563721032762944, "grad_norm": 2.4747798442840576, "learning_rate": 3.5854802096211974e-06, "loss": 0.2302, "step": 911900 }, { "epoch": 12.565098784822684, "grad_norm": 0.3169412612915039, "learning_rate": 3.5815238129562493e-06, "loss": 0.1844, "step": 912000 }, { "epoch": 12.566476536882423, "grad_norm": 3.081221580505371, "learning_rate": 3.577569448288223e-06, "loss": 0.2151, "step": 912100 }, { "epoch": 12.567854288942161, "grad_norm": 6.372676849365234, "learning_rate": 3.573617115953061e-06, "loss": 0.2274, "step": 912200 }, { "epoch": 12.569232041001902, "grad_norm": 9.849974632263184, "learning_rate": 3.569666816286527e-06, "loss": 0.2364, "step": 912300 }, { "epoch": 12.57060979306164, "grad_norm": 1.6972519159317017, "learning_rate": 3.5657185496242225e-06, "loss": 0.2398, "step": 912400 }, { "epoch": 12.57198754512138, "grad_norm": 1.846554160118103, "learning_rate": 3.561772316301583e-06, "loss": 0.2424, "step": 912500 }, { "epoch": 12.57336529718112, "grad_norm": 2.0066514015197754, "learning_rate": 3.5578281166538505e-06, "loss": 0.2034, "step": 912600 }, { "epoch": 12.574743049240858, "grad_norm": 9.855225563049316, "learning_rate": 3.5538859510161003e-06, "loss": 0.2016, "step": 912700 }, { "epoch": 12.576120801300599, "grad_norm": 5.288704872131348, "learning_rate": 3.5499458197232475e-06, "loss": 0.2447, "step": 912800 }, { "epoch": 12.577498553360337, "grad_norm": 6.687885761260986, "learning_rate": 3.5460077231100285e-06, "loss": 0.1968, "step": 912900 }, { "epoch": 12.578876305420076, "grad_norm": 4.185986042022705, "learning_rate": 3.5420716615110043e-06, "loss": 0.2282, "step": 913000 }, { "epoch": 12.580254057479817, "grad_norm": 2.8870296478271484, "learning_rate": 3.538137635260554e-06, "loss": 0.2503, "step": 913100 }, { "epoch": 12.581631809539555, "grad_norm": 1.2261135578155518, "learning_rate": 3.5342056446929027e-06, "loss": 0.187, "step": 913200 }, { "epoch": 12.583009561599294, "grad_norm": 2.374559164047241, "learning_rate": 3.530275690142085e-06, "loss": 0.2245, "step": 913300 }, { "epoch": 12.584387313659034, "grad_norm": 0.7190760374069214, "learning_rate": 3.5263477719419775e-06, "loss": 0.242, "step": 913400 }, { "epoch": 12.585765065718773, "grad_norm": 0.626494288444519, "learning_rate": 3.5224218904262824e-06, "loss": 0.2191, "step": 913500 }, { "epoch": 12.587142817778513, "grad_norm": 3.52441668510437, "learning_rate": 3.5184980459285123e-06, "loss": 0.1824, "step": 913600 }, { "epoch": 12.588520569838252, "grad_norm": 2.043318510055542, "learning_rate": 3.5145762387820183e-06, "loss": 0.2007, "step": 913700 }, { "epoch": 12.58989832189799, "grad_norm": 3.781277656555176, "learning_rate": 3.5106564693199807e-06, "loss": 0.2322, "step": 913800 }, { "epoch": 12.591276073957731, "grad_norm": 2.291917085647583, "learning_rate": 3.506738737875412e-06, "loss": 0.2408, "step": 913900 }, { "epoch": 12.59265382601747, "grad_norm": 2.0271263122558594, "learning_rate": 3.502823044781129e-06, "loss": 0.2152, "step": 914000 }, { "epoch": 12.594031578077209, "grad_norm": 4.473174095153809, "learning_rate": 3.4989093903698015e-06, "loss": 0.257, "step": 914100 }, { "epoch": 12.595409330136949, "grad_norm": 2.161417007446289, "learning_rate": 3.4949977749739106e-06, "loss": 0.2172, "step": 914200 }, { "epoch": 12.596787082196688, "grad_norm": 4.593111038208008, "learning_rate": 3.491088198925764e-06, "loss": 0.1643, "step": 914300 }, { "epoch": 12.598164834256426, "grad_norm": 4.854642868041992, "learning_rate": 3.4871806625575025e-06, "loss": 0.2242, "step": 914400 }, { "epoch": 12.599542586316167, "grad_norm": 14.07918643951416, "learning_rate": 3.4832751662010984e-06, "loss": 0.2112, "step": 914500 }, { "epoch": 12.600920338375905, "grad_norm": 0.7469869256019592, "learning_rate": 3.47937171018833e-06, "loss": 0.2229, "step": 914600 }, { "epoch": 12.602298090435646, "grad_norm": 3.4395291805267334, "learning_rate": 3.4754702948508324e-06, "loss": 0.2401, "step": 914700 }, { "epoch": 12.603675842495385, "grad_norm": 1.2302637100219727, "learning_rate": 3.47160990415927e-06, "loss": 0.2078, "step": 914800 }, { "epoch": 12.605053594555123, "grad_norm": 4.362504005432129, "learning_rate": 3.467712550751439e-06, "loss": 0.2112, "step": 914900 }, { "epoch": 12.606431346614864, "grad_norm": 1.5781856775283813, "learning_rate": 3.4638172390093675e-06, "loss": 0.1686, "step": 915000 }, { "epoch": 12.607809098674602, "grad_norm": 0.9203447103500366, "learning_rate": 3.459923969263991e-06, "loss": 0.2644, "step": 915100 }, { "epoch": 12.609186850734343, "grad_norm": 3.0363094806671143, "learning_rate": 3.4560327418460683e-06, "loss": 0.2001, "step": 915200 }, { "epoch": 12.610564602794081, "grad_norm": 2.0067203044891357, "learning_rate": 3.4521435570861694e-06, "loss": 0.2253, "step": 915300 }, { "epoch": 12.61194235485382, "grad_norm": 3.2417232990264893, "learning_rate": 3.4482564153146995e-06, "loss": 0.2113, "step": 915400 }, { "epoch": 12.61332010691356, "grad_norm": 2.598731279373169, "learning_rate": 3.444371316861895e-06, "loss": 0.2063, "step": 915500 }, { "epoch": 12.6146978589733, "grad_norm": 3.812410354614258, "learning_rate": 3.4404882620578192e-06, "loss": 0.2644, "step": 915600 }, { "epoch": 12.616075611033038, "grad_norm": 4.084008693695068, "learning_rate": 3.4366460512218275e-06, "loss": 0.205, "step": 915700 }, { "epoch": 12.617453363092778, "grad_norm": 2.0450711250305176, "learning_rate": 3.4327670642599675e-06, "loss": 0.2303, "step": 915800 }, { "epoch": 12.618831115152517, "grad_norm": 2.5086967945098877, "learning_rate": 3.4288901219326684e-06, "loss": 0.185, "step": 915900 }, { "epoch": 12.620208867212256, "grad_norm": 4.411999702453613, "learning_rate": 3.425015224569309e-06, "loss": 0.2069, "step": 916000 }, { "epoch": 12.621586619271996, "grad_norm": 2.2588372230529785, "learning_rate": 3.4211423724990635e-06, "loss": 0.2363, "step": 916100 }, { "epoch": 12.622964371331735, "grad_norm": 2.837456464767456, "learning_rate": 3.41727156605096e-06, "loss": 0.2485, "step": 916200 }, { "epoch": 12.624342123391475, "grad_norm": 1.386113166809082, "learning_rate": 3.413402805553852e-06, "loss": 0.2264, "step": 916300 }, { "epoch": 12.625719875451214, "grad_norm": 1.0794447660446167, "learning_rate": 3.409536091336399e-06, "loss": 0.227, "step": 916400 }, { "epoch": 12.627097627510953, "grad_norm": 5.324759483337402, "learning_rate": 3.4056714237270965e-06, "loss": 0.228, "step": 916500 }, { "epoch": 12.628475379570693, "grad_norm": 2.1033549308776855, "learning_rate": 3.4018088030542707e-06, "loss": 0.2354, "step": 916600 }, { "epoch": 12.629853131630432, "grad_norm": 0.15262453258037567, "learning_rate": 3.3979482296460795e-06, "loss": 0.2475, "step": 916700 }, { "epoch": 12.63123088369017, "grad_norm": 3.1930205821990967, "learning_rate": 3.394089703830488e-06, "loss": 0.2416, "step": 916800 }, { "epoch": 12.63260863574991, "grad_norm": 3.10556960105896, "learning_rate": 3.390233225935304e-06, "loss": 0.2634, "step": 916900 }, { "epoch": 12.63398638780965, "grad_norm": 0.5082910060882568, "learning_rate": 3.386378796288152e-06, "loss": 0.2421, "step": 917000 }, { "epoch": 12.63536413986939, "grad_norm": 0.2184629589319229, "learning_rate": 3.382526415216483e-06, "loss": 0.2201, "step": 917100 }, { "epoch": 12.636741891929129, "grad_norm": 5.219517230987549, "learning_rate": 3.3786760830475754e-06, "loss": 0.2477, "step": 917200 }, { "epoch": 12.638119643988867, "grad_norm": 4.889597415924072, "learning_rate": 3.374827800108548e-06, "loss": 0.2021, "step": 917300 }, { "epoch": 12.639497396048608, "grad_norm": 2.4116554260253906, "learning_rate": 3.37098156672632e-06, "loss": 0.2139, "step": 917400 }, { "epoch": 12.640875148108346, "grad_norm": 6.757554054260254, "learning_rate": 3.367137383227644e-06, "loss": 0.2159, "step": 917500 }, { "epoch": 12.642252900168085, "grad_norm": 2.021207332611084, "learning_rate": 3.3632952499391123e-06, "loss": 0.2143, "step": 917600 }, { "epoch": 12.643630652227825, "grad_norm": 1.2735322713851929, "learning_rate": 3.3594551671871387e-06, "loss": 0.2334, "step": 917700 }, { "epoch": 12.645008404287564, "grad_norm": 0.10791590809822083, "learning_rate": 3.3556171352979427e-06, "loss": 0.2308, "step": 917800 }, { "epoch": 12.646386156347305, "grad_norm": 0.05612052232027054, "learning_rate": 3.351857854107844e-06, "loss": 0.2477, "step": 917900 }, { "epoch": 12.647763908407043, "grad_norm": 1.6612247228622437, "learning_rate": 3.348023883888748e-06, "loss": 0.2338, "step": 918000 }, { "epoch": 12.649141660466782, "grad_norm": 3.7704150676727295, "learning_rate": 3.344191965503576e-06, "loss": 0.1596, "step": 918100 }, { "epoch": 12.650519412526522, "grad_norm": 0.6422763466835022, "learning_rate": 3.3403620992778815e-06, "loss": 0.2067, "step": 918200 }, { "epoch": 12.651897164586261, "grad_norm": 3.9612650871276855, "learning_rate": 3.336534285537022e-06, "loss": 0.2644, "step": 918300 }, { "epoch": 12.653274916646, "grad_norm": 1.3923494815826416, "learning_rate": 3.3327085246061947e-06, "loss": 0.2099, "step": 918400 }, { "epoch": 12.65465266870574, "grad_norm": 3.3482720851898193, "learning_rate": 3.328884816810427e-06, "loss": 0.2368, "step": 918500 }, { "epoch": 12.656030420765479, "grad_norm": 3.072597026824951, "learning_rate": 3.325063162474543e-06, "loss": 0.2576, "step": 918600 }, { "epoch": 12.657408172825217, "grad_norm": 1.5982545614242554, "learning_rate": 3.3212435619232206e-06, "loss": 0.239, "step": 918700 }, { "epoch": 12.658785924884958, "grad_norm": 3.929764747619629, "learning_rate": 3.317426015480962e-06, "loss": 0.234, "step": 918800 }, { "epoch": 12.660163676944697, "grad_norm": 4.8823561668396, "learning_rate": 3.3136105234720753e-06, "loss": 0.188, "step": 918900 }, { "epoch": 12.661541429004437, "grad_norm": 2.1174111366271973, "learning_rate": 3.30979708622071e-06, "loss": 0.2065, "step": 919000 }, { "epoch": 12.662919181064176, "grad_norm": 1.4198808670043945, "learning_rate": 3.3059857040508436e-06, "loss": 0.1962, "step": 919100 }, { "epoch": 12.664296933123914, "grad_norm": 2.373792886734009, "learning_rate": 3.3021763772862684e-06, "loss": 0.2206, "step": 919200 }, { "epoch": 12.665674685183655, "grad_norm": 2.4396631717681885, "learning_rate": 3.298369106250599e-06, "loss": 0.2255, "step": 919300 }, { "epoch": 12.667052437243393, "grad_norm": 0.6654471158981323, "learning_rate": 3.294563891267291e-06, "loss": 0.2217, "step": 919400 }, { "epoch": 12.668430189303134, "grad_norm": 2.4668378829956055, "learning_rate": 3.2907607326596178e-06, "loss": 0.219, "step": 919500 }, { "epoch": 12.669807941362873, "grad_norm": 2.653829336166382, "learning_rate": 3.2869596307506744e-06, "loss": 0.252, "step": 919600 }, { "epoch": 12.671185693422611, "grad_norm": 1.5821677446365356, "learning_rate": 3.283160585863377e-06, "loss": 0.1857, "step": 919700 }, { "epoch": 12.672563445482352, "grad_norm": 1.2112700939178467, "learning_rate": 3.2793635983204882e-06, "loss": 0.2284, "step": 919800 }, { "epoch": 12.67394119754209, "grad_norm": 2.146636486053467, "learning_rate": 3.275568668444567e-06, "loss": 0.2127, "step": 919900 }, { "epoch": 12.675318949601829, "grad_norm": 0.02897159568965435, "learning_rate": 3.271775796558018e-06, "loss": 0.1937, "step": 920000 }, { "epoch": 12.67669670166157, "grad_norm": 4.477741718292236, "learning_rate": 3.267984982983074e-06, "loss": 0.2606, "step": 920100 }, { "epoch": 12.678074453721308, "grad_norm": 1.341055154800415, "learning_rate": 3.2641962280417734e-06, "loss": 0.2119, "step": 920200 }, { "epoch": 12.679452205781047, "grad_norm": 3.609816551208496, "learning_rate": 3.260409532055988e-06, "loss": 0.1916, "step": 920300 }, { "epoch": 12.680829957840787, "grad_norm": 2.864924669265747, "learning_rate": 3.2566248953474217e-06, "loss": 0.2361, "step": 920400 }, { "epoch": 12.682207709900526, "grad_norm": 1.8209729194641113, "learning_rate": 3.2528423182376037e-06, "loss": 0.2382, "step": 920500 }, { "epoch": 12.683585461960266, "grad_norm": 3.0421018600463867, "learning_rate": 3.2490618010478736e-06, "loss": 0.1922, "step": 920600 }, { "epoch": 12.684963214020005, "grad_norm": 1.7290154695510864, "learning_rate": 3.2452833440994135e-06, "loss": 0.1992, "step": 920700 }, { "epoch": 12.686340966079744, "grad_norm": 2.664268732070923, "learning_rate": 3.2415069477132218e-06, "loss": 0.2129, "step": 920800 }, { "epoch": 12.687718718139484, "grad_norm": 1.547699213027954, "learning_rate": 3.237732612210116e-06, "loss": 0.2012, "step": 920900 }, { "epoch": 12.689096470199223, "grad_norm": 1.4671459197998047, "learning_rate": 3.233960337910749e-06, "loss": 0.2738, "step": 921000 }, { "epoch": 12.690474222258961, "grad_norm": 1.930884838104248, "learning_rate": 3.2301901251356016e-06, "loss": 0.234, "step": 921100 }, { "epoch": 12.691851974318702, "grad_norm": 0.25040969252586365, "learning_rate": 3.2264219742049587e-06, "loss": 0.2381, "step": 921200 }, { "epoch": 12.69322972637844, "grad_norm": 14.818979263305664, "learning_rate": 3.222655885438959e-06, "loss": 0.2421, "step": 921300 }, { "epoch": 12.694607478438181, "grad_norm": 3.110816240310669, "learning_rate": 3.2188918591575435e-06, "loss": 0.2065, "step": 921400 }, { "epoch": 12.69598523049792, "grad_norm": 3.54752516746521, "learning_rate": 3.2151298956804805e-06, "loss": 0.2343, "step": 921500 }, { "epoch": 12.697362982557658, "grad_norm": 1.3571783304214478, "learning_rate": 3.2113699953273727e-06, "loss": 0.1953, "step": 921600 }, { "epoch": 12.698740734617399, "grad_norm": 1.5337263345718384, "learning_rate": 3.2076121584176523e-06, "loss": 0.2161, "step": 921700 }, { "epoch": 12.700118486677137, "grad_norm": 1.7826619148254395, "learning_rate": 3.203856385270556e-06, "loss": 0.2171, "step": 921800 }, { "epoch": 12.701496238736876, "grad_norm": 2.599698066711426, "learning_rate": 3.200102676205152e-06, "loss": 0.2214, "step": 921900 }, { "epoch": 12.702873990796617, "grad_norm": 1.8907020092010498, "learning_rate": 3.196351031540344e-06, "loss": 0.1873, "step": 922000 }, { "epoch": 12.704251742856355, "grad_norm": 3.0847809314727783, "learning_rate": 3.192601451594856e-06, "loss": 0.2382, "step": 922100 }, { "epoch": 12.705629494916096, "grad_norm": 0.013093199580907822, "learning_rate": 3.1888539366872256e-06, "loss": 0.1631, "step": 922200 }, { "epoch": 12.707007246975834, "grad_norm": 1.5631515979766846, "learning_rate": 3.1851084871358312e-06, "loss": 0.2269, "step": 922300 }, { "epoch": 12.708384999035573, "grad_norm": 1.1403919458389282, "learning_rate": 3.1813651032588714e-06, "loss": 0.2208, "step": 922400 }, { "epoch": 12.709762751095314, "grad_norm": 1.0077670812606812, "learning_rate": 3.177623785374352e-06, "loss": 0.25, "step": 922500 }, { "epoch": 12.711140503155052, "grad_norm": 4.851685523986816, "learning_rate": 3.173884533800124e-06, "loss": 0.2044, "step": 922600 }, { "epoch": 12.71251825521479, "grad_norm": 4.300386428833008, "learning_rate": 3.1701473488538606e-06, "loss": 0.2626, "step": 922700 }, { "epoch": 12.713896007274531, "grad_norm": 4.452045440673828, "learning_rate": 3.166412230853046e-06, "loss": 0.2153, "step": 922800 }, { "epoch": 12.71527375933427, "grad_norm": 2.243765354156494, "learning_rate": 3.1627165003883963e-06, "loss": 0.2166, "step": 922900 }, { "epoch": 12.716651511394009, "grad_norm": 1.0505355596542358, "learning_rate": 3.1589854965528974e-06, "loss": 0.1921, "step": 923000 }, { "epoch": 12.718029263453749, "grad_norm": 0.570681631565094, "learning_rate": 3.155256560611107e-06, "loss": 0.2383, "step": 923100 }, { "epoch": 12.719407015513488, "grad_norm": 0.11060261726379395, "learning_rate": 3.151529692879823e-06, "loss": 0.2283, "step": 923200 }, { "epoch": 12.720784767573228, "grad_norm": 5.472596168518066, "learning_rate": 3.147804893675653e-06, "loss": 0.2118, "step": 923300 }, { "epoch": 12.722162519632967, "grad_norm": 1.9719280004501343, "learning_rate": 3.1440821633150428e-06, "loss": 0.2351, "step": 923400 }, { "epoch": 12.723540271692706, "grad_norm": 4.066338539123535, "learning_rate": 3.140361502114268e-06, "loss": 0.2319, "step": 923500 }, { "epoch": 12.724918023752446, "grad_norm": 0.9245819449424744, "learning_rate": 3.136642910389396e-06, "loss": 0.2482, "step": 923600 }, { "epoch": 12.726295775812185, "grad_norm": 0.9695680737495422, "learning_rate": 3.13292638845635e-06, "loss": 0.1953, "step": 923700 }, { "epoch": 12.727673527871925, "grad_norm": 2.442044258117676, "learning_rate": 3.129211936630875e-06, "loss": 0.2758, "step": 923800 }, { "epoch": 12.729051279931664, "grad_norm": 10.180858612060547, "learning_rate": 3.1254995552285186e-06, "loss": 0.2707, "step": 923900 }, { "epoch": 12.730429031991402, "grad_norm": 1.1940913200378418, "learning_rate": 3.1217892445646795e-06, "loss": 0.2352, "step": 924000 }, { "epoch": 12.731806784051143, "grad_norm": 2.8272998332977295, "learning_rate": 3.1180810049545587e-06, "loss": 0.1894, "step": 924100 }, { "epoch": 12.733184536110882, "grad_norm": 3.493452310562134, "learning_rate": 3.114374836713197e-06, "loss": 0.1851, "step": 924200 }, { "epoch": 12.73456228817062, "grad_norm": 2.7276203632354736, "learning_rate": 3.1106707401554478e-06, "loss": 0.2224, "step": 924300 }, { "epoch": 12.73594004023036, "grad_norm": 2.1621506214141846, "learning_rate": 3.1069687155959882e-06, "loss": 0.187, "step": 924400 }, { "epoch": 12.7373177922901, "grad_norm": 1.2508805990219116, "learning_rate": 3.1032687633493387e-06, "loss": 0.22, "step": 924500 }, { "epoch": 12.738695544349838, "grad_norm": 2.0268301963806152, "learning_rate": 3.099570883729823e-06, "loss": 0.2047, "step": 924600 }, { "epoch": 12.740073296409578, "grad_norm": 3.811948776245117, "learning_rate": 3.0958750770515818e-06, "loss": 0.265, "step": 924700 }, { "epoch": 12.741451048469317, "grad_norm": 2.4735653400421143, "learning_rate": 3.092181343628613e-06, "loss": 0.2221, "step": 924800 }, { "epoch": 12.742828800529058, "grad_norm": 2.6457033157348633, "learning_rate": 3.0885265901080436e-06, "loss": 0.2125, "step": 924900 }, { "epoch": 12.744206552588796, "grad_norm": 1.1905360221862793, "learning_rate": 3.084836983396445e-06, "loss": 0.2397, "step": 925000 }, { "epoch": 12.745584304648535, "grad_norm": 2.320091485977173, "learning_rate": 3.0811494508778544e-06, "loss": 0.1946, "step": 925100 }, { "epoch": 12.746962056708275, "grad_norm": 0.17135082185268402, "learning_rate": 3.0774639928655466e-06, "loss": 0.2581, "step": 925200 }, { "epoch": 12.748339808768014, "grad_norm": 0.7217452526092529, "learning_rate": 3.0737806096726115e-06, "loss": 0.247, "step": 925300 }, { "epoch": 12.749717560827753, "grad_norm": 0.8664499521255493, "learning_rate": 3.0700993016119753e-06, "loss": 0.1523, "step": 925400 }, { "epoch": 12.751095312887493, "grad_norm": 3.5013489723205566, "learning_rate": 3.0664200689963927e-06, "loss": 0.26, "step": 925500 }, { "epoch": 12.752473064947232, "grad_norm": 4.590202331542969, "learning_rate": 3.0627429121384197e-06, "loss": 0.224, "step": 925600 }, { "epoch": 12.753850817006972, "grad_norm": 0.34800752997398376, "learning_rate": 3.059067831350464e-06, "loss": 0.1885, "step": 925700 }, { "epoch": 12.75522856906671, "grad_norm": 2.6511430740356445, "learning_rate": 3.05539482694473e-06, "loss": 0.2265, "step": 925800 }, { "epoch": 12.75660632112645, "grad_norm": 6.315014839172363, "learning_rate": 3.0517238992332643e-06, "loss": 0.2249, "step": 925900 }, { "epoch": 12.75798407318619, "grad_norm": 0.6136699318885803, "learning_rate": 3.0480550485279278e-06, "loss": 0.2069, "step": 926000 }, { "epoch": 12.759361825245929, "grad_norm": 181.0465087890625, "learning_rate": 3.044388275140412e-06, "loss": 0.2215, "step": 926100 }, { "epoch": 12.760739577305667, "grad_norm": 1.8276182413101196, "learning_rate": 3.040723579382236e-06, "loss": 0.2215, "step": 926200 }, { "epoch": 12.762117329365408, "grad_norm": 3.4303765296936035, "learning_rate": 3.037060961564713e-06, "loss": 0.2435, "step": 926300 }, { "epoch": 12.763495081425146, "grad_norm": 2.020491600036621, "learning_rate": 3.03340042199901e-06, "loss": 0.1892, "step": 926400 }, { "epoch": 12.764872833484887, "grad_norm": 7.699017524719238, "learning_rate": 3.0297419609961206e-06, "loss": 0.2643, "step": 926500 }, { "epoch": 12.766250585544626, "grad_norm": 2.8943793773651123, "learning_rate": 3.0260855788668337e-06, "loss": 0.2231, "step": 926600 }, { "epoch": 12.767628337604364, "grad_norm": 2.7637150287628174, "learning_rate": 3.022431275921785e-06, "loss": 0.2547, "step": 926700 }, { "epoch": 12.769006089664105, "grad_norm": 0.6149296760559082, "learning_rate": 3.0187790524714327e-06, "loss": 0.2699, "step": 926800 }, { "epoch": 12.770383841723843, "grad_norm": 3.2291924953460693, "learning_rate": 3.0151289088260443e-06, "loss": 0.194, "step": 926900 }, { "epoch": 12.771761593783582, "grad_norm": 1.6513943672180176, "learning_rate": 3.011480845295712e-06, "loss": 0.2346, "step": 927000 }, { "epoch": 12.773139345843322, "grad_norm": 2.3989291191101074, "learning_rate": 3.0078348621903633e-06, "loss": 0.2043, "step": 927100 }, { "epoch": 12.774517097903061, "grad_norm": 2.5076005458831787, "learning_rate": 3.004190959819752e-06, "loss": 0.2165, "step": 927200 }, { "epoch": 12.7758948499628, "grad_norm": 1.2464557886123657, "learning_rate": 3.000549138493433e-06, "loss": 0.2054, "step": 927300 }, { "epoch": 12.77727260202254, "grad_norm": 0.9744536280632019, "learning_rate": 2.9969093985208093e-06, "loss": 0.2291, "step": 927400 }, { "epoch": 12.778650354082279, "grad_norm": 2.2464873790740967, "learning_rate": 2.993271740211087e-06, "loss": 0.2311, "step": 927500 }, { "epoch": 12.78002810614202, "grad_norm": 2.430288791656494, "learning_rate": 2.9896361638733004e-06, "loss": 0.229, "step": 927600 }, { "epoch": 12.781405858201758, "grad_norm": 1.9284194707870483, "learning_rate": 2.9860026698163167e-06, "loss": 0.2161, "step": 927700 }, { "epoch": 12.782783610261497, "grad_norm": 2.662445545196533, "learning_rate": 2.982371258348825e-06, "loss": 0.213, "step": 927800 }, { "epoch": 12.784161362321237, "grad_norm": 3.4701831340789795, "learning_rate": 2.9787419297793257e-06, "loss": 0.2138, "step": 927900 }, { "epoch": 12.785539114380976, "grad_norm": 0.5050792098045349, "learning_rate": 2.9751146844161422e-06, "loss": 0.2462, "step": 928000 }, { "epoch": 12.786916866440716, "grad_norm": 2.81074595451355, "learning_rate": 2.971489522567436e-06, "loss": 0.229, "step": 928100 }, { "epoch": 12.788294618500455, "grad_norm": 3.9507157802581787, "learning_rate": 2.967866444541188e-06, "loss": 0.1931, "step": 928200 }, { "epoch": 12.789672370560194, "grad_norm": 0.31623491644859314, "learning_rate": 2.9642454506451874e-06, "loss": 0.2449, "step": 928300 }, { "epoch": 12.791050122619934, "grad_norm": 2.87270188331604, "learning_rate": 2.9606265411870546e-06, "loss": 0.254, "step": 928400 }, { "epoch": 12.792427874679673, "grad_norm": 3.3676488399505615, "learning_rate": 2.9570097164742566e-06, "loss": 0.2609, "step": 928500 }, { "epoch": 12.793805626739411, "grad_norm": 1.3202564716339111, "learning_rate": 2.953394976814031e-06, "loss": 0.2308, "step": 928600 }, { "epoch": 12.795183378799152, "grad_norm": 0.3617525100708008, "learning_rate": 2.9497823225134804e-06, "loss": 0.2109, "step": 928700 }, { "epoch": 12.79656113085889, "grad_norm": 1.5670219659805298, "learning_rate": 2.946171753879526e-06, "loss": 0.2324, "step": 928800 }, { "epoch": 12.79793888291863, "grad_norm": 5.255171298980713, "learning_rate": 2.94256327121889e-06, "loss": 0.252, "step": 928900 }, { "epoch": 12.79931663497837, "grad_norm": 1.8571407794952393, "learning_rate": 2.938992928473866e-06, "loss": 0.2673, "step": 929000 }, { "epoch": 12.800694387038108, "grad_norm": 2.7903871536254883, "learning_rate": 2.9354246307890307e-06, "loss": 0.2503, "step": 929100 }, { "epoch": 12.802072139097849, "grad_norm": 1.9524593353271484, "learning_rate": 2.9318223661461725e-06, "loss": 0.2133, "step": 929200 }, { "epoch": 12.803449891157587, "grad_norm": 3.577744722366333, "learning_rate": 2.9282221886956976e-06, "loss": 0.2361, "step": 929300 }, { "epoch": 12.804827643217326, "grad_norm": 2.7197766304016113, "learning_rate": 2.9246240987434517e-06, "loss": 0.2062, "step": 929400 }, { "epoch": 12.806205395277066, "grad_norm": 0.5694272518157959, "learning_rate": 2.921028096595122e-06, "loss": 0.2517, "step": 929500 }, { "epoch": 12.807583147336805, "grad_norm": 3.6552529335021973, "learning_rate": 2.917434182556193e-06, "loss": 0.195, "step": 929600 }, { "epoch": 12.808960899396544, "grad_norm": 1.040052890777588, "learning_rate": 2.9138423569319983e-06, "loss": 0.2327, "step": 929700 }, { "epoch": 12.810338651456284, "grad_norm": 1.8768765926361084, "learning_rate": 2.9102526200276742e-06, "loss": 0.2158, "step": 929800 }, { "epoch": 12.811716403516023, "grad_norm": 0.9721648097038269, "learning_rate": 2.9066649721481914e-06, "loss": 0.2361, "step": 929900 }, { "epoch": 12.813094155575763, "grad_norm": 0.9108976721763611, "learning_rate": 2.9030794135983415e-06, "loss": 0.1929, "step": 930000 }, { "epoch": 12.814471907635502, "grad_norm": 1.3948016166687012, "learning_rate": 2.89949594468273e-06, "loss": 0.1867, "step": 930100 }, { "epoch": 12.81584965969524, "grad_norm": 4.907284736633301, "learning_rate": 2.895914565705793e-06, "loss": 0.1979, "step": 930200 }, { "epoch": 12.817227411754981, "grad_norm": 0.08066895604133606, "learning_rate": 2.892335276971788e-06, "loss": 0.2316, "step": 930300 }, { "epoch": 12.81860516381472, "grad_norm": 2.7154369354248047, "learning_rate": 2.88875807878479e-06, "loss": 0.2536, "step": 930400 }, { "epoch": 12.819982915874458, "grad_norm": 1.7118443250656128, "learning_rate": 2.8852187121713565e-06, "loss": 0.1956, "step": 930500 }, { "epoch": 12.821360667934199, "grad_norm": 0.07313413172960281, "learning_rate": 2.8816456750768617e-06, "loss": 0.1884, "step": 930600 }, { "epoch": 12.822738419993938, "grad_norm": 1.630340337753296, "learning_rate": 2.8780747294375075e-06, "loss": 0.1979, "step": 930700 }, { "epoch": 12.824116172053678, "grad_norm": 5.865341663360596, "learning_rate": 2.874505875556675e-06, "loss": 0.2635, "step": 930800 }, { "epoch": 12.825493924113417, "grad_norm": 1.1222562789916992, "learning_rate": 2.8709391137375482e-06, "loss": 0.1927, "step": 930900 }, { "epoch": 12.826871676173155, "grad_norm": 2.5637333393096924, "learning_rate": 2.867374444283142e-06, "loss": 0.2584, "step": 931000 }, { "epoch": 12.828249428232896, "grad_norm": 0.18940003216266632, "learning_rate": 2.8638118674963037e-06, "loss": 0.2199, "step": 931100 }, { "epoch": 12.829627180292634, "grad_norm": 2.914417266845703, "learning_rate": 2.8602513836796854e-06, "loss": 0.2029, "step": 931200 }, { "epoch": 12.831004932352373, "grad_norm": 5.160257816314697, "learning_rate": 2.8566929931357624e-06, "loss": 0.2475, "step": 931300 }, { "epoch": 12.832382684412114, "grad_norm": 3.4218485355377197, "learning_rate": 2.853136696166848e-06, "loss": 0.2021, "step": 931400 }, { "epoch": 12.833760436471852, "grad_norm": 1.1191118955612183, "learning_rate": 2.8495824930750595e-06, "loss": 0.207, "step": 931500 }, { "epoch": 12.835138188531591, "grad_norm": 1.8268364667892456, "learning_rate": 2.84603038416235e-06, "loss": 0.2199, "step": 931600 }, { "epoch": 12.836515940591331, "grad_norm": 0.5255799293518066, "learning_rate": 2.8424803697304924e-06, "loss": 0.2533, "step": 931700 }, { "epoch": 12.83789369265107, "grad_norm": 7.88160514831543, "learning_rate": 2.838932450081073e-06, "loss": 0.2927, "step": 931800 }, { "epoch": 12.83927144471081, "grad_norm": 1.5226892232894897, "learning_rate": 2.8353866255155047e-06, "loss": 0.2365, "step": 931900 }, { "epoch": 12.84064919677055, "grad_norm": 5.4155988693237305, "learning_rate": 2.831842896335029e-06, "loss": 0.2479, "step": 932000 }, { "epoch": 12.842026948830288, "grad_norm": 0.7205844521522522, "learning_rate": 2.8283012628406974e-06, "loss": 0.2546, "step": 932100 }, { "epoch": 12.843404700890028, "grad_norm": 2.0160913467407227, "learning_rate": 2.8247617253333918e-06, "loss": 0.2207, "step": 932200 }, { "epoch": 12.844782452949767, "grad_norm": 4.22540807723999, "learning_rate": 2.8212242841138194e-06, "loss": 0.2313, "step": 932300 }, { "epoch": 12.846160205009507, "grad_norm": 0.6966512203216553, "learning_rate": 2.8176889394825012e-06, "loss": 0.2136, "step": 932400 }, { "epoch": 12.847537957069246, "grad_norm": 2.6707377433776855, "learning_rate": 2.814155691739775e-06, "loss": 0.2458, "step": 932500 }, { "epoch": 12.848915709128985, "grad_norm": 1.7527315616607666, "learning_rate": 2.8106245411858164e-06, "loss": 0.2839, "step": 932600 }, { "epoch": 12.850293461188725, "grad_norm": 2.891397476196289, "learning_rate": 2.8070954881206187e-06, "loss": 0.2075, "step": 932700 }, { "epoch": 12.851671213248464, "grad_norm": 1.4294129610061646, "learning_rate": 2.8035685328439852e-06, "loss": 0.1966, "step": 932800 }, { "epoch": 12.853048965308203, "grad_norm": 1.9177244901657104, "learning_rate": 2.8000436756555455e-06, "loss": 0.228, "step": 932900 }, { "epoch": 12.854426717367943, "grad_norm": 0.9147353768348694, "learning_rate": 2.7965209168547675e-06, "loss": 0.2048, "step": 933000 }, { "epoch": 12.855804469427682, "grad_norm": 1.4930061101913452, "learning_rate": 2.7930002567409135e-06, "loss": 0.2296, "step": 933100 }, { "epoch": 12.85718222148742, "grad_norm": 0.21898534893989563, "learning_rate": 2.7894816956130858e-06, "loss": 0.2954, "step": 933200 }, { "epoch": 12.85855997354716, "grad_norm": 1.4664571285247803, "learning_rate": 2.785965233770214e-06, "loss": 0.2613, "step": 933300 }, { "epoch": 12.8599377256069, "grad_norm": 1.0566760301589966, "learning_rate": 2.78245087151103e-06, "loss": 0.2428, "step": 933400 }, { "epoch": 12.86131547766664, "grad_norm": 2.049891710281372, "learning_rate": 2.778938609134095e-06, "loss": 0.1929, "step": 933500 }, { "epoch": 12.862693229726379, "grad_norm": 3.200713872909546, "learning_rate": 2.775428446937796e-06, "loss": 0.2296, "step": 933600 }, { "epoch": 12.864070981786117, "grad_norm": 0.13515597581863403, "learning_rate": 2.7719203852203487e-06, "loss": 0.2412, "step": 933700 }, { "epoch": 12.865448733845858, "grad_norm": 5.488831996917725, "learning_rate": 2.7684144242797642e-06, "loss": 0.2238, "step": 933800 }, { "epoch": 12.866826485905596, "grad_norm": 4.567749500274658, "learning_rate": 2.7649105644139016e-06, "loss": 0.2156, "step": 933900 }, { "epoch": 12.868204237965335, "grad_norm": 1.842706561088562, "learning_rate": 2.7614088059204394e-06, "loss": 0.2435, "step": 934000 }, { "epoch": 12.869581990025075, "grad_norm": 3.7605953216552734, "learning_rate": 2.757909149096851e-06, "loss": 0.2333, "step": 934100 }, { "epoch": 12.870959742084814, "grad_norm": 1.7115031480789185, "learning_rate": 2.7544115942404614e-06, "loss": 0.2246, "step": 934200 }, { "epoch": 12.872337494144555, "grad_norm": 0.9463909864425659, "learning_rate": 2.750916141648409e-06, "loss": 0.1825, "step": 934300 }, { "epoch": 12.873715246204293, "grad_norm": 3.504456043243408, "learning_rate": 2.7474227916176386e-06, "loss": 0.1957, "step": 934400 }, { "epoch": 12.875092998264032, "grad_norm": 3.7877490520477295, "learning_rate": 2.743931544444944e-06, "loss": 0.2352, "step": 934500 }, { "epoch": 12.876470750323772, "grad_norm": 5.8775315284729, "learning_rate": 2.740442400426908e-06, "loss": 0.2593, "step": 934600 }, { "epoch": 12.877848502383511, "grad_norm": 1.6690183877944946, "learning_rate": 2.736955359859966e-06, "loss": 0.2463, "step": 934700 }, { "epoch": 12.87922625444325, "grad_norm": 5.3186726570129395, "learning_rate": 2.7334704230403464e-06, "loss": 0.2403, "step": 934800 }, { "epoch": 12.88060400650299, "grad_norm": 2.5305628776550293, "learning_rate": 2.7299875902641216e-06, "loss": 0.2154, "step": 934900 }, { "epoch": 12.881981758562729, "grad_norm": 1.443049669265747, "learning_rate": 2.7265068618271754e-06, "loss": 0.2262, "step": 935000 }, { "epoch": 12.88335951062247, "grad_norm": 1.4582191705703735, "learning_rate": 2.7230630138443222e-06, "loss": 0.1933, "step": 935100 }, { "epoch": 12.884737262682208, "grad_norm": 4.619357585906982, "learning_rate": 2.7195864739220946e-06, "loss": 0.2521, "step": 935200 }, { "epoch": 12.886115014741947, "grad_norm": 3.4242618083953857, "learning_rate": 2.7161120392227746e-06, "loss": 0.2651, "step": 935300 }, { "epoch": 12.887492766801687, "grad_norm": 0.3394777476787567, "learning_rate": 2.7126397100415338e-06, "loss": 0.2234, "step": 935400 }, { "epoch": 12.888870518861426, "grad_norm": 1.9548254013061523, "learning_rate": 2.709169486673358e-06, "loss": 0.2577, "step": 935500 }, { "epoch": 12.890248270921164, "grad_norm": 3.070148229598999, "learning_rate": 2.705701369413068e-06, "loss": 0.2371, "step": 935600 }, { "epoch": 12.891626022980905, "grad_norm": 3.368596076965332, "learning_rate": 2.7022353585552863e-06, "loss": 0.2194, "step": 935700 }, { "epoch": 12.893003775040643, "grad_norm": 0.44243016839027405, "learning_rate": 2.6987714543944826e-06, "loss": 0.1999, "step": 935800 }, { "epoch": 12.894381527100382, "grad_norm": 1.993676781654358, "learning_rate": 2.695309657224919e-06, "loss": 0.2388, "step": 935900 }, { "epoch": 12.895759279160123, "grad_norm": 2.328325033187866, "learning_rate": 2.691849967340702e-06, "loss": 0.2731, "step": 936000 }, { "epoch": 12.897137031219861, "grad_norm": 0.3272148072719574, "learning_rate": 2.6883923850357547e-06, "loss": 0.2333, "step": 936100 }, { "epoch": 12.898514783279602, "grad_norm": 2.8061811923980713, "learning_rate": 2.684936910603805e-06, "loss": 0.1965, "step": 936200 }, { "epoch": 12.89989253533934, "grad_norm": 1.714197039604187, "learning_rate": 2.6814835443384156e-06, "loss": 0.2244, "step": 936300 }, { "epoch": 12.901270287399079, "grad_norm": 0.4128607511520386, "learning_rate": 2.6780322865329756e-06, "loss": 0.2103, "step": 936400 }, { "epoch": 12.90264803945882, "grad_norm": 1.9874459505081177, "learning_rate": 2.6745831374806725e-06, "loss": 0.2009, "step": 936500 }, { "epoch": 12.904025791518558, "grad_norm": 3.0516269207000732, "learning_rate": 2.671136097474541e-06, "loss": 0.2346, "step": 936600 }, { "epoch": 12.905403543578299, "grad_norm": 2.2331130504608154, "learning_rate": 2.6676911668074268e-06, "loss": 0.2312, "step": 936700 }, { "epoch": 12.906781295638037, "grad_norm": 59.183021545410156, "learning_rate": 2.6642483457719856e-06, "loss": 0.2966, "step": 936800 }, { "epoch": 12.908159047697776, "grad_norm": 2.3145411014556885, "learning_rate": 2.660807634660706e-06, "loss": 0.2227, "step": 936900 }, { "epoch": 12.909536799757516, "grad_norm": 4.065418720245361, "learning_rate": 2.6573690337658897e-06, "loss": 0.241, "step": 937000 }, { "epoch": 12.910914551817255, "grad_norm": 0.19847795367240906, "learning_rate": 2.653932543379677e-06, "loss": 0.2636, "step": 937100 }, { "epoch": 12.912292303876994, "grad_norm": 1.0193102359771729, "learning_rate": 2.6504981637939978e-06, "loss": 0.223, "step": 937200 }, { "epoch": 12.913670055936734, "grad_norm": 1.5910488367080688, "learning_rate": 2.647065895300638e-06, "loss": 0.2265, "step": 937300 }, { "epoch": 12.915047807996473, "grad_norm": 3.2115275859832764, "learning_rate": 2.6436357381911726e-06, "loss": 0.2152, "step": 937400 }, { "epoch": 12.916425560056211, "grad_norm": 2.2084672451019287, "learning_rate": 2.640207692757016e-06, "loss": 0.2171, "step": 937500 }, { "epoch": 12.917803312115952, "grad_norm": 1.376097321510315, "learning_rate": 2.636781759289394e-06, "loss": 0.2162, "step": 937600 }, { "epoch": 12.91918106417569, "grad_norm": 3.4237771034240723, "learning_rate": 2.6333579380793646e-06, "loss": 0.2232, "step": 937700 }, { "epoch": 12.920558816235431, "grad_norm": 5.032687187194824, "learning_rate": 2.629936229417797e-06, "loss": 0.2732, "step": 937800 }, { "epoch": 12.92193656829517, "grad_norm": 0.3038492798805237, "learning_rate": 2.6265166335953785e-06, "loss": 0.2202, "step": 937900 }, { "epoch": 12.923314320354908, "grad_norm": 0.8990733623504639, "learning_rate": 2.6230991509026246e-06, "loss": 0.2226, "step": 938000 }, { "epoch": 12.924692072414649, "grad_norm": 2.5408823490142822, "learning_rate": 2.619683781629865e-06, "loss": 0.2241, "step": 938100 }, { "epoch": 12.926069824474387, "grad_norm": 2.3964927196502686, "learning_rate": 2.616270526067253e-06, "loss": 0.2076, "step": 938200 }, { "epoch": 12.927447576534126, "grad_norm": 2.4010443687438965, "learning_rate": 2.61285938450477e-06, "loss": 0.2193, "step": 938300 }, { "epoch": 12.928825328593867, "grad_norm": 1.8050990104675293, "learning_rate": 2.6094503572322047e-06, "loss": 0.2464, "step": 938400 }, { "epoch": 12.930203080653605, "grad_norm": 1.831526517868042, "learning_rate": 2.606043444539167e-06, "loss": 0.25, "step": 938500 }, { "epoch": 12.931580832713346, "grad_norm": 0.004637610632926226, "learning_rate": 2.6026386467150916e-06, "loss": 0.2155, "step": 938600 }, { "epoch": 12.932958584773084, "grad_norm": 3.2540037631988525, "learning_rate": 2.599235964049246e-06, "loss": 0.2097, "step": 938700 }, { "epoch": 12.934336336832823, "grad_norm": 1.7191284894943237, "learning_rate": 2.5958353968306897e-06, "loss": 0.2256, "step": 938800 }, { "epoch": 12.935714088892563, "grad_norm": 3.144895553588867, "learning_rate": 2.5924369453483296e-06, "loss": 0.2248, "step": 938900 }, { "epoch": 12.937091840952302, "grad_norm": 4.147371768951416, "learning_rate": 2.5890406098908774e-06, "loss": 0.2142, "step": 939000 }, { "epoch": 12.93846959301204, "grad_norm": 1.9305498600006104, "learning_rate": 2.5856463907468673e-06, "loss": 0.2392, "step": 939100 }, { "epoch": 12.939847345071781, "grad_norm": 0.2543761134147644, "learning_rate": 2.5822542882046538e-06, "loss": 0.1918, "step": 939200 }, { "epoch": 12.94122509713152, "grad_norm": 1.9301526546478271, "learning_rate": 2.5788643025524233e-06, "loss": 0.2186, "step": 939300 }, { "epoch": 12.94260284919126, "grad_norm": 0.950579047203064, "learning_rate": 2.575476434078161e-06, "loss": 0.2142, "step": 939400 }, { "epoch": 12.943980601250999, "grad_norm": 0.687894344329834, "learning_rate": 2.5720906830696927e-06, "loss": 0.2301, "step": 939500 }, { "epoch": 12.945358353310738, "grad_norm": 5.440469741821289, "learning_rate": 2.568707049814646e-06, "loss": 0.1948, "step": 939600 }, { "epoch": 12.946736105370478, "grad_norm": 3.061732053756714, "learning_rate": 2.565393144146111e-06, "loss": 0.2262, "step": 939700 }, { "epoch": 12.948113857430217, "grad_norm": 1.799528956413269, "learning_rate": 2.5620137048907313e-06, "loss": 0.2535, "step": 939800 }, { "epoch": 12.949491609489955, "grad_norm": 2.2766990661621094, "learning_rate": 2.558636384244876e-06, "loss": 0.2397, "step": 939900 }, { "epoch": 12.950869361549696, "grad_norm": 3.5763628482818604, "learning_rate": 2.555261182495458e-06, "loss": 0.1948, "step": 940000 }, { "epoch": 12.952247113609435, "grad_norm": 3.525674343109131, "learning_rate": 2.551888099929213e-06, "loss": 0.227, "step": 940100 }, { "epoch": 12.953624865669173, "grad_norm": 1.8106833696365356, "learning_rate": 2.5485171368327127e-06, "loss": 0.2047, "step": 940200 }, { "epoch": 12.955002617728914, "grad_norm": 10.527534484863281, "learning_rate": 2.5451482934923304e-06, "loss": 0.2699, "step": 940300 }, { "epoch": 12.956380369788652, "grad_norm": 6.805759429931641, "learning_rate": 2.5417815701942643e-06, "loss": 0.2258, "step": 940400 }, { "epoch": 12.957758121848393, "grad_norm": 5.308080673217773, "learning_rate": 2.538416967224549e-06, "loss": 0.2215, "step": 940500 }, { "epoch": 12.959135873908131, "grad_norm": 1.2921286821365356, "learning_rate": 2.5350544848690128e-06, "loss": 0.2433, "step": 940600 }, { "epoch": 12.96051362596787, "grad_norm": 5.0306396484375, "learning_rate": 2.531694123413312e-06, "loss": 0.2281, "step": 940700 }, { "epoch": 12.96189137802761, "grad_norm": 1.7793551683425903, "learning_rate": 2.528335883142937e-06, "loss": 0.1787, "step": 940800 }, { "epoch": 12.96326913008735, "grad_norm": 0.9906152486801147, "learning_rate": 2.52497976434318e-06, "loss": 0.1993, "step": 940900 }, { "epoch": 12.96464688214709, "grad_norm": 2.697572946548462, "learning_rate": 2.5216257672991616e-06, "loss": 0.1932, "step": 941000 }, { "epoch": 12.966024634206828, "grad_norm": 0.06797800213098526, "learning_rate": 2.5182738922958295e-06, "loss": 0.2159, "step": 941100 }, { "epoch": 12.967402386266567, "grad_norm": 1.9671000242233276, "learning_rate": 2.5149241396179345e-06, "loss": 0.2189, "step": 941200 }, { "epoch": 12.968780138326307, "grad_norm": 1.4161914587020874, "learning_rate": 2.5115765095500522e-06, "loss": 0.2081, "step": 941300 }, { "epoch": 12.970157890386046, "grad_norm": 1.1342270374298096, "learning_rate": 2.5082310023765853e-06, "loss": 0.2168, "step": 941400 }, { "epoch": 12.971535642445785, "grad_norm": 0.3216767907142639, "learning_rate": 2.504887618381755e-06, "loss": 0.2311, "step": 941500 }, { "epoch": 12.972913394505525, "grad_norm": 2.3021531105041504, "learning_rate": 2.501546357849595e-06, "loss": 0.2276, "step": 941600 }, { "epoch": 12.974291146565264, "grad_norm": 4.159495830535889, "learning_rate": 2.4982072210639653e-06, "loss": 0.1994, "step": 941700 }, { "epoch": 12.975668898625003, "grad_norm": 3.3270316123962402, "learning_rate": 2.49487020830854e-06, "loss": 0.2704, "step": 941800 }, { "epoch": 12.977046650684743, "grad_norm": 2.290621519088745, "learning_rate": 2.4915353198668104e-06, "loss": 0.2257, "step": 941900 }, { "epoch": 12.978424402744482, "grad_norm": 2.0975193977355957, "learning_rate": 2.4882025560220988e-06, "loss": 0.2188, "step": 942000 }, { "epoch": 12.979802154804222, "grad_norm": 3.7833104133605957, "learning_rate": 2.4848719170575447e-06, "loss": 0.2107, "step": 942100 }, { "epoch": 12.98117990686396, "grad_norm": 5.014673709869385, "learning_rate": 2.481543403256099e-06, "loss": 0.2462, "step": 942200 }, { "epoch": 12.9825576589237, "grad_norm": 4.888100624084473, "learning_rate": 2.478217014900528e-06, "loss": 0.2307, "step": 942300 }, { "epoch": 12.98393541098344, "grad_norm": 1.8661445379257202, "learning_rate": 2.4748927522734313e-06, "loss": 0.2354, "step": 942400 }, { "epoch": 12.985313163043179, "grad_norm": 1.4494556188583374, "learning_rate": 2.471570615657231e-06, "loss": 0.1723, "step": 942500 }, { "epoch": 12.986690915102917, "grad_norm": 1.0541070699691772, "learning_rate": 2.468250605334145e-06, "loss": 0.2193, "step": 942600 }, { "epoch": 12.988068667162658, "grad_norm": 1.1592589616775513, "learning_rate": 2.4649327215862323e-06, "loss": 0.1988, "step": 942700 }, { "epoch": 12.989446419222396, "grad_norm": 0.9725094437599182, "learning_rate": 2.461616964695371e-06, "loss": 0.2019, "step": 942800 }, { "epoch": 12.990824171282137, "grad_norm": 0.08320147544145584, "learning_rate": 2.458303334943235e-06, "loss": 0.2232, "step": 942900 }, { "epoch": 12.992201923341876, "grad_norm": 3.3967432975769043, "learning_rate": 2.4549918326113433e-06, "loss": 0.2343, "step": 943000 }, { "epoch": 12.993579675401614, "grad_norm": 1.8226722478866577, "learning_rate": 2.451682457981031e-06, "loss": 0.2472, "step": 943100 }, { "epoch": 12.994957427461355, "grad_norm": 0.3414977192878723, "learning_rate": 2.448375211333435e-06, "loss": 0.2276, "step": 943200 }, { "epoch": 12.996335179521093, "grad_norm": 3.5121781826019287, "learning_rate": 2.445070092949533e-06, "loss": 0.2186, "step": 943300 }, { "epoch": 12.997712931580832, "grad_norm": 5.230090141296387, "learning_rate": 2.4417671031101014e-06, "loss": 0.2229, "step": 943400 }, { "epoch": 12.999090683640572, "grad_norm": 2.0974578857421875, "learning_rate": 2.4384662420957556e-06, "loss": 0.2212, "step": 943500 }, { "epoch": 13.000468435700311, "grad_norm": 1.4741615056991577, "learning_rate": 2.4351675101869115e-06, "loss": 0.2435, "step": 943600 }, { "epoch": 13.001846187760052, "grad_norm": 3.1395866870880127, "learning_rate": 2.43187090766382e-06, "loss": 0.1824, "step": 943700 }, { "epoch": 13.00322393981979, "grad_norm": 1.5867871046066284, "learning_rate": 2.428576434806547e-06, "loss": 0.2141, "step": 943800 }, { "epoch": 13.004601691879529, "grad_norm": 1.372540831565857, "learning_rate": 2.4252840918949677e-06, "loss": 0.207, "step": 943900 }, { "epoch": 13.00597944393927, "grad_norm": 1.4043537378311157, "learning_rate": 2.4219938792087845e-06, "loss": 0.2296, "step": 944000 }, { "epoch": 13.007357195999008, "grad_norm": 0.22934818267822266, "learning_rate": 2.4187057970275247e-06, "loss": 0.1613, "step": 944100 }, { "epoch": 13.008734948058747, "grad_norm": 0.3385646641254425, "learning_rate": 2.415419845630515e-06, "loss": 0.2279, "step": 944200 }, { "epoch": 13.010112700118487, "grad_norm": 2.983510971069336, "learning_rate": 2.4121360252969226e-06, "loss": 0.1874, "step": 944300 }, { "epoch": 13.011490452178226, "grad_norm": 2.225717067718506, "learning_rate": 2.408854336305729e-06, "loss": 0.2004, "step": 944400 }, { "epoch": 13.012868204237966, "grad_norm": 2.111985921859741, "learning_rate": 2.4055747789357225e-06, "loss": 0.1925, "step": 944500 }, { "epoch": 13.014245956297705, "grad_norm": 1.034456729888916, "learning_rate": 2.4022973534655186e-06, "loss": 0.188, "step": 944600 }, { "epoch": 13.015623708357444, "grad_norm": 3.1166813373565674, "learning_rate": 2.3990220601735526e-06, "loss": 0.1915, "step": 944700 }, { "epoch": 13.017001460417184, "grad_norm": 3.778953790664673, "learning_rate": 2.3957816203898616e-06, "loss": 0.2181, "step": 944800 }, { "epoch": 13.018379212476923, "grad_norm": 2.072683334350586, "learning_rate": 2.392510570960234e-06, "loss": 0.2229, "step": 944900 }, { "epoch": 13.019756964536661, "grad_norm": 1.4086980819702148, "learning_rate": 2.3892416545402825e-06, "loss": 0.2564, "step": 945000 }, { "epoch": 13.021134716596402, "grad_norm": 0.9210970997810364, "learning_rate": 2.3859748714077122e-06, "loss": 0.27, "step": 945100 }, { "epoch": 13.02251246865614, "grad_norm": 0.13293832540512085, "learning_rate": 2.382710221840067e-06, "loss": 0.1868, "step": 945200 }, { "epoch": 13.023890220715879, "grad_norm": 4.4459614753723145, "learning_rate": 2.379447706114679e-06, "loss": 0.1813, "step": 945300 }, { "epoch": 13.02526797277562, "grad_norm": 0.6108028292655945, "learning_rate": 2.3761873245087284e-06, "loss": 0.2208, "step": 945400 }, { "epoch": 13.026645724835358, "grad_norm": 0.9959761500358582, "learning_rate": 2.3729290772991997e-06, "loss": 0.2104, "step": 945500 }, { "epoch": 13.028023476895099, "grad_norm": 0.8236709833145142, "learning_rate": 2.369672964762894e-06, "loss": 0.2571, "step": 945600 }, { "epoch": 13.029401228954837, "grad_norm": 4.525708198547363, "learning_rate": 2.3664189871764326e-06, "loss": 0.2243, "step": 945700 }, { "epoch": 13.030778981014576, "grad_norm": 1.7330988645553589, "learning_rate": 2.36316714481626e-06, "loss": 0.2646, "step": 945800 }, { "epoch": 13.032156733074316, "grad_norm": 0.5445881485939026, "learning_rate": 2.359917437958643e-06, "loss": 0.2063, "step": 945900 }, { "epoch": 13.033534485134055, "grad_norm": 1.733644723892212, "learning_rate": 2.35666986687965e-06, "loss": 0.1692, "step": 946000 }, { "epoch": 13.034912237193794, "grad_norm": 4.179257392883301, "learning_rate": 2.3534244318551886e-06, "loss": 0.2677, "step": 946100 }, { "epoch": 13.036289989253534, "grad_norm": 1.500658392906189, "learning_rate": 2.3501811331609723e-06, "loss": 0.2066, "step": 946200 }, { "epoch": 13.037667741313273, "grad_norm": 1.2771934270858765, "learning_rate": 2.3469399710725295e-06, "loss": 0.1976, "step": 946300 }, { "epoch": 13.039045493373013, "grad_norm": 3.5511856079101562, "learning_rate": 2.343700945865218e-06, "loss": 0.238, "step": 946400 }, { "epoch": 13.040423245432752, "grad_norm": 3.867316246032715, "learning_rate": 2.340464057814214e-06, "loss": 0.2391, "step": 946500 }, { "epoch": 13.04180099749249, "grad_norm": 3.279280662536621, "learning_rate": 2.3372293071944973e-06, "loss": 0.2014, "step": 946600 }, { "epoch": 13.043178749552231, "grad_norm": 2.9935896396636963, "learning_rate": 2.3339966942808898e-06, "loss": 0.2268, "step": 946700 }, { "epoch": 13.04455650161197, "grad_norm": 2.1359305381774902, "learning_rate": 2.330766219348014e-06, "loss": 0.2052, "step": 946800 }, { "epoch": 13.045934253671708, "grad_norm": 0.7234707474708557, "learning_rate": 2.3275378826703035e-06, "loss": 0.2043, "step": 946900 }, { "epoch": 13.047312005731449, "grad_norm": 2.754171133041382, "learning_rate": 2.324311684522033e-06, "loss": 0.2088, "step": 947000 }, { "epoch": 13.048689757791188, "grad_norm": 3.9752023220062256, "learning_rate": 2.3210876251772865e-06, "loss": 0.1975, "step": 947100 }, { "epoch": 13.050067509850928, "grad_norm": 2.000788688659668, "learning_rate": 2.3178657049099615e-06, "loss": 0.1886, "step": 947200 }, { "epoch": 13.051445261910667, "grad_norm": 0.27197399735450745, "learning_rate": 2.31467811121225e-06, "loss": 0.2077, "step": 947300 }, { "epoch": 13.052823013970405, "grad_norm": 4.119304656982422, "learning_rate": 2.3114604485231368e-06, "loss": 0.2077, "step": 947400 }, { "epoch": 13.054200766030146, "grad_norm": 1.2764922380447388, "learning_rate": 2.308244925729319e-06, "loss": 0.2202, "step": 947500 }, { "epoch": 13.055578518089884, "grad_norm": 5.638263702392578, "learning_rate": 2.305031543103984e-06, "loss": 0.2564, "step": 947600 }, { "epoch": 13.056956270149623, "grad_norm": 0.6445918679237366, "learning_rate": 2.301820300920113e-06, "loss": 0.2113, "step": 947700 }, { "epoch": 13.058334022209364, "grad_norm": 3.727322816848755, "learning_rate": 2.2986111994505165e-06, "loss": 0.2043, "step": 947800 }, { "epoch": 13.059711774269102, "grad_norm": 1.9680469036102295, "learning_rate": 2.295404238967832e-06, "loss": 0.2156, "step": 947900 }, { "epoch": 13.061089526328843, "grad_norm": 2.260833501815796, "learning_rate": 2.2921994197445017e-06, "loss": 0.2213, "step": 948000 }, { "epoch": 13.062467278388581, "grad_norm": 7.015494346618652, "learning_rate": 2.2889967420527886e-06, "loss": 0.1871, "step": 948100 }, { "epoch": 13.06384503044832, "grad_norm": 1.0621708631515503, "learning_rate": 2.285796206164787e-06, "loss": 0.2027, "step": 948200 }, { "epoch": 13.06522278250806, "grad_norm": 4.520688056945801, "learning_rate": 2.2825978123523946e-06, "loss": 0.2413, "step": 948300 }, { "epoch": 13.0666005345678, "grad_norm": 3.529108762741089, "learning_rate": 2.2794015608873236e-06, "loss": 0.209, "step": 948400 }, { "epoch": 13.067978286627538, "grad_norm": 0.6948244571685791, "learning_rate": 2.2762074520411178e-06, "loss": 0.242, "step": 948500 }, { "epoch": 13.069356038687278, "grad_norm": 1.436937689781189, "learning_rate": 2.2730154860851375e-06, "loss": 0.2235, "step": 948600 }, { "epoch": 13.070733790747017, "grad_norm": 3.428218364715576, "learning_rate": 2.2698256632905457e-06, "loss": 0.2318, "step": 948700 }, { "epoch": 13.072111542806757, "grad_norm": 2.1471517086029053, "learning_rate": 2.266637983928342e-06, "loss": 0.2314, "step": 948800 }, { "epoch": 13.073489294866496, "grad_norm": 2.222318172454834, "learning_rate": 2.2634524482693445e-06, "loss": 0.2157, "step": 948900 }, { "epoch": 13.074867046926235, "grad_norm": 2.199751853942871, "learning_rate": 2.2602690565841624e-06, "loss": 0.2463, "step": 949000 }, { "epoch": 13.076244798985975, "grad_norm": 2.260749340057373, "learning_rate": 2.257087809143246e-06, "loss": 0.2114, "step": 949100 }, { "epoch": 13.077622551045714, "grad_norm": 0.5413200855255127, "learning_rate": 2.2539087062168702e-06, "loss": 0.1575, "step": 949200 }, { "epoch": 13.079000303105452, "grad_norm": 3.971824884414673, "learning_rate": 2.2507317480751008e-06, "loss": 0.2287, "step": 949300 }, { "epoch": 13.080378055165193, "grad_norm": 5.059444904327393, "learning_rate": 2.247556934987851e-06, "loss": 0.2036, "step": 949400 }, { "epoch": 13.081755807224932, "grad_norm": 2.146378755569458, "learning_rate": 2.244384267224824e-06, "loss": 0.2179, "step": 949500 }, { "epoch": 13.08313355928467, "grad_norm": 3.3127827644348145, "learning_rate": 2.241213745055564e-06, "loss": 0.2128, "step": 949600 }, { "epoch": 13.08451131134441, "grad_norm": 0.6094133853912354, "learning_rate": 2.2380453687494165e-06, "loss": 0.2573, "step": 949700 }, { "epoch": 13.08588906340415, "grad_norm": 15.943194389343262, "learning_rate": 2.234879138575556e-06, "loss": 0.209, "step": 949800 }, { "epoch": 13.08726681546389, "grad_norm": 0.180172860622406, "learning_rate": 2.2317150548029677e-06, "loss": 0.2386, "step": 949900 }, { "epoch": 13.088644567523628, "grad_norm": 1.3473585844039917, "learning_rate": 2.2285531177004605e-06, "loss": 0.1988, "step": 950000 }, { "epoch": 13.090022319583367, "grad_norm": 2.9297616481781006, "learning_rate": 2.2253933275366504e-06, "loss": 0.189, "step": 950100 }, { "epoch": 13.091400071643108, "grad_norm": 1.48695707321167, "learning_rate": 2.222235684579982e-06, "loss": 0.205, "step": 950200 }, { "epoch": 13.092777823702846, "grad_norm": 0.36406728625297546, "learning_rate": 2.2190801890987108e-06, "loss": 0.206, "step": 950300 }, { "epoch": 13.094155575762585, "grad_norm": 1.1817207336425781, "learning_rate": 2.215926841360913e-06, "loss": 0.1814, "step": 950400 }, { "epoch": 13.095533327822325, "grad_norm": 2.4070940017700195, "learning_rate": 2.212775641634487e-06, "loss": 0.1978, "step": 950500 }, { "epoch": 13.096911079882064, "grad_norm": 1.542210340499878, "learning_rate": 2.209626590187138e-06, "loss": 0.2053, "step": 950600 }, { "epoch": 13.098288831941804, "grad_norm": 0.3750985264778137, "learning_rate": 2.20647968728639e-06, "loss": 0.2066, "step": 950700 }, { "epoch": 13.099666584001543, "grad_norm": 0.5737673044204712, "learning_rate": 2.2033349331995946e-06, "loss": 0.2242, "step": 950800 }, { "epoch": 13.101044336061282, "grad_norm": 1.8188323974609375, "learning_rate": 2.2001923281939174e-06, "loss": 0.2257, "step": 950900 }, { "epoch": 13.102422088121022, "grad_norm": 4.118083953857422, "learning_rate": 2.197051872536332e-06, "loss": 0.2651, "step": 951000 }, { "epoch": 13.103799840180761, "grad_norm": 2.199143648147583, "learning_rate": 2.193913566493641e-06, "loss": 0.1719, "step": 951100 }, { "epoch": 13.1051775922405, "grad_norm": 2.603848695755005, "learning_rate": 2.19077741033246e-06, "loss": 0.21, "step": 951200 }, { "epoch": 13.10655534430024, "grad_norm": 2.3072054386138916, "learning_rate": 2.187643404319213e-06, "loss": 0.2331, "step": 951300 }, { "epoch": 13.107933096359979, "grad_norm": 3.031574010848999, "learning_rate": 2.1845115487201563e-06, "loss": 0.2345, "step": 951400 }, { "epoch": 13.10931084841972, "grad_norm": 3.446669101715088, "learning_rate": 2.1813818438013625e-06, "loss": 0.2338, "step": 951500 }, { "epoch": 13.110688600479458, "grad_norm": 0.2870500981807709, "learning_rate": 2.1782542898287087e-06, "loss": 0.2383, "step": 951600 }, { "epoch": 13.112066352539197, "grad_norm": 3.003225088119507, "learning_rate": 2.175128887067895e-06, "loss": 0.2094, "step": 951700 }, { "epoch": 13.113444104598937, "grad_norm": 3.1960954666137695, "learning_rate": 2.1720056357844422e-06, "loss": 0.2479, "step": 951800 }, { "epoch": 13.114821856658676, "grad_norm": 0.12802423536777496, "learning_rate": 2.1688845362436926e-06, "loss": 0.2587, "step": 951900 }, { "epoch": 13.116199608718414, "grad_norm": 20.819339752197266, "learning_rate": 2.165765588710788e-06, "loss": 0.2292, "step": 952000 }, { "epoch": 13.117577360778155, "grad_norm": 2.5628113746643066, "learning_rate": 2.162648793450714e-06, "loss": 0.2212, "step": 952100 }, { "epoch": 13.118955112837893, "grad_norm": 3.513842821121216, "learning_rate": 2.159534150728249e-06, "loss": 0.2552, "step": 952200 }, { "epoch": 13.120332864897634, "grad_norm": 2.1069796085357666, "learning_rate": 2.156421660807994e-06, "loss": 0.1862, "step": 952300 }, { "epoch": 13.121710616957373, "grad_norm": 2.3737189769744873, "learning_rate": 2.1533113239543735e-06, "loss": 0.1936, "step": 952400 }, { "epoch": 13.123088369017111, "grad_norm": 1.7955330610275269, "learning_rate": 2.150203140431637e-06, "loss": 0.2481, "step": 952500 }, { "epoch": 13.124466121076852, "grad_norm": 0.1346665918827057, "learning_rate": 2.1471281601419422e-06, "loss": 0.2082, "step": 952600 }, { "epoch": 13.12584387313659, "grad_norm": 1.8241961002349854, "learning_rate": 2.144024262533048e-06, "loss": 0.1907, "step": 952700 }, { "epoch": 13.127221625196329, "grad_norm": 1.0534404516220093, "learning_rate": 2.1409225190440078e-06, "loss": 0.2172, "step": 952800 }, { "epoch": 13.12859937725607, "grad_norm": 2.357387065887451, "learning_rate": 2.1378539151643304e-06, "loss": 0.2096, "step": 952900 }, { "epoch": 13.129977129315808, "grad_norm": 2.7878684997558594, "learning_rate": 2.1347564591575875e-06, "loss": 0.1871, "step": 953000 }, { "epoch": 13.131354881375549, "grad_norm": 0.9517192840576172, "learning_rate": 2.1316611580580478e-06, "loss": 0.2152, "step": 953100 }, { "epoch": 13.132732633435287, "grad_norm": 1.9640759229660034, "learning_rate": 2.1285680121286735e-06, "loss": 0.1884, "step": 953200 }, { "epoch": 13.134110385495026, "grad_norm": 1.5480353832244873, "learning_rate": 2.1254770216322544e-06, "loss": 0.2142, "step": 953300 }, { "epoch": 13.135488137554766, "grad_norm": 2.0094008445739746, "learning_rate": 2.122388186831374e-06, "loss": 0.2851, "step": 953400 }, { "epoch": 13.136865889614505, "grad_norm": 18.14215087890625, "learning_rate": 2.119301507988445e-06, "loss": 0.2144, "step": 953500 }, { "epoch": 13.138243641674244, "grad_norm": 3.405684471130371, "learning_rate": 2.1162169853656987e-06, "loss": 0.2239, "step": 953600 }, { "epoch": 13.139621393733984, "grad_norm": 3.234578847885132, "learning_rate": 2.1131346192251863e-06, "loss": 0.2252, "step": 953700 }, { "epoch": 13.140999145793723, "grad_norm": 0.4654708504676819, "learning_rate": 2.110054409828768e-06, "loss": 0.2141, "step": 953800 }, { "epoch": 13.142376897853463, "grad_norm": 5.575819969177246, "learning_rate": 2.1069763574381168e-06, "loss": 0.2165, "step": 953900 }, { "epoch": 13.143754649913202, "grad_norm": 2.115338087081909, "learning_rate": 2.103900462314738e-06, "loss": 0.1974, "step": 954000 }, { "epoch": 13.14513240197294, "grad_norm": 0.2580846846103668, "learning_rate": 2.100826724719935e-06, "loss": 0.1772, "step": 954100 }, { "epoch": 13.146510154032681, "grad_norm": 4.814138889312744, "learning_rate": 2.0977551449148443e-06, "loss": 0.1916, "step": 954200 }, { "epoch": 13.14788790609242, "grad_norm": 2.7448670864105225, "learning_rate": 2.0946857231604145e-06, "loss": 0.1971, "step": 954300 }, { "epoch": 13.149265658152158, "grad_norm": 0.6680305600166321, "learning_rate": 2.091618459717407e-06, "loss": 0.1799, "step": 954400 }, { "epoch": 13.150643410211899, "grad_norm": 0.2847000062465668, "learning_rate": 2.0885533548463924e-06, "loss": 0.1846, "step": 954500 }, { "epoch": 13.152021162271637, "grad_norm": 1.1524426937103271, "learning_rate": 2.0854904088077765e-06, "loss": 0.174, "step": 954600 }, { "epoch": 13.153398914331376, "grad_norm": 1.1759401559829712, "learning_rate": 2.0824296218617734e-06, "loss": 0.2072, "step": 954700 }, { "epoch": 13.154776666391117, "grad_norm": 1.0690033435821533, "learning_rate": 2.0793709942684073e-06, "loss": 0.2452, "step": 954800 }, { "epoch": 13.156154418450855, "grad_norm": 2.0779855251312256, "learning_rate": 2.0763145262875295e-06, "loss": 0.2358, "step": 954900 }, { "epoch": 13.157532170510596, "grad_norm": 2.2275679111480713, "learning_rate": 2.0732602181787972e-06, "loss": 0.1873, "step": 955000 }, { "epoch": 13.158909922570334, "grad_norm": 2.71479868888855, "learning_rate": 2.07023858098796e-06, "loss": 0.2167, "step": 955100 }, { "epoch": 13.160287674630073, "grad_norm": 2.918341875076294, "learning_rate": 2.067188571796582e-06, "loss": 0.2021, "step": 955200 }, { "epoch": 13.161665426689813, "grad_norm": 1.2662862539291382, "learning_rate": 2.0641407232526478e-06, "loss": 0.187, "step": 955300 }, { "epoch": 13.163043178749552, "grad_norm": 2.664578676223755, "learning_rate": 2.061095035615093e-06, "loss": 0.1996, "step": 955400 }, { "epoch": 13.16442093080929, "grad_norm": 1.7334948778152466, "learning_rate": 2.058051509142659e-06, "loss": 0.193, "step": 955500 }, { "epoch": 13.165798682869031, "grad_norm": 2.573725700378418, "learning_rate": 2.055010144093905e-06, "loss": 0.2287, "step": 955600 }, { "epoch": 13.16717643492877, "grad_norm": 3.56535005569458, "learning_rate": 2.051970940727222e-06, "loss": 0.251, "step": 955700 }, { "epoch": 13.16855418698851, "grad_norm": 0.7944051027297974, "learning_rate": 2.0489338993007905e-06, "loss": 0.1961, "step": 955800 }, { "epoch": 13.169931939048249, "grad_norm": 1.9641187191009521, "learning_rate": 2.0458990200726322e-06, "loss": 0.2402, "step": 955900 }, { "epoch": 13.171309691107988, "grad_norm": 2.6603341102600098, "learning_rate": 2.04286630330058e-06, "loss": 0.2279, "step": 956000 }, { "epoch": 13.172687443167728, "grad_norm": 2.809593915939331, "learning_rate": 2.039835749242273e-06, "loss": 0.1981, "step": 956100 }, { "epoch": 13.174065195227467, "grad_norm": 1.3900868892669678, "learning_rate": 2.0368073581551683e-06, "loss": 0.1711, "step": 956200 }, { "epoch": 13.175442947287205, "grad_norm": 0.32162657380104065, "learning_rate": 2.033781130296546e-06, "loss": 0.1836, "step": 956300 }, { "epoch": 13.176820699346946, "grad_norm": 1.0811519622802734, "learning_rate": 2.0307570659235053e-06, "loss": 0.2027, "step": 956400 }, { "epoch": 13.178198451406685, "grad_norm": 2.0750279426574707, "learning_rate": 2.0277351652929506e-06, "loss": 0.2139, "step": 956500 }, { "epoch": 13.179576203466425, "grad_norm": 1.5830872058868408, "learning_rate": 2.0247154286616093e-06, "loss": 0.165, "step": 956600 }, { "epoch": 13.180953955526164, "grad_norm": 2.5654876232147217, "learning_rate": 2.021697856286025e-06, "loss": 0.2213, "step": 956700 }, { "epoch": 13.182331707585902, "grad_norm": 3.174516439437866, "learning_rate": 2.0186824484225493e-06, "loss": 0.2213, "step": 956800 }, { "epoch": 13.183709459645643, "grad_norm": 2.442420721054077, "learning_rate": 2.01566920532736e-06, "loss": 0.2125, "step": 956900 }, { "epoch": 13.185087211705381, "grad_norm": 3.190927743911743, "learning_rate": 2.0126581272564545e-06, "loss": 0.1767, "step": 957000 }, { "epoch": 13.18646496376512, "grad_norm": 0.8228018879890442, "learning_rate": 2.0096492144656316e-06, "loss": 0.2207, "step": 957100 }, { "epoch": 13.18784271582486, "grad_norm": 2.233006238937378, "learning_rate": 2.0066424672105135e-06, "loss": 0.2723, "step": 957200 }, { "epoch": 13.1892204678846, "grad_norm": 2.4445199966430664, "learning_rate": 2.0036378857465387e-06, "loss": 0.1895, "step": 957300 }, { "epoch": 13.19059821994434, "grad_norm": 1.401798129081726, "learning_rate": 2.000635470328972e-06, "loss": 0.2085, "step": 957400 }, { "epoch": 13.191975972004078, "grad_norm": 2.0793230533599854, "learning_rate": 1.997665212980002e-06, "loss": 0.2387, "step": 957500 }, { "epoch": 13.193353724063817, "grad_norm": 0.5570358037948608, "learning_rate": 1.9946671087534297e-06, "loss": 0.2172, "step": 957600 }, { "epoch": 13.194731476123557, "grad_norm": 1.3516061305999756, "learning_rate": 1.991671171335378e-06, "loss": 0.1906, "step": 957700 }, { "epoch": 13.196109228183296, "grad_norm": 1.9212381839752197, "learning_rate": 1.98867740098036e-06, "loss": 0.2601, "step": 957800 }, { "epoch": 13.197486980243035, "grad_norm": 3.625758171081543, "learning_rate": 1.985685797942706e-06, "loss": 0.2119, "step": 957900 }, { "epoch": 13.198864732302775, "grad_norm": 0.3776724338531494, "learning_rate": 1.9826963624765757e-06, "loss": 0.2288, "step": 958000 }, { "epoch": 13.200242484362514, "grad_norm": 1.0738612413406372, "learning_rate": 1.979709094835936e-06, "loss": 0.1992, "step": 958100 }, { "epoch": 13.201620236422254, "grad_norm": 0.25447648763656616, "learning_rate": 1.9767239952745746e-06, "loss": 0.23, "step": 958200 }, { "epoch": 13.202997988481993, "grad_norm": 0.29381680488586426, "learning_rate": 1.973741064046079e-06, "loss": 0.2077, "step": 958300 }, { "epoch": 13.204375740541732, "grad_norm": 1.5760165452957153, "learning_rate": 1.9707603014038735e-06, "loss": 0.2302, "step": 958400 }, { "epoch": 13.205753492601472, "grad_norm": 1.596274971961975, "learning_rate": 1.9677817076011866e-06, "loss": 0.2719, "step": 958500 }, { "epoch": 13.20713124466121, "grad_norm": 0.08021339029073715, "learning_rate": 1.9648052828910627e-06, "loss": 0.213, "step": 958600 }, { "epoch": 13.20850899672095, "grad_norm": 0.735159158706665, "learning_rate": 1.9618310275263734e-06, "loss": 0.1986, "step": 958700 }, { "epoch": 13.20988674878069, "grad_norm": 5.046648025512695, "learning_rate": 1.9588589417597907e-06, "loss": 0.2363, "step": 958800 }, { "epoch": 13.211264500840429, "grad_norm": 3.062516927719116, "learning_rate": 1.9558890258438046e-06, "loss": 0.269, "step": 958900 }, { "epoch": 13.212642252900167, "grad_norm": 2.159027099609375, "learning_rate": 1.9529212800307275e-06, "loss": 0.1784, "step": 959000 }, { "epoch": 13.214020004959908, "grad_norm": 3.4646830558776855, "learning_rate": 1.949955704572688e-06, "loss": 0.2355, "step": 959100 }, { "epoch": 13.215397757019646, "grad_norm": 6.0226311683654785, "learning_rate": 1.9469922997216237e-06, "loss": 0.2051, "step": 959200 }, { "epoch": 13.216775509079387, "grad_norm": 1.7110134363174438, "learning_rate": 1.9440310657292935e-06, "loss": 0.2245, "step": 959300 }, { "epoch": 13.218153261139125, "grad_norm": 3.0870511531829834, "learning_rate": 1.941072002847271e-06, "loss": 0.2163, "step": 959400 }, { "epoch": 13.219531013198864, "grad_norm": 2.9375531673431396, "learning_rate": 1.938115111326935e-06, "loss": 0.235, "step": 959500 }, { "epoch": 13.220908765258605, "grad_norm": 1.4334698915481567, "learning_rate": 1.935160391419492e-06, "loss": 0.2279, "step": 959600 }, { "epoch": 13.222286517318343, "grad_norm": 0.6312920451164246, "learning_rate": 1.9322078433759688e-06, "loss": 0.2083, "step": 959700 }, { "epoch": 13.223664269378082, "grad_norm": 2.858332872390747, "learning_rate": 1.9292574674471884e-06, "loss": 0.2322, "step": 959800 }, { "epoch": 13.225042021437822, "grad_norm": 1.0100257396697998, "learning_rate": 1.926309263883808e-06, "loss": 0.2033, "step": 959900 }, { "epoch": 13.226419773497561, "grad_norm": 3.617982864379883, "learning_rate": 1.923392682490494e-06, "loss": 0.1817, "step": 960000 }, { "epoch": 13.227797525557301, "grad_norm": 4.050951957702637, "learning_rate": 1.9204488026792146e-06, "loss": 0.1905, "step": 960100 }, { "epoch": 13.22917527761704, "grad_norm": 3.6145012378692627, "learning_rate": 1.917507095981678e-06, "loss": 0.1909, "step": 960200 }, { "epoch": 13.230553029676779, "grad_norm": 0.3611353039741516, "learning_rate": 1.9145675626477906e-06, "loss": 0.2711, "step": 960300 }, { "epoch": 13.23193078173652, "grad_norm": 1.8225626945495605, "learning_rate": 1.9116302029272848e-06, "loss": 0.2352, "step": 960400 }, { "epoch": 13.233308533796258, "grad_norm": 3.953374147415161, "learning_rate": 1.9086950170697037e-06, "loss": 0.2131, "step": 960500 }, { "epoch": 13.234686285855997, "grad_norm": 1.9514292478561401, "learning_rate": 1.9057620053244007e-06, "loss": 0.2047, "step": 960600 }, { "epoch": 13.236064037915737, "grad_norm": 1.420623779296875, "learning_rate": 1.9028311679405533e-06, "loss": 0.1969, "step": 960700 }, { "epoch": 13.237441789975476, "grad_norm": 12.847402572631836, "learning_rate": 1.8999025051671578e-06, "loss": 0.2108, "step": 960800 }, { "epoch": 13.238819542035216, "grad_norm": 1.4847880601882935, "learning_rate": 1.8969760172530095e-06, "loss": 0.2143, "step": 960900 }, { "epoch": 13.240197294094955, "grad_norm": 2.621593475341797, "learning_rate": 1.8940517044467387e-06, "loss": 0.1947, "step": 961000 }, { "epoch": 13.241575046154693, "grad_norm": 1.2573065757751465, "learning_rate": 1.891129566996768e-06, "loss": 0.1817, "step": 961100 }, { "epoch": 13.242952798214434, "grad_norm": 2.4884815216064453, "learning_rate": 1.8882096051513612e-06, "loss": 0.2152, "step": 961200 }, { "epoch": 13.244330550274173, "grad_norm": 0.638947606086731, "learning_rate": 1.8852918191585716e-06, "loss": 0.2189, "step": 961300 }, { "epoch": 13.245708302333911, "grad_norm": 1.6240088939666748, "learning_rate": 1.8823762092662847e-06, "loss": 0.195, "step": 961400 }, { "epoch": 13.247086054393652, "grad_norm": 1.3978030681610107, "learning_rate": 1.8794627757222086e-06, "loss": 0.1933, "step": 961500 }, { "epoch": 13.24846380645339, "grad_norm": 1.0554022789001465, "learning_rate": 1.8765515187738347e-06, "loss": 0.1826, "step": 961600 }, { "epoch": 13.24984155851313, "grad_norm": 2.806983709335327, "learning_rate": 1.873642438668496e-06, "loss": 0.243, "step": 961700 }, { "epoch": 13.25121931057287, "grad_norm": 0.32036328315734863, "learning_rate": 1.8707355356533415e-06, "loss": 0.1708, "step": 961800 }, { "epoch": 13.252597062632608, "grad_norm": 3.681459903717041, "learning_rate": 1.8678308099753197e-06, "loss": 0.2217, "step": 961900 }, { "epoch": 13.253974814692349, "grad_norm": 1.3223588466644287, "learning_rate": 1.8649282618811983e-06, "loss": 0.2476, "step": 962000 }, { "epoch": 13.255352566752087, "grad_norm": 7.37537956237793, "learning_rate": 1.8620278916175774e-06, "loss": 0.2365, "step": 962100 }, { "epoch": 13.256730318811826, "grad_norm": 2.0773894786834717, "learning_rate": 1.859129699430849e-06, "loss": 0.2179, "step": 962200 }, { "epoch": 13.258108070871566, "grad_norm": 7.191392421722412, "learning_rate": 1.8562336855672288e-06, "loss": 0.1976, "step": 962300 }, { "epoch": 13.259485822931305, "grad_norm": 5.158681869506836, "learning_rate": 1.8533398502727455e-06, "loss": 0.2421, "step": 962400 }, { "epoch": 13.260863574991046, "grad_norm": 4.292149066925049, "learning_rate": 1.8504481937932543e-06, "loss": 0.2222, "step": 962500 }, { "epoch": 13.262241327050784, "grad_norm": 1.4448665380477905, "learning_rate": 1.8475587163744149e-06, "loss": 0.169, "step": 962600 }, { "epoch": 13.263619079110523, "grad_norm": 2.8384995460510254, "learning_rate": 1.8446714182616912e-06, "loss": 0.2924, "step": 962700 }, { "epoch": 13.264996831170263, "grad_norm": 1.0152028799057007, "learning_rate": 1.841815140096413e-06, "loss": 0.2406, "step": 962800 }, { "epoch": 13.266374583230002, "grad_norm": 0.9772372841835022, "learning_rate": 1.8389321795324526e-06, "loss": 0.2323, "step": 962900 }, { "epoch": 13.26775233528974, "grad_norm": 2.675663948059082, "learning_rate": 1.8360513990074804e-06, "loss": 0.2386, "step": 963000 }, { "epoch": 13.269130087349481, "grad_norm": 0.9422595500946045, "learning_rate": 1.8331727987662369e-06, "loss": 0.2058, "step": 963100 }, { "epoch": 13.27050783940922, "grad_norm": 2.2564454078674316, "learning_rate": 1.830296379053286e-06, "loss": 0.2187, "step": 963200 }, { "epoch": 13.271885591468958, "grad_norm": 4.06761360168457, "learning_rate": 1.827422140112971e-06, "loss": 0.2481, "step": 963300 }, { "epoch": 13.273263343528699, "grad_norm": 4.00928258895874, "learning_rate": 1.8245500821894862e-06, "loss": 0.2098, "step": 963400 }, { "epoch": 13.274641095588438, "grad_norm": 1.5838277339935303, "learning_rate": 1.8216802055268333e-06, "loss": 0.2395, "step": 963500 }, { "epoch": 13.276018847648178, "grad_norm": 2.5445423126220703, "learning_rate": 1.8188125103688097e-06, "loss": 0.255, "step": 963600 }, { "epoch": 13.277396599707917, "grad_norm": 0.20837311446666718, "learning_rate": 1.8159469969590506e-06, "loss": 0.1614, "step": 963700 }, { "epoch": 13.278774351767655, "grad_norm": 0.6879075169563293, "learning_rate": 1.81308366554099e-06, "loss": 0.2176, "step": 963800 }, { "epoch": 13.280152103827396, "grad_norm": 1.2926067113876343, "learning_rate": 1.8102225163578907e-06, "loss": 0.206, "step": 963900 }, { "epoch": 13.281529855887134, "grad_norm": 0.3677089810371399, "learning_rate": 1.807363549652811e-06, "loss": 0.2267, "step": 964000 }, { "epoch": 13.282907607946873, "grad_norm": 0.8048468232154846, "learning_rate": 1.8045067656686416e-06, "loss": 0.1896, "step": 964100 }, { "epoch": 13.284285360006614, "grad_norm": 2.8021392822265625, "learning_rate": 1.8016521646480833e-06, "loss": 0.1946, "step": 964200 }, { "epoch": 13.285663112066352, "grad_norm": 3.0912606716156006, "learning_rate": 1.7987997468336451e-06, "loss": 0.2375, "step": 964300 }, { "epoch": 13.287040864126093, "grad_norm": 0.29730942845344543, "learning_rate": 1.7959495124676495e-06, "loss": 0.2421, "step": 964400 }, { "epoch": 13.288418616185831, "grad_norm": 0.2854321002960205, "learning_rate": 1.7931014617922512e-06, "loss": 0.1812, "step": 964500 }, { "epoch": 13.28979636824557, "grad_norm": 3.0549330711364746, "learning_rate": 1.790255595049394e-06, "loss": 0.259, "step": 964600 }, { "epoch": 13.29117412030531, "grad_norm": 3.5647294521331787, "learning_rate": 1.7874119124808511e-06, "loss": 0.2054, "step": 964700 }, { "epoch": 13.292551872365049, "grad_norm": 2.9089515209198, "learning_rate": 1.7845704143282181e-06, "loss": 0.2317, "step": 964800 }, { "epoch": 13.293929624424788, "grad_norm": 0.3399277627468109, "learning_rate": 1.7817594831529938e-06, "loss": 0.2213, "step": 964900 }, { "epoch": 13.295307376484528, "grad_norm": 2.450773239135742, "learning_rate": 1.7789223327059987e-06, "loss": 0.2322, "step": 965000 }, { "epoch": 13.296685128544267, "grad_norm": 1.8029911518096924, "learning_rate": 1.7760873673961383e-06, "loss": 0.1924, "step": 965100 }, { "epoch": 13.298062880604007, "grad_norm": 0.9154857993125916, "learning_rate": 1.7732545874642578e-06, "loss": 0.2255, "step": 965200 }, { "epoch": 13.299440632663746, "grad_norm": 3.634655714035034, "learning_rate": 1.7704239931510217e-06, "loss": 0.203, "step": 965300 }, { "epoch": 13.300818384723485, "grad_norm": 2.506727933883667, "learning_rate": 1.7675955846968992e-06, "loss": 0.195, "step": 965400 }, { "epoch": 13.302196136783225, "grad_norm": 2.1380488872528076, "learning_rate": 1.7647693623421734e-06, "loss": 0.2426, "step": 965500 }, { "epoch": 13.303573888842964, "grad_norm": 2.5635151863098145, "learning_rate": 1.7619453263269567e-06, "loss": 0.2361, "step": 965600 }, { "epoch": 13.304951640902702, "grad_norm": 0.0536142997443676, "learning_rate": 1.7591234768911533e-06, "loss": 0.2229, "step": 965700 }, { "epoch": 13.306329392962443, "grad_norm": 1.680090308189392, "learning_rate": 1.7563038142744998e-06, "loss": 0.2149, "step": 965800 }, { "epoch": 13.307707145022182, "grad_norm": 1.297087550163269, "learning_rate": 1.7534863387165465e-06, "loss": 0.2408, "step": 965900 }, { "epoch": 13.309084897081922, "grad_norm": 4.679940223693848, "learning_rate": 1.7506710504566452e-06, "loss": 0.2246, "step": 966000 }, { "epoch": 13.31046264914166, "grad_norm": 0.42619386315345764, "learning_rate": 1.747857949733968e-06, "loss": 0.1815, "step": 966100 }, { "epoch": 13.3118404012014, "grad_norm": 2.690516471862793, "learning_rate": 1.7450470367875094e-06, "loss": 0.2053, "step": 966200 }, { "epoch": 13.31321815326114, "grad_norm": 3.1921546459198, "learning_rate": 1.7422383118560623e-06, "loss": 0.2479, "step": 966300 }, { "epoch": 13.314595905320878, "grad_norm": 3.8973655700683594, "learning_rate": 1.739431775178246e-06, "loss": 0.2306, "step": 966400 }, { "epoch": 13.315973657380617, "grad_norm": 2.7223026752471924, "learning_rate": 1.7366274269924964e-06, "loss": 0.2305, "step": 966500 }, { "epoch": 13.317351409440358, "grad_norm": 2.297441005706787, "learning_rate": 1.733825267537048e-06, "loss": 0.2017, "step": 966600 }, { "epoch": 13.318729161500096, "grad_norm": 1.2714729309082031, "learning_rate": 1.7310252970499614e-06, "loss": 0.2593, "step": 966700 }, { "epoch": 13.320106913559837, "grad_norm": 2.4037818908691406, "learning_rate": 1.7282275157691072e-06, "loss": 0.23, "step": 966800 }, { "epoch": 13.321484665619575, "grad_norm": 1.806141972541809, "learning_rate": 1.7254319239321768e-06, "loss": 0.2197, "step": 966900 }, { "epoch": 13.322862417679314, "grad_norm": 0.31874725222587585, "learning_rate": 1.7226385217766686e-06, "loss": 0.2088, "step": 967000 }, { "epoch": 13.324240169739054, "grad_norm": 1.0153437852859497, "learning_rate": 1.719847309539886e-06, "loss": 0.1966, "step": 967100 }, { "epoch": 13.325617921798793, "grad_norm": 2.779550313949585, "learning_rate": 1.7170582874589701e-06, "loss": 0.2138, "step": 967200 }, { "epoch": 13.326995673858532, "grad_norm": 0.6179196238517761, "learning_rate": 1.7142714557708552e-06, "loss": 0.1844, "step": 967300 }, { "epoch": 13.328373425918272, "grad_norm": 0.23961561918258667, "learning_rate": 1.7114868147122978e-06, "loss": 0.1959, "step": 967400 }, { "epoch": 13.329751177978011, "grad_norm": 1.4346020221710205, "learning_rate": 1.7087043645198743e-06, "loss": 0.2056, "step": 967500 }, { "epoch": 13.33112893003775, "grad_norm": 1.4697591066360474, "learning_rate": 1.7059241054299602e-06, "loss": 0.2327, "step": 967600 }, { "epoch": 13.33250668209749, "grad_norm": 0.1433299332857132, "learning_rate": 1.703146037678753e-06, "loss": 0.2196, "step": 967700 }, { "epoch": 13.333884434157229, "grad_norm": 2.8740367889404297, "learning_rate": 1.7003701615022648e-06, "loss": 0.1792, "step": 967800 }, { "epoch": 13.33526218621697, "grad_norm": 1.9139193296432495, "learning_rate": 1.6975964771363243e-06, "loss": 0.1985, "step": 967900 }, { "epoch": 13.336639938276708, "grad_norm": 1.7368520498275757, "learning_rate": 1.6948249848165642e-06, "loss": 0.1736, "step": 968000 }, { "epoch": 13.338017690336446, "grad_norm": 1.6556569337844849, "learning_rate": 1.692055684778438e-06, "loss": 0.202, "step": 968100 }, { "epoch": 13.339395442396187, "grad_norm": 3.377519130706787, "learning_rate": 1.6892885772572245e-06, "loss": 0.2611, "step": 968200 }, { "epoch": 13.340773194455926, "grad_norm": 6.001559257507324, "learning_rate": 1.686523662487986e-06, "loss": 0.2483, "step": 968300 }, { "epoch": 13.342150946515664, "grad_norm": 0.0038635018281638622, "learning_rate": 1.6837609407056198e-06, "loss": 0.2409, "step": 968400 }, { "epoch": 13.343528698575405, "grad_norm": 2.0864150524139404, "learning_rate": 1.6810004121448434e-06, "loss": 0.2225, "step": 968500 }, { "epoch": 13.344906450635143, "grad_norm": 3.923978567123413, "learning_rate": 1.678242077040166e-06, "loss": 0.2032, "step": 968600 }, { "epoch": 13.346284202694884, "grad_norm": 2.1420841217041016, "learning_rate": 1.6754859356259328e-06, "loss": 0.1897, "step": 968700 }, { "epoch": 13.347661954754622, "grad_norm": 0.8581820130348206, "learning_rate": 1.6727319881362807e-06, "loss": 0.1966, "step": 968800 }, { "epoch": 13.349039706814361, "grad_norm": 1.9391624927520752, "learning_rate": 1.669980234805182e-06, "loss": 0.2227, "step": 968900 }, { "epoch": 13.350417458874102, "grad_norm": NaN, "learning_rate": 1.6672581605927847e-06, "loss": 0.2526, "step": 969000 }, { "epoch": 13.35179521093384, "grad_norm": 2.4621922969818115, "learning_rate": 1.6645382373309544e-06, "loss": 0.2089, "step": 969100 }, { "epoch": 13.353172962993579, "grad_norm": 2.3735692501068115, "learning_rate": 1.6617930239779405e-06, "loss": 0.1911, "step": 969200 }, { "epoch": 13.35455071505332, "grad_norm": 2.3573076725006104, "learning_rate": 1.6590500057127898e-06, "loss": 0.2427, "step": 969300 }, { "epoch": 13.355928467113058, "grad_norm": 0.6580946445465088, "learning_rate": 1.6563091827685462e-06, "loss": 0.2079, "step": 969400 }, { "epoch": 13.357306219172798, "grad_norm": 1.3953064680099487, "learning_rate": 1.6535705553780494e-06, "loss": 0.2252, "step": 969500 }, { "epoch": 13.358683971232537, "grad_norm": 1.5804858207702637, "learning_rate": 1.6508341237739653e-06, "loss": 0.185, "step": 969600 }, { "epoch": 13.360061723292276, "grad_norm": 2.270874500274658, "learning_rate": 1.6480998881887665e-06, "loss": 0.1747, "step": 969700 }, { "epoch": 13.361439475352016, "grad_norm": 0.686627984046936, "learning_rate": 1.6453678488547405e-06, "loss": 0.2254, "step": 969800 }, { "epoch": 13.362817227411755, "grad_norm": 0.15113112330436707, "learning_rate": 1.6426380060039848e-06, "loss": 0.2203, "step": 969900 }, { "epoch": 13.364194979471494, "grad_norm": 2.3919641971588135, "learning_rate": 1.6399103598684198e-06, "loss": 0.1706, "step": 970000 }, { "epoch": 13.365572731531234, "grad_norm": 6.637493133544922, "learning_rate": 1.6371849106797648e-06, "loss": 0.2116, "step": 970100 }, { "epoch": 13.366950483590973, "grad_norm": 2.3451459407806396, "learning_rate": 1.6344888803128737e-06, "loss": 0.1782, "step": 970200 }, { "epoch": 13.368328235650713, "grad_norm": 1.4470887184143066, "learning_rate": 1.6317678037372435e-06, "loss": 0.2063, "step": 970300 }, { "epoch": 13.369705987710452, "grad_norm": 1.908995270729065, "learning_rate": 1.6290489248002763e-06, "loss": 0.2095, "step": 970400 }, { "epoch": 13.37108373977019, "grad_norm": 1.9326345920562744, "learning_rate": 1.6263322437329615e-06, "loss": 0.1824, "step": 970500 }, { "epoch": 13.372461491829931, "grad_norm": 4.54675817489624, "learning_rate": 1.6236177607660867e-06, "loss": 0.2222, "step": 970600 }, { "epoch": 13.37383924388967, "grad_norm": 8.290882110595703, "learning_rate": 1.6209054761302664e-06, "loss": 0.2496, "step": 970700 }, { "epoch": 13.375216995949408, "grad_norm": 0.9323617219924927, "learning_rate": 1.6181953900559245e-06, "loss": 0.2073, "step": 970800 }, { "epoch": 13.376594748009149, "grad_norm": 2.27738356590271, "learning_rate": 1.6154875027732968e-06, "loss": 0.1709, "step": 970900 }, { "epoch": 13.377972500068887, "grad_norm": 10.14094066619873, "learning_rate": 1.612781814512426e-06, "loss": 0.1959, "step": 971000 }, { "epoch": 13.379350252128628, "grad_norm": 2.2707149982452393, "learning_rate": 1.610078325503184e-06, "loss": 0.1881, "step": 971100 }, { "epoch": 13.380728004188366, "grad_norm": 1.2219228744506836, "learning_rate": 1.607377035975232e-06, "loss": 0.2055, "step": 971200 }, { "epoch": 13.382105756248105, "grad_norm": 2.8613808155059814, "learning_rate": 1.60467794615807e-06, "loss": 0.209, "step": 971300 }, { "epoch": 13.383483508307846, "grad_norm": 1.4485281705856323, "learning_rate": 1.6019810562809983e-06, "loss": 0.2442, "step": 971400 }, { "epoch": 13.384861260367584, "grad_norm": 2.4416680335998535, "learning_rate": 1.5992863665731291e-06, "loss": 0.196, "step": 971500 }, { "epoch": 13.386239012427323, "grad_norm": 3.0236451625823975, "learning_rate": 1.5965938772633879e-06, "loss": 0.2033, "step": 971600 }, { "epoch": 13.387616764487063, "grad_norm": 4.0460052490234375, "learning_rate": 1.5939035885805168e-06, "loss": 0.1949, "step": 971700 }, { "epoch": 13.388994516546802, "grad_norm": 3.5044283866882324, "learning_rate": 1.5912155007530718e-06, "loss": 0.1987, "step": 971800 }, { "epoch": 13.39037226860654, "grad_norm": 0.32671087980270386, "learning_rate": 1.5885296140094169e-06, "loss": 0.215, "step": 971900 }, { "epoch": 13.391750020666281, "grad_norm": 0.9594303369522095, "learning_rate": 1.5858459285777386e-06, "loss": 0.2355, "step": 972000 }, { "epoch": 13.39312777272602, "grad_norm": 0.8852246403694153, "learning_rate": 1.583164444686025e-06, "loss": 0.2103, "step": 972100 }, { "epoch": 13.39450552478576, "grad_norm": 1.6586456298828125, "learning_rate": 1.580485162562075e-06, "loss": 0.2076, "step": 972200 }, { "epoch": 13.395883276845499, "grad_norm": 3.041682481765747, "learning_rate": 1.5778080824335136e-06, "loss": 0.2326, "step": 972300 }, { "epoch": 13.397261028905238, "grad_norm": 5.47319221496582, "learning_rate": 1.5751332045277794e-06, "loss": 0.2231, "step": 972400 }, { "epoch": 13.398638780964978, "grad_norm": 1.3653414249420166, "learning_rate": 1.5724605290721094e-06, "loss": 0.2244, "step": 972500 }, { "epoch": 13.400016533024717, "grad_norm": 2.543056011199951, "learning_rate": 1.5697900562935546e-06, "loss": 0.1928, "step": 972600 }, { "epoch": 13.401394285084455, "grad_norm": 4.055856704711914, "learning_rate": 1.5671217864189948e-06, "loss": 0.207, "step": 972700 }, { "epoch": 13.402772037144196, "grad_norm": 1.5557111501693726, "learning_rate": 1.5644557196751177e-06, "loss": 0.2174, "step": 972800 }, { "epoch": 13.404149789203935, "grad_norm": 0.6921003460884094, "learning_rate": 1.5617918562884062e-06, "loss": 0.2233, "step": 972900 }, { "epoch": 13.405527541263675, "grad_norm": 2.644334554672241, "learning_rate": 1.5591568021747277e-06, "loss": 0.2124, "step": 973000 }, { "epoch": 13.406905293323414, "grad_norm": 2.2516720294952393, "learning_rate": 1.5564973241418872e-06, "loss": 0.2312, "step": 973100 }, { "epoch": 13.408283045383152, "grad_norm": 3.5470683574676514, "learning_rate": 1.5538400501423274e-06, "loss": 0.2675, "step": 973200 }, { "epoch": 13.409660797442893, "grad_norm": 9.808758735656738, "learning_rate": 1.5511849804017895e-06, "loss": 0.3152, "step": 973300 }, { "epoch": 13.411038549502631, "grad_norm": 1.2279080152511597, "learning_rate": 1.54853211514584e-06, "loss": 0.2358, "step": 973400 }, { "epoch": 13.41241630156237, "grad_norm": 5.738792419433594, "learning_rate": 1.5458814545998604e-06, "loss": 0.2255, "step": 973500 }, { "epoch": 13.41379405362211, "grad_norm": 1.5541588068008423, "learning_rate": 1.5432329989890295e-06, "loss": 0.2018, "step": 973600 }, { "epoch": 13.41517180568185, "grad_norm": 5.201451778411865, "learning_rate": 1.5405867485383467e-06, "loss": 0.2686, "step": 973700 }, { "epoch": 13.41654955774159, "grad_norm": 1.6326016187667847, "learning_rate": 1.5379427034726312e-06, "loss": 0.1861, "step": 973800 }, { "epoch": 13.417927309801328, "grad_norm": 3.761275053024292, "learning_rate": 1.5353008640165003e-06, "loss": 0.2297, "step": 973900 }, { "epoch": 13.419305061861067, "grad_norm": 1.3702479600906372, "learning_rate": 1.5326612303943946e-06, "loss": 0.2145, "step": 974000 }, { "epoch": 13.420682813920807, "grad_norm": 2.6068058013916016, "learning_rate": 1.5300238028305682e-06, "loss": 0.2391, "step": 974100 }, { "epoch": 13.422060565980546, "grad_norm": 1.8667982816696167, "learning_rate": 1.5273885815490831e-06, "loss": 0.2306, "step": 974200 }, { "epoch": 13.423438318040285, "grad_norm": 0.05423307791352272, "learning_rate": 1.5247555667738088e-06, "loss": 0.2275, "step": 974300 }, { "epoch": 13.424816070100025, "grad_norm": 1.0515514612197876, "learning_rate": 1.522124758728438e-06, "loss": 0.2424, "step": 974400 }, { "epoch": 13.426193822159764, "grad_norm": 2.653064489364624, "learning_rate": 1.5194961576364732e-06, "loss": 0.167, "step": 974500 }, { "epoch": 13.427571574219504, "grad_norm": 4.222901821136475, "learning_rate": 1.5168697637212229e-06, "loss": 0.1886, "step": 974600 }, { "epoch": 13.428949326279243, "grad_norm": 0.6469887495040894, "learning_rate": 1.5142455772058172e-06, "loss": 0.2236, "step": 974700 }, { "epoch": 13.430327078338982, "grad_norm": 0.8529631495475769, "learning_rate": 1.5116235983131948e-06, "loss": 0.2271, "step": 974800 }, { "epoch": 13.431704830398722, "grad_norm": 0.9263003468513489, "learning_rate": 1.5090038272660955e-06, "loss": 0.1843, "step": 974900 }, { "epoch": 13.43308258245846, "grad_norm": 0.8160406351089478, "learning_rate": 1.5063862642870942e-06, "loss": 0.2261, "step": 975000 }, { "epoch": 13.4344603345182, "grad_norm": 0.4991409182548523, "learning_rate": 1.5037709095985647e-06, "loss": 0.229, "step": 975100 }, { "epoch": 13.43583808657794, "grad_norm": 3.1712472438812256, "learning_rate": 1.5011577634226878e-06, "loss": 0.215, "step": 975200 }, { "epoch": 13.437215838637679, "grad_norm": 2.931771993637085, "learning_rate": 1.498546825981471e-06, "loss": 0.1871, "step": 975300 }, { "epoch": 13.438593590697419, "grad_norm": 2.5258891582489014, "learning_rate": 1.4959380974967194e-06, "loss": 0.2022, "step": 975400 }, { "epoch": 13.439971342757158, "grad_norm": 2.1388068199157715, "learning_rate": 1.493331578190068e-06, "loss": 0.2185, "step": 975500 }, { "epoch": 13.441349094816896, "grad_norm": 0.913186252117157, "learning_rate": 1.490727268282944e-06, "loss": 0.2461, "step": 975600 }, { "epoch": 13.442726846876637, "grad_norm": 1.7823710441589355, "learning_rate": 1.4881251679966002e-06, "loss": 0.2024, "step": 975700 }, { "epoch": 13.444104598936375, "grad_norm": 1.3149183988571167, "learning_rate": 1.4855252775521064e-06, "loss": 0.1734, "step": 975800 }, { "epoch": 13.445482350996114, "grad_norm": 1.0991275310516357, "learning_rate": 1.4829275971703248e-06, "loss": 0.1982, "step": 975900 }, { "epoch": 13.446860103055855, "grad_norm": 2.2895333766937256, "learning_rate": 1.480332127071947e-06, "loss": 0.1653, "step": 976000 }, { "epoch": 13.448237855115593, "grad_norm": 3.155470848083496, "learning_rate": 1.4777388674774713e-06, "loss": 0.2139, "step": 976100 }, { "epoch": 13.449615607175332, "grad_norm": 2.3851096630096436, "learning_rate": 1.4751478186072048e-06, "loss": 0.2386, "step": 976200 }, { "epoch": 13.450993359235072, "grad_norm": 3.124310255050659, "learning_rate": 1.4725589806812736e-06, "loss": 0.2577, "step": 976300 }, { "epoch": 13.452371111294811, "grad_norm": 1.242732048034668, "learning_rate": 1.469972353919615e-06, "loss": 0.2735, "step": 976400 }, { "epoch": 13.453748863354551, "grad_norm": 2.21231746673584, "learning_rate": 1.4673879385419768e-06, "loss": 0.2592, "step": 976500 }, { "epoch": 13.45512661541429, "grad_norm": 4.025476455688477, "learning_rate": 1.4648057347679082e-06, "loss": 0.2109, "step": 976600 }, { "epoch": 13.456504367474029, "grad_norm": 0.8984253406524658, "learning_rate": 1.4622257428167877e-06, "loss": 0.2303, "step": 976700 }, { "epoch": 13.45788211953377, "grad_norm": 1.9153467416763306, "learning_rate": 1.4596479629078045e-06, "loss": 0.1826, "step": 976800 }, { "epoch": 13.459259871593508, "grad_norm": 1.0118606090545654, "learning_rate": 1.4570723952599429e-06, "loss": 0.2025, "step": 976900 }, { "epoch": 13.460637623653247, "grad_norm": 5.3036274909973145, "learning_rate": 1.454499040092023e-06, "loss": 0.2601, "step": 977000 }, { "epoch": 13.462015375712987, "grad_norm": 4.049126625061035, "learning_rate": 1.451953598093768e-06, "loss": 0.2297, "step": 977100 }, { "epoch": 13.463393127772726, "grad_norm": 2.906615972518921, "learning_rate": 1.4493846464111365e-06, "loss": 0.2111, "step": 977200 }, { "epoch": 13.464770879832466, "grad_norm": 0.4944547116756439, "learning_rate": 1.4468179078615502e-06, "loss": 0.2222, "step": 977300 }, { "epoch": 13.466148631892205, "grad_norm": 2.989488124847412, "learning_rate": 1.4442533826630684e-06, "loss": 0.236, "step": 977400 }, { "epoch": 13.467526383951943, "grad_norm": 2.149967908859253, "learning_rate": 1.4416910710335643e-06, "loss": 0.2114, "step": 977500 }, { "epoch": 13.468904136011684, "grad_norm": 3.3817970752716064, "learning_rate": 1.4391565632101897e-06, "loss": 0.1915, "step": 977600 }, { "epoch": 13.470281888071423, "grad_norm": 3.9822964668273926, "learning_rate": 1.436598657230375e-06, "loss": 0.212, "step": 977700 }, { "epoch": 13.471659640131161, "grad_norm": 2.298999547958374, "learning_rate": 1.434042965469842e-06, "loss": 0.1999, "step": 977800 }, { "epoch": 13.473037392190902, "grad_norm": 2.3098762035369873, "learning_rate": 1.4314894881457131e-06, "loss": 0.2194, "step": 977900 }, { "epoch": 13.47441514425064, "grad_norm": 0.9896675944328308, "learning_rate": 1.4289382254749208e-06, "loss": 0.1758, "step": 978000 }, { "epoch": 13.47579289631038, "grad_norm": 2.371516227722168, "learning_rate": 1.4263891776742e-06, "loss": 0.2214, "step": 978100 }, { "epoch": 13.47717064837012, "grad_norm": 2.395801305770874, "learning_rate": 1.4238423449601107e-06, "loss": 0.2366, "step": 978200 }, { "epoch": 13.478548400429858, "grad_norm": 3.5755250453948975, "learning_rate": 1.421297727549024e-06, "loss": 0.2622, "step": 978300 }, { "epoch": 13.479926152489599, "grad_norm": 3.4193718433380127, "learning_rate": 1.4187553256571067e-06, "loss": 0.1903, "step": 978400 }, { "epoch": 13.481303904549337, "grad_norm": 0.9844844341278076, "learning_rate": 1.4162151395003572e-06, "loss": 0.1894, "step": 978500 }, { "epoch": 13.482681656609076, "grad_norm": 2.724477767944336, "learning_rate": 1.413677169294582e-06, "loss": 0.262, "step": 978600 }, { "epoch": 13.484059408668816, "grad_norm": 0.596457302570343, "learning_rate": 1.4111414152553823e-06, "loss": 0.2137, "step": 978700 }, { "epoch": 13.485437160728555, "grad_norm": 4.129903316497803, "learning_rate": 1.4086078775981861e-06, "loss": 0.2445, "step": 978800 }, { "epoch": 13.486814912788295, "grad_norm": 0.13706065714359283, "learning_rate": 1.406076556538238e-06, "loss": 0.2244, "step": 978900 }, { "epoch": 13.488192664848034, "grad_norm": 2.330181837081909, "learning_rate": 1.4035474522905778e-06, "loss": 0.2507, "step": 979000 }, { "epoch": 13.489570416907773, "grad_norm": 3.4639618396759033, "learning_rate": 1.4010205650700744e-06, "loss": 0.1634, "step": 979100 }, { "epoch": 13.490948168967513, "grad_norm": 6.897192478179932, "learning_rate": 1.3984958950913924e-06, "loss": 0.2109, "step": 979200 }, { "epoch": 13.492325921027252, "grad_norm": 4.98928165435791, "learning_rate": 1.3959734425690218e-06, "loss": 0.1964, "step": 979300 }, { "epoch": 13.49370367308699, "grad_norm": 1.832289695739746, "learning_rate": 1.393453207717252e-06, "loss": 0.2792, "step": 979400 }, { "epoch": 13.495081425146731, "grad_norm": 0.4960339367389679, "learning_rate": 1.390935190750191e-06, "loss": 0.2485, "step": 979500 }, { "epoch": 13.49645917720647, "grad_norm": 3.3870530128479004, "learning_rate": 1.388419391881762e-06, "loss": 0.2613, "step": 979600 }, { "epoch": 13.49783692926621, "grad_norm": 0.7139763832092285, "learning_rate": 1.3859058113256947e-06, "loss": 0.197, "step": 979700 }, { "epoch": 13.499214681325949, "grad_norm": 4.498809337615967, "learning_rate": 1.3833944492955212e-06, "loss": 0.2105, "step": 979800 }, { "epoch": 13.500592433385687, "grad_norm": 1.9053908586502075, "learning_rate": 1.3808853060046047e-06, "loss": 0.251, "step": 979900 }, { "epoch": 13.501970185445428, "grad_norm": 0.7259538173675537, "learning_rate": 1.3783783816661019e-06, "loss": 0.21, "step": 980000 }, { "epoch": 13.503347937505167, "grad_norm": 4.234583377838135, "learning_rate": 1.3758736764929916e-06, "loss": 0.206, "step": 980100 }, { "epoch": 13.504725689564905, "grad_norm": 0.6961900591850281, "learning_rate": 1.3733711906980699e-06, "loss": 0.2574, "step": 980200 }, { "epoch": 13.506103441624646, "grad_norm": 2.5326108932495117, "learning_rate": 1.3708709244939247e-06, "loss": 0.1808, "step": 980300 }, { "epoch": 13.507481193684384, "grad_norm": 2.8855934143066406, "learning_rate": 1.3683728780929705e-06, "loss": 0.2214, "step": 980400 }, { "epoch": 13.508858945744123, "grad_norm": 0.2484571486711502, "learning_rate": 1.3658770517074262e-06, "loss": 0.1636, "step": 980500 }, { "epoch": 13.510236697803863, "grad_norm": 1.1764426231384277, "learning_rate": 1.3633834455493304e-06, "loss": 0.1869, "step": 980600 }, { "epoch": 13.511614449863602, "grad_norm": 2.2267537117004395, "learning_rate": 1.3608920598305235e-06, "loss": 0.2249, "step": 980700 }, { "epoch": 13.512992201923343, "grad_norm": 2.6020455360412598, "learning_rate": 1.3584028947626656e-06, "loss": 0.1859, "step": 980800 }, { "epoch": 13.514369953983081, "grad_norm": 0.9432273507118225, "learning_rate": 1.3559159505572213e-06, "loss": 0.1966, "step": 980900 }, { "epoch": 13.51574770604282, "grad_norm": 0.07814761251211166, "learning_rate": 1.3534312274254663e-06, "loss": 0.2162, "step": 981000 }, { "epoch": 13.51712545810256, "grad_norm": 1.8752347230911255, "learning_rate": 1.3509487255784925e-06, "loss": 0.2358, "step": 981100 }, { "epoch": 13.518503210162299, "grad_norm": 0.3885021209716797, "learning_rate": 1.348468445227206e-06, "loss": 0.197, "step": 981200 }, { "epoch": 13.519880962222038, "grad_norm": 4.643098831176758, "learning_rate": 1.3459903865823118e-06, "loss": 0.1794, "step": 981300 }, { "epoch": 13.521258714281778, "grad_norm": 2.188894748687744, "learning_rate": 1.3435145498543427e-06, "loss": 0.2237, "step": 981400 }, { "epoch": 13.522636466341517, "grad_norm": 2.5729386806488037, "learning_rate": 1.341040935253622e-06, "loss": 0.2553, "step": 981500 }, { "epoch": 13.524014218401257, "grad_norm": 2.813927173614502, "learning_rate": 1.3385695429903078e-06, "loss": 0.2104, "step": 981600 }, { "epoch": 13.525391970460996, "grad_norm": 4.45389461517334, "learning_rate": 1.3361003732743472e-06, "loss": 0.2118, "step": 981700 }, { "epoch": 13.526769722520735, "grad_norm": 2.084127426147461, "learning_rate": 1.3336334263155134e-06, "loss": 0.2364, "step": 981800 }, { "epoch": 13.528147474580475, "grad_norm": 1.1524569988250732, "learning_rate": 1.3311687023233938e-06, "loss": 0.1987, "step": 981900 }, { "epoch": 13.529525226640214, "grad_norm": 2.9172592163085938, "learning_rate": 1.3287062015073645e-06, "loss": 0.2349, "step": 982000 }, { "epoch": 13.530902978699952, "grad_norm": 1.9078935384750366, "learning_rate": 1.3262459240766343e-06, "loss": 0.1774, "step": 982100 }, { "epoch": 13.532280730759693, "grad_norm": 1.2338684797286987, "learning_rate": 1.3237878702402221e-06, "loss": 0.2046, "step": 982200 }, { "epoch": 13.533658482819432, "grad_norm": 1.4152686595916748, "learning_rate": 1.3213320402069396e-06, "loss": 0.1813, "step": 982300 }, { "epoch": 13.535036234879172, "grad_norm": 4.223006248474121, "learning_rate": 1.318902959236107e-06, "loss": 0.2005, "step": 982400 }, { "epoch": 13.53641398693891, "grad_norm": 0.9802374243736267, "learning_rate": 1.3164515551915884e-06, "loss": 0.1695, "step": 982500 }, { "epoch": 13.53779173899865, "grad_norm": 0.04625668004155159, "learning_rate": 1.3140023755734621e-06, "loss": 0.2189, "step": 982600 }, { "epoch": 13.53916949105839, "grad_norm": 8.70906925201416, "learning_rate": 1.3115554205898006e-06, "loss": 0.2712, "step": 982700 }, { "epoch": 13.540547243118128, "grad_norm": 0.3222040832042694, "learning_rate": 1.3091106904484846e-06, "loss": 0.1882, "step": 982800 }, { "epoch": 13.541924995177867, "grad_norm": 1.4261754751205444, "learning_rate": 1.306668185357208e-06, "loss": 0.1873, "step": 982900 }, { "epoch": 13.543302747237608, "grad_norm": 2.110771417617798, "learning_rate": 1.3042279055234757e-06, "loss": 0.1871, "step": 983000 }, { "epoch": 13.544680499297346, "grad_norm": 0.22240127623081207, "learning_rate": 1.3017898511545972e-06, "loss": 0.2234, "step": 983100 }, { "epoch": 13.546058251357085, "grad_norm": 2.1917483806610107, "learning_rate": 1.2993540224576958e-06, "loss": 0.2447, "step": 983200 }, { "epoch": 13.547436003416825, "grad_norm": 0.8603069186210632, "learning_rate": 1.2969204196397204e-06, "loss": 0.2014, "step": 983300 }, { "epoch": 13.548813755476564, "grad_norm": 0.83849036693573, "learning_rate": 1.2944890429074009e-06, "loss": 0.2598, "step": 983400 }, { "epoch": 13.550191507536304, "grad_norm": 1.582377552986145, "learning_rate": 1.292059892467307e-06, "loss": 0.2073, "step": 983500 }, { "epoch": 13.551569259596043, "grad_norm": 1.7436193227767944, "learning_rate": 1.2896329685258119e-06, "loss": 0.2114, "step": 983600 }, { "epoch": 13.552947011655782, "grad_norm": 1.3982709646224976, "learning_rate": 1.2872082712890762e-06, "loss": 0.231, "step": 983700 }, { "epoch": 13.554324763715522, "grad_norm": 0.29163679480552673, "learning_rate": 1.284785800963106e-06, "loss": 0.1872, "step": 983800 }, { "epoch": 13.55570251577526, "grad_norm": 0.5178783535957336, "learning_rate": 1.2823655577536998e-06, "loss": 0.2617, "step": 983900 }, { "epoch": 13.557080267835001, "grad_norm": 3.3753626346588135, "learning_rate": 1.2799475418664628e-06, "loss": 0.2044, "step": 984000 }, { "epoch": 13.55845801989474, "grad_norm": 2.0288450717926025, "learning_rate": 1.2775317535068301e-06, "loss": 0.1904, "step": 984100 }, { "epoch": 13.559835771954479, "grad_norm": 6.1607537269592285, "learning_rate": 1.275118192880023e-06, "loss": 0.1998, "step": 984200 }, { "epoch": 13.561213524014219, "grad_norm": 0.929041862487793, "learning_rate": 1.2727068601910942e-06, "loss": 0.221, "step": 984300 }, { "epoch": 13.562591276073958, "grad_norm": 1.6923526525497437, "learning_rate": 1.27029775564489e-06, "loss": 0.2305, "step": 984400 }, { "epoch": 13.563969028133696, "grad_norm": 2.6941702365875244, "learning_rate": 1.2678908794460844e-06, "loss": 0.2258, "step": 984500 }, { "epoch": 13.565346780193437, "grad_norm": 3.6822738647460938, "learning_rate": 1.265486231799154e-06, "loss": 0.201, "step": 984600 }, { "epoch": 13.566724532253176, "grad_norm": 1.1845561265945435, "learning_rate": 1.2630838129083852e-06, "loss": 0.2061, "step": 984700 }, { "epoch": 13.568102284312914, "grad_norm": 2.522339105606079, "learning_rate": 1.2606836229778669e-06, "loss": 0.2448, "step": 984800 }, { "epoch": 13.569480036372655, "grad_norm": 4.268677234649658, "learning_rate": 1.2582856622115191e-06, "loss": 0.2089, "step": 984900 }, { "epoch": 13.570857788432393, "grad_norm": 2.647533416748047, "learning_rate": 1.255889930813052e-06, "loss": 0.2057, "step": 985000 }, { "epoch": 13.572235540492134, "grad_norm": 0.933820366859436, "learning_rate": 1.2534964289860013e-06, "loss": 0.2017, "step": 985100 }, { "epoch": 13.573613292551872, "grad_norm": 3.9596025943756104, "learning_rate": 1.2511051569337073e-06, "loss": 0.2423, "step": 985200 }, { "epoch": 13.574991044611611, "grad_norm": 0.7913817763328552, "learning_rate": 1.2487161148593181e-06, "loss": 0.1956, "step": 985300 }, { "epoch": 13.576368796671352, "grad_norm": 3.467693328857422, "learning_rate": 1.2463293029657927e-06, "loss": 0.219, "step": 985400 }, { "epoch": 13.57774654873109, "grad_norm": 3.1986021995544434, "learning_rate": 1.2439447214559032e-06, "loss": 0.1895, "step": 985500 }, { "epoch": 13.579124300790829, "grad_norm": 0.7990707159042358, "learning_rate": 1.2415623705322424e-06, "loss": 0.1707, "step": 985600 }, { "epoch": 13.58050205285057, "grad_norm": 1.7855857610702515, "learning_rate": 1.2391822503971887e-06, "loss": 0.2125, "step": 985700 }, { "epoch": 13.581879804910308, "grad_norm": 8.833712577819824, "learning_rate": 1.236804361252959e-06, "loss": 0.196, "step": 985800 }, { "epoch": 13.583257556970048, "grad_norm": 1.7128103971481323, "learning_rate": 1.2344287033015594e-06, "loss": 0.2051, "step": 985900 }, { "epoch": 13.584635309029787, "grad_norm": 3.971196174621582, "learning_rate": 1.23205527674481e-06, "loss": 0.192, "step": 986000 }, { "epoch": 13.586013061089526, "grad_norm": 1.8333990573883057, "learning_rate": 1.2296840817843508e-06, "loss": 0.2049, "step": 986100 }, { "epoch": 13.587390813149266, "grad_norm": 2.9046077728271484, "learning_rate": 1.2273151186216318e-06, "loss": 0.1688, "step": 986200 }, { "epoch": 13.588768565209005, "grad_norm": 2.5722877979278564, "learning_rate": 1.2249483874579026e-06, "loss": 0.2952, "step": 986300 }, { "epoch": 13.590146317268744, "grad_norm": 0.1348728984594345, "learning_rate": 1.2225838884942286e-06, "loss": 0.1694, "step": 986400 }, { "epoch": 13.591524069328484, "grad_norm": 1.7274844646453857, "learning_rate": 1.2202216219314836e-06, "loss": 0.2174, "step": 986500 }, { "epoch": 13.592901821388223, "grad_norm": 1.9118367433547974, "learning_rate": 1.2178615879703668e-06, "loss": 0.164, "step": 986600 }, { "epoch": 13.594279573447963, "grad_norm": 1.7584048509597778, "learning_rate": 1.2155037868113608e-06, "loss": 0.1618, "step": 986700 }, { "epoch": 13.595657325507702, "grad_norm": 3.2685728073120117, "learning_rate": 1.2131717632823233e-06, "loss": 0.2098, "step": 986800 }, { "epoch": 13.59703507756744, "grad_norm": 1.5389854907989502, "learning_rate": 1.2108184059952712e-06, "loss": 0.2153, "step": 986900 }, { "epoch": 13.598412829627181, "grad_norm": 1.4094879627227783, "learning_rate": 1.2084672821086915e-06, "loss": 0.2123, "step": 987000 }, { "epoch": 13.59979058168692, "grad_norm": 2.0222158432006836, "learning_rate": 1.2061183918223156e-06, "loss": 0.2132, "step": 987100 }, { "epoch": 13.601168333746658, "grad_norm": 3.6543211936950684, "learning_rate": 1.2037717353357044e-06, "loss": 0.2079, "step": 987200 }, { "epoch": 13.602546085806399, "grad_norm": 1.1921967267990112, "learning_rate": 1.201427312848214e-06, "loss": 0.2015, "step": 987300 }, { "epoch": 13.603923837866137, "grad_norm": 0.08158989250659943, "learning_rate": 1.19908512455902e-06, "loss": 0.2636, "step": 987400 }, { "epoch": 13.605301589925876, "grad_norm": 1.0798386335372925, "learning_rate": 1.1967451706670942e-06, "loss": 0.2592, "step": 987500 }, { "epoch": 13.606679341985616, "grad_norm": 4.925256729125977, "learning_rate": 1.194407451371237e-06, "loss": 0.2021, "step": 987600 }, { "epoch": 13.608057094045355, "grad_norm": 0.1371840387582779, "learning_rate": 1.1920719668700408e-06, "loss": 0.2036, "step": 987700 }, { "epoch": 13.609434846105096, "grad_norm": 1.224448561668396, "learning_rate": 1.1897387173619217e-06, "loss": 0.2313, "step": 987800 }, { "epoch": 13.610812598164834, "grad_norm": 2.2813880443573, "learning_rate": 1.187431002123421e-06, "loss": 0.2392, "step": 987900 }, { "epoch": 13.612190350224573, "grad_norm": 1.0893574953079224, "learning_rate": 1.1851022008410625e-06, "loss": 0.1959, "step": 988000 }, { "epoch": 13.613568102284313, "grad_norm": 0.4824713468551636, "learning_rate": 1.1827756351438956e-06, "loss": 0.2337, "step": 988100 }, { "epoch": 13.614945854344052, "grad_norm": 2.2070438861846924, "learning_rate": 1.1804513052295729e-06, "loss": 0.234, "step": 988200 }, { "epoch": 13.616323606403792, "grad_norm": 2.84987211227417, "learning_rate": 1.1781524211661543e-06, "loss": 0.249, "step": 988300 }, { "epoch": 13.617701358463531, "grad_norm": 2.451464891433716, "learning_rate": 1.175832541046981e-06, "loss": 0.2389, "step": 988400 }, { "epoch": 13.61907911052327, "grad_norm": 0.5440343022346497, "learning_rate": 1.173514897300507e-06, "loss": 0.1602, "step": 988500 }, { "epoch": 13.62045686258301, "grad_norm": 3.420504093170166, "learning_rate": 1.1711994901236248e-06, "loss": 0.2052, "step": 988600 }, { "epoch": 13.621834614642749, "grad_norm": 3.50215220451355, "learning_rate": 1.1688863197130433e-06, "loss": 0.2116, "step": 988700 }, { "epoch": 13.623212366702488, "grad_norm": 1.4052149057388306, "learning_rate": 1.1665753862652795e-06, "loss": 0.231, "step": 988800 }, { "epoch": 13.624590118762228, "grad_norm": 0.8020164370536804, "learning_rate": 1.1642666899766544e-06, "loss": 0.2174, "step": 988900 }, { "epoch": 13.625967870821967, "grad_norm": 2.215566635131836, "learning_rate": 1.1619602310433091e-06, "loss": 0.2302, "step": 989000 }, { "epoch": 13.627345622881705, "grad_norm": 1.5796433687210083, "learning_rate": 1.1596560096611898e-06, "loss": 0.2336, "step": 989100 }, { "epoch": 13.628723374941446, "grad_norm": 1.0190300941467285, "learning_rate": 1.1573540260260463e-06, "loss": 0.2074, "step": 989200 }, { "epoch": 13.630101127001184, "grad_norm": 3.5776174068450928, "learning_rate": 1.155054280333443e-06, "loss": 0.2182, "step": 989300 }, { "epoch": 13.631478879060925, "grad_norm": 10.981512069702148, "learning_rate": 1.1527567727787639e-06, "loss": 0.2357, "step": 989400 }, { "epoch": 13.632856631120664, "grad_norm": 3.514577865600586, "learning_rate": 1.1504615035571853e-06, "loss": 0.2244, "step": 989500 }, { "epoch": 13.634234383180402, "grad_norm": 3.2168173789978027, "learning_rate": 1.1481684728637064e-06, "loss": 0.2155, "step": 989600 }, { "epoch": 13.635612135240143, "grad_norm": 5.93967866897583, "learning_rate": 1.1458776808931315e-06, "loss": 0.2627, "step": 989700 }, { "epoch": 13.636989887299881, "grad_norm": 1.6903858184814453, "learning_rate": 1.1435891278400748e-06, "loss": 0.2069, "step": 989800 }, { "epoch": 13.63836763935962, "grad_norm": 2.683765411376953, "learning_rate": 1.1413028138989556e-06, "loss": 0.1935, "step": 989900 }, { "epoch": 13.63974539141936, "grad_norm": 2.301539659500122, "learning_rate": 1.139018739264013e-06, "loss": 0.1958, "step": 990000 }, { "epoch": 13.6411231434791, "grad_norm": 1.6838740110397339, "learning_rate": 1.136736904129297e-06, "loss": 0.1662, "step": 990100 }, { "epoch": 13.64250089553884, "grad_norm": 3.7607204914093018, "learning_rate": 1.1344573086886522e-06, "loss": 0.1705, "step": 990200 }, { "epoch": 13.643878647598578, "grad_norm": 3.1095991134643555, "learning_rate": 1.1321799531357412e-06, "loss": 0.1986, "step": 990300 }, { "epoch": 13.645256399658317, "grad_norm": 1.0303610563278198, "learning_rate": 1.1299048376640428e-06, "loss": 0.2119, "step": 990400 }, { "epoch": 13.646634151718057, "grad_norm": 3.0687148571014404, "learning_rate": 1.127631962466834e-06, "loss": 0.179, "step": 990500 }, { "epoch": 13.648011903777796, "grad_norm": 4.130395412445068, "learning_rate": 1.1253613277372093e-06, "loss": 0.2061, "step": 990600 }, { "epoch": 13.649389655837535, "grad_norm": 2.410235643386841, "learning_rate": 1.1230929336680765e-06, "loss": 0.2268, "step": 990700 }, { "epoch": 13.650767407897275, "grad_norm": 0.22902274131774902, "learning_rate": 1.120849430891443e-06, "loss": 0.2119, "step": 990800 }, { "epoch": 13.652145159957014, "grad_norm": 2.778848171234131, "learning_rate": 1.118585496309818e-06, "loss": 0.2246, "step": 990900 }, { "epoch": 13.653522912016754, "grad_norm": 8.463459968566895, "learning_rate": 1.1163238029643188e-06, "loss": 0.2076, "step": 991000 }, { "epoch": 13.654900664076493, "grad_norm": 0.40749770402908325, "learning_rate": 1.1140643510470902e-06, "loss": 0.2262, "step": 991100 }, { "epoch": 13.656278416136232, "grad_norm": 1.7824000120162964, "learning_rate": 1.1118071407500874e-06, "loss": 0.2231, "step": 991200 }, { "epoch": 13.657656168195972, "grad_norm": 2.1625523567199707, "learning_rate": 1.1095521722650704e-06, "loss": 0.2067, "step": 991300 }, { "epoch": 13.65903392025571, "grad_norm": 0.9409769177436829, "learning_rate": 1.1072994457836037e-06, "loss": 0.1769, "step": 991400 }, { "epoch": 13.66041167231545, "grad_norm": 5.067215442657471, "learning_rate": 1.1050489614970777e-06, "loss": 0.2467, "step": 991500 }, { "epoch": 13.66178942437519, "grad_norm": 1.7076265811920166, "learning_rate": 1.1028007195966754e-06, "loss": 0.2279, "step": 991600 }, { "epoch": 13.663167176434929, "grad_norm": 2.7417099475860596, "learning_rate": 1.1005547202733997e-06, "loss": 0.1992, "step": 991700 }, { "epoch": 13.664544928494667, "grad_norm": 0.27878376841545105, "learning_rate": 1.0983109637180612e-06, "loss": 0.2397, "step": 991800 }, { "epoch": 13.665922680554408, "grad_norm": 1.955369472503662, "learning_rate": 1.0960694501212743e-06, "loss": 0.2357, "step": 991900 }, { "epoch": 13.667300432614146, "grad_norm": 1.3347461223602295, "learning_rate": 1.0938301796734682e-06, "loss": 0.2288, "step": 992000 }, { "epoch": 13.668678184673887, "grad_norm": 2.3524982929229736, "learning_rate": 1.0915931525648822e-06, "loss": 0.1911, "step": 992100 }, { "epoch": 13.670055936733625, "grad_norm": 2.027970314025879, "learning_rate": 1.0893583689855638e-06, "loss": 0.2098, "step": 992200 }, { "epoch": 13.671433688793364, "grad_norm": 0.11719391494989395, "learning_rate": 1.0871258291253674e-06, "loss": 0.2284, "step": 992300 }, { "epoch": 13.672811440853105, "grad_norm": 2.669922351837158, "learning_rate": 1.0848955331739587e-06, "loss": 0.2456, "step": 992400 }, { "epoch": 13.674189192912843, "grad_norm": 1.5742334127426147, "learning_rate": 1.082667481320817e-06, "loss": 0.1785, "step": 992500 }, { "epoch": 13.675566944972584, "grad_norm": 4.7714738845825195, "learning_rate": 1.080441673755217e-06, "loss": 0.1965, "step": 992600 }, { "epoch": 13.676944697032322, "grad_norm": 4.0279645919799805, "learning_rate": 1.0782181106662591e-06, "loss": 0.1919, "step": 992700 }, { "epoch": 13.678322449092061, "grad_norm": 1.4492229223251343, "learning_rate": 1.075996792242849e-06, "loss": 0.1907, "step": 992800 }, { "epoch": 13.679700201151801, "grad_norm": 1.688649296760559, "learning_rate": 1.0737777186736964e-06, "loss": 0.2021, "step": 992900 }, { "epoch": 13.68107795321154, "grad_norm": 4.979525089263916, "learning_rate": 1.0715608901473188e-06, "loss": 0.2204, "step": 993000 }, { "epoch": 13.682455705271279, "grad_norm": 2.755791664123535, "learning_rate": 1.0693463068520505e-06, "loss": 0.1683, "step": 993100 }, { "epoch": 13.68383345733102, "grad_norm": 1.8954098224639893, "learning_rate": 1.0671339689760334e-06, "loss": 0.254, "step": 993200 }, { "epoch": 13.685211209390758, "grad_norm": 2.886615753173828, "learning_rate": 1.0649238767072145e-06, "loss": 0.2435, "step": 993300 }, { "epoch": 13.686588961450497, "grad_norm": 3.025954246520996, "learning_rate": 1.062716030233354e-06, "loss": 0.2532, "step": 993400 }, { "epoch": 13.687966713510237, "grad_norm": 2.97761607170105, "learning_rate": 1.0605104297420196e-06, "loss": 0.226, "step": 993500 }, { "epoch": 13.689344465569976, "grad_norm": 0.6922687888145447, "learning_rate": 1.0583070754205843e-06, "loss": 0.1939, "step": 993600 }, { "epoch": 13.690722217629716, "grad_norm": 1.5756398439407349, "learning_rate": 1.0561059674562404e-06, "loss": 0.2093, "step": 993700 }, { "epoch": 13.692099969689455, "grad_norm": 2.02323579788208, "learning_rate": 1.053907106035982e-06, "loss": 0.1889, "step": 993800 }, { "epoch": 13.693477721749193, "grad_norm": 4.709471225738525, "learning_rate": 1.0517104913466076e-06, "loss": 0.2284, "step": 993900 }, { "epoch": 13.694855473808934, "grad_norm": 1.2264903783798218, "learning_rate": 1.0495161235747392e-06, "loss": 0.2226, "step": 994000 }, { "epoch": 13.696233225868673, "grad_norm": 3.4645321369171143, "learning_rate": 1.0473240029067932e-06, "loss": 0.221, "step": 994100 }, { "epoch": 13.697610977928411, "grad_norm": 12.041801452636719, "learning_rate": 1.0451341295290098e-06, "loss": 0.202, "step": 994200 }, { "epoch": 13.698988729988152, "grad_norm": 3.8063607215881348, "learning_rate": 1.0429683687608186e-06, "loss": 0.2177, "step": 994300 }, { "epoch": 13.70036648204789, "grad_norm": 0.010017875581979752, "learning_rate": 1.0407829680437347e-06, "loss": 0.2172, "step": 994400 }, { "epoch": 13.70174423410763, "grad_norm": 1.863673448562622, "learning_rate": 1.0385998151725046e-06, "loss": 0.2285, "step": 994500 }, { "epoch": 13.70312198616737, "grad_norm": 2.777113914489746, "learning_rate": 1.0364189103325986e-06, "loss": 0.2135, "step": 994600 }, { "epoch": 13.704499738227108, "grad_norm": 9.070704460144043, "learning_rate": 1.0342402537092889e-06, "loss": 0.233, "step": 994700 }, { "epoch": 13.705877490286849, "grad_norm": 1.3899420499801636, "learning_rate": 1.0320638454876672e-06, "loss": 0.2038, "step": 994800 }, { "epoch": 13.707255242346587, "grad_norm": 1.8444868326187134, "learning_rate": 1.0298896858526334e-06, "loss": 0.1944, "step": 994900 }, { "epoch": 13.708632994406326, "grad_norm": 2.5521552562713623, "learning_rate": 1.0277177749888883e-06, "loss": 0.2217, "step": 995000 }, { "epoch": 13.710010746466066, "grad_norm": 1.6306613683700562, "learning_rate": 1.0255481130809532e-06, "loss": 0.2268, "step": 995100 }, { "epoch": 13.711388498525805, "grad_norm": 1.5699774026870728, "learning_rate": 1.0233807003131475e-06, "loss": 0.2458, "step": 995200 }, { "epoch": 13.712766250585545, "grad_norm": 4.638448715209961, "learning_rate": 1.0212155368696048e-06, "loss": 0.2135, "step": 995300 }, { "epoch": 13.714144002645284, "grad_norm": 3.931145191192627, "learning_rate": 1.0190526229342623e-06, "loss": 0.2232, "step": 995400 }, { "epoch": 13.715521754705023, "grad_norm": 4.689014434814453, "learning_rate": 1.0168919586908812e-06, "loss": 0.2145, "step": 995500 }, { "epoch": 13.716899506764763, "grad_norm": 3.678219795227051, "learning_rate": 1.0147335443230115e-06, "loss": 0.1754, "step": 995600 }, { "epoch": 13.718277258824502, "grad_norm": 2.0498316287994385, "learning_rate": 1.0125773800140264e-06, "loss": 0.2468, "step": 995700 }, { "epoch": 13.71965501088424, "grad_norm": 1.4617855548858643, "learning_rate": 1.0104449939484708e-06, "loss": 0.234, "step": 995800 }, { "epoch": 13.721032762943981, "grad_norm": 4.918128490447998, "learning_rate": 1.0082933078014345e-06, "loss": 0.2375, "step": 995900 }, { "epoch": 13.72241051500372, "grad_norm": 1.2750688791275024, "learning_rate": 1.0061438722604128e-06, "loss": 0.1992, "step": 996000 }, { "epoch": 13.723788267063458, "grad_norm": 3.687493085861206, "learning_rate": 1.0039966875080122e-06, "loss": 0.2197, "step": 996100 }, { "epoch": 13.725166019123199, "grad_norm": 3.3317105770111084, "learning_rate": 1.0018517537266474e-06, "loss": 0.2298, "step": 996200 }, { "epoch": 13.726543771182937, "grad_norm": 0.7477369904518127, "learning_rate": 9.997090710985374e-07, "loss": 0.232, "step": 996300 }, { "epoch": 13.727921523242678, "grad_norm": 5.2168288230896, "learning_rate": 9.975686398057147e-07, "loss": 0.2517, "step": 996400 }, { "epoch": 13.729299275302417, "grad_norm": 11.2040376663208, "learning_rate": 9.954304600300226e-07, "loss": 0.2599, "step": 996500 }, { "epoch": 13.730677027362155, "grad_norm": 3.6550655364990234, "learning_rate": 9.932945319531126e-07, "loss": 0.1728, "step": 996600 }, { "epoch": 13.732054779421896, "grad_norm": 3.051327705383301, "learning_rate": 9.91160855756434e-07, "loss": 0.1992, "step": 996700 }, { "epoch": 13.733432531481634, "grad_norm": 1.389105200767517, "learning_rate": 9.890294316212653e-07, "loss": 0.2389, "step": 996800 }, { "epoch": 13.734810283541375, "grad_norm": 1.5840896368026733, "learning_rate": 9.869002597286686e-07, "loss": 0.1737, "step": 996900 }, { "epoch": 13.736188035601113, "grad_norm": 1.3473501205444336, "learning_rate": 9.847733402595408e-07, "loss": 0.2269, "step": 997000 }, { "epoch": 13.737565787660852, "grad_norm": 1.070029616355896, "learning_rate": 9.82648673394565e-07, "loss": 0.2188, "step": 997100 }, { "epoch": 13.738943539720593, "grad_norm": 10.101550102233887, "learning_rate": 9.805262593142505e-07, "loss": 0.2078, "step": 997200 }, { "epoch": 13.740321291780331, "grad_norm": 3.133347511291504, "learning_rate": 9.78406098198902e-07, "loss": 0.2422, "step": 997300 }, { "epoch": 13.74169904384007, "grad_norm": 3.3848319053649902, "learning_rate": 9.76288190228635e-07, "loss": 0.1921, "step": 997400 }, { "epoch": 13.74307679589981, "grad_norm": 0.38242432475090027, "learning_rate": 9.741725355833814e-07, "loss": 0.2518, "step": 997500 }, { "epoch": 13.744454547959549, "grad_norm": 1.6492681503295898, "learning_rate": 9.72059134442881e-07, "loss": 0.1939, "step": 997600 }, { "epoch": 13.745832300019288, "grad_norm": 1.232793927192688, "learning_rate": 9.699479869866666e-07, "loss": 0.2167, "step": 997700 }, { "epoch": 13.747210052079028, "grad_norm": 0.5539990663528442, "learning_rate": 9.678390933941017e-07, "loss": 0.2003, "step": 997800 }, { "epoch": 13.748587804138767, "grad_norm": 1.1356955766677856, "learning_rate": 9.657324538443434e-07, "loss": 0.2069, "step": 997900 }, { "epoch": 13.749965556198507, "grad_norm": 2.537792682647705, "learning_rate": 9.636280685163622e-07, "loss": 0.2454, "step": 998000 }, { "epoch": 13.751343308258246, "grad_norm": 0.834906280040741, "learning_rate": 9.615259375889333e-07, "loss": 0.2072, "step": 998100 }, { "epoch": 13.752721060317985, "grad_norm": 2.8623859882354736, "learning_rate": 9.59426061240648e-07, "loss": 0.2287, "step": 998200 }, { "epoch": 13.754098812377725, "grad_norm": 1.4833942651748657, "learning_rate": 9.57328439649897e-07, "loss": 0.2205, "step": 998300 }, { "epoch": 13.755476564437464, "grad_norm": 19.024581909179688, "learning_rate": 9.552330729948877e-07, "loss": 0.1983, "step": 998400 }, { "epoch": 13.756854316497202, "grad_norm": 2.09698224067688, "learning_rate": 9.531399614536315e-07, "loss": 0.1883, "step": 998500 }, { "epoch": 13.758232068556943, "grad_norm": 2.571943998336792, "learning_rate": 9.510700026021655e-07, "loss": 0.2129, "step": 998600 }, { "epoch": 13.759609820616681, "grad_norm": 0.8022386431694031, "learning_rate": 9.48981379266117e-07, "loss": 0.1529, "step": 998700 }, { "epoch": 13.760987572676422, "grad_norm": 0.9395287036895752, "learning_rate": 9.468950115749292e-07, "loss": 0.2552, "step": 998800 }, { "epoch": 13.76236532473616, "grad_norm": 1.4781537055969238, "learning_rate": 9.448108997058536e-07, "loss": 0.2411, "step": 998900 }, { "epoch": 13.7637430767959, "grad_norm": 8.282505989074707, "learning_rate": 9.427290438359553e-07, "loss": 0.1979, "step": 999000 }, { "epoch": 13.76512082885564, "grad_norm": 6.674598693847656, "learning_rate": 9.406494441420862e-07, "loss": 0.2053, "step": 999100 }, { "epoch": 13.766498580915378, "grad_norm": 1.7220368385314941, "learning_rate": 9.385721008009207e-07, "loss": 0.2095, "step": 999200 }, { "epoch": 13.767876332975117, "grad_norm": 0.21435314416885376, "learning_rate": 9.36497013988947e-07, "loss": 0.1936, "step": 999300 }, { "epoch": 13.769254085034857, "grad_norm": 3.895564556121826, "learning_rate": 9.344241838824458e-07, "loss": 0.2279, "step": 999400 }, { "epoch": 13.770631837094596, "grad_norm": 4.139699459075928, "learning_rate": 9.323536106575236e-07, "loss": 0.2265, "step": 999500 }, { "epoch": 13.772009589154337, "grad_norm": 3.963125228881836, "learning_rate": 9.302852944900767e-07, "loss": 0.2422, "step": 999600 }, { "epoch": 13.773387341214075, "grad_norm": 6.898953914642334, "learning_rate": 9.282192355558267e-07, "loss": 0.207, "step": 999700 }, { "epoch": 13.774765093273814, "grad_norm": 2.6803648471832275, "learning_rate": 9.261554340302911e-07, "loss": 0.1944, "step": 999800 }, { "epoch": 13.776142845333554, "grad_norm": 2.988354206085205, "learning_rate": 9.240938900888011e-07, "loss": 0.2193, "step": 999900 }, { "epoch": 13.777520597393293, "grad_norm": 5.72231388092041, "learning_rate": 9.220346039065014e-07, "loss": 0.2411, "step": 1000000 }, { "epoch": 13.778898349453032, "grad_norm": 1.2038748264312744, "learning_rate": 9.199775756583326e-07, "loss": 0.2343, "step": 1000100 }, { "epoch": 13.780276101512772, "grad_norm": 0.06246373429894447, "learning_rate": 9.179228055190486e-07, "loss": 0.2434, "step": 1000200 }, { "epoch": 13.78165385357251, "grad_norm": 0.20930127799510956, "learning_rate": 9.158702936632174e-07, "loss": 0.1959, "step": 1000300 }, { "epoch": 13.78303160563225, "grad_norm": 1.3775873184204102, "learning_rate": 9.138200402652053e-07, "loss": 0.2168, "step": 1000400 }, { "epoch": 13.78440935769199, "grad_norm": 1.4312326908111572, "learning_rate": 9.117720454991954e-07, "loss": 0.2183, "step": 1000500 }, { "epoch": 13.785787109751729, "grad_norm": 1.8419893980026245, "learning_rate": 9.097263095391786e-07, "loss": 0.2076, "step": 1000600 }, { "epoch": 13.787164861811469, "grad_norm": 1.177031397819519, "learning_rate": 9.076828325589473e-07, "loss": 0.2087, "step": 1000700 }, { "epoch": 13.788542613871208, "grad_norm": 1.5196750164031982, "learning_rate": 9.056416147320984e-07, "loss": 0.1741, "step": 1000800 }, { "epoch": 13.789920365930946, "grad_norm": 0.05920976772904396, "learning_rate": 9.036026562320518e-07, "loss": 0.1747, "step": 1000900 }, { "epoch": 13.791298117990687, "grad_norm": 1.096441626548767, "learning_rate": 9.01565957232032e-07, "loss": 0.2272, "step": 1001000 }, { "epoch": 13.792675870050426, "grad_norm": 1.5265923738479614, "learning_rate": 8.995315179050528e-07, "loss": 0.2286, "step": 1001100 }, { "epoch": 13.794053622110166, "grad_norm": 0.43982061743736267, "learning_rate": 8.974993384239633e-07, "loss": 0.2073, "step": 1001200 }, { "epoch": 13.795431374169905, "grad_norm": 1.9879732131958008, "learning_rate": 8.954694189614046e-07, "loss": 0.2374, "step": 1001300 }, { "epoch": 13.796809126229643, "grad_norm": 0.34522032737731934, "learning_rate": 8.934417596898198e-07, "loss": 0.1905, "step": 1001400 }, { "epoch": 13.798186878289384, "grad_norm": 2.9486215114593506, "learning_rate": 8.914163607814777e-07, "loss": 0.2642, "step": 1001500 }, { "epoch": 13.799564630349122, "grad_norm": 1.0944488048553467, "learning_rate": 8.893932224084486e-07, "loss": 0.2213, "step": 1001600 }, { "epoch": 13.800942382408861, "grad_norm": 1.9349406957626343, "learning_rate": 8.873723447425986e-07, "loss": 0.2633, "step": 1001700 }, { "epoch": 13.802320134468602, "grad_norm": 0.47075796127319336, "learning_rate": 8.853537279556223e-07, "loss": 0.168, "step": 1001800 }, { "epoch": 13.80369788652834, "grad_norm": 2.7314095497131348, "learning_rate": 8.833373722190011e-07, "loss": 0.2101, "step": 1001900 }, { "epoch": 13.805075638588079, "grad_norm": 8.204957962036133, "learning_rate": 8.813232777040451e-07, "loss": 0.2512, "step": 1002000 }, { "epoch": 13.80645339064782, "grad_norm": 0.17167074978351593, "learning_rate": 8.793114445818537e-07, "loss": 0.2208, "step": 1002100 }, { "epoch": 13.807831142707558, "grad_norm": 3.848662853240967, "learning_rate": 8.773018730233465e-07, "loss": 0.1929, "step": 1002200 }, { "epoch": 13.809208894767298, "grad_norm": 1.5230530500411987, "learning_rate": 8.752945631992473e-07, "loss": 0.182, "step": 1002300 }, { "epoch": 13.810586646827037, "grad_norm": 3.1771936416625977, "learning_rate": 8.732895152800849e-07, "loss": 0.219, "step": 1002400 }, { "epoch": 13.811964398886776, "grad_norm": 2.234166383743286, "learning_rate": 8.712867294361984e-07, "loss": 0.2348, "step": 1002500 }, { "epoch": 13.813342150946516, "grad_norm": 3.100041151046753, "learning_rate": 8.692862058377379e-07, "loss": 0.2369, "step": 1002600 }, { "epoch": 13.814719903006255, "grad_norm": 0.47130268812179565, "learning_rate": 8.67287944654652e-07, "loss": 0.2623, "step": 1002700 }, { "epoch": 13.816097655065994, "grad_norm": 1.7812787294387817, "learning_rate": 8.652919460567089e-07, "loss": 0.1906, "step": 1002800 }, { "epoch": 13.817475407125734, "grad_norm": 3.329913854598999, "learning_rate": 8.632982102134814e-07, "loss": 0.1965, "step": 1002900 }, { "epoch": 13.818853159185473, "grad_norm": 0.4932354986667633, "learning_rate": 8.613067372943415e-07, "loss": 0.1987, "step": 1003000 }, { "epoch": 13.820230911245213, "grad_norm": 1.7606476545333862, "learning_rate": 8.593175274684739e-07, "loss": 0.209, "step": 1003100 }, { "epoch": 13.821608663304952, "grad_norm": 0.14409102499485016, "learning_rate": 8.57330580904881e-07, "loss": 0.2578, "step": 1003200 }, { "epoch": 13.82298641536469, "grad_norm": 1.501898169517517, "learning_rate": 8.553458977723508e-07, "loss": 0.2093, "step": 1003300 }, { "epoch": 13.82436416742443, "grad_norm": 3.4398627281188965, "learning_rate": 8.53363478239507e-07, "loss": 0.1864, "step": 1003400 }, { "epoch": 13.82574191948417, "grad_norm": 2.122560739517212, "learning_rate": 8.51383322474753e-07, "loss": 0.1831, "step": 1003500 }, { "epoch": 13.827119671543908, "grad_norm": 2.6567516326904297, "learning_rate": 8.494054306463217e-07, "loss": 0.2242, "step": 1003600 }, { "epoch": 13.828497423603649, "grad_norm": 2.4729018211364746, "learning_rate": 8.47429802922241e-07, "loss": 0.2249, "step": 1003700 }, { "epoch": 13.829875175663387, "grad_norm": 0.2673816978931427, "learning_rate": 8.454564394703529e-07, "loss": 0.213, "step": 1003800 }, { "epoch": 13.831252927723128, "grad_norm": 3.0601062774658203, "learning_rate": 8.434853404583068e-07, "loss": 0.1995, "step": 1003900 }, { "epoch": 13.832630679782866, "grad_norm": 0.12234704941511154, "learning_rate": 8.415165060535537e-07, "loss": 0.2038, "step": 1004000 }, { "epoch": 13.834008431842605, "grad_norm": 1.2257379293441772, "learning_rate": 8.395499364233556e-07, "loss": 0.2476, "step": 1004100 }, { "epoch": 13.835386183902346, "grad_norm": 0.32526853680610657, "learning_rate": 8.375856317347847e-07, "loss": 0.1825, "step": 1004200 }, { "epoch": 13.836763935962084, "grad_norm": 1.6427152156829834, "learning_rate": 8.356235921547181e-07, "loss": 0.2387, "step": 1004300 }, { "epoch": 13.838141688021823, "grad_norm": 1.9942232370376587, "learning_rate": 8.336638178498407e-07, "loss": 0.2403, "step": 1004400 }, { "epoch": 13.839519440081563, "grad_norm": 4.2254767417907715, "learning_rate": 8.317063089866478e-07, "loss": 0.2026, "step": 1004500 }, { "epoch": 13.840897192141302, "grad_norm": 3.479304790496826, "learning_rate": 8.297510657314394e-07, "loss": 0.1911, "step": 1004600 }, { "epoch": 13.84227494420104, "grad_norm": 0.5361469984054565, "learning_rate": 8.277980882503202e-07, "loss": 0.201, "step": 1004700 }, { "epoch": 13.843652696260781, "grad_norm": 2.803344249725342, "learning_rate": 8.258473767092057e-07, "loss": 0.2316, "step": 1004800 }, { "epoch": 13.84503044832052, "grad_norm": 4.163804054260254, "learning_rate": 8.238989312738246e-07, "loss": 0.2546, "step": 1004900 }, { "epoch": 13.84640820038026, "grad_norm": 0.056751832365989685, "learning_rate": 8.219527521096987e-07, "loss": 0.2349, "step": 1005000 }, { "epoch": 13.847785952439999, "grad_norm": 0.1597357541322708, "learning_rate": 8.200088393821753e-07, "loss": 0.2149, "step": 1005100 }, { "epoch": 13.849163704499738, "grad_norm": 2.5986826419830322, "learning_rate": 8.180865984974265e-07, "loss": 0.2138, "step": 1005200 }, { "epoch": 13.850541456559478, "grad_norm": 3.167032480239868, "learning_rate": 8.161471964698605e-07, "loss": 0.1619, "step": 1005300 }, { "epoch": 13.851919208619217, "grad_norm": 4.284632682800293, "learning_rate": 8.142100613720971e-07, "loss": 0.2072, "step": 1005400 }, { "epoch": 13.853296960678957, "grad_norm": 1.47388756275177, "learning_rate": 8.122751933687142e-07, "loss": 0.2735, "step": 1005500 }, { "epoch": 13.854674712738696, "grad_norm": 0.350028932094574, "learning_rate": 8.103425926240885e-07, "loss": 0.1751, "step": 1005600 }, { "epoch": 13.856052464798434, "grad_norm": 2.342237710952759, "learning_rate": 8.084122593024041e-07, "loss": 0.226, "step": 1005700 }, { "epoch": 13.857430216858175, "grad_norm": 2.5162465572357178, "learning_rate": 8.064841935676498e-07, "loss": 0.2213, "step": 1005800 }, { "epoch": 13.858807968917914, "grad_norm": 1.6593633890151978, "learning_rate": 8.045583955836252e-07, "loss": 0.1827, "step": 1005900 }, { "epoch": 13.860185720977652, "grad_norm": 0.9531621932983398, "learning_rate": 8.026348655139406e-07, "loss": 0.2231, "step": 1006000 }, { "epoch": 13.861563473037393, "grad_norm": 2.44447660446167, "learning_rate": 8.007136035220047e-07, "loss": 0.2264, "step": 1006100 }, { "epoch": 13.862941225097131, "grad_norm": 1.9165898561477661, "learning_rate": 7.98794609771043e-07, "loss": 0.1865, "step": 1006200 }, { "epoch": 13.86431897715687, "grad_norm": 0.11040189117193222, "learning_rate": 7.968778844240826e-07, "loss": 0.1818, "step": 1006300 }, { "epoch": 13.86569672921661, "grad_norm": 0.032923463732004166, "learning_rate": 7.949634276439552e-07, "loss": 0.2265, "step": 1006400 }, { "epoch": 13.86707448127635, "grad_norm": 4.999039649963379, "learning_rate": 7.930512395933063e-07, "loss": 0.2294, "step": 1006500 }, { "epoch": 13.86845223333609, "grad_norm": 2.618626594543457, "learning_rate": 7.911413204345888e-07, "loss": 0.1847, "step": 1006600 }, { "epoch": 13.869829985395828, "grad_norm": 2.1467878818511963, "learning_rate": 7.892527355987517e-07, "loss": 0.2381, "step": 1006700 }, { "epoch": 13.871207737455567, "grad_norm": 1.197317361831665, "learning_rate": 7.873473320175051e-07, "loss": 0.2774, "step": 1006800 }, { "epoch": 13.872585489515307, "grad_norm": 3.0321991443634033, "learning_rate": 7.854441978127649e-07, "loss": 0.245, "step": 1006900 }, { "epoch": 13.873963241575046, "grad_norm": 3.3284738063812256, "learning_rate": 7.835433331462119e-07, "loss": 0.2107, "step": 1007000 }, { "epoch": 13.875340993634785, "grad_norm": 2.685920476913452, "learning_rate": 7.816447381793312e-07, "loss": 0.2048, "step": 1007100 }, { "epoch": 13.876718745694525, "grad_norm": 2.6791586875915527, "learning_rate": 7.797484130734189e-07, "loss": 0.2281, "step": 1007200 }, { "epoch": 13.878096497754264, "grad_norm": 0.05113573744893074, "learning_rate": 7.778543579895847e-07, "loss": 0.1838, "step": 1007300 }, { "epoch": 13.879474249814004, "grad_norm": 0.7338482141494751, "learning_rate": 7.759625730887277e-07, "loss": 0.2053, "step": 1007400 }, { "epoch": 13.880852001873743, "grad_norm": 2.5004451274871826, "learning_rate": 7.740730585315697e-07, "loss": 0.1948, "step": 1007500 }, { "epoch": 13.882229753933482, "grad_norm": 0.45705825090408325, "learning_rate": 7.721858144786345e-07, "loss": 0.2043, "step": 1007600 }, { "epoch": 13.883607505993222, "grad_norm": 2.3425233364105225, "learning_rate": 7.703008410902563e-07, "loss": 0.1871, "step": 1007700 }, { "epoch": 13.88498525805296, "grad_norm": 2.4338724613189697, "learning_rate": 7.684181385265676e-07, "loss": 0.2233, "step": 1007800 }, { "epoch": 13.8863630101127, "grad_norm": 1.2534384727478027, "learning_rate": 7.665377069475121e-07, "loss": 0.2093, "step": 1007900 }, { "epoch": 13.88774076217244, "grad_norm": 2.705702543258667, "learning_rate": 7.646595465128531e-07, "loss": 0.2155, "step": 1008000 }, { "epoch": 13.889118514232178, "grad_norm": 0.4062190651893616, "learning_rate": 7.62783657382137e-07, "loss": 0.2067, "step": 1008100 }, { "epoch": 13.890496266291919, "grad_norm": 0.5641285181045532, "learning_rate": 7.609100397147364e-07, "loss": 0.2015, "step": 1008200 }, { "epoch": 13.891874018351658, "grad_norm": 0.33847561478614807, "learning_rate": 7.590386936698254e-07, "loss": 0.2376, "step": 1008300 }, { "epoch": 13.893251770411396, "grad_norm": 1.2037060260772705, "learning_rate": 7.571696194063857e-07, "loss": 0.2193, "step": 1008400 }, { "epoch": 13.894629522471137, "grad_norm": 1.8664073944091797, "learning_rate": 7.553028170831974e-07, "loss": 0.2396, "step": 1008500 }, { "epoch": 13.896007274530875, "grad_norm": 2.2433054447174072, "learning_rate": 7.534382868588607e-07, "loss": 0.1571, "step": 1008600 }, { "epoch": 13.897385026590614, "grad_norm": 5.6116766929626465, "learning_rate": 7.515760288917803e-07, "loss": 0.2249, "step": 1008700 }, { "epoch": 13.898762778650354, "grad_norm": 3.0396549701690674, "learning_rate": 7.497160433401531e-07, "loss": 0.2012, "step": 1008800 }, { "epoch": 13.900140530710093, "grad_norm": 0.0740162581205368, "learning_rate": 7.478583303620084e-07, "loss": 0.1661, "step": 1008900 }, { "epoch": 13.901518282769832, "grad_norm": 1.833280324935913, "learning_rate": 7.460028901151617e-07, "loss": 0.183, "step": 1009000 }, { "epoch": 13.902896034829572, "grad_norm": 2.4394686222076416, "learning_rate": 7.441497227572359e-07, "loss": 0.1971, "step": 1009100 }, { "epoch": 13.904273786889311, "grad_norm": 2.9945454597473145, "learning_rate": 7.422988284456769e-07, "loss": 0.2536, "step": 1009200 }, { "epoch": 13.905651538949051, "grad_norm": 3.1405553817749023, "learning_rate": 7.404686822959275e-07, "loss": 0.2015, "step": 1009300 }, { "epoch": 13.90702929100879, "grad_norm": 1.5918724536895752, "learning_rate": 7.38622311814249e-07, "loss": 0.2287, "step": 1009400 }, { "epoch": 13.908407043068529, "grad_norm": 1.7790374755859375, "learning_rate": 7.367782148485125e-07, "loss": 0.2501, "step": 1009500 }, { "epoch": 13.90978479512827, "grad_norm": 8.950652122497559, "learning_rate": 7.349363915553883e-07, "loss": 0.2294, "step": 1009600 }, { "epoch": 13.911162547188008, "grad_norm": 3.0867457389831543, "learning_rate": 7.330968420913425e-07, "loss": 0.2601, "step": 1009700 }, { "epoch": 13.912540299247748, "grad_norm": 0.4622548818588257, "learning_rate": 7.31259566612664e-07, "loss": 0.2155, "step": 1009800 }, { "epoch": 13.913918051307487, "grad_norm": 0.2727733552455902, "learning_rate": 7.294245652754278e-07, "loss": 0.211, "step": 1009900 }, { "epoch": 13.915295803367226, "grad_norm": 2.8583593368530273, "learning_rate": 7.275918382355381e-07, "loss": 0.1938, "step": 1010000 }, { "epoch": 13.916673555426966, "grad_norm": 3.9037747383117676, "learning_rate": 7.257613856486856e-07, "loss": 0.2063, "step": 1010100 }, { "epoch": 13.918051307486705, "grad_norm": 2.1687731742858887, "learning_rate": 7.239332076703773e-07, "loss": 0.2308, "step": 1010200 }, { "epoch": 13.919429059546443, "grad_norm": 3.1149818897247314, "learning_rate": 7.221073044559281e-07, "loss": 0.1956, "step": 1010300 }, { "epoch": 13.920806811606184, "grad_norm": 1.0580086708068848, "learning_rate": 7.202836761604639e-07, "loss": 0.197, "step": 1010400 }, { "epoch": 13.922184563665922, "grad_norm": 0.49563419818878174, "learning_rate": 7.184623229388995e-07, "loss": 0.1805, "step": 1010500 }, { "epoch": 13.923562315725661, "grad_norm": 1.5151976346969604, "learning_rate": 7.166432449459789e-07, "loss": 0.1768, "step": 1010600 }, { "epoch": 13.924940067785402, "grad_norm": 4.352602958679199, "learning_rate": 7.148264423362325e-07, "loss": 0.2123, "step": 1010700 }, { "epoch": 13.92631781984514, "grad_norm": 1.7908936738967896, "learning_rate": 7.130119152640164e-07, "loss": 0.196, "step": 1010800 }, { "epoch": 13.92769557190488, "grad_norm": 1.6136554479599, "learning_rate": 7.111996638834761e-07, "loss": 0.1815, "step": 1010900 }, { "epoch": 13.92907332396462, "grad_norm": 0.3812917470932007, "learning_rate": 7.093896883485774e-07, "loss": 0.2002, "step": 1011000 }, { "epoch": 13.930451076024358, "grad_norm": 1.072643518447876, "learning_rate": 7.075819888130869e-07, "loss": 0.199, "step": 1011100 }, { "epoch": 13.931828828084099, "grad_norm": 1.3197944164276123, "learning_rate": 7.057765654305703e-07, "loss": 0.2064, "step": 1011200 }, { "epoch": 13.933206580143837, "grad_norm": 2.436164617538452, "learning_rate": 7.039734183544131e-07, "loss": 0.1635, "step": 1011300 }, { "epoch": 13.934584332203576, "grad_norm": 2.967456579208374, "learning_rate": 7.021725477378049e-07, "loss": 0.2232, "step": 1011400 }, { "epoch": 13.935962084263316, "grad_norm": 3.6786534786224365, "learning_rate": 7.003739537337314e-07, "loss": 0.1966, "step": 1011500 }, { "epoch": 13.937339836323055, "grad_norm": 2.5281527042388916, "learning_rate": 6.985776364949977e-07, "loss": 0.2103, "step": 1011600 }, { "epoch": 13.938717588382795, "grad_norm": 2.496382236480713, "learning_rate": 6.967835961742108e-07, "loss": 0.2035, "step": 1011700 }, { "epoch": 13.940095340442534, "grad_norm": 21.985082626342773, "learning_rate": 6.94991832923782e-07, "loss": 0.2613, "step": 1011800 }, { "epoch": 13.941473092502273, "grad_norm": 3.655707359313965, "learning_rate": 6.932023468959303e-07, "loss": 0.2185, "step": 1011900 }, { "epoch": 13.942850844562013, "grad_norm": 3.2711689472198486, "learning_rate": 6.914151382426826e-07, "loss": 0.2301, "step": 1012000 }, { "epoch": 13.944228596621752, "grad_norm": 1.7792415618896484, "learning_rate": 6.896302071158669e-07, "loss": 0.195, "step": 1012100 }, { "epoch": 13.94560634868149, "grad_norm": 1.6625088453292847, "learning_rate": 6.878475536671255e-07, "loss": 0.2309, "step": 1012200 }, { "epoch": 13.946984100741231, "grad_norm": 3.4740829467773438, "learning_rate": 6.860671780479111e-07, "loss": 0.1751, "step": 1012300 }, { "epoch": 13.94836185280097, "grad_norm": 1.5779759883880615, "learning_rate": 6.843068501093472e-07, "loss": 0.3096, "step": 1012400 }, { "epoch": 13.94973960486071, "grad_norm": 0.8223806023597717, "learning_rate": 6.825310078206668e-07, "loss": 0.1736, "step": 1012500 }, { "epoch": 13.951117356920449, "grad_norm": 1.7619963884353638, "learning_rate": 6.807574438131756e-07, "loss": 0.1928, "step": 1012600 }, { "epoch": 13.952495108980187, "grad_norm": 2.0087437629699707, "learning_rate": 6.789861582375418e-07, "loss": 0.2031, "step": 1012700 }, { "epoch": 13.953872861039928, "grad_norm": 2.0862858295440674, "learning_rate": 6.772171512442563e-07, "loss": 0.1802, "step": 1012800 }, { "epoch": 13.955250613099667, "grad_norm": 2.0899112224578857, "learning_rate": 6.754504229835934e-07, "loss": 0.2334, "step": 1012900 }, { "epoch": 13.956628365159405, "grad_norm": 2.539127826690674, "learning_rate": 6.736859736056503e-07, "loss": 0.1871, "step": 1013000 }, { "epoch": 13.958006117219146, "grad_norm": 2.8806445598602295, "learning_rate": 6.719238032603286e-07, "loss": 0.2357, "step": 1013100 }, { "epoch": 13.959383869278884, "grad_norm": 2.864682197570801, "learning_rate": 6.701639120973288e-07, "loss": 0.21, "step": 1013200 }, { "epoch": 13.960761621338623, "grad_norm": 2.4140005111694336, "learning_rate": 6.684063002661678e-07, "loss": 0.1821, "step": 1013300 }, { "epoch": 13.962139373398363, "grad_norm": 5.064892768859863, "learning_rate": 6.666509679161581e-07, "loss": 0.2281, "step": 1013400 }, { "epoch": 13.963517125458102, "grad_norm": 2.8160386085510254, "learning_rate": 6.648979151964291e-07, "loss": 0.1904, "step": 1013500 }, { "epoch": 13.964894877517843, "grad_norm": 1.712924838066101, "learning_rate": 6.631471422559085e-07, "loss": 0.2553, "step": 1013600 }, { "epoch": 13.966272629577581, "grad_norm": 5.080384254455566, "learning_rate": 6.613986492433349e-07, "loss": 0.1726, "step": 1013700 }, { "epoch": 13.96765038163732, "grad_norm": 2.5360894203186035, "learning_rate": 6.596524363072515e-07, "loss": 0.1918, "step": 1013800 }, { "epoch": 13.96902813369706, "grad_norm": 1.59514582157135, "learning_rate": 6.57908503596012e-07, "loss": 0.2174, "step": 1013900 }, { "epoch": 13.970405885756799, "grad_norm": 1.158673882484436, "learning_rate": 6.561668512577626e-07, "loss": 0.226, "step": 1014000 }, { "epoch": 13.97178363781654, "grad_norm": 3.0452091693878174, "learning_rate": 6.544274794404726e-07, "loss": 0.1834, "step": 1014100 }, { "epoch": 13.973161389876278, "grad_norm": 3.675121784210205, "learning_rate": 6.526903882919095e-07, "loss": 0.2099, "step": 1014200 }, { "epoch": 13.974539141936017, "grad_norm": 1.7934209108352661, "learning_rate": 6.509555779596485e-07, "loss": 0.2266, "step": 1014300 }, { "epoch": 13.975916893995757, "grad_norm": 3.093475580215454, "learning_rate": 6.492230485910697e-07, "loss": 0.2081, "step": 1014400 }, { "epoch": 13.977294646055496, "grad_norm": 0.9700685143470764, "learning_rate": 6.474928003333636e-07, "loss": 0.2426, "step": 1014500 }, { "epoch": 13.978672398115235, "grad_norm": 4.574120998382568, "learning_rate": 6.457648333335162e-07, "loss": 0.24, "step": 1014600 }, { "epoch": 13.980050150174975, "grad_norm": 2.2313599586486816, "learning_rate": 6.440391477383304e-07, "loss": 0.2406, "step": 1014700 }, { "epoch": 13.981427902234714, "grad_norm": 1.7652546167373657, "learning_rate": 6.423157436944166e-07, "loss": 0.2368, "step": 1014800 }, { "epoch": 13.982805654294452, "grad_norm": 0.9916251301765442, "learning_rate": 6.405946213481809e-07, "loss": 0.1892, "step": 1014900 }, { "epoch": 13.984183406354193, "grad_norm": 4.213309288024902, "learning_rate": 6.38875780845849e-07, "loss": 0.2334, "step": 1015000 }, { "epoch": 13.985561158413931, "grad_norm": 1.217786431312561, "learning_rate": 6.37159222333436e-07, "loss": 0.1665, "step": 1015100 }, { "epoch": 13.986938910473672, "grad_norm": 0.12633667886257172, "learning_rate": 6.354449459567772e-07, "loss": 0.1934, "step": 1015200 }, { "epoch": 13.98831666253341, "grad_norm": 5.602499961853027, "learning_rate": 6.337329518615056e-07, "loss": 0.2053, "step": 1015300 }, { "epoch": 13.98969441459315, "grad_norm": 4.697601318359375, "learning_rate": 6.32023240193072e-07, "loss": 0.2348, "step": 1015400 }, { "epoch": 13.99107216665289, "grad_norm": 2.557914972305298, "learning_rate": 6.303158110967158e-07, "loss": 0.1661, "step": 1015500 }, { "epoch": 13.992449918712628, "grad_norm": 0.2315978705883026, "learning_rate": 6.286106647174967e-07, "loss": 0.22, "step": 1015600 }, { "epoch": 13.993827670772367, "grad_norm": 3.4364569187164307, "learning_rate": 6.269078012002754e-07, "loss": 0.2358, "step": 1015700 }, { "epoch": 13.995205422832107, "grad_norm": 5.167016983032227, "learning_rate": 6.252072206897181e-07, "loss": 0.2474, "step": 1015800 }, { "epoch": 13.996583174891846, "grad_norm": 4.080760955810547, "learning_rate": 6.235089233302977e-07, "loss": 0.3004, "step": 1015900 }, { "epoch": 13.997960926951587, "grad_norm": 2.131598711013794, "learning_rate": 6.218129092662922e-07, "loss": 0.2469, "step": 1016000 }, { "epoch": 13.999338679011325, "grad_norm": 3.778364419937134, "learning_rate": 6.201191786417934e-07, "loss": 0.1964, "step": 1016100 }, { "epoch": 14.000716431071064, "grad_norm": 1.4112939834594727, "learning_rate": 6.184277316006823e-07, "loss": 0.2074, "step": 1016200 }, { "epoch": 14.002094183130804, "grad_norm": 1.1307002305984497, "learning_rate": 6.167385682866597e-07, "loss": 0.1817, "step": 1016300 }, { "epoch": 14.003471935190543, "grad_norm": 2.6714022159576416, "learning_rate": 6.150516888432312e-07, "loss": 0.2157, "step": 1016400 }, { "epoch": 14.004849687250282, "grad_norm": 1.5242327451705933, "learning_rate": 6.133670934137037e-07, "loss": 0.2606, "step": 1016500 }, { "epoch": 14.006227439310022, "grad_norm": 3.3772196769714355, "learning_rate": 6.117015939468687e-07, "loss": 0.2208, "step": 1016600 }, { "epoch": 14.00760519136976, "grad_norm": 12.760124206542969, "learning_rate": 6.100215441305898e-07, "loss": 0.244, "step": 1016700 }, { "epoch": 14.008982943429501, "grad_norm": 0.41571125388145447, "learning_rate": 6.083437787555502e-07, "loss": 0.2474, "step": 1016800 }, { "epoch": 14.01036069548924, "grad_norm": 1.3224139213562012, "learning_rate": 6.066682979642813e-07, "loss": 0.2156, "step": 1016900 }, { "epoch": 14.011738447548979, "grad_norm": 1.5913996696472168, "learning_rate": 6.049951018991254e-07, "loss": 0.1787, "step": 1017000 }, { "epoch": 14.013116199608719, "grad_norm": 2.211167335510254, "learning_rate": 6.033241907022294e-07, "loss": 0.2308, "step": 1017100 }, { "epoch": 14.014493951668458, "grad_norm": 0.6107139587402344, "learning_rate": 6.016722394661467e-07, "loss": 0.2251, "step": 1017200 }, { "epoch": 14.015871703728196, "grad_norm": 0.005240975879132748, "learning_rate": 6.000058755792151e-07, "loss": 0.2176, "step": 1017300 }, { "epoch": 14.017249455787937, "grad_norm": 3.663726806640625, "learning_rate": 5.98341796984401e-07, "loss": 0.2221, "step": 1017400 }, { "epoch": 14.018627207847675, "grad_norm": 0.14198751747608185, "learning_rate": 5.966800038230819e-07, "loss": 0.2232, "step": 1017500 }, { "epoch": 14.020004959907416, "grad_norm": 2.2149317264556885, "learning_rate": 5.950204962364336e-07, "loss": 0.1956, "step": 1017600 }, { "epoch": 14.021382711967155, "grad_norm": 0.46175140142440796, "learning_rate": 5.933632743654369e-07, "loss": 0.1594, "step": 1017700 }, { "epoch": 14.022760464026893, "grad_norm": 4.762292861938477, "learning_rate": 5.917083383508828e-07, "loss": 0.2216, "step": 1017800 }, { "epoch": 14.024138216086634, "grad_norm": 0.09483277052640915, "learning_rate": 5.900556883333705e-07, "loss": 0.2253, "step": 1017900 }, { "epoch": 14.025515968146372, "grad_norm": 1.2487127780914307, "learning_rate": 5.884053244532941e-07, "loss": 0.215, "step": 1018000 }, { "epoch": 14.026893720206111, "grad_norm": 2.1889774799346924, "learning_rate": 5.86757246850865e-07, "loss": 0.2072, "step": 1018100 }, { "epoch": 14.028271472265851, "grad_norm": 0.42818793654441833, "learning_rate": 5.851114556660958e-07, "loss": 0.1606, "step": 1018200 }, { "epoch": 14.02964922432559, "grad_norm": 0.7637725472450256, "learning_rate": 5.834679510388069e-07, "loss": 0.24, "step": 1018300 }, { "epoch": 14.03102697638533, "grad_norm": 2.0872621536254883, "learning_rate": 5.818267331086142e-07, "loss": 0.1759, "step": 1018400 }, { "epoch": 14.03240472844507, "grad_norm": 1.8280794620513916, "learning_rate": 5.801878020149562e-07, "loss": 0.1676, "step": 1018500 }, { "epoch": 14.033782480504808, "grad_norm": 3.9696295261383057, "learning_rate": 5.785511578970642e-07, "loss": 0.1973, "step": 1018600 }, { "epoch": 14.035160232564548, "grad_norm": 1.1307194232940674, "learning_rate": 5.769168008939802e-07, "loss": 0.2402, "step": 1018700 }, { "epoch": 14.036537984624287, "grad_norm": 2.9659740924835205, "learning_rate": 5.752847311445533e-07, "loss": 0.1905, "step": 1018800 }, { "epoch": 14.037915736684026, "grad_norm": 0.9227334260940552, "learning_rate": 5.736549487874348e-07, "loss": 0.1929, "step": 1018900 }, { "epoch": 14.039293488743766, "grad_norm": 3.533618450164795, "learning_rate": 5.720274539610804e-07, "loss": 0.2527, "step": 1019000 }, { "epoch": 14.040671240803505, "grad_norm": 2.9905810356140137, "learning_rate": 5.704022468037562e-07, "loss": 0.2228, "step": 1019100 }, { "epoch": 14.042048992863243, "grad_norm": 3.513597249984741, "learning_rate": 5.687793274535332e-07, "loss": 0.2106, "step": 1019200 }, { "epoch": 14.043426744922984, "grad_norm": 0.49585723876953125, "learning_rate": 5.671586960482841e-07, "loss": 0.2263, "step": 1019300 }, { "epoch": 14.044804496982723, "grad_norm": 1.862160086631775, "learning_rate": 5.65540352725695e-07, "loss": 0.2132, "step": 1019400 }, { "epoch": 14.046182249042463, "grad_norm": 4.592262268066406, "learning_rate": 5.639404468471296e-07, "loss": 0.2217, "step": 1019500 }, { "epoch": 14.047560001102202, "grad_norm": 4.919544219970703, "learning_rate": 5.623266572178626e-07, "loss": 0.2057, "step": 1019600 }, { "epoch": 14.04893775316194, "grad_norm": 2.0715248584747314, "learning_rate": 5.607151560817589e-07, "loss": 0.1741, "step": 1019700 }, { "epoch": 14.05031550522168, "grad_norm": 2.83178448677063, "learning_rate": 5.591059435757201e-07, "loss": 0.214, "step": 1019800 }, { "epoch": 14.05169325728142, "grad_norm": 0.12600405514240265, "learning_rate": 5.574990198364646e-07, "loss": 0.1995, "step": 1019900 }, { "epoch": 14.053071009341158, "grad_norm": 3.3065154552459717, "learning_rate": 5.558943850005033e-07, "loss": 0.2187, "step": 1020000 }, { "epoch": 14.054448761400899, "grad_norm": 0.8885064721107483, "learning_rate": 5.542920392041545e-07, "loss": 0.1924, "step": 1020100 }, { "epoch": 14.055826513460637, "grad_norm": 2.5127980709075928, "learning_rate": 5.526919825835536e-07, "loss": 0.1906, "step": 1020200 }, { "epoch": 14.057204265520378, "grad_norm": 0.5123410224914551, "learning_rate": 5.510942152746313e-07, "loss": 0.1973, "step": 1020300 }, { "epoch": 14.058582017580116, "grad_norm": 0.9705925583839417, "learning_rate": 5.494987374131227e-07, "loss": 0.2141, "step": 1020400 }, { "epoch": 14.059959769639855, "grad_norm": 2.7959299087524414, "learning_rate": 5.479055491345772e-07, "loss": 0.1816, "step": 1020500 }, { "epoch": 14.061337521699595, "grad_norm": 1.8931347131729126, "learning_rate": 5.463146505743391e-07, "loss": 0.1953, "step": 1020600 }, { "epoch": 14.062715273759334, "grad_norm": 1.8882026672363281, "learning_rate": 5.447260418675668e-07, "loss": 0.2321, "step": 1020700 }, { "epoch": 14.064093025819073, "grad_norm": 5.866182327270508, "learning_rate": 5.431397231492169e-07, "loss": 0.2262, "step": 1020800 }, { "epoch": 14.065470777878813, "grad_norm": 0.38003942370414734, "learning_rate": 5.415556945540601e-07, "loss": 0.2101, "step": 1020900 }, { "epoch": 14.066848529938552, "grad_norm": 1.1763534545898438, "learning_rate": 5.399739562166653e-07, "loss": 0.2222, "step": 1021000 }, { "epoch": 14.068226281998292, "grad_norm": 1.1725133657455444, "learning_rate": 5.383945082714092e-07, "loss": 0.215, "step": 1021100 }, { "epoch": 14.069604034058031, "grad_norm": 1.883790373802185, "learning_rate": 5.368173508524762e-07, "loss": 0.2252, "step": 1021200 }, { "epoch": 14.07098178611777, "grad_norm": 1.001843810081482, "learning_rate": 5.352424840938492e-07, "loss": 0.1866, "step": 1021300 }, { "epoch": 14.07235953817751, "grad_norm": 4.79423189163208, "learning_rate": 5.336699081293245e-07, "loss": 0.2031, "step": 1021400 }, { "epoch": 14.073737290237249, "grad_norm": 2.1388587951660156, "learning_rate": 5.320996230925004e-07, "loss": 0.2071, "step": 1021500 }, { "epoch": 14.075115042296988, "grad_norm": 7.936727046966553, "learning_rate": 5.305316291167828e-07, "loss": 0.1927, "step": 1021600 }, { "epoch": 14.076492794356728, "grad_norm": 3.701247215270996, "learning_rate": 5.289659263353758e-07, "loss": 0.2305, "step": 1021700 }, { "epoch": 14.077870546416467, "grad_norm": 6.462217807769775, "learning_rate": 5.274025148812944e-07, "loss": 0.2144, "step": 1021800 }, { "epoch": 14.079248298476207, "grad_norm": 4.1912007331848145, "learning_rate": 5.258413948873614e-07, "loss": 0.2159, "step": 1021900 }, { "epoch": 14.080626050535946, "grad_norm": 1.2235301733016968, "learning_rate": 5.242825664862008e-07, "loss": 0.2348, "step": 1022000 }, { "epoch": 14.082003802595684, "grad_norm": 1.829254388809204, "learning_rate": 5.227260298102415e-07, "loss": 0.1844, "step": 1022100 }, { "epoch": 14.083381554655425, "grad_norm": 2.716498374938965, "learning_rate": 5.211717849917261e-07, "loss": 0.2225, "step": 1022200 }, { "epoch": 14.084759306715164, "grad_norm": 2.048232078552246, "learning_rate": 5.196198321626864e-07, "loss": 0.2164, "step": 1022300 }, { "epoch": 14.086137058774902, "grad_norm": 1.251952052116394, "learning_rate": 5.18070171454971e-07, "loss": 0.2067, "step": 1022400 }, { "epoch": 14.087514810834643, "grad_norm": 1.424296259880066, "learning_rate": 5.165228030002364e-07, "loss": 0.2389, "step": 1022500 }, { "epoch": 14.088892562894381, "grad_norm": 3.7526659965515137, "learning_rate": 5.149777269299344e-07, "loss": 0.2216, "step": 1022600 }, { "epoch": 14.090270314954122, "grad_norm": 0.20451049506664276, "learning_rate": 5.134349433753305e-07, "loss": 0.2002, "step": 1022700 }, { "epoch": 14.09164806701386, "grad_norm": 3.32766056060791, "learning_rate": 5.118944524674887e-07, "loss": 0.1986, "step": 1022800 }, { "epoch": 14.093025819073599, "grad_norm": 2.618166446685791, "learning_rate": 5.10356254337287e-07, "loss": 0.223, "step": 1022900 }, { "epoch": 14.09440357113334, "grad_norm": 0.6933869123458862, "learning_rate": 5.088203491153953e-07, "loss": 0.1785, "step": 1023000 }, { "epoch": 14.095781323193078, "grad_norm": 2.1524736881256104, "learning_rate": 5.072867369323008e-07, "loss": 0.2094, "step": 1023100 }, { "epoch": 14.097159075252817, "grad_norm": 1.6353240013122559, "learning_rate": 5.057554179182953e-07, "loss": 0.226, "step": 1023200 }, { "epoch": 14.098536827312557, "grad_norm": 1.0612972974777222, "learning_rate": 5.042263922034684e-07, "loss": 0.203, "step": 1023300 }, { "epoch": 14.099914579372296, "grad_norm": 3.8534348011016846, "learning_rate": 5.026996599177153e-07, "loss": 0.1985, "step": 1023400 }, { "epoch": 14.101292331432035, "grad_norm": 3.1799464225769043, "learning_rate": 5.011752211907442e-07, "loss": 0.2081, "step": 1023500 }, { "epoch": 14.102670083491775, "grad_norm": 5.872501373291016, "learning_rate": 4.996530761520652e-07, "loss": 0.1871, "step": 1023600 }, { "epoch": 14.104047835551514, "grad_norm": 4.943983554840088, "learning_rate": 4.981332249309898e-07, "loss": 0.2084, "step": 1023700 }, { "epoch": 14.105425587611254, "grad_norm": 0.810875415802002, "learning_rate": 4.966156676566372e-07, "loss": 0.1928, "step": 1023800 }, { "epoch": 14.106803339670993, "grad_norm": 3.209590196609497, "learning_rate": 4.951155457338216e-07, "loss": 0.2027, "step": 1023900 }, { "epoch": 14.108181091730732, "grad_norm": 3.388915538787842, "learning_rate": 4.936025537968158e-07, "loss": 0.1498, "step": 1024000 }, { "epoch": 14.109558843790472, "grad_norm": 2.952042579650879, "learning_rate": 4.920918561914329e-07, "loss": 0.2354, "step": 1024100 }, { "epoch": 14.11093659585021, "grad_norm": 0.910190224647522, "learning_rate": 4.905834530460163e-07, "loss": 0.1662, "step": 1024200 }, { "epoch": 14.11231434790995, "grad_norm": 1.0930262804031372, "learning_rate": 4.890773444887206e-07, "loss": 0.17, "step": 1024300 }, { "epoch": 14.11369209996969, "grad_norm": 1.4029775857925415, "learning_rate": 4.875735306474896e-07, "loss": 0.2043, "step": 1024400 }, { "epoch": 14.115069852029428, "grad_norm": 4.97995662689209, "learning_rate": 4.860720116500778e-07, "loss": 0.232, "step": 1024500 }, { "epoch": 14.116447604089169, "grad_norm": 2.449876546859741, "learning_rate": 4.845727876240563e-07, "loss": 0.2092, "step": 1024600 }, { "epoch": 14.117825356148908, "grad_norm": 0.9248343110084534, "learning_rate": 4.830758586967829e-07, "loss": 0.2432, "step": 1024700 }, { "epoch": 14.119203108208646, "grad_norm": 1.7604528665542603, "learning_rate": 4.815812249954318e-07, "loss": 0.18, "step": 1024800 }, { "epoch": 14.120580860268387, "grad_norm": 1.6153812408447266, "learning_rate": 4.800888866469852e-07, "loss": 0.2024, "step": 1024900 }, { "epoch": 14.121958612328125, "grad_norm": 1.0927902460098267, "learning_rate": 4.785988437782204e-07, "loss": 0.171, "step": 1025000 }, { "epoch": 14.123336364387864, "grad_norm": 0.6424771547317505, "learning_rate": 4.771110965157197e-07, "loss": 0.1928, "step": 1025100 }, { "epoch": 14.124714116447604, "grad_norm": 3.5146942138671875, "learning_rate": 4.756256449858817e-07, "loss": 0.2516, "step": 1025200 }, { "epoch": 14.126091868507343, "grad_norm": 2.9304721355438232, "learning_rate": 4.7414248931490404e-07, "loss": 0.2025, "step": 1025300 }, { "epoch": 14.127469620567084, "grad_norm": 3.652332067489624, "learning_rate": 4.726616296287795e-07, "loss": 0.2029, "step": 1025400 }, { "epoch": 14.128847372626822, "grad_norm": 0.7800684571266174, "learning_rate": 4.711830660533267e-07, "loss": 0.2122, "step": 1025500 }, { "epoch": 14.130225124686561, "grad_norm": 2.5150647163391113, "learning_rate": 4.697067987141479e-07, "loss": 0.2148, "step": 1025600 }, { "epoch": 14.131602876746301, "grad_norm": 0.49390849471092224, "learning_rate": 4.6823282773666184e-07, "loss": 0.1895, "step": 1025700 }, { "epoch": 14.13298062880604, "grad_norm": 0.7920058369636536, "learning_rate": 4.667611532460919e-07, "loss": 0.179, "step": 1025800 }, { "epoch": 14.134358380865779, "grad_norm": 0.6743540167808533, "learning_rate": 4.652917753674632e-07, "loss": 0.2115, "step": 1025900 }, { "epoch": 14.13573613292552, "grad_norm": 1.0219744443893433, "learning_rate": 4.638246942256083e-07, "loss": 0.2658, "step": 1026000 }, { "epoch": 14.137113884985258, "grad_norm": 2.968315601348877, "learning_rate": 4.623599099451584e-07, "loss": 0.1751, "step": 1026100 }, { "epoch": 14.138491637044998, "grad_norm": 2.6218507289886475, "learning_rate": 4.608974226505583e-07, "loss": 0.2254, "step": 1026200 }, { "epoch": 14.139869389104737, "grad_norm": 1.8848381042480469, "learning_rate": 4.5943723246605453e-07, "loss": 0.1708, "step": 1026300 }, { "epoch": 14.141247141164476, "grad_norm": 1.5641224384307861, "learning_rate": 4.5797933951569506e-07, "loss": 0.2044, "step": 1026400 }, { "epoch": 14.142624893224216, "grad_norm": 1.3623244762420654, "learning_rate": 4.565237439233355e-07, "loss": 0.2322, "step": 1026500 }, { "epoch": 14.144002645283955, "grad_norm": 1.5846881866455078, "learning_rate": 4.5507044581264535e-07, "loss": 0.1639, "step": 1026600 }, { "epoch": 14.145380397343693, "grad_norm": 2.354372024536133, "learning_rate": 4.536194453070741e-07, "loss": 0.2222, "step": 1026700 }, { "epoch": 14.146758149403434, "grad_norm": 1.9994189739227295, "learning_rate": 4.521707425299004e-07, "loss": 0.2047, "step": 1026800 }, { "epoch": 14.148135901463172, "grad_norm": 0.7828368544578552, "learning_rate": 4.5072433760419835e-07, "loss": 0.2253, "step": 1026900 }, { "epoch": 14.149513653522913, "grad_norm": 1.2019603252410889, "learning_rate": 4.492802306528465e-07, "loss": 0.178, "step": 1027000 }, { "epoch": 14.150891405582652, "grad_norm": 1.1777639389038086, "learning_rate": 4.478384217985343e-07, "loss": 0.198, "step": 1027100 }, { "epoch": 14.15226915764239, "grad_norm": 0.26375922560691833, "learning_rate": 4.4639891116374036e-07, "loss": 0.2648, "step": 1027200 }, { "epoch": 14.15364690970213, "grad_norm": 2.4622080326080322, "learning_rate": 4.449616988707664e-07, "loss": 0.1996, "step": 1027300 }, { "epoch": 14.15502466176187, "grad_norm": 3.7776124477386475, "learning_rate": 4.4352678504170937e-07, "loss": 0.2053, "step": 1027400 }, { "epoch": 14.156402413821608, "grad_norm": 1.59080970287323, "learning_rate": 4.420941697984679e-07, "loss": 0.2273, "step": 1027500 }, { "epoch": 14.157780165881348, "grad_norm": 3.6957881450653076, "learning_rate": 4.406638532627605e-07, "loss": 0.2413, "step": 1027600 }, { "epoch": 14.159157917941087, "grad_norm": 0.9523907899856567, "learning_rate": 4.392358355560888e-07, "loss": 0.225, "step": 1027700 }, { "epoch": 14.160535670000826, "grad_norm": 2.3711841106414795, "learning_rate": 4.3781011679977753e-07, "loss": 0.209, "step": 1027800 }, { "epoch": 14.161913422060566, "grad_norm": 2.091735363006592, "learning_rate": 4.363866971149436e-07, "loss": 0.2291, "step": 1027900 }, { "epoch": 14.163291174120305, "grad_norm": 7.050431251525879, "learning_rate": 4.3496557662251773e-07, "loss": 0.2145, "step": 1028000 }, { "epoch": 14.164668926180045, "grad_norm": 0.635335385799408, "learning_rate": 4.3356093227302526e-07, "loss": 0.2004, "step": 1028100 }, { "epoch": 14.166046678239784, "grad_norm": 2.8612728118896484, "learning_rate": 4.3214438753247904e-07, "loss": 0.1923, "step": 1028200 }, { "epoch": 14.167424430299523, "grad_norm": 1.9540824890136719, "learning_rate": 4.307442734134412e-07, "loss": 0.1946, "step": 1028300 }, { "epoch": 14.168802182359263, "grad_norm": 1.2414122819900513, "learning_rate": 4.293464132026928e-07, "loss": 0.2225, "step": 1028400 }, { "epoch": 14.170179934419002, "grad_norm": 52.50362777709961, "learning_rate": 4.2793672148380147e-07, "loss": 0.2644, "step": 1028500 }, { "epoch": 14.17155768647874, "grad_norm": 1.7245001792907715, "learning_rate": 4.265293296751785e-07, "loss": 0.2321, "step": 1028600 }, { "epoch": 14.172935438538481, "grad_norm": 2.8156352043151855, "learning_rate": 4.2512423789639903e-07, "loss": 0.1948, "step": 1028700 }, { "epoch": 14.17431319059822, "grad_norm": 4.342079162597656, "learning_rate": 4.2372144626683067e-07, "loss": 0.2536, "step": 1028800 }, { "epoch": 14.17569094265796, "grad_norm": 2.5000643730163574, "learning_rate": 4.223209549056456e-07, "loss": 0.2202, "step": 1028900 }, { "epoch": 14.177068694717699, "grad_norm": 0.9814404249191284, "learning_rate": 4.209227639318236e-07, "loss": 0.2067, "step": 1029000 }, { "epoch": 14.178446446777437, "grad_norm": 0.12182557582855225, "learning_rate": 4.1952687346414916e-07, "loss": 0.1907, "step": 1029100 }, { "epoch": 14.179824198837178, "grad_norm": 3.392672538757324, "learning_rate": 4.181332836212082e-07, "loss": 0.1708, "step": 1029200 }, { "epoch": 14.181201950896916, "grad_norm": 0.6692277789115906, "learning_rate": 4.1674199452139737e-07, "loss": 0.2452, "step": 1029300 }, { "epoch": 14.182579702956655, "grad_norm": 2.9966447353363037, "learning_rate": 4.153530062829089e-07, "loss": 0.205, "step": 1029400 }, { "epoch": 14.183957455016396, "grad_norm": 1.4956156015396118, "learning_rate": 4.139663190237486e-07, "loss": 0.199, "step": 1029500 }, { "epoch": 14.185335207076134, "grad_norm": 1.746561884880066, "learning_rate": 4.125819328617178e-07, "loss": 0.2582, "step": 1029600 }, { "epoch": 14.186712959135875, "grad_norm": 1.776915192604065, "learning_rate": 4.1119984791443163e-07, "loss": 0.1487, "step": 1029700 }, { "epoch": 14.188090711195613, "grad_norm": 2.4574389457702637, "learning_rate": 4.0982006429930356e-07, "loss": 0.2156, "step": 1029800 }, { "epoch": 14.189468463255352, "grad_norm": 0.12091220915317535, "learning_rate": 4.0844258213355493e-07, "loss": 0.1912, "step": 1029900 }, { "epoch": 14.190846215315092, "grad_norm": 2.144056797027588, "learning_rate": 4.0706740153420854e-07, "loss": 0.2134, "step": 1030000 }, { "epoch": 14.192223967374831, "grad_norm": 1.8108185529708862, "learning_rate": 4.056945226180886e-07, "loss": 0.1911, "step": 1030100 }, { "epoch": 14.19360171943457, "grad_norm": 1.456079363822937, "learning_rate": 4.0432394550183634e-07, "loss": 0.2564, "step": 1030200 }, { "epoch": 14.19497947149431, "grad_norm": 0.5807664394378662, "learning_rate": 4.0295567030188526e-07, "loss": 0.2304, "step": 1030300 }, { "epoch": 14.196357223554049, "grad_norm": 1.4189187288284302, "learning_rate": 4.0158969713447654e-07, "loss": 0.2176, "step": 1030400 }, { "epoch": 14.19773497561379, "grad_norm": 0.04461994394659996, "learning_rate": 4.0022602611565903e-07, "loss": 0.2117, "step": 1030500 }, { "epoch": 14.199112727673528, "grad_norm": 2.271803855895996, "learning_rate": 3.9886465736128006e-07, "loss": 0.2387, "step": 1030600 }, { "epoch": 14.200490479733267, "grad_norm": 0.8087264895439148, "learning_rate": 3.9750559098699763e-07, "loss": 0.1875, "step": 1030700 }, { "epoch": 14.201868231793007, "grad_norm": 2.196885347366333, "learning_rate": 3.961488271082714e-07, "loss": 0.212, "step": 1030800 }, { "epoch": 14.203245983852746, "grad_norm": 4.261157035827637, "learning_rate": 3.9479436584036247e-07, "loss": 0.2407, "step": 1030900 }, { "epoch": 14.204623735912485, "grad_norm": 0.02784602902829647, "learning_rate": 3.9344220729834567e-07, "loss": 0.1892, "step": 1031000 }, { "epoch": 14.206001487972225, "grad_norm": 3.5351369380950928, "learning_rate": 3.9209235159708535e-07, "loss": 0.2243, "step": 1031100 }, { "epoch": 14.207379240031964, "grad_norm": 3.901085376739502, "learning_rate": 3.907447988512655e-07, "loss": 0.2107, "step": 1031200 }, { "epoch": 14.208756992091704, "grad_norm": 1.521775722503662, "learning_rate": 3.893995491753627e-07, "loss": 0.1966, "step": 1031300 }, { "epoch": 14.210134744151443, "grad_norm": 1.7670669555664062, "learning_rate": 3.8805660268367015e-07, "loss": 0.1676, "step": 1031400 }, { "epoch": 14.211512496211181, "grad_norm": 2.405217409133911, "learning_rate": 3.867159594902675e-07, "loss": 0.218, "step": 1031500 }, { "epoch": 14.212890248270922, "grad_norm": 3.611844301223755, "learning_rate": 3.8537761970905723e-07, "loss": 0.218, "step": 1031600 }, { "epoch": 14.21426800033066, "grad_norm": 2.2119975090026855, "learning_rate": 3.840415834537342e-07, "loss": 0.2458, "step": 1031700 }, { "epoch": 14.2156457523904, "grad_norm": 2.266141891479492, "learning_rate": 3.8270785083780405e-07, "loss": 0.1986, "step": 1031800 }, { "epoch": 14.21702350445014, "grad_norm": 3.746082305908203, "learning_rate": 3.813764219745708e-07, "loss": 0.205, "step": 1031900 }, { "epoch": 14.218401256509878, "grad_norm": 1.9543709754943848, "learning_rate": 3.800472969771493e-07, "loss": 0.1745, "step": 1032000 }, { "epoch": 14.219779008569617, "grad_norm": 0.3729618191719055, "learning_rate": 3.7872047595845894e-07, "loss": 0.2688, "step": 1032100 }, { "epoch": 14.221156760629357, "grad_norm": 3.6639039516448975, "learning_rate": 3.773959590312115e-07, "loss": 0.1647, "step": 1032200 }, { "epoch": 14.222534512689096, "grad_norm": 2.0133955478668213, "learning_rate": 3.760737463079386e-07, "loss": 0.2309, "step": 1032300 }, { "epoch": 14.223912264748837, "grad_norm": 2.9937479496002197, "learning_rate": 3.7475383790096437e-07, "loss": 0.127, "step": 1032400 }, { "epoch": 14.225290016808575, "grad_norm": 1.7002424001693726, "learning_rate": 3.734493985549198e-07, "loss": 0.1855, "step": 1032500 }, { "epoch": 14.226667768868314, "grad_norm": 0.47065451741218567, "learning_rate": 3.721340760707928e-07, "loss": 0.1921, "step": 1032600 }, { "epoch": 14.228045520928054, "grad_norm": 2.343005895614624, "learning_rate": 3.708210582376589e-07, "loss": 0.2018, "step": 1032700 }, { "epoch": 14.229423272987793, "grad_norm": 1.2562779188156128, "learning_rate": 3.695103451670698e-07, "loss": 0.2436, "step": 1032800 }, { "epoch": 14.230801025047532, "grad_norm": 4.102870941162109, "learning_rate": 3.682019369703754e-07, "loss": 0.1779, "step": 1032900 }, { "epoch": 14.232178777107272, "grad_norm": 2.85564923286438, "learning_rate": 3.6689583375873056e-07, "loss": 0.1897, "step": 1033000 }, { "epoch": 14.23355652916701, "grad_norm": 4.5034403800964355, "learning_rate": 3.6559203564309754e-07, "loss": 0.2206, "step": 1033100 }, { "epoch": 14.234934281226751, "grad_norm": 2.7022995948791504, "learning_rate": 3.642905427342433e-07, "loss": 0.1476, "step": 1033200 }, { "epoch": 14.23631203328649, "grad_norm": 2.3958001136779785, "learning_rate": 3.629913551427272e-07, "loss": 0.2131, "step": 1033300 }, { "epoch": 14.237689785346229, "grad_norm": 2.8164851665496826, "learning_rate": 3.6169447297892845e-07, "loss": 0.2254, "step": 1033400 }, { "epoch": 14.239067537405969, "grad_norm": 0.8474321961402893, "learning_rate": 3.603998963530247e-07, "loss": 0.2243, "step": 1033500 }, { "epoch": 14.240445289465708, "grad_norm": 2.588263988494873, "learning_rate": 3.591076253749953e-07, "loss": 0.2303, "step": 1033600 }, { "epoch": 14.241823041525446, "grad_norm": 3.9786689281463623, "learning_rate": 3.5783054839296976e-07, "loss": 0.2251, "step": 1033700 }, { "epoch": 14.243200793585187, "grad_norm": 0.8122533559799194, "learning_rate": 3.565428659806297e-07, "loss": 0.1943, "step": 1033800 }, { "epoch": 14.244578545644925, "grad_norm": 1.5650519132614136, "learning_rate": 3.552574895438393e-07, "loss": 0.2648, "step": 1033900 }, { "epoch": 14.245956297704666, "grad_norm": 1.2846943140029907, "learning_rate": 3.5397441919180233e-07, "loss": 0.2161, "step": 1034000 }, { "epoch": 14.247334049764405, "grad_norm": 3.9933454990386963, "learning_rate": 3.526936550335089e-07, "loss": 0.2523, "step": 1034100 }, { "epoch": 14.248711801824143, "grad_norm": 1.4556498527526855, "learning_rate": 3.51415197177781e-07, "loss": 0.1964, "step": 1034200 }, { "epoch": 14.250089553883884, "grad_norm": 2.9921977519989014, "learning_rate": 3.5013904573322096e-07, "loss": 0.2148, "step": 1034300 }, { "epoch": 14.251467305943622, "grad_norm": 0.990747332572937, "learning_rate": 3.488652008082449e-07, "loss": 0.1995, "step": 1034400 }, { "epoch": 14.252845058003361, "grad_norm": 1.2467118501663208, "learning_rate": 3.475936625110734e-07, "loss": 0.2419, "step": 1034500 }, { "epoch": 14.254222810063101, "grad_norm": 0.8108887672424316, "learning_rate": 3.4632443094973487e-07, "loss": 0.1927, "step": 1034600 }, { "epoch": 14.25560056212284, "grad_norm": 1.928206205368042, "learning_rate": 3.450575062320499e-07, "loss": 0.2018, "step": 1034700 }, { "epoch": 14.25697831418258, "grad_norm": 1.2115625143051147, "learning_rate": 3.43792888465656e-07, "loss": 0.1865, "step": 1034800 }, { "epoch": 14.25835606624232, "grad_norm": 0.04710285738110542, "learning_rate": 3.425305777579893e-07, "loss": 0.1874, "step": 1034900 }, { "epoch": 14.259733818302058, "grad_norm": 5.471766948699951, "learning_rate": 3.4127057421628113e-07, "loss": 0.2167, "step": 1035000 }, { "epoch": 14.261111570361798, "grad_norm": 1.6544393301010132, "learning_rate": 3.400128779475828e-07, "loss": 0.1954, "step": 1035100 }, { "epoch": 14.262489322421537, "grad_norm": 3.087660074234009, "learning_rate": 3.38757489058738e-07, "loss": 0.2379, "step": 1035200 }, { "epoch": 14.263867074481276, "grad_norm": 5.120940208435059, "learning_rate": 3.375044076564043e-07, "loss": 0.2025, "step": 1035300 }, { "epoch": 14.265244826541016, "grad_norm": 2.240692138671875, "learning_rate": 3.362536338470315e-07, "loss": 0.1794, "step": 1035400 }, { "epoch": 14.266622578600755, "grad_norm": 3.7346484661102295, "learning_rate": 3.350051677368801e-07, "loss": 0.2205, "step": 1035500 }, { "epoch": 14.268000330660495, "grad_norm": 0.34074950218200684, "learning_rate": 3.3375900943201835e-07, "loss": 0.2057, "step": 1035600 }, { "epoch": 14.269378082720234, "grad_norm": 2.1143715381622314, "learning_rate": 3.325151590383069e-07, "loss": 0.215, "step": 1035700 }, { "epoch": 14.270755834779973, "grad_norm": 3.158684015274048, "learning_rate": 3.3127361666142e-07, "loss": 0.1385, "step": 1035800 }, { "epoch": 14.272133586839713, "grad_norm": 2.260822057723999, "learning_rate": 3.300343824068397e-07, "loss": 0.1932, "step": 1035900 }, { "epoch": 14.273511338899452, "grad_norm": 11.548611640930176, "learning_rate": 3.287974563798343e-07, "loss": 0.2551, "step": 1036000 }, { "epoch": 14.27488909095919, "grad_norm": 3.603050470352173, "learning_rate": 3.2756283868549204e-07, "loss": 0.2327, "step": 1036100 }, { "epoch": 14.27626684301893, "grad_norm": 2.7603049278259277, "learning_rate": 3.2633052942869947e-07, "loss": 0.2235, "step": 1036200 }, { "epoch": 14.27764459507867, "grad_norm": 1.892252802848816, "learning_rate": 3.2510052871414785e-07, "loss": 0.1934, "step": 1036300 }, { "epoch": 14.279022347138408, "grad_norm": 0.1928088665008545, "learning_rate": 3.238728366463331e-07, "loss": 0.174, "step": 1036400 }, { "epoch": 14.280400099198149, "grad_norm": 0.5428099632263184, "learning_rate": 3.2264745332955566e-07, "loss": 0.2188, "step": 1036500 }, { "epoch": 14.281777851257887, "grad_norm": 1.455741047859192, "learning_rate": 3.214243788679114e-07, "loss": 0.2832, "step": 1036600 }, { "epoch": 14.283155603317628, "grad_norm": 3.8896660804748535, "learning_rate": 3.2020361336531303e-07, "loss": 0.2049, "step": 1036700 }, { "epoch": 14.284533355377366, "grad_norm": 2.817736864089966, "learning_rate": 3.1898515692546563e-07, "loss": 0.1822, "step": 1036800 }, { "epoch": 14.285911107437105, "grad_norm": 2.8720290660858154, "learning_rate": 3.177690096518912e-07, "loss": 0.2856, "step": 1036900 }, { "epoch": 14.287288859496845, "grad_norm": 1.3023791313171387, "learning_rate": 3.1655517164790093e-07, "loss": 0.2144, "step": 1037000 }, { "epoch": 14.288666611556584, "grad_norm": 1.185239553451538, "learning_rate": 3.1534364301661677e-07, "loss": 0.1773, "step": 1037100 }, { "epoch": 14.290044363616323, "grad_norm": 5.254380702972412, "learning_rate": 3.1413442386096836e-07, "loss": 0.2329, "step": 1037200 }, { "epoch": 14.291422115676063, "grad_norm": 1.3584585189819336, "learning_rate": 3.129275142836807e-07, "loss": 0.2291, "step": 1037300 }, { "epoch": 14.292799867735802, "grad_norm": 7.368195533752441, "learning_rate": 3.1172291438728657e-07, "loss": 0.2296, "step": 1037400 }, { "epoch": 14.294177619795542, "grad_norm": 2.3495419025421143, "learning_rate": 3.105206242741293e-07, "loss": 0.1709, "step": 1037500 }, { "epoch": 14.295555371855281, "grad_norm": 0.49491262435913086, "learning_rate": 3.0932064404634475e-07, "loss": 0.232, "step": 1037600 }, { "epoch": 14.29693312391502, "grad_norm": 2.5167791843414307, "learning_rate": 3.081229738058763e-07, "loss": 0.1803, "step": 1037700 }, { "epoch": 14.29831087597476, "grad_norm": 1.661225438117981, "learning_rate": 3.069276136544752e-07, "loss": 0.2078, "step": 1037800 }, { "epoch": 14.299688628034499, "grad_norm": 0.7844659686088562, "learning_rate": 3.0573456369369093e-07, "loss": 0.1832, "step": 1037900 }, { "epoch": 14.301066380094237, "grad_norm": 1.6307048797607422, "learning_rate": 3.0454382402488075e-07, "loss": 0.1902, "step": 1038000 }, { "epoch": 14.302444132153978, "grad_norm": 4.161242485046387, "learning_rate": 3.033553947492035e-07, "loss": 0.1761, "step": 1038100 }, { "epoch": 14.303821884213717, "grad_norm": 0.62209552526474, "learning_rate": 3.0216927596762564e-07, "loss": 0.1998, "step": 1038200 }, { "epoch": 14.305199636273457, "grad_norm": 1.5336649417877197, "learning_rate": 3.00985467780909e-07, "loss": 0.1986, "step": 1038300 }, { "epoch": 14.306577388333196, "grad_norm": 2.5388731956481934, "learning_rate": 2.9980397028962325e-07, "loss": 0.1673, "step": 1038400 }, { "epoch": 14.307955140392934, "grad_norm": 1.5935280323028564, "learning_rate": 2.9862478359414856e-07, "loss": 0.229, "step": 1038500 }, { "epoch": 14.309332892452675, "grad_norm": 3.3074448108673096, "learning_rate": 2.9744790779466067e-07, "loss": 0.2502, "step": 1038600 }, { "epoch": 14.310710644512413, "grad_norm": 4.7681450843811035, "learning_rate": 2.9627334299113685e-07, "loss": 0.239, "step": 1038700 }, { "epoch": 14.312088396572152, "grad_norm": 1.3199881315231323, "learning_rate": 2.9510108928336806e-07, "loss": 0.2078, "step": 1038800 }, { "epoch": 14.313466148631893, "grad_norm": 2.629986047744751, "learning_rate": 2.9393114677094076e-07, "loss": 0.2086, "step": 1038900 }, { "epoch": 14.314843900691631, "grad_norm": 2.506103515625, "learning_rate": 2.92763515553249e-07, "loss": 0.231, "step": 1039000 }, { "epoch": 14.316221652751372, "grad_norm": 2.377462148666382, "learning_rate": 2.9159819572948553e-07, "loss": 0.2545, "step": 1039100 }, { "epoch": 14.31759940481111, "grad_norm": 2.1929492950439453, "learning_rate": 2.904351873986535e-07, "loss": 0.1833, "step": 1039200 }, { "epoch": 14.318977156870849, "grad_norm": 2.1182632446289062, "learning_rate": 2.892744906595579e-07, "loss": 0.1721, "step": 1039300 }, { "epoch": 14.32035490893059, "grad_norm": 1.0325742959976196, "learning_rate": 2.88116105610802e-07, "loss": 0.2236, "step": 1039400 }, { "epoch": 14.321732660990328, "grad_norm": 1.8284990787506104, "learning_rate": 2.8696003235079677e-07, "loss": 0.2339, "step": 1039500 }, { "epoch": 14.323110413050067, "grad_norm": 1.4432164430618286, "learning_rate": 2.858062709777579e-07, "loss": 0.1798, "step": 1039600 }, { "epoch": 14.324488165109807, "grad_norm": 0.15447640419006348, "learning_rate": 2.846548215897024e-07, "loss": 0.1717, "step": 1039700 }, { "epoch": 14.325865917169546, "grad_norm": 3.6791911125183105, "learning_rate": 2.8351716421237823e-07, "loss": 0.1807, "step": 1039800 }, { "epoch": 14.327243669229286, "grad_norm": 3.0506296157836914, "learning_rate": 2.8237031596527296e-07, "loss": 0.1826, "step": 1039900 }, { "epoch": 14.328621421289025, "grad_norm": 2.111396074295044, "learning_rate": 2.8122577999505325e-07, "loss": 0.2454, "step": 1040000 }, { "epoch": 14.329999173348764, "grad_norm": 3.7517457008361816, "learning_rate": 2.8008355639895457e-07, "loss": 0.211, "step": 1040100 }, { "epoch": 14.331376925408504, "grad_norm": 2.1078410148620605, "learning_rate": 2.7894364527401714e-07, "loss": 0.2764, "step": 1040200 }, { "epoch": 14.332754677468243, "grad_norm": 1.7874021530151367, "learning_rate": 2.7780604671707345e-07, "loss": 0.1847, "step": 1040300 }, { "epoch": 14.334132429527982, "grad_norm": 0.3600746989250183, "learning_rate": 2.76670760824779e-07, "loss": 0.2282, "step": 1040400 }, { "epoch": 14.335510181587722, "grad_norm": 2.2421083450317383, "learning_rate": 2.7553778769357245e-07, "loss": 0.2402, "step": 1040500 }, { "epoch": 14.33688793364746, "grad_norm": 4.3449015617370605, "learning_rate": 2.7440712741971545e-07, "loss": 0.2527, "step": 1040600 }, { "epoch": 14.3382656857072, "grad_norm": 0.47316887974739075, "learning_rate": 2.732900521230247e-07, "loss": 0.1971, "step": 1040700 }, { "epoch": 14.33964343776694, "grad_norm": 1.0531458854675293, "learning_rate": 2.721639947208546e-07, "loss": 0.2036, "step": 1040800 }, { "epoch": 14.341021189826678, "grad_norm": 3.1056787967681885, "learning_rate": 2.7104025046265414e-07, "loss": 0.2267, "step": 1040900 }, { "epoch": 14.342398941886419, "grad_norm": 4.312679767608643, "learning_rate": 2.699188194438851e-07, "loss": 0.2364, "step": 1041000 }, { "epoch": 14.343776693946158, "grad_norm": 2.8823482990264893, "learning_rate": 2.6879970175981984e-07, "loss": 0.1825, "step": 1041100 }, { "epoch": 14.345154446005896, "grad_norm": 0.7732188701629639, "learning_rate": 2.6768289750553234e-07, "loss": 0.2046, "step": 1041200 }, { "epoch": 14.346532198065637, "grad_norm": 2.2512500286102295, "learning_rate": 2.665684067759042e-07, "loss": 0.2477, "step": 1041300 }, { "epoch": 14.347909950125375, "grad_norm": 0.3430367708206177, "learning_rate": 2.6545622966561546e-07, "loss": 0.2026, "step": 1041400 }, { "epoch": 14.349287702185114, "grad_norm": 3.5744383335113525, "learning_rate": 2.6434636626915087e-07, "loss": 0.2179, "step": 1041500 }, { "epoch": 14.350665454244854, "grad_norm": 1.2464557886123657, "learning_rate": 2.632388166807967e-07, "loss": 0.2235, "step": 1041600 }, { "epoch": 14.352043206304593, "grad_norm": 3.767289638519287, "learning_rate": 2.621335809946499e-07, "loss": 0.1569, "step": 1041700 }, { "epoch": 14.353420958364334, "grad_norm": 1.6704542636871338, "learning_rate": 2.6103065930460275e-07, "loss": 0.1786, "step": 1041800 }, { "epoch": 14.354798710424072, "grad_norm": 4.37132453918457, "learning_rate": 2.5993005170435236e-07, "loss": 0.2047, "step": 1041900 }, { "epoch": 14.35617646248381, "grad_norm": 2.1292221546173096, "learning_rate": 2.5883175828740947e-07, "loss": 0.2634, "step": 1042000 }, { "epoch": 14.357554214543551, "grad_norm": 3.1232669353485107, "learning_rate": 2.57735779147068e-07, "loss": 0.2209, "step": 1042100 }, { "epoch": 14.35893196660329, "grad_norm": 0.2823559641838074, "learning_rate": 2.56642114376445e-07, "loss": 0.2517, "step": 1042200 }, { "epoch": 14.360309718663029, "grad_norm": 5.655496120452881, "learning_rate": 2.555507640684496e-07, "loss": 0.1845, "step": 1042300 }, { "epoch": 14.361687470722769, "grad_norm": 1.1234709024429321, "learning_rate": 2.544617283157988e-07, "loss": 0.2384, "step": 1042400 }, { "epoch": 14.363065222782508, "grad_norm": 1.0338001251220703, "learning_rate": 2.533750072110111e-07, "loss": 0.19, "step": 1042500 }, { "epoch": 14.364442974842248, "grad_norm": 0.4390679895877838, "learning_rate": 2.522906008464096e-07, "loss": 0.2171, "step": 1042600 }, { "epoch": 14.365820726901987, "grad_norm": 3.9069883823394775, "learning_rate": 2.51208509314122e-07, "loss": 0.2064, "step": 1042700 }, { "epoch": 14.367198478961726, "grad_norm": 1.9762722253799438, "learning_rate": 2.501287327060714e-07, "loss": 0.2292, "step": 1042800 }, { "epoch": 14.368576231021466, "grad_norm": 2.0985963344573975, "learning_rate": 2.490512711139978e-07, "loss": 0.216, "step": 1042900 }, { "epoch": 14.369953983081205, "grad_norm": 0.6314952969551086, "learning_rate": 2.4797612462943044e-07, "loss": 0.1884, "step": 1043000 }, { "epoch": 14.371331735140943, "grad_norm": 3.535656452178955, "learning_rate": 2.4690329334371544e-07, "loss": 0.2505, "step": 1043100 }, { "epoch": 14.372709487200684, "grad_norm": 1.6125764846801758, "learning_rate": 2.4583277734798823e-07, "loss": 0.1939, "step": 1043200 }, { "epoch": 14.374087239260422, "grad_norm": 2.146817445755005, "learning_rate": 2.4476457673320104e-07, "loss": 0.1761, "step": 1043300 }, { "epoch": 14.375464991320163, "grad_norm": 1.5963634252548218, "learning_rate": 2.4369869159009546e-07, "loss": 0.2002, "step": 1043400 }, { "epoch": 14.376842743379902, "grad_norm": 0.5893296003341675, "learning_rate": 2.426351220092329e-07, "loss": 0.2026, "step": 1043500 }, { "epoch": 14.37822049543964, "grad_norm": 4.40700101852417, "learning_rate": 2.415738680809612e-07, "loss": 0.247, "step": 1043600 }, { "epoch": 14.37959824749938, "grad_norm": 2.344078540802002, "learning_rate": 2.405149298954448e-07, "loss": 0.2117, "step": 1043700 }, { "epoch": 14.38097599955912, "grad_norm": 4.357010364532471, "learning_rate": 2.3945830754264377e-07, "loss": 0.231, "step": 1043800 }, { "epoch": 14.382353751618858, "grad_norm": 1.6458518505096436, "learning_rate": 2.384145327125134e-07, "loss": 0.2255, "step": 1043900 }, { "epoch": 14.383731503678598, "grad_norm": 1.5913604497909546, "learning_rate": 2.373625191336809e-07, "loss": 0.1868, "step": 1044000 }, { "epoch": 14.385109255738337, "grad_norm": 1.3978865146636963, "learning_rate": 2.3631282165537366e-07, "loss": 0.1935, "step": 1044100 }, { "epoch": 14.386487007798078, "grad_norm": 0.16621717810630798, "learning_rate": 2.3526544036677312e-07, "loss": 0.215, "step": 1044200 }, { "epoch": 14.387864759857816, "grad_norm": 1.7409483194351196, "learning_rate": 2.342203753568592e-07, "loss": 0.2474, "step": 1044300 }, { "epoch": 14.389242511917555, "grad_norm": 1.9124393463134766, "learning_rate": 2.3317762671441646e-07, "loss": 0.1718, "step": 1044400 }, { "epoch": 14.390620263977295, "grad_norm": 7.344611644744873, "learning_rate": 2.321371945280279e-07, "loss": 0.2719, "step": 1044500 }, { "epoch": 14.391998016037034, "grad_norm": 2.652233839035034, "learning_rate": 2.3109907888608727e-07, "loss": 0.2683, "step": 1044600 }, { "epoch": 14.393375768096773, "grad_norm": 1.747808575630188, "learning_rate": 2.3006327987678683e-07, "loss": 0.2501, "step": 1044700 }, { "epoch": 14.394753520156513, "grad_norm": 1.9243394136428833, "learning_rate": 2.2902979758812336e-07, "loss": 0.1941, "step": 1044800 }, { "epoch": 14.396131272216252, "grad_norm": 11.358366966247559, "learning_rate": 2.2799863210789529e-07, "loss": 0.209, "step": 1044900 }, { "epoch": 14.39750902427599, "grad_norm": 1.5777690410614014, "learning_rate": 2.2696978352370558e-07, "loss": 0.2328, "step": 1045000 }, { "epoch": 14.39888677633573, "grad_norm": 1.2036610841751099, "learning_rate": 2.2594325192295874e-07, "loss": 0.1965, "step": 1045100 }, { "epoch": 14.40026452839547, "grad_norm": 2.236189842224121, "learning_rate": 2.2491903739287e-07, "loss": 0.2193, "step": 1045200 }, { "epoch": 14.40164228045521, "grad_norm": 3.3195059299468994, "learning_rate": 2.2389714002044398e-07, "loss": 0.1785, "step": 1045300 }, { "epoch": 14.403020032514949, "grad_norm": 2.5880942344665527, "learning_rate": 2.2287755989250203e-07, "loss": 0.221, "step": 1045400 }, { "epoch": 14.404397784574687, "grad_norm": 2.9236528873443604, "learning_rate": 2.2186029709565791e-07, "loss": 0.2898, "step": 1045500 }, { "epoch": 14.405775536634428, "grad_norm": 1.9024989604949951, "learning_rate": 2.2084535171633613e-07, "loss": 0.2406, "step": 1045600 }, { "epoch": 14.407153288694166, "grad_norm": 2.324446439743042, "learning_rate": 2.198327238407627e-07, "loss": 0.2347, "step": 1045700 }, { "epoch": 14.408531040753905, "grad_norm": 0.6329560875892639, "learning_rate": 2.1882241355496213e-07, "loss": 0.1909, "step": 1045800 }, { "epoch": 14.409908792813646, "grad_norm": 1.1651264429092407, "learning_rate": 2.1781442094476968e-07, "loss": 0.216, "step": 1045900 }, { "epoch": 14.411286544873384, "grad_norm": 3.119415283203125, "learning_rate": 2.1681879137110339e-07, "loss": 0.1995, "step": 1046000 }, { "epoch": 14.412664296933125, "grad_norm": 2.7042925357818604, "learning_rate": 2.1581541118993852e-07, "loss": 0.1916, "step": 1046100 }, { "epoch": 14.414042048992863, "grad_norm": 0.9542060494422913, "learning_rate": 2.1481434893983576e-07, "loss": 0.1834, "step": 1046200 }, { "epoch": 14.415419801052602, "grad_norm": 1.2856370210647583, "learning_rate": 2.1381560470584566e-07, "loss": 0.1932, "step": 1046300 }, { "epoch": 14.416797553112342, "grad_norm": 3.630920648574829, "learning_rate": 2.128291313592698e-07, "loss": 0.2054, "step": 1046400 }, { "epoch": 14.418175305172081, "grad_norm": 1.5007926225662231, "learning_rate": 2.1183500022957657e-07, "loss": 0.1866, "step": 1046500 }, { "epoch": 14.41955305723182, "grad_norm": 3.446928024291992, "learning_rate": 2.108431873691022e-07, "loss": 0.2541, "step": 1046600 }, { "epoch": 14.42093080929156, "grad_norm": 10.052985191345215, "learning_rate": 2.0985369286211253e-07, "loss": 0.1924, "step": 1046700 }, { "epoch": 14.422308561351299, "grad_norm": 1.9225763082504272, "learning_rate": 2.0886651679266593e-07, "loss": 0.196, "step": 1046800 }, { "epoch": 14.42368631341104, "grad_norm": 1.3549712896347046, "learning_rate": 2.0788165924463144e-07, "loss": 0.2095, "step": 1046900 }, { "epoch": 14.425064065470778, "grad_norm": 1.2966724634170532, "learning_rate": 2.0689912030167347e-07, "loss": 0.1784, "step": 1047000 }, { "epoch": 14.426441817530517, "grad_norm": 1.7539267539978027, "learning_rate": 2.0591890004726416e-07, "loss": 0.169, "step": 1047100 }, { "epoch": 14.427819569590257, "grad_norm": 2.6548092365264893, "learning_rate": 2.049409985646833e-07, "loss": 0.1441, "step": 1047200 }, { "epoch": 14.429197321649996, "grad_norm": 0.853512167930603, "learning_rate": 2.0396541593700302e-07, "loss": 0.1988, "step": 1047300 }, { "epoch": 14.430575073709734, "grad_norm": 1.3353285789489746, "learning_rate": 2.029921522471062e-07, "loss": 0.1963, "step": 1047400 }, { "epoch": 14.431952825769475, "grad_norm": 1.4992989301681519, "learning_rate": 2.0202120757768032e-07, "loss": 0.1771, "step": 1047500 }, { "epoch": 14.433330577829214, "grad_norm": 3.35964298248291, "learning_rate": 2.0105258201120525e-07, "loss": 0.1991, "step": 1047600 }, { "epoch": 14.434708329888954, "grad_norm": 1.370593547821045, "learning_rate": 2.0008627562997456e-07, "loss": 0.207, "step": 1047700 }, { "epoch": 14.436086081948693, "grad_norm": 3.285459518432617, "learning_rate": 1.9912228851608038e-07, "loss": 0.216, "step": 1047800 }, { "epoch": 14.437463834008431, "grad_norm": 2.591484546661377, "learning_rate": 1.981606207514164e-07, "loss": 0.2149, "step": 1047900 }, { "epoch": 14.438841586068172, "grad_norm": 0.5036186575889587, "learning_rate": 1.9720127241768081e-07, "loss": 0.2712, "step": 1048000 }, { "epoch": 14.44021933812791, "grad_norm": 2.6519598960876465, "learning_rate": 1.9624424359638266e-07, "loss": 0.249, "step": 1048100 }, { "epoch": 14.44159709018765, "grad_norm": 1.8656333684921265, "learning_rate": 1.952895343688202e-07, "loss": 0.1891, "step": 1048200 }, { "epoch": 14.44297484224739, "grad_norm": 4.056741237640381, "learning_rate": 1.9433714481609938e-07, "loss": 0.2325, "step": 1048300 }, { "epoch": 14.444352594307128, "grad_norm": 1.381493091583252, "learning_rate": 1.9338707501913383e-07, "loss": 0.2016, "step": 1048400 }, { "epoch": 14.445730346366869, "grad_norm": 3.2945094108581543, "learning_rate": 1.9243932505863565e-07, "loss": 0.1888, "step": 1048500 }, { "epoch": 14.447108098426607, "grad_norm": 4.610701084136963, "learning_rate": 1.9149389501512155e-07, "loss": 0.2216, "step": 1048600 }, { "epoch": 14.448485850486346, "grad_norm": 2.909604787826538, "learning_rate": 1.9055078496891282e-07, "loss": 0.1726, "step": 1048700 }, { "epoch": 14.449863602546086, "grad_norm": 1.805124282836914, "learning_rate": 1.896099950001262e-07, "loss": 0.1931, "step": 1048800 }, { "epoch": 14.451241354605825, "grad_norm": 3.58864688873291, "learning_rate": 1.886715251886892e-07, "loss": 0.2032, "step": 1048900 }, { "epoch": 14.452619106665564, "grad_norm": 3.1392288208007812, "learning_rate": 1.8773537561433075e-07, "loss": 0.2271, "step": 1049000 }, { "epoch": 14.453996858725304, "grad_norm": 0.464388370513916, "learning_rate": 1.8680154635658142e-07, "loss": 0.2278, "step": 1049100 }, { "epoch": 14.455374610785043, "grad_norm": 1.773710012435913, "learning_rate": 1.858700374947764e-07, "loss": 0.1921, "step": 1049200 }, { "epoch": 14.456752362844782, "grad_norm": 1.4253270626068115, "learning_rate": 1.849408491080462e-07, "loss": 0.2238, "step": 1049300 }, { "epoch": 14.458130114904522, "grad_norm": 0.6480837464332581, "learning_rate": 1.840139812753322e-07, "loss": 0.1597, "step": 1049400 }, { "epoch": 14.45950786696426, "grad_norm": 0.861703097820282, "learning_rate": 1.830894340753833e-07, "loss": 0.2195, "step": 1049500 }, { "epoch": 14.460885619024001, "grad_norm": 2.280820846557617, "learning_rate": 1.8216720758673478e-07, "loss": 0.2362, "step": 1049600 }, { "epoch": 14.46226337108374, "grad_norm": 0.08329978585243225, "learning_rate": 1.8124730188774168e-07, "loss": 0.2407, "step": 1049700 }, { "epoch": 14.463641123143478, "grad_norm": 1.8552747964859009, "learning_rate": 1.8032971705655155e-07, "loss": 0.1436, "step": 1049800 }, { "epoch": 14.465018875203219, "grad_norm": 0.7528455257415771, "learning_rate": 1.794144531711195e-07, "loss": 0.1894, "step": 1049900 }, { "epoch": 14.466396627262958, "grad_norm": 1.3289313316345215, "learning_rate": 1.7850151030919915e-07, "loss": 0.2204, "step": 1050000 }, { "epoch": 14.467774379322696, "grad_norm": 0.7229489088058472, "learning_rate": 1.7759088854835183e-07, "loss": 0.1797, "step": 1050100 }, { "epoch": 14.469152131382437, "grad_norm": 4.091299057006836, "learning_rate": 1.766825879659373e-07, "loss": 0.1949, "step": 1050200 }, { "epoch": 14.470529883442175, "grad_norm": 3.66206955909729, "learning_rate": 1.7577660863912604e-07, "loss": 0.1907, "step": 1050300 }, { "epoch": 14.471907635501916, "grad_norm": 1.943058967590332, "learning_rate": 1.748729506448779e-07, "loss": 0.1852, "step": 1050400 }, { "epoch": 14.473285387561655, "grad_norm": 5.444952487945557, "learning_rate": 1.7397161405996953e-07, "loss": 0.2327, "step": 1050500 }, { "epoch": 14.474663139621393, "grad_norm": 2.428382396697998, "learning_rate": 1.7307259896096994e-07, "loss": 0.2187, "step": 1050600 }, { "epoch": 14.476040891681134, "grad_norm": 1.6242905855178833, "learning_rate": 1.7217590542425582e-07, "loss": 0.1826, "step": 1050700 }, { "epoch": 14.477418643740872, "grad_norm": 2.304988384246826, "learning_rate": 1.712815335260115e-07, "loss": 0.2175, "step": 1050800 }, { "epoch": 14.478796395800611, "grad_norm": 12.82276439666748, "learning_rate": 1.7038948334221066e-07, "loss": 0.213, "step": 1050900 }, { "epoch": 14.480174147860351, "grad_norm": 0.9803490042686462, "learning_rate": 1.6950864073946554e-07, "loss": 0.1998, "step": 1051000 }, { "epoch": 14.48155189992009, "grad_norm": 2.679659128189087, "learning_rate": 1.686212109926788e-07, "loss": 0.1887, "step": 1051100 }, { "epoch": 14.48292965197983, "grad_norm": 0.6335778832435608, "learning_rate": 1.6773610318635142e-07, "loss": 0.2328, "step": 1051200 }, { "epoch": 14.48430740403957, "grad_norm": 3.6239070892333984, "learning_rate": 1.6685331739566934e-07, "loss": 0.1979, "step": 1051300 }, { "epoch": 14.485685156099308, "grad_norm": 4.630417346954346, "learning_rate": 1.6597285369563536e-07, "loss": 0.2133, "step": 1051400 }, { "epoch": 14.487062908159048, "grad_norm": 2.279193878173828, "learning_rate": 1.6509471216104772e-07, "loss": 0.1993, "step": 1051500 }, { "epoch": 14.488440660218787, "grad_norm": 1.0107131004333496, "learning_rate": 1.6421889286650616e-07, "loss": 0.2402, "step": 1051600 }, { "epoch": 14.489818412278526, "grad_norm": 6.315639972686768, "learning_rate": 1.6334539588642116e-07, "loss": 0.2363, "step": 1051700 }, { "epoch": 14.491196164338266, "grad_norm": 0.8674686551094055, "learning_rate": 1.6247422129499562e-07, "loss": 0.2261, "step": 1051800 }, { "epoch": 14.492573916398005, "grad_norm": 2.2753660678863525, "learning_rate": 1.6160536916624614e-07, "loss": 0.2249, "step": 1051900 }, { "epoch": 14.493951668457745, "grad_norm": 0.1291486620903015, "learning_rate": 1.6073883957397866e-07, "loss": 0.1907, "step": 1052000 }, { "epoch": 14.495329420517484, "grad_norm": 2.8889153003692627, "learning_rate": 1.59874632591816e-07, "loss": 0.242, "step": 1052100 }, { "epoch": 14.496707172577223, "grad_norm": 2.4183170795440674, "learning_rate": 1.5901274829317635e-07, "loss": 0.1841, "step": 1052200 }, { "epoch": 14.498084924636963, "grad_norm": 2.2250900268554688, "learning_rate": 1.5815318675127644e-07, "loss": 0.2223, "step": 1052300 }, { "epoch": 14.499462676696702, "grad_norm": 1.2737767696380615, "learning_rate": 1.5729594803914365e-07, "loss": 0.2176, "step": 1052400 }, { "epoch": 14.50084042875644, "grad_norm": 1.689103126525879, "learning_rate": 1.56441032229607e-07, "loss": 0.216, "step": 1052500 }, { "epoch": 14.50221818081618, "grad_norm": 0.18839338421821594, "learning_rate": 1.5558843939529083e-07, "loss": 0.2649, "step": 1052600 }, { "epoch": 14.50359593287592, "grad_norm": 4.971494674682617, "learning_rate": 1.5473816960862724e-07, "loss": 0.1975, "step": 1052700 }, { "epoch": 14.50497368493566, "grad_norm": 1.2953412532806396, "learning_rate": 1.53890222941859e-07, "loss": 0.2142, "step": 1052800 }, { "epoch": 14.506351436995399, "grad_norm": 4.013678073883057, "learning_rate": 1.5304459946701517e-07, "loss": 0.1891, "step": 1052900 }, { "epoch": 14.507729189055137, "grad_norm": 1.1028567552566528, "learning_rate": 1.5220129925593852e-07, "loss": 0.2217, "step": 1053000 }, { "epoch": 14.509106941114878, "grad_norm": 1.6715927124023438, "learning_rate": 1.5136872064828445e-07, "loss": 0.223, "step": 1053100 }, { "epoch": 14.510484693174616, "grad_norm": 0.015158869326114655, "learning_rate": 1.5053004394504939e-07, "loss": 0.22, "step": 1053200 }, { "epoch": 14.511862445234355, "grad_norm": 3.99090313911438, "learning_rate": 1.4969369071921091e-07, "loss": 0.207, "step": 1053300 }, { "epoch": 14.513240197294095, "grad_norm": 3.285388231277466, "learning_rate": 1.4885966104181191e-07, "loss": 0.2281, "step": 1053400 }, { "epoch": 14.514617949353834, "grad_norm": 3.254018783569336, "learning_rate": 1.4802795498371519e-07, "loss": 0.2487, "step": 1053500 }, { "epoch": 14.515995701413573, "grad_norm": 1.1891891956329346, "learning_rate": 1.4719857261557585e-07, "loss": 0.2147, "step": 1053600 }, { "epoch": 14.517373453473313, "grad_norm": 1.6442031860351562, "learning_rate": 1.4637151400785366e-07, "loss": 0.2143, "step": 1053700 }, { "epoch": 14.518751205533052, "grad_norm": 1.2956016063690186, "learning_rate": 1.4554677923080988e-07, "loss": 0.161, "step": 1053800 }, { "epoch": 14.520128957592792, "grad_norm": 0.8181973099708557, "learning_rate": 1.4472436835451352e-07, "loss": 0.2314, "step": 1053900 }, { "epoch": 14.521506709652531, "grad_norm": 3.5142598152160645, "learning_rate": 1.4390428144882894e-07, "loss": 0.1941, "step": 1054000 }, { "epoch": 14.52288446171227, "grad_norm": 3.131718873977661, "learning_rate": 1.4308651858343125e-07, "loss": 0.21, "step": 1054100 }, { "epoch": 14.52426221377201, "grad_norm": 0.9682480692863464, "learning_rate": 1.4227107982778792e-07, "loss": 0.255, "step": 1054200 }, { "epoch": 14.525639965831749, "grad_norm": 0.4797624945640564, "learning_rate": 1.4145796525117717e-07, "loss": 0.2381, "step": 1054300 }, { "epoch": 14.527017717891487, "grad_norm": 3.029003143310547, "learning_rate": 1.406471749226787e-07, "loss": 0.2386, "step": 1054400 }, { "epoch": 14.528395469951228, "grad_norm": 0.8405803442001343, "learning_rate": 1.3983870891117384e-07, "loss": 0.2399, "step": 1054500 }, { "epoch": 14.529773222010967, "grad_norm": 3.6884028911590576, "learning_rate": 1.3903256728534236e-07, "loss": 0.1855, "step": 1054600 }, { "epoch": 14.531150974070707, "grad_norm": 1.092498779296875, "learning_rate": 1.3822875011367474e-07, "loss": 0.1926, "step": 1054700 }, { "epoch": 14.532528726130446, "grad_norm": 0.5491926074028015, "learning_rate": 1.374272574644539e-07, "loss": 0.1999, "step": 1054800 }, { "epoch": 14.533906478190184, "grad_norm": 3.4490442276000977, "learning_rate": 1.3662808940577336e-07, "loss": 0.2043, "step": 1054900 }, { "epoch": 14.535284230249925, "grad_norm": 3.400589942932129, "learning_rate": 1.3583124600552833e-07, "loss": 0.1838, "step": 1055000 }, { "epoch": 14.536661982309663, "grad_norm": 2.4204816818237305, "learning_rate": 1.3503672733141237e-07, "loss": 0.2028, "step": 1055100 }, { "epoch": 14.538039734369402, "grad_norm": 0.10947739332914352, "learning_rate": 1.3424453345092376e-07, "loss": 0.2289, "step": 1055200 }, { "epoch": 14.539417486429143, "grad_norm": 1.800245761871338, "learning_rate": 1.334546644313653e-07, "loss": 0.2524, "step": 1055300 }, { "epoch": 14.540795238488881, "grad_norm": 3.0236077308654785, "learning_rate": 1.326671203398383e-07, "loss": 0.2177, "step": 1055400 }, { "epoch": 14.542172990548622, "grad_norm": 1.1610027551651, "learning_rate": 1.3188190124325177e-07, "loss": 0.1819, "step": 1055500 }, { "epoch": 14.54355074260836, "grad_norm": 2.971428871154785, "learning_rate": 1.3109900720831015e-07, "loss": 0.2543, "step": 1055600 }, { "epoch": 14.544928494668099, "grad_norm": 2.1985690593719482, "learning_rate": 1.303262324809934e-07, "loss": 0.2089, "step": 1055700 }, { "epoch": 14.54630624672784, "grad_norm": 1.9219588041305542, "learning_rate": 1.2954796551640946e-07, "loss": 0.2119, "step": 1055800 }, { "epoch": 14.547683998787578, "grad_norm": 1.1498929262161255, "learning_rate": 1.287720238117518e-07, "loss": 0.1835, "step": 1055900 }, { "epoch": 14.549061750847317, "grad_norm": 0.21879388391971588, "learning_rate": 1.2799840743294016e-07, "loss": 0.213, "step": 1056000 }, { "epoch": 14.550439502907057, "grad_norm": 0.8429954648017883, "learning_rate": 1.2722711644569895e-07, "loss": 0.2225, "step": 1056100 }, { "epoch": 14.551817254966796, "grad_norm": 1.203163504600525, "learning_rate": 1.2645815091555103e-07, "loss": 0.2079, "step": 1056200 }, { "epoch": 14.553195007026535, "grad_norm": 2.9745748043060303, "learning_rate": 1.256915109078269e-07, "loss": 0.2331, "step": 1056300 }, { "epoch": 14.554572759086275, "grad_norm": 1.7408654689788818, "learning_rate": 1.2492719648765554e-07, "loss": 0.2146, "step": 1056400 }, { "epoch": 14.555950511146014, "grad_norm": 2.820443868637085, "learning_rate": 1.2417281609545278e-07, "loss": 0.2372, "step": 1056500 }, { "epoch": 14.557328263205754, "grad_norm": 1.4155102968215942, "learning_rate": 1.2341312978748964e-07, "loss": 0.2082, "step": 1056600 }, { "epoch": 14.558706015265493, "grad_norm": 4.878104209899902, "learning_rate": 1.2265576926064365e-07, "loss": 0.19, "step": 1056700 }, { "epoch": 14.560083767325231, "grad_norm": 5.549022197723389, "learning_rate": 1.2190073457924694e-07, "loss": 0.2395, "step": 1056800 }, { "epoch": 14.561461519384972, "grad_norm": 4.503925323486328, "learning_rate": 1.2114802580745155e-07, "loss": 0.2035, "step": 1056900 }, { "epoch": 14.56283927144471, "grad_norm": 1.402661681175232, "learning_rate": 1.2039764300920187e-07, "loss": 0.1996, "step": 1057000 }, { "epoch": 14.564217023504451, "grad_norm": 5.117333889007568, "learning_rate": 1.1964958624824386e-07, "loss": 0.2313, "step": 1057100 }, { "epoch": 14.56559477556419, "grad_norm": 4.271954536437988, "learning_rate": 1.1890385558812806e-07, "loss": 0.1967, "step": 1057200 }, { "epoch": 14.566972527623928, "grad_norm": 2.0071210861206055, "learning_rate": 1.1816045109221268e-07, "loss": 0.19, "step": 1057300 }, { "epoch": 14.568350279683669, "grad_norm": 1.0216478109359741, "learning_rate": 1.174193728236514e-07, "loss": 0.2271, "step": 1057400 }, { "epoch": 14.569728031743407, "grad_norm": 3.0407822132110596, "learning_rate": 1.1668062084540243e-07, "loss": 0.2184, "step": 1057500 }, { "epoch": 14.571105783803146, "grad_norm": 2.05863356590271, "learning_rate": 1.1594419522022865e-07, "loss": 0.2326, "step": 1057600 }, { "epoch": 14.572483535862887, "grad_norm": 1.6558059453964233, "learning_rate": 1.1521009601068833e-07, "loss": 0.2138, "step": 1057700 }, { "epoch": 14.573861287922625, "grad_norm": 2.2125537395477295, "learning_rate": 1.1447832327915048e-07, "loss": 0.2411, "step": 1057800 }, { "epoch": 14.575239039982364, "grad_norm": 1.699074149131775, "learning_rate": 1.1374887708778565e-07, "loss": 0.1932, "step": 1057900 }, { "epoch": 14.576616792042104, "grad_norm": 2.9789435863494873, "learning_rate": 1.1302175749855676e-07, "loss": 0.1811, "step": 1058000 }, { "epoch": 14.577994544101843, "grad_norm": 2.0149011611938477, "learning_rate": 1.1229696457324357e-07, "loss": 0.1888, "step": 1058100 }, { "epoch": 14.579372296161583, "grad_norm": 1.9105106592178345, "learning_rate": 1.1157449837341516e-07, "loss": 0.194, "step": 1058200 }, { "epoch": 14.580750048221322, "grad_norm": 1.6746035814285278, "learning_rate": 1.1085435896045437e-07, "loss": 0.2688, "step": 1058300 }, { "epoch": 14.58212780028106, "grad_norm": 2.158827066421509, "learning_rate": 1.1013654639553644e-07, "loss": 0.2634, "step": 1058400 }, { "epoch": 14.583505552340801, "grad_norm": 0.6203633546829224, "learning_rate": 1.0942106073964425e-07, "loss": 0.1886, "step": 1058500 }, { "epoch": 14.58488330440054, "grad_norm": 5.5968217849731445, "learning_rate": 1.0870790205356223e-07, "loss": 0.1952, "step": 1058600 }, { "epoch": 14.586261056460279, "grad_norm": 0.3262626826763153, "learning_rate": 1.0799707039787638e-07, "loss": 0.2004, "step": 1058700 }, { "epoch": 14.587638808520019, "grad_norm": 3.674304723739624, "learning_rate": 1.072885658329742e-07, "loss": 0.2524, "step": 1058800 }, { "epoch": 14.589016560579758, "grad_norm": 1.5658506155014038, "learning_rate": 1.0658238841905088e-07, "loss": 0.2123, "step": 1058900 }, { "epoch": 14.590394312639498, "grad_norm": 4.069824695587158, "learning_rate": 1.0587853821609705e-07, "loss": 0.2105, "step": 1059000 }, { "epoch": 14.591772064699237, "grad_norm": 1.7818678617477417, "learning_rate": 1.05177015283911e-07, "loss": 0.1986, "step": 1059100 }, { "epoch": 14.593149816758975, "grad_norm": 10.689050674438477, "learning_rate": 1.0447781968208336e-07, "loss": 0.1888, "step": 1059200 }, { "epoch": 14.594527568818716, "grad_norm": 0.26548296213150024, "learning_rate": 1.0378095147002164e-07, "loss": 0.1931, "step": 1059300 }, { "epoch": 14.595905320878455, "grad_norm": 1.739495873451233, "learning_rate": 1.0308641070692875e-07, "loss": 0.1732, "step": 1059400 }, { "epoch": 14.597283072938193, "grad_norm": 1.6456894874572754, "learning_rate": 1.0239419745180611e-07, "loss": 0.1993, "step": 1059500 }, { "epoch": 14.598660824997934, "grad_norm": 3.7128641605377197, "learning_rate": 1.0170431176345974e-07, "loss": 0.2487, "step": 1059600 }, { "epoch": 14.600038577057672, "grad_norm": 2.0437982082366943, "learning_rate": 1.0101675370050329e-07, "loss": 0.1728, "step": 1059700 }, { "epoch": 14.601416329117413, "grad_norm": 5.03501033782959, "learning_rate": 1.0033152332134282e-07, "loss": 0.2589, "step": 1059800 }, { "epoch": 14.602794081177151, "grad_norm": 1.3488279581069946, "learning_rate": 9.964862068419512e-08, "loss": 0.2037, "step": 1059900 }, { "epoch": 14.60417183323689, "grad_norm": 2.6241307258605957, "learning_rate": 9.897484007264951e-08, "loss": 0.1876, "step": 1060000 }, { "epoch": 14.60554958529663, "grad_norm": 1.7825360298156738, "learning_rate": 9.829656981451524e-08, "loss": 0.2385, "step": 1060100 }, { "epoch": 14.60692733735637, "grad_norm": 1.813906192779541, "learning_rate": 9.762062747127626e-08, "loss": 0.2377, "step": 1060200 }, { "epoch": 14.608305089416108, "grad_norm": 2.9254839420318604, "learning_rate": 9.694701310035248e-08, "loss": 0.1574, "step": 1060300 }, { "epoch": 14.609682841475848, "grad_norm": 1.04872727394104, "learning_rate": 9.627572675897145e-08, "loss": 0.1972, "step": 1060400 }, { "epoch": 14.611060593535587, "grad_norm": 0.01565225049853325, "learning_rate": 9.560676850416534e-08, "loss": 0.1872, "step": 1060500 }, { "epoch": 14.612438345595326, "grad_norm": 1.5857715606689453, "learning_rate": 9.494013839276173e-08, "loss": 0.245, "step": 1060600 }, { "epoch": 14.613816097655066, "grad_norm": 12.168656349182129, "learning_rate": 9.427583648140198e-08, "loss": 0.1845, "step": 1060700 }, { "epoch": 14.615193849714805, "grad_norm": 4.860556125640869, "learning_rate": 9.361386282651374e-08, "loss": 0.1731, "step": 1060800 }, { "epoch": 14.616571601774545, "grad_norm": 3.238278388977051, "learning_rate": 9.295421748433536e-08, "loss": 0.2143, "step": 1060900 }, { "epoch": 14.617949353834284, "grad_norm": 2.28961443901062, "learning_rate": 9.229690051091283e-08, "loss": 0.1696, "step": 1061000 }, { "epoch": 14.619327105894023, "grad_norm": 1.147296667098999, "learning_rate": 9.164191196208455e-08, "loss": 0.218, "step": 1061100 }, { "epoch": 14.620704857953763, "grad_norm": 0.628308117389679, "learning_rate": 9.098925189349655e-08, "loss": 0.1883, "step": 1061200 }, { "epoch": 14.622082610013502, "grad_norm": 2.5557363033294678, "learning_rate": 9.033892036059643e-08, "loss": 0.1671, "step": 1061300 }, { "epoch": 14.623460362073242, "grad_norm": 2.456542491912842, "learning_rate": 8.969091741863028e-08, "loss": 0.2625, "step": 1061400 }, { "epoch": 14.62483811413298, "grad_norm": 2.364161491394043, "learning_rate": 8.904524312265184e-08, "loss": 0.1918, "step": 1061500 }, { "epoch": 14.62621586619272, "grad_norm": 1.5944273471832275, "learning_rate": 8.840189752751027e-08, "loss": 0.1826, "step": 1061600 }, { "epoch": 14.62759361825246, "grad_norm": 1.6017801761627197, "learning_rate": 8.776088068786852e-08, "loss": 0.2363, "step": 1061700 }, { "epoch": 14.628971370312199, "grad_norm": 2.457019329071045, "learning_rate": 8.712219265817889e-08, "loss": 0.2035, "step": 1061800 }, { "epoch": 14.630349122371937, "grad_norm": 0.2548597455024719, "learning_rate": 8.648583349270128e-08, "loss": 0.2163, "step": 1061900 }, { "epoch": 14.631726874431678, "grad_norm": 1.0474525690078735, "learning_rate": 8.585180324549718e-08, "loss": 0.225, "step": 1062000 }, { "epoch": 14.633104626491416, "grad_norm": 0.396475225687027, "learning_rate": 8.522010197043572e-08, "loss": 0.2023, "step": 1062100 }, { "epoch": 14.634482378551155, "grad_norm": 4.0248703956604, "learning_rate": 8.459072972117843e-08, "loss": 0.2102, "step": 1062200 }, { "epoch": 14.635860130610896, "grad_norm": 4.017994403839111, "learning_rate": 8.396368655119446e-08, "loss": 0.1701, "step": 1062300 }, { "epoch": 14.637237882670634, "grad_norm": 4.064123153686523, "learning_rate": 8.333897251375455e-08, "loss": 0.1793, "step": 1062400 }, { "epoch": 14.638615634730375, "grad_norm": 1.98574960231781, "learning_rate": 8.271658766193402e-08, "loss": 0.208, "step": 1062500 }, { "epoch": 14.639993386790113, "grad_norm": 0.863537073135376, "learning_rate": 8.209653204860057e-08, "loss": 0.2334, "step": 1062600 }, { "epoch": 14.641371138849852, "grad_norm": 0.8978185653686523, "learning_rate": 8.147880572644178e-08, "loss": 0.1951, "step": 1062700 }, { "epoch": 14.642748890909592, "grad_norm": 1.044740080833435, "learning_rate": 8.086340874792542e-08, "loss": 0.2432, "step": 1062800 }, { "epoch": 14.644126642969331, "grad_norm": 2.7601523399353027, "learning_rate": 8.025034116533908e-08, "loss": 0.2094, "step": 1062900 }, { "epoch": 14.64550439502907, "grad_norm": 1.0072929859161377, "learning_rate": 7.963960303076584e-08, "loss": 0.1743, "step": 1063000 }, { "epoch": 14.64688214708881, "grad_norm": 1.8595679998397827, "learning_rate": 7.90311943960903e-08, "loss": 0.2207, "step": 1063100 }, { "epoch": 14.648259899148549, "grad_norm": 7.076990604400635, "learning_rate": 7.842511531299862e-08, "loss": 0.2471, "step": 1063200 }, { "epoch": 14.64963765120829, "grad_norm": 1.6304078102111816, "learning_rate": 7.782136583298155e-08, "loss": 0.2289, "step": 1063300 }, { "epoch": 14.651015403268028, "grad_norm": 2.103257417678833, "learning_rate": 7.72199460073314e-08, "loss": 0.1702, "step": 1063400 }, { "epoch": 14.652393155327767, "grad_norm": 1.096548318862915, "learning_rate": 7.662085588713896e-08, "loss": 0.2147, "step": 1063500 }, { "epoch": 14.653770907387507, "grad_norm": 0.15656167268753052, "learning_rate": 7.602409552330269e-08, "loss": 0.1845, "step": 1063600 }, { "epoch": 14.655148659447246, "grad_norm": 1.7596184015274048, "learning_rate": 7.542966496651954e-08, "loss": 0.2288, "step": 1063700 }, { "epoch": 14.656526411506984, "grad_norm": 1.2369574308395386, "learning_rate": 7.4837564267288e-08, "loss": 0.19, "step": 1063800 }, { "epoch": 14.657904163566725, "grad_norm": 1.3500478267669678, "learning_rate": 7.42477934759142e-08, "loss": 0.2014, "step": 1063900 }, { "epoch": 14.659281915626464, "grad_norm": 1.9070484638214111, "learning_rate": 7.36603526424967e-08, "loss": 0.2189, "step": 1064000 }, { "epoch": 14.660659667686204, "grad_norm": 2.2372629642486572, "learning_rate": 7.30752418169478e-08, "loss": 0.2175, "step": 1064100 }, { "epoch": 14.662037419745943, "grad_norm": 2.9730851650238037, "learning_rate": 7.249246104896912e-08, "loss": 0.1969, "step": 1064200 }, { "epoch": 14.663415171805681, "grad_norm": 9.116437911987305, "learning_rate": 7.191201038807604e-08, "loss": 0.1958, "step": 1064300 }, { "epoch": 14.664792923865422, "grad_norm": 0.8251709938049316, "learning_rate": 7.133388988357942e-08, "loss": 0.2244, "step": 1064400 }, { "epoch": 14.66617067592516, "grad_norm": 0.7052600979804993, "learning_rate": 7.075809958459161e-08, "loss": 0.1703, "step": 1064500 }, { "epoch": 14.667548427984899, "grad_norm": 2.356926679611206, "learning_rate": 7.018463954002963e-08, "loss": 0.1958, "step": 1064600 }, { "epoch": 14.66892618004464, "grad_norm": 2.4160759449005127, "learning_rate": 6.961350979861503e-08, "loss": 0.2274, "step": 1064700 }, { "epoch": 14.670303932104378, "grad_norm": 3.823357343673706, "learning_rate": 6.904471040886484e-08, "loss": 0.1798, "step": 1064800 }, { "epoch": 14.671681684164117, "grad_norm": 7.0631303787231445, "learning_rate": 6.847824141910069e-08, "loss": 0.2236, "step": 1064900 }, { "epoch": 14.673059436223857, "grad_norm": 0.5727594494819641, "learning_rate": 6.791410287744881e-08, "loss": 0.2236, "step": 1065000 }, { "epoch": 14.674437188283596, "grad_norm": 1.1596064567565918, "learning_rate": 6.735229483183697e-08, "loss": 0.1897, "step": 1065100 }, { "epoch": 14.675814940343336, "grad_norm": 3.9859416484832764, "learning_rate": 6.679281732999143e-08, "loss": 0.2029, "step": 1065200 }, { "epoch": 14.677192692403075, "grad_norm": 0.9898383021354675, "learning_rate": 6.624123035196606e-08, "loss": 0.2259, "step": 1065300 }, { "epoch": 14.678570444462814, "grad_norm": 1.6187522411346436, "learning_rate": 6.568639077342897e-08, "loss": 0.2377, "step": 1065400 }, { "epoch": 14.679948196522554, "grad_norm": 1.5964347124099731, "learning_rate": 6.513388188018526e-08, "loss": 0.2121, "step": 1065500 }, { "epoch": 14.681325948582293, "grad_norm": 0.2603021264076233, "learning_rate": 6.45837037191735e-08, "loss": 0.1998, "step": 1065600 }, { "epoch": 14.682703700642033, "grad_norm": 1.0766738653182983, "learning_rate": 6.403585633713682e-08, "loss": 0.2192, "step": 1065700 }, { "epoch": 14.684081452701772, "grad_norm": 1.3574103116989136, "learning_rate": 6.349033978061386e-08, "loss": 0.2341, "step": 1065800 }, { "epoch": 14.68545920476151, "grad_norm": 2.3115594387054443, "learning_rate": 6.294715409595086e-08, "loss": 0.204, "step": 1065900 }, { "epoch": 14.686836956821251, "grad_norm": 1.500071406364441, "learning_rate": 6.240629932929259e-08, "loss": 0.2029, "step": 1066000 }, { "epoch": 14.68821470888099, "grad_norm": 4.5576324462890625, "learning_rate": 6.186777552659145e-08, "loss": 0.1847, "step": 1066100 }, { "epoch": 14.689592460940728, "grad_norm": 2.3135814666748047, "learning_rate": 6.133693312287581e-08, "loss": 0.1796, "step": 1066200 }, { "epoch": 14.690970213000469, "grad_norm": 1.7553790807724, "learning_rate": 6.080304807436043e-08, "loss": 0.1912, "step": 1066300 }, { "epoch": 14.692347965060208, "grad_norm": 1.0685458183288574, "learning_rate": 6.027149412600158e-08, "loss": 0.1607, "step": 1066400 }, { "epoch": 14.693725717119946, "grad_norm": 0.9714058041572571, "learning_rate": 5.97422713229609e-08, "loss": 0.2066, "step": 1066500 }, { "epoch": 14.695103469179687, "grad_norm": 2.1571431159973145, "learning_rate": 5.921537971019852e-08, "loss": 0.1889, "step": 1066600 }, { "epoch": 14.696481221239425, "grad_norm": 0.6031253337860107, "learning_rate": 5.869081933247311e-08, "loss": 0.2272, "step": 1066700 }, { "epoch": 14.697858973299166, "grad_norm": 3.8849663734436035, "learning_rate": 5.8168590234354003e-08, "loss": 0.2067, "step": 1066800 }, { "epoch": 14.699236725358904, "grad_norm": 2.919374465942383, "learning_rate": 5.7648692460202936e-08, "loss": 0.2238, "step": 1066900 }, { "epoch": 14.700614477418643, "grad_norm": 3.165018320083618, "learning_rate": 5.71311260541893e-08, "loss": 0.1995, "step": 1067000 }, { "epoch": 14.701992229478384, "grad_norm": 3.9404499530792236, "learning_rate": 5.6615891060284034e-08, "loss": 0.2541, "step": 1067100 }, { "epoch": 14.703369981538122, "grad_norm": 9.033873558044434, "learning_rate": 5.610298752225962e-08, "loss": 0.2389, "step": 1067200 }, { "epoch": 14.704747733597861, "grad_norm": 4.983729839324951, "learning_rate": 5.559241548368704e-08, "loss": 0.205, "step": 1067300 }, { "epoch": 14.706125485657601, "grad_norm": 1.9892619848251343, "learning_rate": 5.508417498794493e-08, "loss": 0.1708, "step": 1067400 }, { "epoch": 14.70750323771734, "grad_norm": 2.351402759552002, "learning_rate": 5.457826607820737e-08, "loss": 0.222, "step": 1067500 }, { "epoch": 14.70888098977708, "grad_norm": 4.581531524658203, "learning_rate": 5.407468879745608e-08, "loss": 0.2354, "step": 1067600 }, { "epoch": 14.71025874183682, "grad_norm": 1.7210137844085693, "learning_rate": 5.3573443188474345e-08, "loss": 0.2035, "step": 1067700 }, { "epoch": 14.711636493896558, "grad_norm": 1.5514737367630005, "learning_rate": 5.307452929384393e-08, "loss": 0.2351, "step": 1067800 }, { "epoch": 14.713014245956298, "grad_norm": 0.25403597950935364, "learning_rate": 5.257794715594816e-08, "loss": 0.2048, "step": 1067900 }, { "epoch": 14.714391998016037, "grad_norm": 2.524069309234619, "learning_rate": 5.2083696816974945e-08, "loss": 0.2079, "step": 1068000 }, { "epoch": 14.715769750075776, "grad_norm": 3.748978853225708, "learning_rate": 5.1591778318913766e-08, "loss": 0.2187, "step": 1068100 }, { "epoch": 14.717147502135516, "grad_norm": 3.0733766555786133, "learning_rate": 5.1102191703558676e-08, "loss": 0.2276, "step": 1068200 }, { "epoch": 14.718525254195255, "grad_norm": 0.0589502677321434, "learning_rate": 5.06149370124992e-08, "loss": 0.1545, "step": 1068300 }, { "epoch": 14.719903006254995, "grad_norm": 0.9088813066482544, "learning_rate": 5.0130014287129436e-08, "loss": 0.2433, "step": 1068400 }, { "epoch": 14.721280758314734, "grad_norm": 1.7251689434051514, "learning_rate": 4.964742356865115e-08, "loss": 0.2171, "step": 1068500 }, { "epoch": 14.722658510374472, "grad_norm": 0.9921254515647888, "learning_rate": 4.916716489805545e-08, "loss": 0.1837, "step": 1068600 }, { "epoch": 14.724036262434213, "grad_norm": 1.2840731143951416, "learning_rate": 4.868923831614719e-08, "loss": 0.2241, "step": 1068700 }, { "epoch": 14.725414014493952, "grad_norm": 1.6657763719558716, "learning_rate": 4.821364386352972e-08, "loss": 0.2346, "step": 1068800 }, { "epoch": 14.72679176655369, "grad_norm": 1.3147339820861816, "learning_rate": 4.7740381580604896e-08, "loss": 0.2284, "step": 1068900 }, { "epoch": 14.72816951861343, "grad_norm": 24.405963897705078, "learning_rate": 4.7269451507579166e-08, "loss": 0.2302, "step": 1069000 }, { "epoch": 14.72954727067317, "grad_norm": 0.21909376978874207, "learning_rate": 4.6800853684460523e-08, "loss": 0.1833, "step": 1069100 }, { "epoch": 14.730925022732908, "grad_norm": 1.447287678718567, "learning_rate": 4.6339239261428966e-08, "loss": 0.2394, "step": 1069200 }, { "epoch": 14.732302774792648, "grad_norm": 0.07940816879272461, "learning_rate": 4.587528273386793e-08, "loss": 0.1938, "step": 1069300 }, { "epoch": 14.733680526852387, "grad_norm": 0.9355390071868896, "learning_rate": 4.5413658574656456e-08, "loss": 0.2453, "step": 1069400 }, { "epoch": 14.735058278912128, "grad_norm": 4.140065670013428, "learning_rate": 4.495436682301179e-08, "loss": 0.213, "step": 1069500 }, { "epoch": 14.736436030971866, "grad_norm": 2.872631311416626, "learning_rate": 4.4497407517952725e-08, "loss": 0.2331, "step": 1069600 }, { "epoch": 14.737813783031605, "grad_norm": 2.5901694297790527, "learning_rate": 4.404278069829654e-08, "loss": 0.2099, "step": 1069700 }, { "epoch": 14.739191535091345, "grad_norm": 1.375916600227356, "learning_rate": 4.3590486402674266e-08, "loss": 0.2176, "step": 1069800 }, { "epoch": 14.740569287151084, "grad_norm": 0.6927363276481628, "learning_rate": 4.314052466950325e-08, "loss": 0.1963, "step": 1069900 }, { "epoch": 14.741947039210824, "grad_norm": 1.8821014165878296, "learning_rate": 4.2692895537014554e-08, "loss": 0.1848, "step": 1070000 }, { "epoch": 14.743324791270563, "grad_norm": 2.1611859798431396, "learning_rate": 4.224759904323472e-08, "loss": 0.2128, "step": 1070100 }, { "epoch": 14.744702543330302, "grad_norm": 1.9220291376113892, "learning_rate": 4.180463522599487e-08, "loss": 0.1986, "step": 1070200 }, { "epoch": 14.746080295390042, "grad_norm": 1.5371798276901245, "learning_rate": 4.136400412292768e-08, "loss": 0.1789, "step": 1070300 }, { "epoch": 14.747458047449781, "grad_norm": 0.46057751774787903, "learning_rate": 4.093007720773368e-08, "loss": 0.1826, "step": 1070400 }, { "epoch": 14.74883579950952, "grad_norm": 1.2110824584960938, "learning_rate": 4.0494088317040435e-08, "loss": 0.2037, "step": 1070500 }, { "epoch": 14.75021355156926, "grad_norm": 3.3303277492523193, "learning_rate": 4.006043225185469e-08, "loss": 0.2121, "step": 1070600 }, { "epoch": 14.751591303628999, "grad_norm": 1.4010446071624756, "learning_rate": 3.962910904902139e-08, "loss": 0.1754, "step": 1070700 }, { "epoch": 14.752969055688737, "grad_norm": 1.2539904117584229, "learning_rate": 3.920011874517793e-08, "loss": 0.1949, "step": 1070800 }, { "epoch": 14.754346807748478, "grad_norm": 1.4284390211105347, "learning_rate": 3.87734613767754e-08, "loss": 0.1975, "step": 1070900 }, { "epoch": 14.755724559808217, "grad_norm": 1.6033798456192017, "learning_rate": 3.834913698005732e-08, "loss": 0.2314, "step": 1071000 }, { "epoch": 14.757102311867957, "grad_norm": 0.05855090916156769, "learning_rate": 3.79271455910718e-08, "loss": 0.1851, "step": 1071100 }, { "epoch": 14.758480063927696, "grad_norm": 1.4979583024978638, "learning_rate": 3.750748724567154e-08, "loss": 0.179, "step": 1071200 }, { "epoch": 14.759857815987434, "grad_norm": 0.3084957003593445, "learning_rate": 3.709016197950777e-08, "loss": 0.2041, "step": 1071300 }, { "epoch": 14.761235568047175, "grad_norm": 3.2187559604644775, "learning_rate": 3.6675169828033204e-08, "loss": 0.1783, "step": 1071400 }, { "epoch": 14.762613320106913, "grad_norm": 0.8990810513496399, "learning_rate": 3.6262510826505224e-08, "loss": 0.2162, "step": 1071500 }, { "epoch": 14.763991072166652, "grad_norm": 0.5769587159156799, "learning_rate": 3.585218500997967e-08, "loss": 0.1756, "step": 1071600 }, { "epoch": 14.765368824226393, "grad_norm": 2.4321980476379395, "learning_rate": 3.544419241331698e-08, "loss": 0.1889, "step": 1071700 }, { "epoch": 14.766746576286131, "grad_norm": 2.0992283821105957, "learning_rate": 3.5038533071176106e-08, "loss": 0.1913, "step": 1071800 }, { "epoch": 14.768124328345872, "grad_norm": 3.6120078563690186, "learning_rate": 3.463520701802364e-08, "loss": 0.2296, "step": 1071900 }, { "epoch": 14.76950208040561, "grad_norm": 2.59912109375, "learning_rate": 3.423421428812162e-08, "loss": 0.2362, "step": 1072000 }, { "epoch": 14.770879832465349, "grad_norm": 2.28425669670105, "learning_rate": 3.3835554915536693e-08, "loss": 0.2052, "step": 1072100 }, { "epoch": 14.77225758452509, "grad_norm": 7.000552177429199, "learning_rate": 3.343922893414009e-08, "loss": 0.2059, "step": 1072200 }, { "epoch": 14.773635336584828, "grad_norm": 11.770084381103516, "learning_rate": 3.3045236377595434e-08, "loss": 0.2147, "step": 1072300 }, { "epoch": 14.775013088644567, "grad_norm": 1.35153329372406, "learning_rate": 3.265357727938012e-08, "loss": 0.2329, "step": 1072400 }, { "epoch": 14.776390840704307, "grad_norm": 1.5613043308258057, "learning_rate": 3.226425167276392e-08, "loss": 0.2304, "step": 1072500 }, { "epoch": 14.777768592764046, "grad_norm": 2.1376333236694336, "learning_rate": 3.187725959082427e-08, "loss": 0.2047, "step": 1072600 }, { "epoch": 14.779146344823786, "grad_norm": 3.5426976680755615, "learning_rate": 3.149260106643709e-08, "loss": 0.2647, "step": 1072700 }, { "epoch": 14.780524096883525, "grad_norm": 2.5625240802764893, "learning_rate": 3.11102761322829e-08, "loss": 0.199, "step": 1072800 }, { "epoch": 14.781901848943264, "grad_norm": 2.60762095451355, "learning_rate": 3.0730284820837675e-08, "loss": 0.2011, "step": 1072900 }, { "epoch": 14.783279601003004, "grad_norm": 2.644148349761963, "learning_rate": 3.035262716438808e-08, "loss": 0.2481, "step": 1073000 }, { "epoch": 14.784657353062743, "grad_norm": 7.524131774902344, "learning_rate": 2.997730319501624e-08, "loss": 0.2187, "step": 1073100 }, { "epoch": 14.786035105122481, "grad_norm": 2.492258310317993, "learning_rate": 2.9604312944608858e-08, "loss": 0.1927, "step": 1073200 }, { "epoch": 14.787412857182222, "grad_norm": 2.18353009223938, "learning_rate": 2.9233656444854208e-08, "loss": 0.1852, "step": 1073300 }, { "epoch": 14.78879060924196, "grad_norm": 0.9161109328269958, "learning_rate": 2.886533372723904e-08, "loss": 0.2131, "step": 1073400 }, { "epoch": 14.7901683613017, "grad_norm": 2.2895407676696777, "learning_rate": 2.8499344823051664e-08, "loss": 0.2557, "step": 1073500 }, { "epoch": 14.79154611336144, "grad_norm": 0.7073013186454773, "learning_rate": 2.8135689763394146e-08, "loss": 0.2226, "step": 1073600 }, { "epoch": 14.792923865421178, "grad_norm": 1.5413271188735962, "learning_rate": 2.7774368579148725e-08, "loss": 0.2377, "step": 1073700 }, { "epoch": 14.794301617480919, "grad_norm": 0.024904552847146988, "learning_rate": 2.741538130102056e-08, "loss": 0.2036, "step": 1073800 }, { "epoch": 14.795679369540657, "grad_norm": 4.637572765350342, "learning_rate": 2.7058727959507203e-08, "loss": 0.188, "step": 1073900 }, { "epoch": 14.797057121600396, "grad_norm": 0.07714895904064178, "learning_rate": 2.670440858490164e-08, "loss": 0.2196, "step": 1074000 }, { "epoch": 14.798434873660137, "grad_norm": 2.255117416381836, "learning_rate": 2.6352423207310628e-08, "loss": 0.2038, "step": 1074100 }, { "epoch": 14.799812625719875, "grad_norm": 7.185609817504883, "learning_rate": 2.6002771856633302e-08, "loss": 0.2429, "step": 1074200 }, { "epoch": 14.801190377779616, "grad_norm": 3.932537794113159, "learning_rate": 2.5655454562579507e-08, "loss": 0.2382, "step": 1074300 }, { "epoch": 14.802568129839354, "grad_norm": 5.433434009552002, "learning_rate": 2.5310471354651478e-08, "loss": 0.2214, "step": 1074400 }, { "epoch": 14.803945881899093, "grad_norm": 1.2009291648864746, "learning_rate": 2.49678222621591e-08, "loss": 0.2264, "step": 1074500 }, { "epoch": 14.805323633958833, "grad_norm": 2.134117364883423, "learning_rate": 2.4627507314210762e-08, "loss": 0.2095, "step": 1074600 }, { "epoch": 14.806701386018572, "grad_norm": 0.10432631522417068, "learning_rate": 2.4289526539719444e-08, "loss": 0.2169, "step": 1074700 }, { "epoch": 14.80807913807831, "grad_norm": 0.342690110206604, "learning_rate": 2.395387996739662e-08, "loss": 0.2105, "step": 1074800 }, { "epoch": 14.809456890138051, "grad_norm": 1.82905113697052, "learning_rate": 2.3620567625758373e-08, "loss": 0.1896, "step": 1074900 }, { "epoch": 14.81083464219779, "grad_norm": 3.019597053527832, "learning_rate": 2.3289589543122324e-08, "loss": 0.1681, "step": 1075000 }, { "epoch": 14.812212394257529, "grad_norm": 2.151141405105591, "learning_rate": 2.2960945747604602e-08, "loss": 0.1567, "step": 1075100 }, { "epoch": 14.813590146317269, "grad_norm": 5.149314880371094, "learning_rate": 2.263463626712592e-08, "loss": 0.2242, "step": 1075200 }, { "epoch": 14.814967898377008, "grad_norm": 1.173034906387329, "learning_rate": 2.231066112940855e-08, "loss": 0.2111, "step": 1075300 }, { "epoch": 14.816345650436748, "grad_norm": 6.555375576019287, "learning_rate": 2.1989020361973256e-08, "loss": 0.2075, "step": 1075400 }, { "epoch": 14.817723402496487, "grad_norm": 1.99422287940979, "learning_rate": 2.1669713992151508e-08, "loss": 0.2053, "step": 1075500 }, { "epoch": 14.819101154556225, "grad_norm": 3.53043270111084, "learning_rate": 2.1352742047061058e-08, "loss": 0.1721, "step": 1075600 }, { "epoch": 14.820478906615966, "grad_norm": 0.13173238933086395, "learning_rate": 2.1038104553639524e-08, "loss": 0.1758, "step": 1075700 }, { "epoch": 14.821856658675705, "grad_norm": 1.1103326082229614, "learning_rate": 2.072580153860776e-08, "loss": 0.2443, "step": 1075800 }, { "epoch": 14.823234410735443, "grad_norm": 1.9461760520935059, "learning_rate": 2.041583302850647e-08, "loss": 0.2006, "step": 1075900 }, { "epoch": 14.824612162795184, "grad_norm": 2.5116820335388184, "learning_rate": 2.0108199049662658e-08, "loss": 0.2304, "step": 1076000 }, { "epoch": 14.825989914854922, "grad_norm": 1.13321852684021, "learning_rate": 1.9802899628214022e-08, "loss": 0.2242, "step": 1076100 }, { "epoch": 14.827367666914663, "grad_norm": 3.4111907482147217, "learning_rate": 1.9499934790096764e-08, "loss": 0.2218, "step": 1076200 }, { "epoch": 14.828745418974401, "grad_norm": 4.788069248199463, "learning_rate": 1.9199304561048624e-08, "loss": 0.2265, "step": 1076300 }, { "epoch": 14.83012317103414, "grad_norm": 1.3524816036224365, "learning_rate": 1.8901008966611954e-08, "loss": 0.2136, "step": 1076400 }, { "epoch": 14.83150092309388, "grad_norm": 1.0137866735458374, "learning_rate": 1.8605048032124537e-08, "loss": 0.2123, "step": 1076500 }, { "epoch": 14.83287867515362, "grad_norm": 1.0612092018127441, "learning_rate": 1.831142178273487e-08, "loss": 0.2048, "step": 1076600 }, { "epoch": 14.834256427213358, "grad_norm": 2.682030200958252, "learning_rate": 1.8020130243383837e-08, "loss": 0.2071, "step": 1076700 }, { "epoch": 14.835634179273098, "grad_norm": 0.1536763608455658, "learning_rate": 1.773117343881997e-08, "loss": 0.1803, "step": 1076800 }, { "epoch": 14.837011931332837, "grad_norm": 2.6570160388946533, "learning_rate": 1.74445513935903e-08, "loss": 0.1824, "step": 1076900 }, { "epoch": 14.838389683392577, "grad_norm": 0.8415933847427368, "learning_rate": 1.7160264132046465e-08, "loss": 0.2261, "step": 1077000 }, { "epoch": 14.839767435452316, "grad_norm": 1.7778992652893066, "learning_rate": 1.6878311678338587e-08, "loss": 0.1959, "step": 1077100 }, { "epoch": 14.841145187512055, "grad_norm": 3.090742349624634, "learning_rate": 1.6598694056421394e-08, "loss": 0.2064, "step": 1077200 }, { "epoch": 14.842522939571795, "grad_norm": 1.2722197771072388, "learning_rate": 1.632141129005116e-08, "loss": 0.2408, "step": 1077300 }, { "epoch": 14.843900691631534, "grad_norm": 7.700870037078857, "learning_rate": 1.6046463402779597e-08, "loss": 0.2148, "step": 1077400 }, { "epoch": 14.845278443691273, "grad_norm": 1.0658072233200073, "learning_rate": 1.577385041797219e-08, "loss": 0.1818, "step": 1077500 }, { "epoch": 14.846656195751013, "grad_norm": 7.456512451171875, "learning_rate": 1.5506263581414047e-08, "loss": 0.214, "step": 1077600 }, { "epoch": 14.848033947810752, "grad_norm": 9.465802192687988, "learning_rate": 1.5238297121207003e-08, "loss": 0.2303, "step": 1077700 }, { "epoch": 14.84941169987049, "grad_norm": 2.7150468826293945, "learning_rate": 1.4972665632116413e-08, "loss": 0.2153, "step": 1077800 }, { "epoch": 14.85078945193023, "grad_norm": 1.6926552057266235, "learning_rate": 1.4709369136710892e-08, "loss": 0.2293, "step": 1077900 }, { "epoch": 14.85216720398997, "grad_norm": 0.7267380952835083, "learning_rate": 1.4448407657357555e-08, "loss": 0.1856, "step": 1078000 }, { "epoch": 14.85354495604971, "grad_norm": 2.963726282119751, "learning_rate": 1.4189781216228104e-08, "loss": 0.2041, "step": 1078100 }, { "epoch": 14.854922708109449, "grad_norm": 2.1079416275024414, "learning_rate": 1.3933489835292745e-08, "loss": 0.195, "step": 1078200 }, { "epoch": 14.856300460169187, "grad_norm": 3.451512336730957, "learning_rate": 1.3679533536326283e-08, "loss": 0.1698, "step": 1078300 }, { "epoch": 14.857678212228928, "grad_norm": 3.796050786972046, "learning_rate": 1.3427912340902016e-08, "loss": 0.2202, "step": 1078400 }, { "epoch": 14.859055964288666, "grad_norm": 2.5348920822143555, "learning_rate": 1.3178626270394789e-08, "loss": 0.1946, "step": 1078500 }, { "epoch": 14.860433716348407, "grad_norm": 3.499793767929077, "learning_rate": 1.2931675345987105e-08, "loss": 0.2607, "step": 1078600 }, { "epoch": 14.861811468408145, "grad_norm": 4.89334774017334, "learning_rate": 1.2687059588653849e-08, "loss": 0.195, "step": 1078700 }, { "epoch": 14.863189220467884, "grad_norm": 1.0662612915039062, "learning_rate": 1.2444779019183672e-08, "loss": 0.2004, "step": 1078800 }, { "epoch": 14.864566972527625, "grad_norm": 1.6833910942077637, "learning_rate": 1.2204833658151504e-08, "loss": 0.2387, "step": 1078900 }, { "epoch": 14.865944724587363, "grad_norm": 0.4727188050746918, "learning_rate": 1.1967223525946036e-08, "loss": 0.1838, "step": 1079000 }, { "epoch": 14.867322476647102, "grad_norm": 3.2705109119415283, "learning_rate": 1.17319486427514e-08, "loss": 0.2367, "step": 1079100 }, { "epoch": 14.868700228706842, "grad_norm": 1.2091560363769531, "learning_rate": 1.1499009028559382e-08, "loss": 0.2003, "step": 1079200 }, { "epoch": 14.870077980766581, "grad_norm": 4.0432353019714355, "learning_rate": 1.1268404703154156e-08, "loss": 0.2124, "step": 1079300 }, { "epoch": 14.87145573282632, "grad_norm": 3.511845827102661, "learning_rate": 1.1040135686130603e-08, "loss": 0.2148, "step": 1079400 }, { "epoch": 14.87283348488606, "grad_norm": 2.3320748805999756, "learning_rate": 1.08142019968821e-08, "loss": 0.2155, "step": 1079500 }, { "epoch": 14.874211236945799, "grad_norm": 3.4457144737243652, "learning_rate": 1.0590603654597464e-08, "loss": 0.1825, "step": 1079600 }, { "epoch": 14.87558898900554, "grad_norm": 2.767977714538574, "learning_rate": 1.036934067827927e-08, "loss": 0.19, "step": 1079700 }, { "epoch": 14.876966741065278, "grad_norm": 7.836250305175781, "learning_rate": 1.0150413086719434e-08, "loss": 0.1909, "step": 1079800 }, { "epoch": 14.878344493125017, "grad_norm": 2.2359414100646973, "learning_rate": 9.933820898520573e-09, "loss": 0.2338, "step": 1079900 }, { "epoch": 14.879722245184757, "grad_norm": 0.8423241376876831, "learning_rate": 9.719564132077696e-09, "loss": 0.2579, "step": 1080000 }, { "epoch": 14.881099997244496, "grad_norm": 5.39343786239624, "learning_rate": 9.507642805602623e-09, "loss": 0.165, "step": 1080100 }, { "epoch": 14.882477749304234, "grad_norm": 2.855870246887207, "learning_rate": 9.298056937090405e-09, "loss": 0.1553, "step": 1080200 }, { "epoch": 14.883855501363975, "grad_norm": 0.983640193939209, "learning_rate": 9.09080654434985e-09, "loss": 0.1924, "step": 1080300 }, { "epoch": 14.885233253423714, "grad_norm": 3.0346145629882812, "learning_rate": 8.885891644988265e-09, "loss": 0.1918, "step": 1080400 }, { "epoch": 14.886611005483454, "grad_norm": 1.5431973934173584, "learning_rate": 8.6833122564145e-09, "loss": 0.199, "step": 1080500 }, { "epoch": 14.887988757543193, "grad_norm": 0.37619274854660034, "learning_rate": 8.483068395838955e-09, "loss": 0.2262, "step": 1080600 }, { "epoch": 14.889366509602931, "grad_norm": 1.83773672580719, "learning_rate": 8.287127602424338e-09, "loss": 0.2508, "step": 1080700 }, { "epoch": 14.890744261662672, "grad_norm": 3.050055503845215, "learning_rate": 8.091531492976368e-09, "loss": 0.194, "step": 1080800 }, { "epoch": 14.89212201372241, "grad_norm": 3.382828950881958, "learning_rate": 7.898270961802235e-09, "loss": 0.1849, "step": 1080900 }, { "epoch": 14.893499765782149, "grad_norm": 1.7832273244857788, "learning_rate": 7.70734602531853e-09, "loss": 0.2015, "step": 1081000 }, { "epoch": 14.89487751784189, "grad_norm": 2.680617570877075, "learning_rate": 7.518756699746442e-09, "loss": 0.2336, "step": 1081100 }, { "epoch": 14.896255269901628, "grad_norm": 1.1461893320083618, "learning_rate": 7.3325030011056615e-09, "loss": 0.2221, "step": 1081200 }, { "epoch": 14.897633021961369, "grad_norm": 0.8372901082038879, "learning_rate": 7.148584945220471e-09, "loss": 0.2151, "step": 1081300 }, { "epoch": 14.899010774021107, "grad_norm": 6.631724834442139, "learning_rate": 6.968806810128403e-09, "loss": 0.2686, "step": 1081400 }, { "epoch": 14.900388526080846, "grad_norm": 2.7890655994415283, "learning_rate": 6.789536729615853e-09, "loss": 0.2132, "step": 1081500 }, { "epoch": 14.901766278140586, "grad_norm": 1.8802226781845093, "learning_rate": 6.61260233798533e-09, "loss": 0.1949, "step": 1081600 }, { "epoch": 14.903144030200325, "grad_norm": 4.693185806274414, "learning_rate": 6.438003650273416e-09, "loss": 0.2887, "step": 1081700 }, { "epoch": 14.904521782260064, "grad_norm": 3.1848788261413574, "learning_rate": 6.265740681306031e-09, "loss": 0.2087, "step": 1081800 }, { "epoch": 14.905899534319804, "grad_norm": 4.509172439575195, "learning_rate": 6.0958134457198e-09, "loss": 0.2162, "step": 1081900 }, { "epoch": 14.907277286379543, "grad_norm": 2.687525987625122, "learning_rate": 5.9282219579528956e-09, "loss": 0.2687, "step": 1082000 }, { "epoch": 14.908655038439282, "grad_norm": 1.9521030187606812, "learning_rate": 5.762966232245037e-09, "loss": 0.1885, "step": 1082100 }, { "epoch": 14.910032790499022, "grad_norm": 3.8227665424346924, "learning_rate": 5.600046282628335e-09, "loss": 0.215, "step": 1082200 }, { "epoch": 14.91141054255876, "grad_norm": 1.545579433441162, "learning_rate": 5.439462122951711e-09, "loss": 0.1997, "step": 1082300 }, { "epoch": 14.912788294618501, "grad_norm": 1.51990807056427, "learning_rate": 5.281213766853421e-09, "loss": 0.2156, "step": 1082400 }, { "epoch": 14.91416604667824, "grad_norm": 0.6883875131607056, "learning_rate": 5.125301227776324e-09, "loss": 0.1959, "step": 1082500 }, { "epoch": 14.915543798737978, "grad_norm": 0.03276563435792923, "learning_rate": 4.97172451896788e-09, "loss": 0.2152, "step": 1082600 }, { "epoch": 14.916921550797719, "grad_norm": 3.164340019226074, "learning_rate": 4.820483653477092e-09, "loss": 0.2196, "step": 1082700 }, { "epoch": 14.918299302857458, "grad_norm": 0.6809417605400085, "learning_rate": 4.671578644148411e-09, "loss": 0.2363, "step": 1082800 }, { "epoch": 14.919677054917198, "grad_norm": 1.171447515487671, "learning_rate": 4.52500950363699e-09, "loss": 0.246, "step": 1082900 }, { "epoch": 14.921054806976937, "grad_norm": 0.0383504256606102, "learning_rate": 4.380776244390372e-09, "loss": 0.1976, "step": 1083000 }, { "epoch": 14.922432559036675, "grad_norm": 2.2809674739837646, "learning_rate": 4.238878878660702e-09, "loss": 0.187, "step": 1083100 }, { "epoch": 14.923810311096416, "grad_norm": 4.4233598709106445, "learning_rate": 4.099317418507775e-09, "loss": 0.1974, "step": 1083200 }, { "epoch": 14.925188063156154, "grad_norm": 1.016135334968567, "learning_rate": 3.962091875786833e-09, "loss": 0.2372, "step": 1083300 }, { "epoch": 14.926565815215893, "grad_norm": 2.259232759475708, "learning_rate": 3.827202262154661e-09, "loss": 0.1914, "step": 1083400 }, { "epoch": 14.927943567275634, "grad_norm": 0.9471726417541504, "learning_rate": 3.6946485890726466e-09, "loss": 0.2094, "step": 1083500 }, { "epoch": 14.929321319335372, "grad_norm": 0.5536746382713318, "learning_rate": 3.564430867797619e-09, "loss": 0.1755, "step": 1083600 }, { "epoch": 14.93069907139511, "grad_norm": 7.622779369354248, "learning_rate": 3.4365491093971136e-09, "loss": 0.2402, "step": 1083700 }, { "epoch": 14.932076823454851, "grad_norm": 2.0714480876922607, "learning_rate": 3.3110033247310547e-09, "loss": 0.2251, "step": 1083800 }, { "epoch": 14.93345457551459, "grad_norm": 5.540711879730225, "learning_rate": 3.187793524470073e-09, "loss": 0.2017, "step": 1083900 }, { "epoch": 14.93483232757433, "grad_norm": 2.172900915145874, "learning_rate": 3.066919719077188e-09, "loss": 0.2436, "step": 1084000 }, { "epoch": 14.936210079634069, "grad_norm": 2.606396436691284, "learning_rate": 2.9483819188230733e-09, "loss": 0.1817, "step": 1084100 }, { "epoch": 14.937587831693808, "grad_norm": 1.8836302757263184, "learning_rate": 2.832180133776896e-09, "loss": 0.2187, "step": 1084200 }, { "epoch": 14.938965583753548, "grad_norm": 1.4190055131912231, "learning_rate": 2.718314373812425e-09, "loss": 0.2018, "step": 1084300 }, { "epoch": 14.940343335813287, "grad_norm": 0.7984558939933777, "learning_rate": 2.606784648601923e-09, "loss": 0.2236, "step": 1084400 }, { "epoch": 14.941721087873026, "grad_norm": 0.8938072323799133, "learning_rate": 2.497590967619201e-09, "loss": 0.2266, "step": 1084500 }, { "epoch": 14.943098839932766, "grad_norm": 0.49862140417099, "learning_rate": 2.3907333401457232e-09, "loss": 0.2348, "step": 1084600 }, { "epoch": 14.944476591992505, "grad_norm": 2.713656187057495, "learning_rate": 2.2862117752553426e-09, "loss": 0.1814, "step": 1084700 }, { "epoch": 14.945854344052245, "grad_norm": 2.6186256408691406, "learning_rate": 2.1840262818265123e-09, "loss": 0.1899, "step": 1084800 }, { "epoch": 14.947232096111984, "grad_norm": 2.87308931350708, "learning_rate": 2.08417686854534e-09, "loss": 0.1576, "step": 1084900 }, { "epoch": 14.948609848171722, "grad_norm": 2.1665875911712646, "learning_rate": 1.9866635438903213e-09, "loss": 0.2265, "step": 1085000 }, { "epoch": 14.949987600231463, "grad_norm": 0.17462141811847687, "learning_rate": 1.891486316147606e-09, "loss": 0.1973, "step": 1085100 }, { "epoch": 14.951365352291202, "grad_norm": 4.120203495025635, "learning_rate": 1.7986451934018378e-09, "loss": 0.1834, "step": 1085200 }, { "epoch": 14.95274310435094, "grad_norm": 1.6549655199050903, "learning_rate": 1.7081401835422616e-09, "loss": 0.2599, "step": 1085300 }, { "epoch": 14.95412085641068, "grad_norm": 2.00540828704834, "learning_rate": 1.6199712942535638e-09, "loss": 0.2118, "step": 1085400 }, { "epoch": 14.95549860847042, "grad_norm": 4.009399890899658, "learning_rate": 1.5349852967871948e-09, "loss": 0.2142, "step": 1085500 }, { "epoch": 14.95687636053016, "grad_norm": 3.2442891597747803, "learning_rate": 1.4514653095307195e-09, "loss": 0.2155, "step": 1085600 }, { "epoch": 14.958254112589898, "grad_norm": 0.10502032190561295, "learning_rate": 1.3702814646565643e-09, "loss": 0.1879, "step": 1085700 }, { "epoch": 14.959631864649637, "grad_norm": 0.705955445766449, "learning_rate": 1.2914337690556056e-09, "loss": 0.1655, "step": 1085800 }, { "epoch": 14.961009616709378, "grad_norm": 2.398799419403076, "learning_rate": 1.2149222294324803e-09, "loss": 0.2199, "step": 1085900 }, { "epoch": 14.962387368769116, "grad_norm": 1.5619040727615356, "learning_rate": 1.1407468522842136e-09, "loss": 0.2179, "step": 1086000 }, { "epoch": 14.963765120828855, "grad_norm": 0.13245314359664917, "learning_rate": 1.0689076439124313e-09, "loss": 0.1967, "step": 1086100 }, { "epoch": 14.965142872888595, "grad_norm": 3.4771838188171387, "learning_rate": 9.99404610420307e-10, "loss": 0.1999, "step": 1086200 }, { "epoch": 14.966520624948334, "grad_norm": 1.209195852279663, "learning_rate": 9.32237757712562e-10, "loss": 0.1733, "step": 1086300 }, { "epoch": 14.967898377008073, "grad_norm": 1.1353882551193237, "learning_rate": 8.674070914954647e-10, "loss": 0.191, "step": 1086400 }, { "epoch": 14.969276129067813, "grad_norm": 3.6483943462371826, "learning_rate": 8.049126172768317e-10, "loss": 0.1901, "step": 1086500 }, { "epoch": 14.970653881127552, "grad_norm": 1.3389338254928589, "learning_rate": 7.447543403660274e-10, "loss": 0.2129, "step": 1086600 }, { "epoch": 14.972031633187292, "grad_norm": 2.003077983856201, "learning_rate": 6.869322658739629e-10, "loss": 0.2038, "step": 1086700 }, { "epoch": 14.973409385247031, "grad_norm": 0.5058888792991638, "learning_rate": 6.314463987130981e-10, "loss": 0.2021, "step": 1086800 }, { "epoch": 14.97478713730677, "grad_norm": 3.1928555965423584, "learning_rate": 5.782967435943865e-10, "loss": 0.1892, "step": 1086900 }, { "epoch": 14.97616488936651, "grad_norm": 1.9641062021255493, "learning_rate": 5.274833050364358e-10, "loss": 0.1995, "step": 1087000 }, { "epoch": 14.977542641426249, "grad_norm": 0.17440913617610931, "learning_rate": 4.790060873563485e-10, "loss": 0.15, "step": 1087100 }, { "epoch": 14.97892039348599, "grad_norm": 4.006913185119629, "learning_rate": 4.3286509466972104e-10, "loss": 0.1976, "step": 1087200 }, { "epoch": 14.980298145545728, "grad_norm": 0.13254722952842712, "learning_rate": 3.890603308998042e-10, "loss": 0.2745, "step": 1087300 }, { "epoch": 14.981675897605466, "grad_norm": 2.0417821407318115, "learning_rate": 3.4759179976528997e-10, "loss": 0.1987, "step": 1087400 }, { "epoch": 14.983053649665207, "grad_norm": 1.5335716009140015, "learning_rate": 3.0845950478947117e-10, "loss": 0.2327, "step": 1087500 }, { "epoch": 14.984431401724946, "grad_norm": 0.9508352279663086, "learning_rate": 2.716634492971881e-10, "loss": 0.2147, "step": 1087600 }, { "epoch": 14.985809153784684, "grad_norm": 1.4289687871932983, "learning_rate": 2.37203636417882e-10, "loss": 0.164, "step": 1087700 }, { "epoch": 14.987186905844425, "grad_norm": 0.03259569779038429, "learning_rate": 2.0508006907338228e-10, "loss": 0.215, "step": 1087800 }, { "epoch": 14.988564657904163, "grad_norm": 5.39491081237793, "learning_rate": 1.7529274999622536e-10, "loss": 0.2425, "step": 1087900 }, { "epoch": 14.989942409963902, "grad_norm": 1.7730236053466797, "learning_rate": 1.4810462794806468e-10, "loss": 0.1963, "step": 1088000 }, { "epoch": 14.991320162023642, "grad_norm": 1.2670625448226929, "learning_rate": 1.2296645025500207e-10, "loss": 0.2181, "step": 1088100 }, { "epoch": 14.992697914083381, "grad_norm": 0.2452702820301056, "learning_rate": 1.0016452780259311e-10, "loss": 0.218, "step": 1088200 }, { "epoch": 14.994075666143122, "grad_norm": 2.5829875469207764, "learning_rate": 7.969886252956471e-11, "loss": 0.2539, "step": 1088300 }, { "epoch": 14.99545341820286, "grad_norm": 0.038913942873477936, "learning_rate": 6.156945617619147e-11, "loss": 0.2639, "step": 1088400 }, { "epoch": 14.996831170262599, "grad_norm": 2.4882044792175293, "learning_rate": 4.577631028124252e-11, "loss": 0.2202, "step": 1088500 }, { "epoch": 14.99820892232234, "grad_norm": 0.9612501263618469, "learning_rate": 3.23194261880877e-11, "loss": 0.2061, "step": 1088600 }, { "epoch": 14.999586674382078, "grad_norm": 1.4983134269714355, "learning_rate": 2.1198805035538284e-11, "loss": 0.1861, "step": 1088700 } ], "logging_steps": 100, "max_steps": 1088730, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.189236164022104e+22, "train_batch_size": 8, "trial_name": null, "trial_params": null }